######################################## # # Script: 10 February 2011 (20110210.R) # ######################################## # Today: # ** Data transforms # ** Two-way Analysis of Variance # d <- read.csv("http://courses.kvasaheim.com/stat40x3/data/gdpcap.csv", header=TRUE) names(d) summary(d) ##### # Let us determine which of these affects wealth in the world # Effects of each categorical variable? summary( aov(gdpcap ~ region, data=d) ) # Statistically significant! summary( aov(gdpcap ~ OPEC, data=d) ) # Statistically significant! summary( aov(gdpcap ~ govt, data=d) ) # Statistically significant! # But, these effects are /not/ independent: # Correlations among the three categorical variables? with(d, table(region,OPEC) ) # correlated means not independent with(d, table(region,govt) ) # correlated means not independent with(d, table(govt,OPEC) ) # correlated means not independent with(d, chisq.test(region,OPEC) ) # read the p-value with(d, chisq.test(region,govt) ) # read the p-value with(d, chisq.test(govt,OPEC) ) # read the p-value ##### # Let us now determine their conditional effects: # conditional effects are those effects that belong to that variable # alone and not to the other variables in the analysis. As these three # variables are correlated, we would expect the conditional effects to # be different from the unconditional effects (above) model.1 <- aov(gdpcap ~ region + OPEC + govt, data=d) summary(model.1) # Wait! The conclusion is different than above... why? # What are the effect levels? model.1$coefficients # Interpret these (see notes) # What about interactions? model.2 <- aov(gdpcap ~ region * OPEC * govt, data=d) summary(model.2) # Not the asterisk to indicate interactions (think of interactions as # being products of the variables # Again, what are the effect levels? coefficients(model.2) # Is this larger model 'better enough'? anova(model.2,model.1) # The anova() function compares the two models to determine if the larger is # statistically better at prediction than the smaller. The null hypothesis # is that there is no statistical difference in the two models # What about dropping the region:govt interaction? model.3 <- aov(gdpcap ~ region + OPEC + govt + OPEC:govt + region:OPEC, data=d) summary(model.3) # Is this dropping acceptable? anova(model.2,model.3) model.4 <- aov(gdpcap ~ region + OPEC + govt + OPEC:govt, data=d) summary(model.4) # Is this dropping acceptable? anova(model.3,model.4) # So, this is the "minimally acceptable" model summary(model.4) # Effect levels: coefficients(model.4) # Minimally acceptable models are important for us because they include only those # variables that are important in predicting our outcome variable. If we include # non-significant effects, then our predictions have a larger variance (lower # precision) than otherwise. # What is the expected gdp per capita in a Democratic, African OPEC member? 2540.4683 + 3377.4044 # Alternatively (and more useful in the future): ourstate <- data.frame(govt="Democracy", region="Eastern", OPEC="Non-Member") predict(model.4, newdata=ourstate) # This is the preferred manner of getting our predictions. It is cleaner and can # be copy-and-pasted, changing only the values. ## # The following gives an error. Beware! ourstate <- data.frame(govt="Democratic", region="Africa", OPEC="Member") predict(model.4, newdata=ourstate) # Learn this error, since you will probably see it in the future. ### # Alternatively: model.5 <- lm(gdpcap ~ region + OPEC + govt + OPEC:govt, data=d) summary(model.5) summary(aov(model.5)) # Differences? ######################### # # Now, for Honesty in Government # # # Do this one on your own. ######################### # # Data transformation # This is done when the assumptions of ANOVA are not met, and we really # want to use ANOVA. The process consists of applying a function (one-to-one # and onto) to the original data in the hopes of having the transformed # data meet the assumptions of ANOVA. Conclusions reached in the transformed # state are valid in the un-transformed state (predictions need to be back- # transformed, however). d <- read.csv("http://courses.kvasaheim.com/stat40x3/data/population1.csv", header=TRUE) names(d) boxplot(population~group, data=d) bartlett.test( population~group, data=d) # p << 0.0001 shapiro.test( d$population[d$group=="A"] ) # p << 0.0001 shapiro.test( d$population[d$group=="B"] ) # p << 0.0001 shapiro.test( d$population[d$group=="C"] ) # p = 0.0005 shapiro.test( d$population[d$group=="D"] ) # p << 0.0001 # The data fails the assumptions. Let us try the transformation method # so that we can still use the ANOVA procedure. # There are several options for transformation: # . square # . square root # . cube # . cube root # . logarithm # ANY function that is one-to-one and onto will work. # Let us try the log transformation: bartlett.test( log(population)~group, data=d) # p = 0.255 shapiro.test( log(d$population[d$group=="A"]) ) # p = 0.451 shapiro.test( log(d$population[d$group=="B"]) ) # p = 0.235 shapiro.test( log(d$population[d$group=="C"]) ) # p = 0.302 shapiro.test( log(d$population[d$group=="D"]) ) # p = 0.804 # The log-transformed data meets the assumptions of ANOVA, so let # us use the ANOVA procedure on the transformed data: model.tr <- aov( log(population)~group, data=d) summary( model.tr ) # Because the log function is one-to-one and onto, the conclusions # regarding statistical significance you reach in the transformed # space are applicable in the non-transformed space. # We can also make our predictions, but we have to back-transform # our predictions: coefficients(model.tr) pr <- predict( model.tr, newdata=data.frame(group=c("A","B","C","D")) ) exp(pr) # Why did I use the expontential function? It is the inverse of the # logarithm function. The predictions in `pr' are in log-units, are # in the transformed space. To get real predictions, we must get back # into our original measurements. We do that by applying the inverse of # the transformation function on the predictions. ################################################## # # Finally ... # All of the multiple comparison tests we covered last time were # based (in some way) on the parametric t-test. There is a non- # parametric multiple-comparison test called the Kruskal Test. library(agricolae) with(d, kruskal(population,group) )