######################################## # # Topic: ANOVA and Kruskal-Wallis # # # Script: 20101203.R # # Date: 3 December 2010 # ######################################## ### Functions # Create a function to calculate the logit: logit <- function(x){ log( x / (1-x) ) } ##### # ANOVA example 1 da1 <- read.csv("http://courses.kvasaheim.com/stat4073/data/anova1.csv", header=TRUE) names(da1) attach(da1) group <- as.factor(group) boxplot(width ~ group) summary(aov(width ~ group)) # From the ANOVA table, how many groups? How many data points? # What is the null hypothesis? What is the conclusion? # What are the three assumptions of ANOVA? Are they violated here? detach(da1) rm(list = ls()) ##### # ANOVA example 2 da2 <- read.csv("http://courses.kvasaheim.com/stat4073/data/anova2.csv", header=TRUE) names(da2) attach(da2) group <- as.factor(group) boxplot(width ~ group) summary(aov(width ~ group)) # From the ANOVA table, how many groups? How many data points? # What is the null hypothesis? What is the conclusion? # What are the three assumptions of ANOVA? Are they violated here? detach(da2) rm(list = ls()) ##### # ANOVA example 3 (fixed) da3 <- read.csv("http://courses.kvasaheim.com/stat4073/data/anova3.csv", header=TRUE) names(da3) attach(da3) region <- as.factor(region) boxplot(vote ~ region) summary(aov(vote ~ region)) # From the ANOVA table, how many groups? How many data points? # What is the null hypothesis? What is the conclusion? # What are the three assumptions of ANOVA? Are they violated here? # What do we do since the assumptions really are violated severely? # We can transform the dependent variable (as before). Since vote # is bounded below by 0 and above by 1, we perform a logit transform: vote2 <- logit(vote) boxplot(vote2 ~ region) summary(aov(vote2 ~ region)) # Is this better? detach(da3) rm(list = ls()) ##### # ANOVA example 4 (fixed) da4 <- read.csv("http://courses.kvasaheim.com/stat4073/data/anova4.csv", header=TRUE) names(da4) attach(da4) boxplot(wealth ~ region) summary(aov(wealth ~ region)) # From the ANOVA table, how many groups? How many data points? # What is the null hypothesis? What is the conclusion? # What are the three assumptions of ANOVA? Are they violated here? # What do we do since the assumptions really are violated severely? # We can transform the dependent variable (as before). Since wealth # is bounded below by 0, we perform a log transform: wealth2 <- log(wealth) boxplot(wealth2 ~ region) summary(aov(wealth2 ~ region)) # Is this better? detach(da4) rm(list = ls()) ##### # ANOVA example 5 (not done during class) da5 <- read.csv("http://courses.kvasaheim.com/stat4073/data/anova5.csv", header=TRUE) names(da5) attach(da5) plot <- as.factor(plot) # These are grass heights in six experimental plots boxplot(height ~ plot) summary(aov(height ~ plot)) # From the ANOVA table, how many groups? How many data points? # What is the null hypothesis? What is the conclusion? # What are the three assumptions of ANOVA? Are they violated here? # What do we do since the assumptions really are violated severely? # We can transform the dependent variable (as before). Since wealth # is bounded below by 0, we perform a log transform: height2 <- log(height) boxplot(height2 ~ plot) summary(aov(height2 ~ plot)) # Is this better? # Yes, but let's try some non-parametric stuff. Here, the non-parametric # ANOVA procedure is called the Kruskal-Wallis procedure. It only works # for one-way ANOVAs, however. kruskal.test(height ~ plot) # When the two give different answers, I usually err on the side of # 'fail to reject'. It is the more conservative position, and the # most defensible. detach(da5) rm(list = ls()) ##### # ANOVA example 6: Two-Factor ANOVA da6 <- read.csv("http://courses.kvasaheim.com/stat4073/data/glycerol.csv", header=TRUE) names(da6) attach(da6) # Make sure your window is wide enough to show all of the labels, # otherwise some will not show par(mar=c(3,4,2,2)+0.1) boxplot(foam ~ watertype + glycerol, las=1, ylab="Foaming Level") # Additive model (assumes water and glycerol are # independent in their effects) summary(aov(foam ~ watertype + glycerol)) # Interactive model (does not make that assumption) summary(aov(foam ~ watertype * glycerol)) # Since the interaction term (watertype:glycerol) is not # statistically significant, we conclude that there is no # interaction, that the additive model is correct, and that # the effects of the glycerol and of the water type are # independent of each other. # Thus, the better model is summary(aov(foam ~ watertype + glycerol)) # However, since the effect of the glycerol is not significant, # we drop that from the model and re-fit to get a stronger model: summary(aov(foam ~ watertype)) # Since the water type is statistically significant in this last # model, we conclude that only the water type is helpful in # determining (or affecting) the foaming measurement.