######################################## # # Script: 1 February 2011 (preliminary) # ######################################## # Today: # # ** Analysis of Variance # # But, before we do that, let us look at boxplots some more # read in the data fball <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ncaa2009football.csv", header=TRUE) boxplot(score~conference, data=fball) # All of the conferences boxplot(score~team, data=fball) # All of the teams par(mar=c(5,5,2,2)+0.1) # Sets the margins order:(b,l,t,r) boxplot(score~team, data=fball, horizontal=TRUE, las=1, cex.axis=0.5) # Better ... maybe # This colors the SEC conference teams blue: boxplot(score~team, data=fball, horizontal=TRUE, las=1, cex.axis=0.5, subset=conference=="SEC", col="light blue", add=TRUE) # This colors the University of Oregon green boxplot(score~team, data=fball, horizontal=TRUE, las=1, cex.axis=0.5, subset=team=="Oregon", col="green4", add=TRUE) # This colors Oklahoma State University orange boxplot(score~team, data=fball, horizontal=TRUE, las=1, cex.axis=0.5, subset=team=="Oklahoma State", col="orange", add=TRUE) ##### # What if we only want to look at the teams in a specific conference? # It is not as easy as it should be: boxplot(score~team, data=fball, subset=conference=="Big 12") # The easiest (to understand) way of accomplishing our goals is the following procedure: big12 <- fball[fball$conference=="Big 12",] # Let's subset the fball dataset big12teams <- unique(factor(fball$team[fball$conference=="Big 12"])) # This lists all of the SEC teams big12$team <- factor(as.character(big12$team), levels=big12teams ) # This is the important part boxplot(score~team, data=big12, horizontal=TRUE, las=1) # Now, the appropriate boxplot boxplot(score~team, data=big12, horizontal=TRUE, las=1, cex.axis=0.5) # or par(mar=c(5,8,2,2)+0.1) boxplot(score~team, data=big12, horizontal=TRUE, las=1) # which is better? # of course, we can make this look more colorful (for Big 12 fans): par(mar=c(5,8,2,2)+0.1) boxplot(score~team, data=big12, horizontal=TRUE, las=1, border=c("#008000","#FFD700","#C41E3A","#0022B4","#7851A9","#FFCC33","#FF2400","#DC143C","#FF8C00","#CC5500","#800000","#FF2400")) # these are the official colors in R,G,B format: #rrggbb # I'm not sure this looks better than all-black, but it may be handy in a Big 12-based magazine? # Now, back to analysis # H0 : There is no statistical difference among the Big 12 teams in terms of points per game (ppg) # Multiple groups -> an ANOVA-like analysis # Check assumptions of ANOVA: #1: Normality shapiro.test(big12$score[big12$team=="Oklahoma State"]) # Not significantly diff from Normal (p=0.39) with(big12[big12$team=="Oklahoma State",], shapiro.test(score)) # Alternative way of doing this shapiro.test(big12$score[big12$team=="Baylor"]) # Not significantly diff from Normal (p=0.06) shapiro.test(big12$score[big12$team=="Colorado"]) # Not significantly diff from Normal (p=0.86) shapiro.test(big12$score[big12$team=="Iowa State"]) # Not significantly diff from Normal (p=0.26) shapiro.test(big12$score[big12$team=="Kansas"]) # Not significantly diff from Normal (p=0.61) shapiro.test(big12$score[big12$team=="Kansas State"]) # Significantly diff from Normal (p=0.04) * shapiro.test(big12$score[big12$team=="Missouri"]) # Not significantly diff from Normal (p=0.57) shapiro.test(big12$score[big12$team=="Nebraska"]) # Not significantly diff from Normal (p=0.34) shapiro.test(big12$score[big12$team=="Oklahoma"]) # Not significantly diff from Normal (p=0.40) shapiro.test(big12$score[big12$team=="Texas"]) # Not significantly diff from Normal (p=0.63) shapiro.test(big12$score[big12$team=="Texas A&M"]) # Not significantly diff from Normal (p=0.50) shapiro.test(big12$score[big12$team=="Texas Tech"]) # Not significantly diff from Normal (p=0.80) # Remember the comment about inflated Type I error rates? #2: Equal variances bartlett.test(score~team, data=big12) # Scores do not have different (enough) variances fligner.test(score~team, data=big12) # Scores do not have different (enough) variances # Well, I think ANOVA is appropriate here model1 <- aov(score~team, data=big12) summary(model1) model1$coefficients # Alternatively, let's do the Kriskal-Wallis just to make sure we are not too wrong model2 <- kruskal.test(score~team, data=big12) model2 # The conclusions are the same # So, which team is not like the others? To answer this question, we need to perform # post-hoc tests. We will discuss these when we start the Multiple Comparisons chapter. ##### # Something new, but related # Let X ~ N(mx, sx) and Y ~ N(my,sy). Also, let X and Y be independent. These are the assumptions of the # usual t-test. Let us check to see if the t-test actually has an alpha=0.05 rate: mx <- 10 # Mean of X sx <- 10 # SD of X my <- 10 # Mean of Y sy <- 10 # SD of Y ss <- 30 # Sample size # Thus, the null hypothesis IS true X <- rnorm(ss, mx,sx) Y <- rnorm(ss, my,sy) t.test(X,Y) t.test(X,Y)$p.value # The p-value of this test # To test the true Type I Error rate: mx <- 10 # Mean of X sx <- 10 # SD of X my <- 10 # Mean of Y sy <- 10 # SD of Y ss <- 30 # Sample size trials <- 1e4 # Number of trials to run (should be large) p <- numeric() # Will hold the calculated p-values for(i in 1:trials) { X <- rnorm(ss, mx,sx) Y <- rnorm(ss, my,sy) p[i] <- t.test(X,Y, var.equal=TRUE)$p.value } histdata <- hist(p, breaks=21,las=1,ylab="Rejections",xlab="p-value") abline(h=trials/20, col=4) histdata$counts[1]/trials # The empirical Type I Error rate, should be close to 0.05