########################################
#
#  Script: 1 February 2011 (preliminary)
#
########################################

# Today:
#
#   ** Analysis of Variance
#

# But, before we do that, let us look at boxplots some more

# read in the data
fball <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ncaa2009football.csv", header=TRUE)


boxplot(score~conference, data=fball)                             # All of the conferences

boxplot(score~team, data=fball)                                   # All of the teams

par(mar=c(5,5,2,2)+0.1)                                           # Sets the margins order:(b,l,t,r)
boxplot(score~team, data=fball, horizontal=TRUE, las=1, cex.axis=0.5)  # Better ... maybe
# This colors the SEC conference teams blue:
boxplot(score~team, data=fball, horizontal=TRUE, las=1, cex.axis=0.5, subset=conference=="SEC", col="light blue", add=TRUE)
# This colors the University of Oregon green
boxplot(score~team, data=fball, horizontal=TRUE, las=1, cex.axis=0.5, subset=team=="Oregon", col="green4", add=TRUE)
# This colors Oklahoma State University orange
boxplot(score~team, data=fball, horizontal=TRUE, las=1, cex.axis=0.5, subset=team=="Oklahoma State", col="orange", add=TRUE)


#####
# What if we only want to look at the teams in a specific conference?

# It is not as easy as it should be:
boxplot(score~team, data=fball, subset=conference=="Big 12")

# The easiest (to understand) way of accomplishing our goals is the following procedure:

big12 <- fball[fball$conference=="Big 12",]                            # Let's subset the fball dataset
big12teams <- unique(factor(fball$team[fball$conference=="Big 12"]))   # This lists all of the SEC teams
big12$team <- factor(as.character(big12$team), levels=big12teams )     # This is the important part
boxplot(score~team, data=big12, horizontal=TRUE, las=1)                # Now, the appropriate boxplot

boxplot(score~team, data=big12, horizontal=TRUE, las=1, cex.axis=0.5)
# or
par(mar=c(5,8,2,2)+0.1)
boxplot(score~team, data=big12, horizontal=TRUE, las=1)
# which is better?

# of course, we can make this look more colorful (for Big 12 fans):
par(mar=c(5,8,2,2)+0.1)
boxplot(score~team, data=big12, horizontal=TRUE, las=1,
	border=c("#008000","#FFD700","#C41E3A","#0022B4","#7851A9","#FFCC33","#FF2400","#DC143C","#FF8C00","#CC5500","#800000","#FF2400"))
# these are the official colors in R,G,B format: #rrggbb
# I'm not sure this looks better than all-black, but it may be handy in a Big 12-based magazine?



# Now, back to analysis 

# H0 : There is no statistical difference among the Big 12 teams in terms of points per game (ppg)

# Multiple groups -> an ANOVA-like analysis

# Check assumptions of ANOVA:

#1: Normality
shapiro.test(big12$score[big12$team=="Oklahoma State"])            # Not significantly diff from Normal (p=0.39)
with(big12[big12$team=="Oklahoma State",], shapiro.test(score))    # Alternative way of doing this

shapiro.test(big12$score[big12$team=="Baylor"])                    # Not significantly diff from Normal (p=0.06)
shapiro.test(big12$score[big12$team=="Colorado"])                  # Not significantly diff from Normal (p=0.86)
shapiro.test(big12$score[big12$team=="Iowa State"])                # Not significantly diff from Normal (p=0.26)
shapiro.test(big12$score[big12$team=="Kansas"])                    # Not significantly diff from Normal (p=0.61)
shapiro.test(big12$score[big12$team=="Kansas State"])              # Significantly diff from Normal     (p=0.04) *
shapiro.test(big12$score[big12$team=="Missouri"])                  # Not significantly diff from Normal (p=0.57)
shapiro.test(big12$score[big12$team=="Nebraska"])                  # Not significantly diff from Normal (p=0.34)
shapiro.test(big12$score[big12$team=="Oklahoma"])                  # Not significantly diff from Normal (p=0.40)
shapiro.test(big12$score[big12$team=="Texas"])                     # Not significantly diff from Normal (p=0.63)
shapiro.test(big12$score[big12$team=="Texas A&M"])                 # Not significantly diff from Normal (p=0.50)
shapiro.test(big12$score[big12$team=="Texas Tech"])                # Not significantly diff from Normal (p=0.80)

# Remember the comment about inflated Type I error rates?


#2: Equal variances
bartlett.test(score~team, data=big12)                              # Scores do not have different (enough) variances
fligner.test(score~team, data=big12)                               # Scores do not have different (enough) variances


# Well, I think ANOVA is appropriate here
model1 <- aov(score~team, data=big12)
summary(model1)
model1$coefficients


# Alternatively, let's do the Kriskal-Wallis just to make sure we are not too wrong
model2 <- kruskal.test(score~team, data=big12)
model2                                                             # The conclusions are the same


# So, which team is not like the others? To answer this question, we need to perform
# post-hoc tests. We will discuss these when we start the Multiple Comparisons chapter.




#####
# Something new, but related

# Let X ~ N(mx, sx) and Y ~ N(my,sy). Also, let X and Y be independent. These are the assumptions of the
# usual t-test. Let us check to see if the t-test actually has an alpha=0.05 rate:

mx <- 10             # Mean of X
sx <- 10             # SD of X
my <- 10             # Mean of Y
sy <- 10             # SD of Y

ss <- 30             # Sample size

# Thus, the null hypothesis IS true
X <- rnorm(ss, mx,sx)
Y <- rnorm(ss, my,sy)

t.test(X,Y)
t.test(X,Y)$p.value  # The p-value of this test


# To test the true Type I Error rate:

mx <- 10             # Mean of X
sx <- 10             # SD of X
my <- 10             # Mean of Y
sy <- 10             # SD of Y
ss <- 30             # Sample size
trials <- 1e4        # Number of trials to run (should be large)
p <- numeric()       # Will hold the calculated p-values

for(i in 1:trials) {
  X <- rnorm(ss, mx,sx)
  Y <- rnorm(ss, my,sy)
  p[i] <- t.test(X,Y, var.equal=TRUE)$p.value
}

histdata <- hist(p, breaks=21,las=1,ylab="Rejections",xlab="p-value")
abline(h=trials/20, col=4)
histdata$counts[1]/trials    # The empirical Type I Error rate, should be close to 0.05