##### SCA-31a
##### 
##### Analysis of Variance
##### 

### This gives a few examples of ANOVA procedure


### Preamble

source("http://rfs.kvasaheim.com/stat200.R")
library(agricolae)


### Example 1: Majors
#
#   Does the average GPA differ among the four major types?
#

MNS = c(3.0, 2.9, 3.3, 2.2, 3.3, 3.4, 2.5)
HSS = c(3.5, 3.8, 3.1, 3.3, 2.2, 3.4, 4.0)
HUM = c(3.8, 3.8, 3.7, 3.6, 1.7, 3.3, 2.4, 2.6)
ART = c(4.0, 4.0, 3.8, 3.7, 3.8, 3.9, 3.8, 3.5)


ad=groupTransform( list(MNS,HSS,HUM,ART), names=c("MNS","HSS","HUM","ART") )
attach(ad)


shapiroTest(xv~gv)
fligner.test(xv~gv)

gpaMOD = aov( xv~gv )
summary(gpaMOD)


#   Conclusion:
#
#   Since the p-value of 0.0253 is less than our usual 
#   alpha of 0.05, we reject the null hypothesis and 
#   conclude that at least one population mean differs
#   from the others.

boxplot(xv~gv)

TukeyHSD(gpaMOD)

#  According to the Tukey HSD test, the average GPA of 
#  the MNS majors is significantly lower than that of the
#  ART majors by 0.1 to 1.6 points.


### Example 2: Some School's GPA by source
#

dt = read.csv("http://rfs.kvasaheim.com/data/someCollege.csv")
attach(dt)

shapiroTest(gpa~highschool)

kruskal.test(gpa,highschool)
print( kruskal(gpa,highschool) )

#   Conclusion
#   Because the p-value of 0.0008 is less than our usual alpha
#   of 0.05, we reject the null hypothesis that all means are
#   equal. 
#
#   According to Kruskal's multiple comparisons test, the
#   mean GPA of public high school students is significantly
#   lower than that of the other groups.
#


### Example 3: Some School's GPA by Gender

shapiroTest(gpa~gender)

kruskal.test(gpa,gender)

#   Conclusion:
#   Because the p-value of 0.7202 is greater than our usual
#   alpha of 0.05, we did not detect a difference in mean
#   GPA between the two genders.


### Example 4: Math SAT vs Gender

summary(math)

drp = which(math==0)
mathR = math[-drp]
gendR = gender[-drp]

shapiroTest(mathR~gendR)

kruskal.test(mathR,gendR)

print(kruskal(mathR,gendR))

#   Conclusion:
#   Because the p-value of 0.0001 is less than our usual
#   alpha of 0.05, we did detect a significant difference
#   in average SAT Math scores between females and males. 
#
#   In fact, females tended to score lower on the SAT Math 
#   than did Males.


### Example 5: Reading SAT vs Gender

summary(reading)
drp = which(reading==0)
readR = reading[-drp]
gendR = gender[-drp]

shapiroTest(readR~gendR)

kruskal.test(readR~gendR)

#   Conclusion: 
#   Because the p-value of 0.6667 is greater than our usual 
#   alpha of 0.05, we cannot reject the null hypothesis. We
#   did not detect a difference in average SAT Reading scores
#   between females and males.


### Example 6: SAT Composite vs Origin

summary(composite)

drp = which(reading==0 | math==0)

compR = composite[-drp]
highR = highschool[-drp]

shapiroTest(compR~highR)

kruskal.test(compR~highR)
fligner.test(compR~highR)


kruskal.test(compR,highR)

#   Conclusion: 
#   Because the p-value of 0.1104 is greater than our usual
#   alpha of 0.05, we cannot reject the null hypothesis. We
#   did not detect a difference in the average SAT composite
#   score across the four types of feeder schools.


### Note: You may think that the Fligner-Killeen p-value of 
#   0.04706 is 'close enough' to 0.05 to not fail the test. 
#   This is especially a good observation in light of the
#   effect of multiple testing. 
#
#   And so, let's see if our conclusions are robust to using
#   a different test:

compMOD = aov(compR~highR)
summary(compMOD)

#   Conclusion: 
#   Because the p-value of 0.0856 is greater than our usual
#   alpha of 0.05, we cannot reject the null hypothesis. We
#   did not detect a difference in the average SAT composite
#   score across the four types of feeder schools.

boxplot(compR~highR)