####################
#
# Filename: 20110906.R
#
####################
#
# Purpose: Multiple testing (aka
#  post hoc testing), extending
#  the abilities of R, and linear 
#  regression.
#



###
# Let us start with post hoc testing

# You have rejected the null hypothesis of an ANOVA
# or a Kruskal-Wallis test (which means at least one
# mean is different from the others).

# Post hoc tests indicates which groups are not
# significantly difference and which are.


# As usual, let us start by reading in a data file
bio <- read.csv("http://courses.kvasaheim.com/pols6123/data/biome2.csv")
names(bio)
summary(bio)
attach(bio)

# A utility boxplot helps suggest which groups are 
# similar and which are not.
boxplot(mfri~biome)

# Remember that we could not use the ANOVA test here, 
# so we use the Kruskal-Wallis test.
kruskal.test(mfri~biome)    # Kruskal-Wallis test


# Non-parametric post-hoc testing:
# Kruskal test (NOT the Kruskal-Wallis test)

kruskal(mfri,biome)

# Note that this test suggests that TBF and TCF are
# similar enough that they cannot be distinguished, but
# all other biomes are different from this gourp and 
# from each other.




# Now, what if we were able to use the ANOVA test? Well,
# since the assumptions of the ANOVA test are Normality 
# and equal variance, we can use these (or, rather, other
# statisticians can use these) to distinguish among the
# groups.

# There are many different parametric testings. The first
# was Fisher's Least Significant Difference (LSD) test. The
# next was Tukey's Honestly Significant Difference (HSD) test.
# Then, a whole bunch were created. The best general purpose
# test is Tukey's HSD.


# To perform the HSD test, you will need to library the 
# agricolae package:

library(agricolae)

# Now, you can perform the tests (and many others). For
# details on this package, type ?agricolae. For details
# on its available functions, click on 'index' at the
# bottom of that page.

LSD.test(mfri,biome, 4,25)  #  4 = # groups - 1                (= df1)
HSD.test(mfri,biome, 4,25)  # 25 = sample size - # groups -1   (= df2)

# As we could not use ANOVA, we should not use these (and
# they gave different groupings from the better test).

# If you do not know df1 and df2, you can get them from
# the original aov function.





##############################################

# Now, let us do linear regression. 

# First, load the RFS library for some 
# helpful functions:

library(RFS)


# Now, load the data file

ssm <- read.csv("http://courses.kvasaheim.com/pols6123/data/ssm.csv")
names(ssm)
summary(ssm)
attach(ssm)

# Q: What is the correlation between the variables?
cor(ssm)

# The NAs are because of missing values in the
# churchAttendance variable.


# Q: Is the correlation between religPct and civilBan
# statistically significant?
cor.test(religPct,civilBan)     # yes: p < alpha



# The null hypothesis is that yearPassed, civilBan, and
# religPct do not affect propWin.

mod1 <- lm(propWin ~ yearPassed + civilBan + religPct)
summary(mod1)

# Note the regression table and the adjusted R-squared. The
# adjusted R-squared helps with model selection. Models with
# higher adjusted R-squared values are preferred over models
# with lower adjusted R-squared values.

# To see this, let us drop the civilBan from our
# model and see if we get a better model

# The restricted (or reduced) model
mod2 <- lm(propWin ~ yearPassed + religPct)

summary(mod1)    # adjusted R-squared = 0.7565
summary(mod2)    # adjusted R-squared = 0.7356

# As the adjusted R-squared is lower for mod2, we
# prefer mod1 (the one including civilBan).


# Now, let us make predictions

# First, let us create the ME (Maine) variable
# that holds all relevant information about Maine.

ME <- data.frame( yearPassed=9, civilBan=0, religPct=48 )

# Now, let us predict the proportion of people voting 
# in favor of this ballot measure:

predict(model, newdata=ME)

# Thus, we expect 42% of the Mainers to vote in favor of
# this ballot measure.