####################
#
# Filename: 20110906.R
#
####################
#
# Purpose: Multiple testing (aka
# post hoc testing), extending
# the abilities of R, and linear
# regression.
#
###
# Let us start with post hoc testing
# You have rejected the null hypothesis of an ANOVA
# or a Kruskal-Wallis test (which means at least one
# mean is different from the others).
# Post hoc tests indicates which groups are not
# significantly difference and which are.
# As usual, let us start by reading in a data file
bio <- read.csv("http://courses.kvasaheim.com/pols6123/data/biome2.csv")
names(bio)
summary(bio)
attach(bio)
# A utility boxplot helps suggest which groups are
# similar and which are not.
boxplot(mfri~biome)
# Remember that we could not use the ANOVA test here,
# so we use the Kruskal-Wallis test.
kruskal.test(mfri~biome) # Kruskal-Wallis test
# Non-parametric post-hoc testing:
# Kruskal test (NOT the Kruskal-Wallis test)
kruskal(mfri,biome)
# Note that this test suggests that TBF and TCF are
# similar enough that they cannot be distinguished, but
# all other biomes are different from this gourp and
# from each other.
# Now, what if we were able to use the ANOVA test? Well,
# since the assumptions of the ANOVA test are Normality
# and equal variance, we can use these (or, rather, other
# statisticians can use these) to distinguish among the
# groups.
# There are many different parametric testings. The first
# was Fisher's Least Significant Difference (LSD) test. The
# next was Tukey's Honestly Significant Difference (HSD) test.
# Then, a whole bunch were created. The best general purpose
# test is Tukey's HSD.
# To perform the HSD test, you will need to library the
# agricolae package:
library(agricolae)
# Now, you can perform the tests (and many others). For
# details on this package, type ?agricolae. For details
# on its available functions, click on 'index' at the
# bottom of that page.
LSD.test(mfri,biome, 4,25) # 4 = # groups - 1 (= df1)
HSD.test(mfri,biome, 4,25) # 25 = sample size - # groups -1 (= df2)
# As we could not use ANOVA, we should not use these (and
# they gave different groupings from the better test).
# If you do not know df1 and df2, you can get them from
# the original aov function.
##############################################
# Now, let us do linear regression.
# First, load the RFS library for some
# helpful functions:
library(RFS)
# Now, load the data file
ssm <- read.csv("http://courses.kvasaheim.com/pols6123/data/ssm.csv")
names(ssm)
summary(ssm)
attach(ssm)
# Q: What is the correlation between the variables?
cor(ssm)
# The NAs are because of missing values in the
# churchAttendance variable.
# Q: Is the correlation between religPct and civilBan
# statistically significant?
cor.test(religPct,civilBan) # yes: p < alpha
# The null hypothesis is that yearPassed, civilBan, and
# religPct do not affect propWin.
mod1 <- lm(propWin ~ yearPassed + civilBan + religPct)
summary(mod1)
# Note the regression table and the adjusted R-squared. The
# adjusted R-squared helps with model selection. Models with
# higher adjusted R-squared values are preferred over models
# with lower adjusted R-squared values.
# To see this, let us drop the civilBan from our
# model and see if we get a better model
# The restricted (or reduced) model
mod2 <- lm(propWin ~ yearPassed + religPct)
summary(mod1) # adjusted R-squared = 0.7565
summary(mod2) # adjusted R-squared = 0.7356
# As the adjusted R-squared is lower for mod2, we
# prefer mod1 (the one including civilBan).
# Now, let us make predictions
# First, let us create the ME (Maine) variable
# that holds all relevant information about Maine.
ME <- data.frame( yearPassed=9, civilBan=0, religPct=48 )
# Now, let us predict the proportion of people voting
# in favor of this ballot measure:
predict(model, newdata=ME)
# Thus, we expect 42% of the Mainers to vote in favor of
# this ballot measure.