####################
#
# Filename: 20110920.R
#
####################
#
# Today: A full-length linear modeling
# action. Note that there are a
# few general steps:
# 1. Get to know your data!
# 2. Determine your research model
# Determine additive and interaction effects
# 3. Check the assumptions of the method
# 4. Answer the question
##################################################
# Preamble
library(RFS)
# Read in data
ssm <- read.csv("http://courses.kvasaheim.com/pols6123/data/ssm.csv")
# Get to know the data
names(ssm)
summary(ssm)
cor(ssm)
attach(ssm)
# Question: What is the probability that a SSM ballot
# measure will pass in Maine in 2010?
# Research model:
# dep var: propWin
# ind vars: yearPassed
# civilBan
# religPct
# Determine if they enter additive or interaction
# As theory is no help here. let us try all possibilities
m1 <- lm( propWin ~ yearPassed + civilBan + religPct)
summary(m1) # adjusted R2 = 0.7565
m2 <- lm( propWin ~ yearPassed * civilBan * religPct)
summary(m2) # adjusted R2 = 0.7387
m3 <- lm( propWin ~ yearPassed * civilBan + religPct)
summary(m3) # adjusted R2 = 0.7577
m4 <- lm( propWin ~ yearPassed + civilBan * religPct)
summary(m4) # adjusted R2 = 0.7495
m5 <- lm( propWin ~ civilBan + yearPassed * religPct)
summary(m5) # adjusted R2 = 0.7550
# Compare the adjusted R2 values. Largest indicates the
# best model. In class, we used model m1. Here, we see
# that my short-term memory was wrong; we will use model m3
# Check assumptions
# independent x-values
cor(ssm)
cor.test(religPct,civilBan) # Statistically significant
# But, is is substantively significant? I'm
# not sure. So, let's make a note for the future.
res <- residuals(m3) # residuals
# mean = 0?
mean(res)
# Normally distributed?
shapiro.test(res) # Frowny face, FAIL
# So, let's choose the second best model from above, model m1
res <- residuals(m1) # residuals
# mean = 0?
mean(res) # check
# Normally distributed?
shapiro.test(res) # check
# Constant variance?
plot(res~propWin) # check (no apparent funnel)
# PASS!
# Prediction
# Maine variable
ME <- data.frame(yearPassed=10, civilBan=0, religPct=48)
# Predict
predict(m1, newdata=ME)
# Thus the point estimate is 40.4%
# That was not the question, however. To answer the question,
# we need to use Monte Carlo techniques (here, it is more
# specifically termed "parametric bootstrapping").
summary(m1) # to refresh the memory
set.seed(577)
t <- 1000000 # this is equivalent to: t <- 1e6
e.cont <- rnorm(t, m= 0.151221, s=0.065938)
e.year <- rnorm(t, m= -0.020095, s=0.003577)
e.civb <- rnorm(t, m= -0.037331, s=0.019988)
e.relp <- rnorm(t, m= 0.009452, s=0.001074)
prd <- e.cont + 10*e.year + 0*e.civb + 48*e.relp
# These are the million predictions
# Histogram:
hist(prd)
# or
hist(prd, breaks=-11:111/100)
# or
par(mar=c(4.3,1,1,1))
hist(prd, breaks=-11:111/100, main="", yaxt="n", ylab="", xlim=c(0,1), xaxs="i", xlab="Vote in Favor of SSM", yaxs="i")
hist(prd[prd>0.500], breaks=-11:111/100, main="", yaxt="n", ylab="", xlim=c(0,1), xaxs="i", xlab="", add=TRUE, col=2)
# Which histogram is best?
# What proportion are wins?
length(which(prd>0.500))/t
# What is the 95% confidence interval?
Q13 <- quantile(prd, c(0.025,0.975))
segments(Q13[1],0,Q13[2],0, col=3, lwd=5)
segments(Q13[1],0,0,0, col=2, lwd=5)
segments(1,0,Q13[2],0, col=2, lwd=5)
# What's wrong? We have a prediction that does not make physical sense. In fact,
length(which(prd<0.000)) # = 4
# are below 0%, and
length(which(prd>1.000)) # = 0
# are above 100%
# Why? Linear models assume the response variable can take on all values. When this
# is not the case, the dependent variable needs to be transformed appropriately
lgtWin <- logit(propWin)
# Research model?
md1 <- lm( lgtWin ~ yearPassed + civilBan + religPct)
summary(md1) # adjusted R2 = 0.7918
md2 <- lm( lgtWin ~ yearPassed * civilBan * religPct)
summary(md2) # adjusted R2 = 0.7771
md3 <- lm( lgtWin ~ yearPassed * civilBan + religPct)
summary(md3) # adjusted R2 = 0.7944
md4 <- lm( lgtWin ~ yearPassed + civilBan * religPct)
summary(md4) # adjusted R2 = 0.7845
md5 <- lm( lgtWin ~ civilBan + yearPassed * religPct)
summary(md5) # adjusted R2 = 0.7899
# Which is best? Model md3
# Check assumptions
# The independent variables are not changed, so the results from above
# for them apply here, as well.
# Residuals
res <- residuals(md3)
# mean = 0?
mean(res) # Success
# Normal?
shapiro.test(res) # FAIL =(
# The second best model was md1
# Its residuals
res <- residuals(md1)
# mean = 0? # Pass! =)
mean(res)
# Normal?
shapiro.test(res) # Pass! =)
# Constant variance?
plot(res~lgtWin, pch=16) # No funnel! =)
# It passes, so we will use this model
# Prediction?
predict(md1, newdata=ME)
# Oops! We forgot that we need to back-transform
logistic( predict(md1, newdata=ME) )
# We predict that 37.8% of the voters will vote for the SSM ballot measure.
# But, the question was about the probability of it passing. So, we do
# parametric bootstrapping (as above)
summary(md1) # to refresh the memory
set.seed(577)
t <- 1000000 # this is equivalent to: t <- 1e6
e.cont <- rnorm(t, m= -1.89091, s=0.28978)
e.year <- rnorm(t, m= -0.08857, s=0.01572)
e.civb <- rnorm(t, m= -0.23177, s=0.08784)
e.relp <- rnorm(t, m= 0.04748, s=0.00472)
prd <- e.cont + 10*e.year + 0*e.civb + 48*e.relp
# These are the million predictions, BUT in logit units
# Back-transform!
prd <- logistic(prd)
# Histogram:
hist(prd)
# or
hist(prd, breaks=-11:111/100)
# or
par(mar=c(4.3,1,1,1))
hist(prd, breaks=-11:111/100, main="", yaxt="n", ylab="", xlim=c(0,1), xaxs="i", xlab="Vote in Favor of SSM", yaxs="i")
hist(prd[prd>0.500], breaks=-11:111/100, main="", yaxt="n", ylab="", xlim=c(0,1), xaxs="i", xlab="", add=TRUE, col=2)
# Which histogram is best?
# What proportion are wins?
length(which(prd>0.500))/t
# What is the 95% confidence interval?
Q13 <- quantile(prd, c(0.025,0.975))
segments(Q13[1],0,Q13[2],0, col=3, lwd=5)
segments(Q13[1],0,0,0, col=2, lwd=5)
segments(1,0,Q13[2],0, col=2, lwd=5)
text(0.6,15000, "p = 0.106")
##################################################
# So, you should be asking the following:
# 1. What are the steps I need to follow to do an analysis?
# 2. What do the graphs tell you? What aspects convey their meanings best?
# 3. Which assumptions need to be tested? How do I test them?
# 4. What is the question and how do I answer it?
# 5. What does each technique tell us?
#
# One of R's greatest strength is that you can create very intricate graphs.
# One of R's drawbacks is that you have to figure out how to make them.
#