####################
#
# Filename: 20110920.R
#
####################
#
# Today: A full-length linear modeling
#        action. Note that there are a
#        few general steps:
#        1. Get to know your data!
#        2. Determine your research model
#           Determine additive and interaction effects
#        3. Check the assumptions of the method
#        4. Answer the question

##################################################

# Preamble

library(RFS)


# Read in data
ssm <- read.csv("http://courses.kvasaheim.com/pols6123/data/ssm.csv")


# Get to know the data
names(ssm)
summary(ssm)
cor(ssm)

attach(ssm)


# Question:  What is the probability that a SSM ballot
#            measure will pass in Maine in 2010?

# Research model:
#            dep var:  propWin
#            ind vars: yearPassed
#                      civilBan
#                      religPct

# Determine if they enter additive or interaction

# As theory is no help here. let us try all possibilities

m1 <- lm( propWin ~ yearPassed + civilBan + religPct)
summary(m1)  # adjusted R2 = 0.7565

m2 <- lm( propWin ~ yearPassed * civilBan * religPct)
summary(m2)  # adjusted R2 = 0.7387

m3 <- lm( propWin ~ yearPassed * civilBan + religPct)
summary(m3)  # adjusted R2 = 0.7577

m4 <- lm( propWin ~ yearPassed + civilBan * religPct)
summary(m4)  # adjusted R2 = 0.7495

m5 <- lm( propWin ~ civilBan + yearPassed * religPct)
summary(m5)  # adjusted R2 = 0.7550

# Compare the adjusted R2 values. Largest indicates the 
# best model. In class, we used model m1. Here, we see
# that my short-term memory was wrong; we will use model m3



# Check assumptions

# independent x-values
cor(ssm)
cor.test(religPct,civilBan)  # Statistically significant
              # But, is is substantively significant? I'm 
              # not sure. So, let's make a note for the future.


res <- residuals(m3)  # residuals

# mean = 0?
mean(res)

# Normally distributed?
shapiro.test(res)   # Frowny face, FAIL

# So, let's choose the second best model from above, model m1

res <- residuals(m1)  # residuals

# mean = 0?
mean(res)          # check

# Normally distributed?
shapiro.test(res)   # check

# Constant variance?
plot(res~propWin)   # check (no apparent funnel)

# PASS!


# Prediction

# Maine variable
ME <- data.frame(yearPassed=10, civilBan=0, religPct=48)

# Predict
predict(m1, newdata=ME)

# Thus the point estimate is 40.4%


# That was not the question, however. To answer the question,
# we need to use Monte Carlo techniques (here, it is more 
# specifically termed "parametric bootstrapping").

summary(m1)    # to refresh the memory

set.seed(577)
t <- 1000000   # this is equivalent to:  t <- 1e6

e.cont <- rnorm(t, m=  0.151221, s=0.065938)
e.year <- rnorm(t, m= -0.020095, s=0.003577)
e.civb <- rnorm(t, m= -0.037331, s=0.019988)
e.relp <- rnorm(t, m=  0.009452, s=0.001074)

prd <- e.cont + 10*e.year + 0*e.civb + 48*e.relp

# These are the million predictions


# Histogram:

hist(prd)

# or

hist(prd, breaks=-11:111/100)

# or

par(mar=c(4.3,1,1,1))
hist(prd, breaks=-11:111/100, main="", yaxt="n", ylab="", xlim=c(0,1), xaxs="i", xlab="Vote in Favor of SSM", yaxs="i")
hist(prd[prd>0.500], breaks=-11:111/100, main="", yaxt="n", ylab="", xlim=c(0,1), xaxs="i", xlab="", add=TRUE, col=2)

# Which histogram is best?



# What proportion are wins?

length(which(prd>0.500))/t

# What is the 95% confidence interval?

Q13 <- quantile(prd, c(0.025,0.975))
segments(Q13[1],0,Q13[2],0, col=3, lwd=5)
segments(Q13[1],0,0,0, col=2, lwd=5)
segments(1,0,Q13[2],0, col=2, lwd=5)

# What's wrong? We have a prediction that does not make physical sense. In fact, 
length(which(prd<0.000))   # = 4
# are below 0%, and 
length(which(prd>1.000))   # = 0
# are above 100%




# Why? Linear models assume the response variable can take on all values. When this
# is not the case, the dependent variable needs to be transformed appropriately

lgtWin <- logit(propWin)


# Research model?

md1 <- lm( lgtWin ~ yearPassed + civilBan + religPct)
summary(md1)  # adjusted R2 = 0.7918

md2 <- lm( lgtWin ~ yearPassed * civilBan * religPct)
summary(md2)  # adjusted R2 = 0.7771

md3 <- lm( lgtWin ~ yearPassed * civilBan + religPct)
summary(md3)  # adjusted R2 = 0.7944

md4 <- lm( lgtWin ~ yearPassed + civilBan * religPct)
summary(md4)  # adjusted R2 = 0.7845

md5 <- lm( lgtWin ~ civilBan + yearPassed * religPct)
summary(md5)  # adjusted R2 = 0.7899

# Which is best? Model md3


# Check assumptions

# The independent variables are not changed, so the results from above
# for them apply here, as well.

# Residuals
res <- residuals(md3)

# mean = 0?
mean(res)                  # Success

# Normal?
shapiro.test(res)          # FAIL =(



# The second best model was md1

# Its residuals
res <- residuals(md1)

# mean = 0?                # Pass! =)
mean(res)

# Normal?
shapiro.test(res)          # Pass! =)

# Constant variance?
plot(res~lgtWin, pch=16)   # No funnel! =)

# It passes, so we will use this model

# Prediction?

predict(md1, newdata=ME)

# Oops! We forgot that we need to back-transform

logistic( predict(md1, newdata=ME) )

# We predict that 37.8% of the voters will vote for the SSM ballot measure.


# But, the question was about the probability of it passing. So, we do
# parametric bootstrapping (as above)


summary(md1)    # to refresh the memory

set.seed(577)
t <- 1000000   # this is equivalent to:  t <- 1e6

e.cont <- rnorm(t, m= -1.89091, s=0.28978)
e.year <- rnorm(t, m= -0.08857, s=0.01572)
e.civb <- rnorm(t, m= -0.23177, s=0.08784)
e.relp <- rnorm(t, m=  0.04748, s=0.00472)

prd <- e.cont + 10*e.year + 0*e.civb + 48*e.relp

# These are the million predictions, BUT in logit units
# Back-transform!

prd <- logistic(prd)

# Histogram:

hist(prd)

# or

hist(prd, breaks=-11:111/100)

# or

par(mar=c(4.3,1,1,1))
hist(prd, breaks=-11:111/100, main="", yaxt="n", ylab="", xlim=c(0,1), xaxs="i", xlab="Vote in Favor of SSM", yaxs="i")
hist(prd[prd>0.500], breaks=-11:111/100, main="", yaxt="n", ylab="", xlim=c(0,1), xaxs="i", xlab="", add=TRUE, col=2)

# Which histogram is best?



# What proportion are wins?

length(which(prd>0.500))/t

# What is the 95% confidence interval?

Q13 <- quantile(prd, c(0.025,0.975))
segments(Q13[1],0,Q13[2],0, col=3, lwd=5)
segments(Q13[1],0,0,0, col=2, lwd=5)
segments(1,0,Q13[2],0, col=2, lwd=5)

text(0.6,15000, "p = 0.106")



##################################################

# So, you should be asking the following:
#   1. What are the steps I need to follow to do an analysis?
#   2. What do the graphs tell you? What aspects convey their meanings best?
#   3. Which assumptions need to be tested? How do I test them?
#   4. What is the question and how do I answer it?
#   5. What does each technique tell us?
#
# One of R's greatest strength is that you can create very intricate graphs.
# One of R's drawbacks is that you have to figure out how to make them.
#