########################################
#
#  Script: 21 April 2011 (20110421.R)
#
########################################

# Today:
#
#  ** Monte Carlo and Testing tests
#     # Testing a different question
#


# Using Monte Carlo methods, we can also estimate the probability
# of events happening when all we have is a point estimate.

# Example: 
#   We modeled the data and predicted that the vote in favor of 
#   Maine's ssm ballot measure would be 45%. That's nice. But, a
#   more interesting question is to ask: "What is the probability
#   that the ballot measure will pass in New Mexico?"
#

# Needed (perhaps) functions

logit     <- function(p) log( p / (1-p) )
logit.inv <- function(x) exp(x)/(1+exp(x)) 


ssm <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ssm.csv", header=TRUE)
names(ssm)
summary(ssm)


# Review Task 1:
#
#   Fit an appropriate model where you can predict the percent
#   voting in favor of the ssm measure based on an additive
#   model of Religiosity, Year, South(-ness), and Civil Union.


p.favor <- ssm$pctfavor/100
n.favor <- p.favor*1000
p.agin  <- 1000-n.favor
y.favor <- cbind(n.favor,p.agin)

mod.yr  <- ssm$year-1998    # Why did I do this? Ask at the histogram!
cu      <- ssm$civilunion
st      <- ssm$south
relig   <- ssm$religiosity


model <- glm(y.favor ~ mod.yr + cu + st + relig, family=quasibinomial(link=logit))
summary(model)

plot(model)



# Review Task 2:
#
#   Predict the outcome of a ssm ballot measure in New Mexico this year (2011).
#   The necessary information for New Mexico and the ballot measure are:
#     Year = 2011 (mod.yr=13)
#     Civil Union = 0
#     South = 0
#     Religiosity = 1

newstate <- data.frame(mod.yr=13, cu=0,st=0,relig=1)
pr.NM <- predict(model, newdata=newstate)
pr.NM <- logit.inv(pr.NM)
pr.NM    # = 0.4608805, a loss




# Review Task 3: 
#
#   Create a prediction curve of percent in favor against year for
#   the two values of South. Only produce the predictions, do
#   not bother to predict the confidence bands. Make sure you add
#   a legend. Use years 1998 through 2012.

new.modyr <- 1998:2012-1998
pr0 <- predict(model, newdata=data.frame(mod.yr=new.modyr, st=0, cu=mean(cu), relig=mean(relig)) )
pr1 <- predict(model, newdata=data.frame(mod.yr=new.modyr, st=1, cu=mean(cu), relig=mean(relig)) )

pr0 <- logit.inv(pr0)
pr1 <- logit.inv(pr1)



  par(mar=c(5,4,2,2)+0.1)
  par(cex=0.6)

  plot(mod.yr+1998,p.favor, pch=16, col=4, cex=0.9, las=1, xlim=c(1998,2012), ylim=c(0.4,1),
    xlab="Year",ylab="Proportion of Vote in Favor")

  points(mod.yr[st==1]+1998,p.favor[st==1], col=2, pch=16)
  lines(new.modyr+1998, pr0, col=4)
  lines(new.modyr+1998, pr1, col=2)

  legend("topright",c("Southern State","Non-Southern State"), col=c(2,4), 
    lwd=1, bty="n", pch=16)

  text(2011,pr.NM,"NM")




# New stuff:

# Now, because of the fitting method (Maximum Likelihood Estimation), the parameter
# estimates are distributed Normally with mean the estimate and variance the square 
# of the standard error. 
#
# BUT, these are just estimates. Thus, were we to rerun the world, we would get 
# parameter estimates that are not the same, but are Normally distributed as
# described above.
#
# The predictions are proportion in favor.
# To win, that proportion must be above 0.500.
#
# See where this is going? If we rerun the world, the coefficients change, thus
# the predictions change, thus the measures may pass or fail. To see how often
# they pass (thus giving a probability of passing), we just perform a Monte
# Carlo experiment of a certain number of trials (10000?), count the numer of
# times the prediction is a win, then divide that by the number of trials to 
# get a probability of the ballot measure passing.
#
#
#
# I want to predict the probability that the ssm ballot measure passes in New Mexico. This
# is a different question than the one we answered previously (pr.NM). To answer this question, 
# we will need to re-run Earth's history a million times and re-reun the analysis each of
# those times ... or, just use the fact that the parameter estimates are Normally distributed:
#
# To wit:


trials <- 1e6

# Create the coefficients

c.int    <- rnorm(trials, m= 0.58389, sd=0.15973)
c.myr    <- rnorm(trials, m=-0.07015, sd=0.01811)
c.cu     <- rnorm(trials, m=-0.17846, sd=0.10392)
c.st     <- rnorm(trials, m= 0.44107, sd=0.12616)
c.relig  <- rnorm(trials, m= 0.17129, sd=0.04534)

# These values came from where?

summary(model)

# Now, we predict the outcome of New Mexico's ballot measure
# with these new coefficients (Where did the 13, 0, and 1 come from?)

p.NM <- c.int + 13*c.myr + 0*c.st + 1*c.relig
p.NM <- logit.inv(p.NM)


    p.NM>0.500                # A vector of TRUE/FALSE depending on whether the measure passed
sum(p.NM>0.500)               # The number of times the measure passed
sum(p.NM>0.500)/trials        # The probability of the measure passing = 0.292903



# Graphics tell the story: A histogram

par(mar=c(5,2,2,2)+0.1)
hist(p.NM, breaks=51, ylab="", yaxt="n",las=1, 
  xlab="Proportion Voting in Favor", main="")
abline(v=0.50, col=2)



# Graphics tell the story (again): A different histogram

h    <- hist(p.NM, plot=FALSE, breaks=10:80/100)
pass <- seq(0.5,1.0,0.001)+0.001

bin      <- cut(pass, h$breaks)
clr      <- rep("white", length(h$counts))
clr[bin] <- "#FF5A36"

plot(h, col=clr, ylab="", yaxt="n",las=1, 
  xlab="Proportion Voting in Favor", main="")
abline(v=0.50, col=1)


# What are the advantages of that first histogram? Of the second?



# Now, the need for using a transformed year variable (mod.yr) came about
# only because we were going to do some Monte Carlo experiments using the 
# model AND because the standard errors for the intercept were so much
# larger than the other standard errors. For homework, I will have you
# perform the same experiment, but without modifying the year variable to
# see the effects.

# If all we wanted to do was predict the proportion of the vote in New
# Mexico, we would not have to transform the year.