######################################## # # Script: 21 April 2011 (20110421.R) # ######################################## # Today: # # ** Monte Carlo and Testing tests # # Testing a different question # # Using Monte Carlo methods, we can also estimate the probability # of events happening when all we have is a point estimate. # Example: # We modeled the data and predicted that the vote in favor of # Maine's ssm ballot measure would be 45%. That's nice. But, a # more interesting question is to ask: "What is the probability # that the ballot measure will pass in New Mexico?" # # Needed (perhaps) functions logit <- function(p) log( p / (1-p) ) logit.inv <- function(x) exp(x)/(1+exp(x)) ssm <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ssm.csv", header=TRUE) names(ssm) summary(ssm) # Review Task 1: # # Fit an appropriate model where you can predict the percent # voting in favor of the ssm measure based on an additive # model of Religiosity, Year, South(-ness), and Civil Union. p.favor <- ssm$pctfavor/100 n.favor <- p.favor*1000 p.agin <- 1000-n.favor y.favor <- cbind(n.favor,p.agin) mod.yr <- ssm$year-1998 # Why did I do this? Ask at the histogram! cu <- ssm$civilunion st <- ssm$south relig <- ssm$religiosity model <- glm(y.favor ~ mod.yr + cu + st + relig, family=quasibinomial(link=logit)) summary(model) plot(model) # Review Task 2: # # Predict the outcome of a ssm ballot measure in New Mexico this year (2011). # The necessary information for New Mexico and the ballot measure are: # Year = 2011 (mod.yr=13) # Civil Union = 0 # South = 0 # Religiosity = 1 newstate <- data.frame(mod.yr=13, cu=0,st=0,relig=1) pr.NM <- predict(model, newdata=newstate) pr.NM <- logit.inv(pr.NM) pr.NM # = 0.4608805, a loss # Review Task 3: # # Create a prediction curve of percent in favor against year for # the two values of South. Only produce the predictions, do # not bother to predict the confidence bands. Make sure you add # a legend. Use years 1998 through 2012. new.modyr <- 1998:2012-1998 pr0 <- predict(model, newdata=data.frame(mod.yr=new.modyr, st=0, cu=mean(cu), relig=mean(relig)) ) pr1 <- predict(model, newdata=data.frame(mod.yr=new.modyr, st=1, cu=mean(cu), relig=mean(relig)) ) pr0 <- logit.inv(pr0) pr1 <- logit.inv(pr1) par(mar=c(5,4,2,2)+0.1) par(cex=0.6) plot(mod.yr+1998,p.favor, pch=16, col=4, cex=0.9, las=1, xlim=c(1998,2012), ylim=c(0.4,1), xlab="Year",ylab="Proportion of Vote in Favor") points(mod.yr[st==1]+1998,p.favor[st==1], col=2, pch=16) lines(new.modyr+1998, pr0, col=4) lines(new.modyr+1998, pr1, col=2) legend("topright",c("Southern State","Non-Southern State"), col=c(2,4), lwd=1, bty="n", pch=16) text(2011,pr.NM,"NM") # New stuff: # Now, because of the fitting method (Maximum Likelihood Estimation), the parameter # estimates are distributed Normally with mean the estimate and variance the square # of the standard error. # # BUT, these are just estimates. Thus, were we to rerun the world, we would get # parameter estimates that are not the same, but are Normally distributed as # described above. # # The predictions are proportion in favor. # To win, that proportion must be above 0.500. # # See where this is going? If we rerun the world, the coefficients change, thus # the predictions change, thus the measures may pass or fail. To see how often # they pass (thus giving a probability of passing), we just perform a Monte # Carlo experiment of a certain number of trials (10000?), count the numer of # times the prediction is a win, then divide that by the number of trials to # get a probability of the ballot measure passing. # # # # I want to predict the probability that the ssm ballot measure passes in New Mexico. This # is a different question than the one we answered previously (pr.NM). To answer this question, # we will need to re-run Earth's history a million times and re-reun the analysis each of # those times ... or, just use the fact that the parameter estimates are Normally distributed: # # To wit: trials <- 1e6 # Create the coefficients c.int <- rnorm(trials, m= 0.58389, sd=0.15973) c.myr <- rnorm(trials, m=-0.07015, sd=0.01811) c.cu <- rnorm(trials, m=-0.17846, sd=0.10392) c.st <- rnorm(trials, m= 0.44107, sd=0.12616) c.relig <- rnorm(trials, m= 0.17129, sd=0.04534) # These values came from where? summary(model) # Now, we predict the outcome of New Mexico's ballot measure # with these new coefficients (Where did the 13, 0, and 1 come from?) p.NM <- c.int + 13*c.myr + 0*c.st + 1*c.relig p.NM <- logit.inv(p.NM) p.NM>0.500 # A vector of TRUE/FALSE depending on whether the measure passed sum(p.NM>0.500) # The number of times the measure passed sum(p.NM>0.500)/trials # The probability of the measure passing = 0.292903 # Graphics tell the story: A histogram par(mar=c(5,2,2,2)+0.1) hist(p.NM, breaks=51, ylab="", yaxt="n",las=1, xlab="Proportion Voting in Favor", main="") abline(v=0.50, col=2) # Graphics tell the story (again): A different histogram h <- hist(p.NM, plot=FALSE, breaks=10:80/100) pass <- seq(0.5,1.0,0.001)+0.001 bin <- cut(pass, h$breaks) clr <- rep("white", length(h$counts)) clr[bin] <- "#FF5A36" plot(h, col=clr, ylab="", yaxt="n",las=1, xlab="Proportion Voting in Favor", main="") abline(v=0.50, col=1) # What are the advantages of that first histogram? Of the second? # Now, the need for using a transformed year variable (mod.yr) came about # only because we were going to do some Monte Carlo experiments using the # model AND because the standard errors for the intercept were so much # larger than the other standard errors. For homework, I will have you # perform the same experiment, but without modifying the year variable to # see the effects. # If all we wanted to do was predict the proportion of the vote in New # Mexico, we would not have to transform the year.