#################### # # Filename: 20110920.R # #################### # # Today: A full-length linear modeling # action. Note that there are a # few general steps: # 1. Get to know your data! # 2. Determine your research model # Determine additive and interaction effects # 3. Check the assumptions of the method # 4. Answer the question ################################################## # Preamble library(RFS) # Read in data ssm <- read.csv("http://courses.kvasaheim.com/pols6123/data/ssm.csv") # Get to know the data names(ssm) summary(ssm) cor(ssm) attach(ssm) # Question: What is the probability that a SSM ballot # measure will pass in Maine in 2010? # Research model: # dep var: propWin # ind vars: yearPassed # civilBan # religPct # Determine if they enter additive or interaction # As theory is no help here. let us try all possibilities m1 <- lm( propWin ~ yearPassed + civilBan + religPct) summary(m1) # adjusted R2 = 0.7565 m2 <- lm( propWin ~ yearPassed * civilBan * religPct) summary(m2) # adjusted R2 = 0.7387 m3 <- lm( propWin ~ yearPassed * civilBan + religPct) summary(m3) # adjusted R2 = 0.7577 m4 <- lm( propWin ~ yearPassed + civilBan * religPct) summary(m4) # adjusted R2 = 0.7495 m5 <- lm( propWin ~ civilBan + yearPassed * religPct) summary(m5) # adjusted R2 = 0.7550 # Compare the adjusted R2 values. Largest indicates the # best model. In class, we used model m1. Here, we see # that my short-term memory was wrong; we will use model m3 # Check assumptions # independent x-values cor(ssm) cor.test(religPct,civilBan) # Statistically significant # But, is is substantively significant? I'm # not sure. So, let's make a note for the future. res <- residuals(m3) # residuals # mean = 0? mean(res) # Normally distributed? shapiro.test(res) # Frowny face, FAIL # So, let's choose the second best model from above, model m1 res <- residuals(m1) # residuals # mean = 0? mean(res) # check # Normally distributed? shapiro.test(res) # check # Constant variance? plot(res~propWin) # check (no apparent funnel) # PASS! # Prediction # Maine variable ME <- data.frame(yearPassed=10, civilBan=0, religPct=48) # Predict predict(m1, newdata=ME) # Thus the point estimate is 40.4% # That was not the question, however. To answer the question, # we need to use Monte Carlo techniques (here, it is more # specifically termed "parametric bootstrapping"). summary(m1) # to refresh the memory set.seed(577) t <- 1000000 # this is equivalent to: t <- 1e6 e.cont <- rnorm(t, m= 0.151221, s=0.065938) e.year <- rnorm(t, m= -0.020095, s=0.003577) e.civb <- rnorm(t, m= -0.037331, s=0.019988) e.relp <- rnorm(t, m= 0.009452, s=0.001074) prd <- e.cont + 10*e.year + 0*e.civb + 48*e.relp # These are the million predictions # Histogram: hist(prd) # or hist(prd, breaks=-11:111/100) # or par(mar=c(4.3,1,1,1)) hist(prd, breaks=-11:111/100, main="", yaxt="n", ylab="", xlim=c(0,1), xaxs="i", xlab="Vote in Favor of SSM", yaxs="i") hist(prd[prd>0.500], breaks=-11:111/100, main="", yaxt="n", ylab="", xlim=c(0,1), xaxs="i", xlab="", add=TRUE, col=2) # Which histogram is best? # What proportion are wins? length(which(prd>0.500))/t # What is the 95% confidence interval? Q13 <- quantile(prd, c(0.025,0.975)) segments(Q13[1],0,Q13[2],0, col=3, lwd=5) segments(Q13[1],0,0,0, col=2, lwd=5) segments(1,0,Q13[2],0, col=2, lwd=5) # What's wrong? We have a prediction that does not make physical sense. In fact, length(which(prd<0.000)) # = 4 # are below 0%, and length(which(prd>1.000)) # = 0 # are above 100% # Why? Linear models assume the response variable can take on all values. When this # is not the case, the dependent variable needs to be transformed appropriately lgtWin <- logit(propWin) # Research model? md1 <- lm( lgtWin ~ yearPassed + civilBan + religPct) summary(md1) # adjusted R2 = 0.7918 md2 <- lm( lgtWin ~ yearPassed * civilBan * religPct) summary(md2) # adjusted R2 = 0.7771 md3 <- lm( lgtWin ~ yearPassed * civilBan + religPct) summary(md3) # adjusted R2 = 0.7944 md4 <- lm( lgtWin ~ yearPassed + civilBan * religPct) summary(md4) # adjusted R2 = 0.7845 md5 <- lm( lgtWin ~ civilBan + yearPassed * religPct) summary(md5) # adjusted R2 = 0.7899 # Which is best? Model md3 # Check assumptions # The independent variables are not changed, so the results from above # for them apply here, as well. # Residuals res <- residuals(md3) # mean = 0? mean(res) # Success # Normal? shapiro.test(res) # FAIL =( # The second best model was md1 # Its residuals res <- residuals(md1) # mean = 0? # Pass! =) mean(res) # Normal? shapiro.test(res) # Pass! =) # Constant variance? plot(res~lgtWin, pch=16) # No funnel! =) # It passes, so we will use this model # Prediction? predict(md1, newdata=ME) # Oops! We forgot that we need to back-transform logistic( predict(md1, newdata=ME) ) # We predict that 37.8% of the voters will vote for the SSM ballot measure. # But, the question was about the probability of it passing. So, we do # parametric bootstrapping (as above) summary(md1) # to refresh the memory set.seed(577) t <- 1000000 # this is equivalent to: t <- 1e6 e.cont <- rnorm(t, m= -1.89091, s=0.28978) e.year <- rnorm(t, m= -0.08857, s=0.01572) e.civb <- rnorm(t, m= -0.23177, s=0.08784) e.relp <- rnorm(t, m= 0.04748, s=0.00472) prd <- e.cont + 10*e.year + 0*e.civb + 48*e.relp # These are the million predictions, BUT in logit units # Back-transform! prd <- logistic(prd) # Histogram: hist(prd) # or hist(prd, breaks=-11:111/100) # or par(mar=c(4.3,1,1,1)) hist(prd, breaks=-11:111/100, main="", yaxt="n", ylab="", xlim=c(0,1), xaxs="i", xlab="Vote in Favor of SSM", yaxs="i") hist(prd[prd>0.500], breaks=-11:111/100, main="", yaxt="n", ylab="", xlim=c(0,1), xaxs="i", xlab="", add=TRUE, col=2) # Which histogram is best? # What proportion are wins? length(which(prd>0.500))/t # What is the 95% confidence interval? Q13 <- quantile(prd, c(0.025,0.975)) segments(Q13[1],0,Q13[2],0, col=3, lwd=5) segments(Q13[1],0,0,0, col=2, lwd=5) segments(1,0,Q13[2],0, col=2, lwd=5) text(0.6,15000, "p = 0.106") ################################################## # So, you should be asking the following: # 1. What are the steps I need to follow to do an analysis? # 2. What do the graphs tell you? What aspects convey their meanings best? # 3. Which assumptions need to be tested? How do I test them? # 4. What is the question and how do I answer it? # 5. What does each technique tell us? # # One of R's greatest strength is that you can create very intricate graphs. # One of R's drawbacks is that you have to figure out how to make them. #