############################## # # Script: Solutions Midterm II # (exam2a.R) # ############################## # In case we need these functions: logit.inv <- function(x) exp(x)/(1+exp(x)) logit <- function(p) log( p/(1-p) ) # And so it begins! =) # Read in the data gdp <- read.csv("http://courses.kvasaheim.com/stat40x3/data/gdpcap.csv", header=TRUE) names(gdp) # Try fullest model m1 <- glm(gdpcap ~ hig * I(hig^2) * democracy * I(democracy^2), family=poisson(link=log), data=gdp ) summary(m1) # Oy vey. Let's pare it down m2 <- glm(gdpcap ~ hig * I(hig^2) * democracy * I(democracy^2) - hig:I(hig^2):democracy:I(democracy^2), family=quasipoisson(link=log), data=gdp ) summary(m2) formula <- "gdpcap ~ ( hig + I(hig^2) ) * ( democracy + I(democracy^2) )" m3 <- glm(formula, family=quasipoisson(link=log), data=gdp ) summary(m3) formula <- "gdpcap ~ ( hig + I(hig^2) ) * ( democracy + I(democracy^2) ) - I(hig^2):I(democracy^2)" m4 <- glm(formula, family=quasipoisson(link=log), data=gdp ) summary(m4) formula <- "gdpcap ~ ( hig + I(hig^2) ) * ( democracy + I(democracy^2) ) - I(hig^2):I(democracy^2)-hig:I(democracy^2)" m5 <- glm(formula, family=quasipoisson(link=log), data=gdp ) summary(m5) formula <- "gdpcap ~ ( hig + I(hig^2) ) * ( democracy + I(democracy^2) ) - I(hig^2):I(democracy^2)-hig:I(democracy^2)-I(hig^2):democracy" m6 <- glm(formula, family=quasipoisson(link=log), data=gdp ) summary(m6) # And this looks best. It is equivalent to formula <- "gdpcap ~ hig*democracy + I(hig^2) + I(democracy^2)" m7 <- glm(formula, family=quasipoisson(link=log), data=gdp ) summary(m7) # which may be easier to read # Now to predict newState <- data.frame(hig=5, democracy=5) exp( predict(m7, newdata=newState) ) # $11,017.70 # The Neat-O graph: png("plot6.png",width=6,height=4,units="in",res=600) par(mar=c(5,6,2,2)+0.1) par(cex=0.7, cex.axis=0.9) plot(gdpcap~democracy, xlim=c(-10,10), xlab="Democracy Level", ylim=c(0,150000), ylab="GDP per capita [$]\n\n", yaxt="n") axis(2, labels=c("10,000","50,000","100,000","150,000"), at=c(10000,50000,100000,150000), las=1 ) ndem <- -100:100/10 newState <- data.frame(hig=7, democracy=ndem) pr1 <- predict(m7, newdata=newState, se.fit=TRUE) pest <- exp(pr1$fit) ucl <- pr1$fit + 1.96*pr1$se.fit lcl <- pr1$fit - 1.96*pr1$se.fit ucl <- exp(ucl) lcl <- exp(lcl) lines(ndem,pest, col=4) lines(ndem,ucl, col=4, lty=2) lines(ndem,lcl, col=4, lty=2) newState <- data.frame(hig=3, democracy=ndem) pr1 <- predict(m7, newdata=newState, se.fit=TRUE) pest <- exp(pr1$fit) ucl <- pr1$fit + 1.96*pr1$se.fit lcl <- pr1$fit - 1.96*pr1$se.fit ucl <- exp(ucl) lcl <- exp(lcl) lines(ndem,pest, col=2) lines(ndem,ucl, col=2, lty=2) lines(ndem,lcl, col=2, lty=2) legend("topright", c("High Corruption","Low Corruption"), col=c(2,4), bty="n", lty=1 ) dev.off() ################################################## # Problem 7 fbi <- read.csv("http://courses.kvasaheim.com/stat40x3/data/crime.csv",header=TRUE) names(fbi) summary(fbi) formula <- "vcrime00 ~ vcrime90 * urbanp2000 * gspcap00 * conserve" m1 <- glm(formula, family=quasipoisson(link=log), data=fbi) summary(m1) # Neat-O! The four-way interaction is statistically significant, so # we may be able to stop here. (Remember that you cannot remove a # lower-degree term while higher-degree terms exist in the model.) # But, there are 16 parameters fit using 51 pieces of data. Could this # be a case of overfitting the data? # One may be able to pare this down by removing ALL two- and # three-way interactions and comparing the two models: formula <- "vcrime00 ~ vcrime90 + urbanp2000 + gspcap00 + conserve" m2 <- glm(formula, family=quasipoisson(link=log), data=fbi) summary(m2) anova(m2,m1) # The deviance here is approximately chi-squared with 11 df. # With this, we note that the p-value will be much less than # our alpha of 0.05. Thus, this model and the full model are # significantly different. Thus, we must return to use m1. # Now to predict: OK <- data.frame(vcrime90=547.5, urbanp2000=65.3, gspcap00=26352.36, conserve=0.12) pr <- predict(m1,newdata=OK) exp(pr) # 401.5461 # The real vcrime00 is fbi$vcrime00[fbi$scode=="OK"] # 497.8 # This gives a prediction error of 497.8-401.5461 = 96.3 # As there are five dimensions to our graph, I will skip it. ################################################## # Problem 8 filename <- "http://www.electoralforensics.org/datasets/xsd2011referendum.csv" xsd <- read.csv(filename, header=TRUE) names(xsd) # Let me peel off some variables to make my life happier v.yes <- xsd$Secession v.invalid <- xsd$Invalid v.total <- xsd$Votes v.valid <- v.total-v.invalid p.yes <- v.yes/v.valid p.invalid <- v.invalid/v.total y.invalid <- cbind(v.invalid,v.valid) # I gave you the model da.model <- glm( y.invalid ~ p.yes, family=quasibinomial(link=logit) ) summary(da.model) # Uf da! Statistical significance # Now, to graphing: png("plot8.png",width=6,height=7,units="in",res=600) par(mar=c(5,4,2,2)+0.1) par(cex=0.7, cex.axis=0.9) plot(p.invalid~p.yes, xlim=c(0,1), xlab="Proportion of Vote for Secession", ylim=c(0,0.1),ylab="Proportion of the Vote Declared Invalid", las=1) newpyes <- 0:100/100 pr <- predict(da.model, newdata=data.frame(p.yes=newpyes), se.fit=TRUE) pest <- logit.inv(pr$fit) ucl <- pr$fit + 1.96*pr$se.fit lcl <- pr$fit - 1.96*pr$se.fit ucl <- logit.inv(ucl) lcl <- logit.inv(lcl) lines(newpyes,pest, col=1, lwd=2) lines(newpyes,ucl, col=2, lty=2) lines(newpyes,lcl, col=2, lty=2) m <- mean(p.invalid) abline(h=m, col=4, lty=4) dev.off()