############################## # # Script: Solutions 14 # # (assignment14a.R) # ############################## logit <- function(p) log( p / (1-p) ) logit.inv <- function(x) exp(x)/(1+exp(x)) ########## # # Problem 14.1 # Get the data ssm <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ssm.csv", header=TRUE) names(ssm) summary(ssm) # Do some adjustments p.favor <- ssm$pctfavor/100 n.favor <- p.favor*1000 p.agin <- 1000-n.favor y.favor <- cbind(n.favor,p.agin) yr <- ssm$year cu <- ssm$civilunion st <- ssm$south relig <- ssm$religiosity # Fit an appropriate model to the SSM data model <- glm(y.favor ~ yr + cu + st + relig, family=quasibinomial(link=logit)) summary(model) plot(model) # Predict the proportion of the vote in favor of the SSM measure newstate <- data.frame(yr=2011, cu=0,st=0,relig=1) pr.NM <- predict(model, newdata=newstate) pr.NM <- logit.inv(pr.NM) pr.NM # = 0.4608805, a loss # Create prediction plot (optional) new.yr <- 1998:2012 pr0 <- predict(model, newdata=data.frame(yr=new.yr, st=0, cu=mean(cu), relig=mean(relig)) ) pr1 <- predict(model, newdata=data.frame(yr=new.yr, st=1, cu=mean(cu), relig=mean(relig)) ) pr0 <- logit.inv(pr0) pr1 <- logit.inv(pr1) plot(yr,p.favor, pch=16, col=4, cex=0.9, las=1, xlim=c(1998,2012), ylim=c(0.4,1), xlab="Year",ylab="Proportion of Vote in Favor") points(yr[st==1],p.favor[st==1], col=2, pch=16) lines(new.yr, pr0, col=4) lines(new.yr, pr1, col=2) legend("topright",c("Southern State","Non-Southern State"), col=c(2,4), lwd=1, bty="n", pch=16) text(2011,pr.NM,"NM") # Predict probability of it passing trials <- 1e6 # Create the coefficients c.int <- rnorm(trials, m= 140.74742, sd= 36.29106) c.yr <- rnorm(trials, m= -0.07015, sd= 0.01811) c.cu <- rnorm(trials, m= -0.17846, sd= 0.10392) c.st <- rnorm(trials, m= 0.44107, sd= 0.12616) c.relig <- rnorm(trials, m= 0.17129, sd= 0.04534) # Now, we predict the outcome of New Mexico's ballot measure # with these new coefficients (Where did the 13, 0, and 1 come from?) p.NM <- c.int + 2011*c.yr + 0*c.st + 1*c.relig p.NM <- logit.inv(p.NM) sum(p.NM>0.500)/trials # The probability of the measure passing = 0.498675 # Graphics really tell the story h <- hist(p.NM, plot=FALSE, breaks=0:100/100) pass <- seq(0.5,1.0,0.001)+0.001 bin <- cut(pass, h$breaks) clr <- rep("white", length(h$counts)) clr[bin] <- "#FF5A36" plot(h, col=clr, ylab="", yaxt="n",las=1, xlab="Proportion Voting in Favor", main="") abline(v=0.50, col=1) # Whoa! What happened??? ########## # # Problem 14.2 fb <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ncaa2009football.csv") names(fb) summary(fb) # a: Team score boxplot(score~team, data=fb) # Normality test: shapiro.test(fb$score[fb$team=="LSU"]) shapiro.test(fb$score[fb$team=="Tennessee"]) shapiro.test(fb$score[fb$team=="Vanderbilt"]) # fail shapiro.test(fb$score[fb$team=="South Carolina"]) # fail shapiro.test(fb$score[fb$team=="Alabama"]) shapiro.test(fb$score[fb$team=="Florida"]) shapiro.test(fb$score[fb$team=="Mississippi"]) # etc. ... # or: for(i in levels(fb$team) ) { pp <- shapiro.test(fb$score[fb$team==i])$p.value cat(pp,"\t",i,"\n") } # By inspection, only 6 fail, but with 65 tests, those are to be expected. # Equi-variance test: fligner.test(score~team, data=fb) # So, analysis of variance should be fine with this data! Phew! model.2a <- aov(score~team, data=fb) summary(model.2a) summary.lm(model.2a) # The team with the largest coefficient is Texas: 9.000 (thus an average score of 40.66) # Its nearest competitor is Cincinnati: 8.166 (thus an average of 39.83) # The difference between these two is 0.833, which is less than 1.96 times the standard # error (5.22253), so the difference is not statistically significant. # b: Margin of Victory model.2b <- aov(I(score-opscore)~team, data=fb) summary(model.2b) summary.lm(model.2b) # The team with the highest MOV is Florida, with 5.833 (so, an avg MOV of 26.66) # The second team is Texas: 4.4167 (so, an avg MOV of 25.4), as this difference is # less than 1.96 times the standard error, Florida did not do better than Texas, # statistically speaking. # c: OSU vs. Florida # coef for OSU = -12.4167 # coef for Florida = 5.8333 # standard errors = 7.7058 # Monte Carlo! trials <- 1e6 c.int <- rnorm(n, m= 20.8333, sd=5.4488) c.osu <- rnorm(n, m=-12.4167, sd=7.7058) c.ufl <- rnorm(n, m= 5.8333, sd=7.7058) p.osu <- c.int + c.osu p.ufl <- c.int + c.ufl sum(p.osu>p.ufl)/trials # = 0.04681 # Thus, OSU will win, on average, 4.681% of the time against Florida # (according to this model and the 2009 data) ########## # # Problem 14.3 vit <- read.csv("http://courses.kvasaheim.com/stat40x3/data/vitaminC.csv") names(vit) summary(vit) table(vit) model.3 <- glm(cold ~ vitC, family=quasibinomial(link=logit), data=vit) summary(model.3) ########## # # Problem 14.4 poi <- read.csv("http://courses.kvasaheim.com/stat40x3/data/fakepoisson.csv") names(poi) summary(poi) # Here's the histogram hist(poi$y) # And the t-test t.test(poi$y,mu=20) # BUT, is n=100 large enough for the t-test to work? Let's find out! long.way <- proc.time() # To see how long this takes (for fun) p <- numeric() # This will hold the p-values trials <- 1e5 # This is the number of trials to run n <- 100 # This is the sample size per trial for(i in 1:trials) { z <- rpois(n,lambda=20) p[i] <- t.test(z, mu=20)$p.value } hist(p, breaks=21) abline(h=trials/20, lty=5, col=4) proc.time() - long.way # 50s # Faster: short.way <- proc.time() # To see how long *this* takes trials <- 1e5 n <- 100 ss <- trials * n z <- rpois(ss,lambda=20) zz <- matrix(z, ncol=n) tp <- function(x,mu) t.test(x, mu=mu)$p.value p <- apply(zz,1, tp,20) hist(p, breaks=21) abline(h=trials/20, lty=5, col=4) proc.time() - short.way # 37s # The second way is faster, as you only make one call to the random- # number generator, while you make 1e5 calls in the first.