############################## # # Script: Final Examination Solutions # # (exam3.R) # ############################## logit <- function(p) log( p / (1-p) ) logit.inv <- function(x) exp(x)/(1+exp(x)) ########## # # Problem X3.5 p <- numeric() tr <- 1e5 n <- 35 for(i in 1:tr) { z <- runif(n)*100 p[i] <- t.test(z, mu=50)$p.value } par(mar=c(5,2,3,2)+0.1) hist(p, breaks=21,main="Histogram of p-values", yaxt="n", xlab="p-values") abline(h=tr/20, lty=2, lwd=2, col="#FF9900") ########## # # Problem X3.6 sports <- data.frame( matrix( c("soccer",4.0, "soccer",4.0, "soccer",4.0, "soccer",4.0, "soccer",4.0, "soccer",3.0, "soccer",3.0, "soccer",3.0, "soccer",3.0, "soccer",2.0, "basketball",4.0, "basketball",4.0, "basketball",4.0, "basketball",3.0, "basketball",3.0, "basketball",3.0, "basketball",3.0, "basketball",3.0, "basketball",3.0, "basketball",1, "softball",4.0, "softball",3.0, "softball",3.0, "softball",3.0, "softball",3.0, "softball",2.0, "softball",2.0, "softball",2.0), ncol=2, byrow=TRUE) ) names(sports) <- c("sport", "grade") sports$sport <- as.factor(sports$sport) sports$grade <- as.numeric(sports$grade) summary(sports) names(sports) gpa <- sports$grade sport <- sports$sport shapiro.test(gpa[sport=="soccer"]) # Not Normal, so no need to go on kruskal.test(gpa~sport) par(mar=c(5,4,3,2)+0.1) boxplot(gpa~sport, main="Average Grade (4-point scale) vs. Sport", ylab="Average Grade", las=1, xlab="Sport Played", ylim=c(0,4)) ########## # # Problem X3.7 cdi <- read.csv("http://courses.kvasaheim.com/stat40x3/data/cdi2010pres2.csv") names(cdi) # Who won the REGION? # One way is to alter the original data # Another way is as follows: lev <- levels(cdi$REGION) reg.oua <- numeric() reg.gba <- numeric() for(i in lev) { reg.oua[i] <- sum(cdi$OUATTARA[cdi$REGION==i]) reg.gba[i] <- sum(cdi$GBAGBO[cdi$REGION==i]) } won.ou <- reg.oua > reg.gba won.oua <- numeric() for(i in 1:length(cdi$REGION)) { won.oua[i] <- won.ou[cdi$REGION[i]] } # Now, create proportions p.oua <- cdi$OUATTARA/cdi$TOTAL p.rej <- cdi$REJECTED/cdi$TOTAL # Boxplot par(mar=c(4,4,2,2)+0.1) boxplot(p.rej~won.oua, xaxt="n", las=1, ylab="Proportion of Votes Rejected") axis(1, label=c("Ouattara Won","Gbagbo Won"), at=c(2,1)) # Same distribution? var.test(p.rej~won.oua) fligner.test(p.rej~won.oua) # But this only tests if they are equally Normal. # Better: ks.test(p.rej[won.oua==1],p.rej[won.oua==0]) # Scatterplot par(mar=c(5,4,2,2)+0.1) plot(p.rej~p.oua, xlab="Proportion Vote for Ouattara", ylab="Proportion of Vote Rejected", las=1, xlim=c(0,1), ylim=c(0,0.1)) model.7 <- glm(p.rej~p.oua, family=quasibinomial(link=logit)) summary(model.7) new.oua <- 0:100/100 pr <- predict(model.7, newdata=data.frame(p.oua=new.oua), se.fit=TRUE) pest <- logit.inv(pr$fit) ucl <- pr$fit + 1.96*pr$se.fit lcl <- pr$fit - 1.96*pr$se.fit ucl <- logit.inv(ucl) lcl <- logit.inv(lcl) lines(new.oua,pest) lines(new.oua, ucl, col="#FF6600", lty=2) lines(new.oua, lcl, col="#FF6600", lty=2) ########## # # Problem X3.8 ear <- read.csv("http://courses.kvasaheim.com/stat40x3/data/earmarks.csv") names(ear) summary(ear) # Why did I do this? drop <- which(ear$peer<5) ear <- ear[-drop,] drop <- which(ear$earmarks<5) ear <- ear[-drop,] # The model search model.8a <- glm(peer ~ earmarks * carnegie, data=ear, family=gaussian(link=log)) summary(model.8a) model.8b <- glm(peer ~ earmarks + carnegie, data=ear, family=gaussian(link=log)) summary(model.8b) model.8p <- glm(peer ~ earmarks * carnegie, data=ear, family=quasipoisson(link=log)) summary(model.8p) model.8q <- glm(peer ~ earmarks * carnegie, data=ear, family=quasipoisson(link=log)) summary(model.8q) model.8r <- glm(peer ~ earmarks + carnegie, data=ear, family=quasipoisson(link=log)) summary(model.8r) plot(model.8r) # Plot plot(peer~earmarks, data=ear, las=1, xlab="Earmark Funding ($Million)", ylab="Peer-Reviewed Funding ($Million)", xaxt="n", yaxt="n") axis(1, labels=c("1","20","50","100"), at=c(1e6, 2e7,5e7,1e8) ) axis(2, labels=c("1","100","500","1,000"), at=c(1e6, 1e8,5e8,1e9) ,las=1) # Predictions new.ear <- seq(1,max(ear$earmarks),length=100) prH <- predict(model.8r, newdata=data.frame(earmarks=new.ear,carnegie="RU/H")) prH <- exp(prH) lines(new.ear,prH, col=4) prVH <- predict(model.8r, newdata=data.frame(earmarks=new.ear,carnegie="RU/VH")) prVH <- exp(prVH) lines(new.ear,prVH, col=3) legend("topright", c("RU/VH Schools","RU/H Schools"), col=c(3,4), lwd=1, bty="n") points(peer[carnegie=="RU/VH"]~earmarks[carnegie=="RU/VH"], data=ear,col=3, pch=".") points(peer[carnegie=="RU/H"]~earmarks[carnegie=="RU/H"], data=ear, col=4, pch=".") # Or plot(peer~earmarks, data=ear, las=1, xlab="Earmark Funding ($Million)", ylab="Peer-Reviewed Funding ($Million)", xaxt="n", yaxt="n", log="xy", xlim=c(1e4,1e9), ylim=c(1e4,1e9) ) axis(1, labels=c(1,20,50, 100), at=c(1e6, 2e7, 5e7, 1e8) ) axis(2, labels=c("1","100","500","1,000"), at=c(1e6, 1e8, 5e8,1e9) ,las=1) new.ear <- seq(1,max(ear$earmarks),length=100) prH <- predict(model.8r, newdata=data.frame(earmarks=new.ear,carnegie="RU/H")) prH <- exp(prH) lines(new.ear,prH, col=4) prVH <- predict(model.8r, newdata=data.frame(earmarks=new.ear,carnegie="RU/VH")) prVH <- exp(prVH) lines(new.ear,prVH, col=3) legend("bottomright", c("RU/VH Schools","RU/H Schools"), col=c(3,4), lwd=1,bty="n") points(peer[carnegie=="RU/VH"]~earmarks[carnegie=="RU/VH"], data=ear,col=3, pch=".") points(peer[carnegie=="RU/H"]~earmarks[carnegie=="RU/H"], data=ear, col=4, pch=".") # Predict OSU: newSchool <- data.frame(carnegie="RU/H", earmarks=400000) prOSU <- predict(model.8r, newdata=newSchool) prOSU <- exp(prOSU) prOSU # = $20,492,778 # Plot OSU's prediction on the graph points(400000,prOSU, pch=16, col="#FF9900", cex=2) # Thanks! # PS: I used some of Sebastian's code here. Hey, it was good and I was lazy. =)