##############################
#
# Script: Solutions 14 
#
#        (assignment14a.R)
#
##############################


logit     <- function(p) log( p / (1-p) )
logit.inv <- function(x) exp(x)/(1+exp(x)) 



##########
#
# Problem 14.1


# Get the data

ssm <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ssm.csv", header=TRUE)
names(ssm)
summary(ssm)


# Do some adjustments

p.favor <- ssm$pctfavor/100
n.favor <- p.favor*1000
p.agin  <- 1000-n.favor
y.favor <- cbind(n.favor,p.agin)

yr      <- ssm$year
cu      <- ssm$civilunion
st      <- ssm$south
relig   <- ssm$religiosity


# Fit an appropriate model to the SSM data

model <- glm(y.favor ~ yr + cu + st + relig, family=quasibinomial(link=logit))
summary(model)

plot(model)


# Predict the proportion of the vote in favor of the SSM measure

newstate <- data.frame(yr=2011, cu=0,st=0,relig=1)
pr.NM <- predict(model, newdata=newstate)
pr.NM <- logit.inv(pr.NM)
pr.NM    # = 0.4608805, a loss


# Create prediction plot (optional)

new.yr <- 1998:2012
pr0 <- predict(model, newdata=data.frame(yr=new.yr, st=0, cu=mean(cu), relig=mean(relig)) )
pr1 <- predict(model, newdata=data.frame(yr=new.yr, st=1, cu=mean(cu), relig=mean(relig)) )

pr0 <- logit.inv(pr0)
pr1 <- logit.inv(pr1)

  plot(yr,p.favor, pch=16, col=4, cex=0.9, las=1, xlim=c(1998,2012), ylim=c(0.4,1),
    xlab="Year",ylab="Proportion of Vote in Favor")

  points(yr[st==1],p.favor[st==1], col=2, pch=16)
  lines(new.yr, pr0, col=4)
  lines(new.yr, pr1, col=2)
  legend("topright",c("Southern State","Non-Southern State"), col=c(2,4), 
    lwd=1, bty="n", pch=16)
  text(2011,pr.NM,"NM")


# Predict probability of it passing

trials <- 1e6

# Create the coefficients

c.int    <- rnorm(trials, m= 140.74742, sd= 36.29106)
c.yr     <- rnorm(trials, m=  -0.07015, sd=  0.01811)
c.cu     <- rnorm(trials, m=  -0.17846, sd=  0.10392)
c.st     <- rnorm(trials, m=   0.44107, sd=  0.12616)
c.relig  <- rnorm(trials, m=   0.17129, sd=  0.04534)

# Now, we predict the outcome of New Mexico's ballot measure
# with these new coefficients (Where did the 13, 0, and 1 come from?)

p.NM <- c.int + 2011*c.yr + 0*c.st + 1*c.relig
p.NM <- logit.inv(p.NM)

sum(p.NM>0.500)/trials       
# The probability of the measure passing = 0.498675


# Graphics really tell the story 

h    <- hist(p.NM, plot=FALSE, breaks=0:100/100)
pass <- seq(0.5,1.0,0.001)+0.001

bin      <- cut(pass, h$breaks)
clr      <- rep("white", length(h$counts))
clr[bin] <- "#FF5A36"

plot(h, col=clr, ylab="", yaxt="n",las=1, 
  xlab="Proportion Voting in Favor", main="")
abline(v=0.50, col=1)


# Whoa! What happened???





##########
#
# Problem 14.2



fb <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ncaa2009football.csv")
names(fb)
summary(fb)


# a: Team score

boxplot(score~team, data=fb)


# Normality test:

shapiro.test(fb$score[fb$team=="LSU"])
shapiro.test(fb$score[fb$team=="Tennessee"])
shapiro.test(fb$score[fb$team=="Vanderbilt"])      # fail
shapiro.test(fb$score[fb$team=="South Carolina"])  # fail
shapiro.test(fb$score[fb$team=="Alabama"])
shapiro.test(fb$score[fb$team=="Florida"])
shapiro.test(fb$score[fb$team=="Mississippi"])

# etc. ...

# or:

for(i in levels(fb$team) ) {
  pp <- shapiro.test(fb$score[fb$team==i])$p.value
  cat(pp,"\t",i,"\n")
}

# By inspection, only 6 fail, but with 65 tests, those are to be expected.

# Equi-variance test:

fligner.test(score~team, data=fb)



# So, analysis of variance should be fine with this data! Phew!


model.2a <- aov(score~team, data=fb)
summary(model.2a)
summary.lm(model.2a)

# The team with the largest coefficient is Texas: 9.000 (thus an average score of 40.66)
# Its nearest competitor is Cincinnati: 8.166 (thus an average of 39.83) 
# The difference between these two is 0.833, which is less than 1.96 times the standard
# error (5.22253), so the difference is not statistically significant.



# b:  Margin of Victory

model.2b <- aov(I(score-opscore)~team, data=fb)
summary(model.2b)
summary.lm(model.2b)

# The team with the highest MOV is Florida, with 5.833 (so, an avg MOV of 26.66)
# The second team is Texas: 4.4167 (so, an avg MOV of 25.4), as this difference is
# less than 1.96 times the standard error, Florida did not do better than Texas,
# statistically speaking.





# c: OSU vs. Florida


# coef for OSU     = -12.4167
# coef for Florida =   5.8333
# standard errors  =   7.7058


# Monte Carlo!

trials <- 1e6

c.int <- rnorm(n, m= 20.8333, sd=5.4488)
c.osu <- rnorm(n, m=-12.4167, sd=7.7058)
c.ufl <- rnorm(n, m=  5.8333, sd=7.7058)

p.osu <- c.int + c.osu
p.ufl <- c.int + c.ufl

sum(p.osu>p.ufl)/trials   # = 0.04681

# Thus, OSU will win, on average, 4.681% of the time against Florida
# (according to this model and the 2009 data)





##########
#
# Problem 14.3


vit <- read.csv("http://courses.kvasaheim.com/stat40x3/data/vitaminC.csv")
names(vit)
summary(vit)

table(vit)

model.3 <- glm(cold ~ vitC, family=quasibinomial(link=logit), data=vit)
summary(model.3)





##########
#
# Problem 14.4



poi <- read.csv("http://courses.kvasaheim.com/stat40x3/data/fakepoisson.csv")
names(poi)
summary(poi)

# Here's the histogram

hist(poi$y)


# And the t-test

t.test(poi$y,mu=20)

# BUT, is n=100 large enough for the t-test to work? Let's find out!


long.way <- proc.time()     # To see how long this takes (for fun)

  p      <- numeric()                 # This will hold the p-values
  trials <- 1e5                       # This is the number of trials to run
  n      <- 100                       # This is the sample size per trial

  for(i in 1:trials) {
    z <- rpois(n,lambda=20)
    p[i] <- t.test(z, mu=20)$p.value
  }

  hist(p, breaks=21)
  abline(h=trials/20, lty=5, col=4)

proc.time() - long.way      # 50s






# Faster:
short.way <- proc.time()    # To see how long *this* takes

  trials <- 1e5
  n      <- 100
  ss     <- trials * n

  z      <- rpois(ss,lambda=20)
  zz     <- matrix(z, ncol=n)
  tp     <- function(x,mu) t.test(x, mu=mu)$p.value
  p      <- apply(zz,1, tp,20)

  hist(p, breaks=21)
  abline(h=trials/20, lty=5, col=4)

proc.time() - short.way     # 37s


# The second way is faster, as you only make one call to the random-
# number generator, while you make 1e5 calls in the first.