##############################
#
# Script: Solutions Midterm II
#         (exam2a.R)
#
##############################


# In case we need these functions:
  logit.inv <- function(x) exp(x)/(1+exp(x))
  logit     <- function(p) log( p/(1-p) )



# And so it begins! =)

# Read in the data
gdp <- read.csv("http://courses.kvasaheim.com/stat40x3/data/gdpcap.csv", 
  header=TRUE)
names(gdp)


# Try fullest model
m1 <- glm(gdpcap ~ hig * I(hig^2) * democracy * I(democracy^2), 
  family=poisson(link=log), data=gdp )
summary(m1)


# Oy vey. Let's pare it down

m2 <- glm(gdpcap ~ hig * I(hig^2) * democracy * I(democracy^2) 
  - hig:I(hig^2):democracy:I(democracy^2), family=quasipoisson(link=log), 
  data=gdp )
summary(m2)

formula <- "gdpcap ~ ( hig + I(hig^2) ) * ( democracy + I(democracy^2) )"
m3 <- glm(formula, family=quasipoisson(link=log), data=gdp )
summary(m3)

formula <- "gdpcap ~ ( hig + I(hig^2) ) * ( democracy + I(democracy^2) )
  - I(hig^2):I(democracy^2)"
m4 <- glm(formula, family=quasipoisson(link=log), data=gdp )
summary(m4)

formula <- "gdpcap ~ ( hig + I(hig^2) ) * ( democracy + I(democracy^2) )
  - I(hig^2):I(democracy^2)-hig:I(democracy^2)"
m5 <- glm(formula, family=quasipoisson(link=log), data=gdp )
summary(m5)

formula <- "gdpcap ~ ( hig + I(hig^2) ) * ( democracy + I(democracy^2) )
  - I(hig^2):I(democracy^2)-hig:I(democracy^2)-I(hig^2):democracy"
m6 <- glm(formula, family=quasipoisson(link=log), data=gdp )
summary(m6)


# And this looks best. It is equivalent to 

formula <- "gdpcap ~ hig*democracy + I(hig^2) + I(democracy^2)"
m7 <- glm(formula, family=quasipoisson(link=log), data=gdp )
summary(m7)

# which may be easier to read

# Now to predict

newState <- data.frame(hig=5, democracy=5)
exp( predict(m7, newdata=newState) )
# $11,017.70


# The Neat-O graph:

png("plot6.png",width=6,height=4,units="in",res=600)

par(mar=c(5,6,2,2)+0.1)
par(cex=0.7, cex.axis=0.9)
plot(gdpcap~democracy, xlim=c(-10,10), xlab="Democracy Level",
  ylim=c(0,150000), ylab="GDP per capita [$]\n\n", yaxt="n")
axis(2, labels=c("10,000","50,000","100,000","150,000"), 
  at=c(10000,50000,100000,150000), las=1 )

ndem <- -100:100/10
newState <- data.frame(hig=7, democracy=ndem)
pr1 <- predict(m7, newdata=newState, se.fit=TRUE)
pest <- exp(pr1$fit)
ucl <- pr1$fit + 1.96*pr1$se.fit
lcl <- pr1$fit - 1.96*pr1$se.fit
ucl <- exp(ucl)
lcl <- exp(lcl)
lines(ndem,pest, col=4)
lines(ndem,ucl, col=4, lty=2)
lines(ndem,lcl, col=4, lty=2)

newState <- data.frame(hig=3, democracy=ndem)
pr1 <- predict(m7, newdata=newState, se.fit=TRUE)
pest <- exp(pr1$fit)
ucl <- pr1$fit + 1.96*pr1$se.fit
lcl <- pr1$fit - 1.96*pr1$se.fit
ucl <- exp(ucl)
lcl <- exp(lcl)
lines(ndem,pest, col=2)
lines(ndem,ucl, col=2, lty=2)
lines(ndem,lcl, col=2, lty=2)

legend("topright", c("High Corruption","Low Corruption"), 
  col=c(2,4), bty="n", lty=1 )

dev.off()



##################################################

# Problem 7


fbi <- read.csv("http://courses.kvasaheim.com/stat40x3/data/crime.csv",header=TRUE)
names(fbi)
summary(fbi)


formula <- "vcrime00 ~ vcrime90 * urbanp2000 * gspcap00 * conserve"
m1 <- glm(formula, family=quasipoisson(link=log), data=fbi)
summary(m1)

# Neat-O! The four-way interaction is statistically significant, so 
# we may be able to stop here. (Remember that you cannot remove a 
# lower-degree term while higher-degree terms exist in the model.)

# But, there are 16 parameters fit using 51 pieces of data. Could this
# be a case of overfitting the data?

# One may be able to pare this down by removing ALL two- and 
# three-way interactions and comparing the two models:

formula <- "vcrime00 ~ vcrime90 + urbanp2000 + gspcap00 + conserve"
m2 <- glm(formula, family=quasipoisson(link=log), data=fbi)
summary(m2)

anova(m2,m1)

# The deviance here is approximately chi-squared with 11 df. 
# With this, we note that the p-value will be much less than
# our alpha of 0.05. Thus, this model and the full model are 
# significantly different. Thus, we must return to use m1.


# Now to predict:

OK <- data.frame(vcrime90=547.5, urbanp2000=65.3, gspcap00=26352.36, 
  conserve=0.12)
pr <- predict(m1,newdata=OK)
exp(pr)  # 401.5461

# The real vcrime00 is
fbi$vcrime00[fbi$scode=="OK"]   # 497.8

# This gives a prediction error of 497.8-401.5461 = 96.3


# As there are five dimensions to our graph, I will skip it.






##################################################

# Problem 8

filename <- "http://www.electoralforensics.org/datasets/xsd2011referendum.csv"
xsd <- read.csv(filename, header=TRUE)
names(xsd)

# Let me peel off some variables to make my life happier

v.yes     <- xsd$Secession
v.invalid <- xsd$Invalid
v.total   <- xsd$Votes
v.valid   <- v.total-v.invalid

p.yes     <- v.yes/v.valid
p.invalid <- v.invalid/v.total
y.invalid <- cbind(v.invalid,v.valid)


# I gave you the model


da.model <- glm( y.invalid ~ p.yes, family=quasibinomial(link=logit) )
summary(da.model)

# Uf da! Statistical significance








# Now, to graphing:


png("plot8.png",width=6,height=7,units="in",res=600)

par(mar=c(5,4,2,2)+0.1)
par(cex=0.7, cex.axis=0.9)
plot(p.invalid~p.yes, xlim=c(0,1), xlab="Proportion of Vote for Secession",
  ylim=c(0,0.1),ylab="Proportion of the Vote Declared Invalid", las=1)


newpyes <- 0:100/100
pr <- predict(da.model, newdata=data.frame(p.yes=newpyes), se.fit=TRUE)

pest <- logit.inv(pr$fit)

ucl <- pr$fit + 1.96*pr$se.fit
lcl <- pr$fit - 1.96*pr$se.fit

ucl <- logit.inv(ucl)
lcl <- logit.inv(lcl)

lines(newpyes,pest, col=1, lwd=2)
lines(newpyes,ucl,  col=2, lty=2)
lines(newpyes,lcl,  col=2, lty=2)

m <- mean(p.invalid)
abline(h=m, col=4, lty=4)

dev.off()