########################################
#
#  Script: 22 March 2011 (20110322.R)
#
########################################

# Today:
#
#   ** Fitting GLMs
#      . Assumptions
#      . Testing Assumptions
#      . More Graphing (!!)
#



### New data: Belgian AIDS cases

  t    <- 1980+1:13
  y    <- c(12,14,33,50,67,74,123,141,165,204,253,246,240)
  AIDS <- data.frame(year=t,cases=y)

# Note that I have now created a new dataset called AIDS. I can
# do to AIDS what I usually do to my other datasets (as I should)

# What is stored in t? What does 1:13 do?


# Quick and dirty plot (as AIDS has only two variables). NOT
# presentation quality, however (nor homework quality)

  plot(AIDS)


# Q: Has the rate of HIV infection actually leveled off, or is it
# within expected random variation?


# Ask yourself: What is distribution of Y? Why is this an 
# important question?

  model1 <- glm(cases ~ year, family=poisson(link=log), data=AIDS )
  summary(model1)

# How good it the fit? It looks good to me, but there is an issue with
# overdispersion, which is the residual deviance divided by degrees of 
# freedom. If the model is a 'good' model, this ratio should be close
# to one. Here, it is definitely not. 

# This tells us we should fix our model before proceeding. To fix the model,
# we need to add independent variables, interactions between independent variables, 
# or powers of independent variables (in this order). Remember to not add too many
# variables or you will overfit the data.

# One can also adjust the standard errors if one cannot fix the model.

# Overdispersion is an issue only in Poisson and Binomial models.

# As social scientists, we would rather fix the model than fix the standard errors.
# Let's create a better model

  model2 <- glm(cases ~ year + I(year^2), family=poisson(link=log), data=AIDS )
  summary(model2)
  
# Here, the dispersion is close enough to one, so we fixed the model. Not always
# will you be able to fix the model. Why?

# Let us check some assumptions of the generalized linear model using these
# four graphs:

  plot(model2)

# Those plots appear to indicate that there is no extreme violation of the assumptions,
# so our next step is to interpret the coefficients (and p-values) and graph the 
# model:

  ynew <- 1981:1993
  pr <- predict(model2, newdata=data.frame(year=ynew))
  pr <- exp(pr)
  plot(AIDS, xlim=c(1980,1994), las=1, xlab="Year",ylab="AIDS Cases", ylim=c(0,300))
  lines(ynew,pr)

# This is as we have done several times in the past.

# However, we can indicate on the graph the precision of our estimates of 
# the population curve by using confidence bands

# These are analogous to confidence intervals from last term.

  ynew <- 1981:1993
  pr   <- predict(model2, newdata=data.frame(year=ynew), se.fit=TRUE)   # Note the change
  pest <- exp(pr$fit)

  plot(AIDS, xlim=c(1980,1994), las=1, xlab="Year",ylab="AIDS Cases", ylim=c(0,300))
  lines(ynew,pest)

  ucl <- pr$fit + 1.96*pr$se.fit
  lcl <- pr$fit - 1.96*pr$se.fit

  ucl <- exp(ucl)
  lcl <- exp(lcl)

  lines(ynew,ucl, lty=2)
  lines(ynew,lcl, lty=2)


# What does this plot tell us that the previous did not?
# Where did the 1.96 come from?
# What do I get when I type just pr? This indicates why we need to use the $ notation.




##################################################

rm(list=ls())    # Cleaning up our memory

# Two functions that will come in handy

  logit     <- function(p) log( p/(1-p) )
  logit.inv <- function(x) exp(x)/(1+exp(x))

# Let's load the Sri Lanka election data and play with it some more

  sri <- read.csv("http://courses.kvasaheim.com/stat40x3/data/sri2010pres.csv", header=TRUE)
  names(sri)
  summary(sri)

# Fix the data (as we know we should)
  v.raja   <- sri$RAJAPAKSE
  v.rej    <- sri$REJECTED
  v.total  <- sri$TOTAL
  wonprov  <- sri$WONPROVINCE
  division <- sri$DIVISION

  drop     <- which(division=="Postal")
  v.raja   <- v.raja[-drop]
  v.rej    <- v.rej[-drop]
  v.total  <- v.total[-drop]
  wonprov  <- wonprov[-drop]
  division <- division[-drop]

  drop     <- which(division=="Displace")
  v.raja   <- v.raja[-drop]
  v.rej    <- v.rej[-drop]
  v.total  <- v.total[-drop]
  wonprov  <- wonprov[-drop]
  division <- division[-drop]



# Next, we create two important variables
  p.raja <- v.raja/v.total
  p.rej  <- v.rej/v.total

# Q: Is there a significant relationship between the proportion of votes declared invalid
#    and the proportion of votes for Candidate Rajapaksa?

# What kind of variable is the dependent variable?

  model1 <- glm(p.rej~p.raja, family=binomial(link=logit))
  summary(model1)

# Oops! A warning message. Note that we are using a Binomial distribution. Recall from last
# term that the binomial distribution is discrete, can only take on integer values. Thus, we 
# need to provide it a different type of dependent variable.

# New stuff. The dependent variable needs to be a dual vector of successes and failures:

  y.rej <- cbind(v.rej, v.total-v.rej)       ######### A big change from the preliminary script!!

# The right model
  model1 <- glm(y.rej~p.raja, family=binomial(link=logit))
  summary(model1)

# Check dispersion. Oops! This model is highly overdispersed. Let us try adding
# powers of our dependent variable

  model2 <- glm(y.rej~p.raja+I(p.raja^2), family=binomial(link=logit))
  summary(model2)

# Nope, no impressive change. Again, adding cubes:

  model3 <- glm(y.rej~p.raja+I(p.raja^2)+I(p.raja^3), family=binomial(link=logit))
  summary(model3)

# Nope, still hieavily overdispersed. You should not try higher powers, as you do NOT
# want to overfit the data. So, the next step would be to add other variables to the
# model (which means collecting more data). Such variables may include wealth or education
# or median age.

# I will let you do that. I happen to know that it will not work, but only because I already
# know the issues with vote data. You, too, will be expert enough in certain areas to be 
# able to know when to stop.

# So, we cannot fix the model. Thus, we return to our original model and adjust the 
# standard errors. We do this by using a quasi- family:

  model4 <- glm(y.rej~p.raja, family=quasibinomial(link=logit))
  summary(model4)

# As we are fitting a quasi- family, the overdispersion is irrelevent and the standard 
# errors are fixed for us. It is not the best solution, but it is the best we can do
# given this data.

# Are the assumptions violated by this model?

  plot(model4)


# Graph the results: The data, the model curve, and the confidence bands

  par(mar=c(5,5,2,2)+0.1)
  plot(p.rej~p.raja, las=1, xlim=c(0,1),ylim=c(0,0.05), 
      xlab="Proportion of Vote for Rajapaksa",
      ylab="Proportion of Vote Rejected")

  newraja <- seq(min(p.raja),max(p.raja),length=100 )
  pr <- predict(model4, newdata=data.frame(p.raja=newraja), se.fit=TRUE)
  pest <- logit.inv(pr$fit)
  ucl  <- pr$fit + 1.96*pr$se.fit
  lcl  <- pr$fit - 1.96*pr$se.fit
  ucl <- logit.inv(ucl)
  lcl <- logit.inv(lcl)
  lines(newraja,pest)
  lines(newraja,ucl, lty=2)
  lines(newraja,lcl, lty=2)



# Next question: Is this pattern the same in provinces he 
#   won as in provinces he lost? Let's see. 

# One way of answering this is to add a variable indicating whether he
# won the province. Another way is to do separate analyses on the two
# subsets of data. The first is usually preferred. Let us do the first
# here, and the second below.

# The interaction model

  modelw1 <- glm(y.rej~p.raja*wonprov, family=quasibinomial(link=logit) )
  summary(modelw1)

# Why did I start with the interaction model?

  modelw2 <- glm(y.rej~p.raja+wonprov, family=quasibinomial(link=logit) )
  summary(modelw2)

# What is our null hypotheses? What are our conclusions? Are you sure?

# Check assumptions

  plot(modelw2)


# Graphs:

par(mar=c(5,5,2,2)+0.1)
plot(p.rej~p.raja, las=1, xlim=c(0,1),ylim=c(0,0.05), 
  xlab="Proportion of Vote for Rajapaksa",
  ylab="Proportion of Vote Rejected")

newraja <- seq(min(p.raja),max(p.raja),length=100 )
pr <- predict(modelw2, newdata=data.frame(p.raja=newraja,wonprov=1), se.fit=TRUE)
pest <- logit.inv(pr$fit)
ucl  <- pr$fit + 1.96*pr$se.fit
lcl  <- pr$fit - 1.96*pr$se.fit
ucl <- logit.inv(ucl)
lcl <- logit.inv(lcl)
lines(newraja,pest, col=2)
lines(newraja,ucl, lty=2, col=2)
lines(newraja,lcl, lty=2, col=2)

pr <- predict(modelw2, newdata=data.frame(p.raja=newraja,wonprov=0), se.fit=TRUE)
pest <- logit.inv(pr$fit)
ucl  <- pr$fit + 1.96*pr$se.fit
lcl  <- pr$fit - 1.96*pr$se.fit
ucl <- logit.inv(ucl)
lcl <- logit.inv(lcl)
lines(newraja,pest, col=4)
lines(newraja,ucl, lty=2, col=4)
lines(newraja,lcl, lty=2, col=4)




# Alternatively, we can model the two subsets of data separately:

  model.w1 <- glm(y.rej~p.raja, family=quasibinomial(link=logit), subset=wonprov==1 )
  summary(model.w1)

  model.l1 <- glm(y.rej~p.raja, family=quasibinomial(link=logit), subset=wonprov==0 )
  summary(model.l1)

# Test assumptions

  plot(model.w1)
  plot(model.l1)

# Actually, I do not like the look of the third graph for model.l1. That makes
# me think this is not a good model and I should fix it somehow. However, since
# this is not the preferred way of comparing the two groupings, I won't bother.


# The new usual graphs:

  par(mar=c(5,5,2,2)+0.1)
  plot(p.rej~p.raja, las=1, xlim=c(0,1),ylim=c(0,0.05), 
    xlab="Proportion of Vote for Rajapaksa",
    ylab="Proportion of Vote Rejected")

  newraja <- seq(min(p.raja[wonprov==1]),max(p.raja[wonprov==1]),length=100 )
  pr <- predict(model.w1, newdata=data.frame(p.raja=newraja), se.fit=TRUE)
  pest <- logit.inv(pr$fit)
  ucl  <- pr$fit + 1.96*pr$se.fit
  lcl  <- pr$fit - 1.96*pr$se.fit
  ucl <- logit.inv(ucl)
  lcl <- logit.inv(lcl)
  lines(newraja,pest, col=2)
  lines(newraja,ucl, lty=2, col=2)
  lines(newraja,lcl, lty=2, col=2)

  newraja <- seq(min(p.raja[wonprov==0]),max(p.raja[wonprov==0]),length=100 )
  pr <- predict(model.l1, newdata=data.frame(p.raja=newraja), se.fit=TRUE)
  pest <- logit.inv(pr$fit)
  ucl  <- pr$fit + 1.96*pr$se.fit
  lcl  <- pr$fit - 1.96*pr$se.fit
  ucl <- logit.inv(ucl)
  lcl <- logit.inv(lcl)
  lines(newraja,pest, col=4)
  lines(newraja,ucl, lty=2, col=4)
  lines(newraja,lcl, lty=2, col=4)

# By now, this graphing should be second-nature to you



# Many people want a single number that tells them how good the model describes the
# data. In OLS, this is called the R^2 measure. It is a measure of the proportion of
# the error reduced by using the model. 

# Technically, there is no R^2 measure for GLMs, but we can give a generalized R^2
# measure that has the same properties. Note that the deviance is a type of error.

# Thus, this generalized R^2 measure is the improvement in deviance divided by the 
# initial deviance.

# We can get the numbers from our model summary, or we can create a function that
# calculates it for us:

devr2 <- function(mod) (mod$null.deviance-mod$deviance)/mod$null.deviance

# Try it. 

summary(model.l1)

devr2(model.l1)