########################################
#
#  Script: 10 March 2011 (20110310.R)
#
########################################

# Today:
#
#   ** Fitting GLMs
#      . Count data
#        . Poisson
#        . Quasipoisson
#        . Negative Binomial
#      . Graphing
#      . Overfitting (!!)
#

# Let's read in pseudo data

  dp <- read.csv("http://courses.kvasaheim.com/stat40x3/data/fakepoisson.csv", header=TRUE)
  summary(dp$y)

# Another way of getting a feel for your data is to use a histogram, which is also
# known as an empirical pdf

  hist(dp$y)


# The classical linear model (CLM) assumes linearity in the variables, the residuals are
# distributed Gaussian, and the link is the identity link. It is equivalent to ordinary
# least squares (OLS), although completely different

  model.l <- lm(y ~ x1, data=dp)
  summary(model.l)

# The Akaike's Information Criterion is a measure of how good the model fits the data,
# BUT is it only useful when comparing different models using the same data.

  AIC(model.l)

# As the data are count, let us fit them using a standard Poisson model

  model.c <- glm(y ~ x1, family=poisson(link=log), data=dp)
  summary(model.c)

# Is this model better than model.l? Compare AIC scores.



# Comparison using graphs

  with(dp, plot(x1, y, pch=16, las=1))
  abline(model.l, col=2, lwd=2)

# This abline shortcut can only be used with linear models. For 
# other models, you will have to use the usual method:

  x.pred <- 0:100/100
  y.pred <- predict(model.c, newdata=data.frame(x1=x.pred))
  y.pred <- exp(y.pred)
  lines(x.pred, y.pred, col=4, lwd=2)


# So, which model appears to fit the data better?


# Under certain circumstances, we will want to fit the data using a 
# quasi-poisson distribution, not a poisson distribution. Here is 
# how we do it. Note that the coefficient estimates are unchanged. 
# The only change is the standard error estimates.

  model.q <- glm(y ~ x1, family=quasipoisson(link=log), data=dp)
  summary(model.q)
  summary(model.c)

# Also note that we lost the AIC measure. This is because the AIC
# is a function of the likelihood, but the quasipoisson only has
# a quasilikelihood. If you care, look it up. If not, save your time.


# The Negative Binomial distribution is also used for counts. Here is
# how you would fit the above data using a Negative Binomial family.

  library(MASS)
  model.nb <- glm.nb(y ~ x1, link=log, data=dp)
  warnings()
  summary(model.nb)

# The Negative Binomial distribution has an additional parameter that it
# needs to estimate. As such, it is more dependent on the data in order
# for it to come up with correct estimates. 

# These warnings indicate that the results did not converge, thus we must
# treat these estimates with a grain of salt (or increase the number of 
# allowable iterations. 

# A graph of the four models

  with(dp, plot(x1, y, pch=16, las=1) )
  abline(model.l, col=2, lwd=2)
  lines(x.pred, y.pred, col=4, lwd=2)

# The quasipoisson will ALWAYS give the same model plot as the poisson
  q.pred <- predict(model.q, newdata=data.frame(x1=x.pred))
  q.pred <- exp(q.pred)
  lines(x.pred, q.pred, col=6, lwd=4, lty=2)

# The negative binomial will often give something close
  n.pred <- predict(model.nb, newdata=data.frame(x1=x.pred))
  n.pred <- exp(n.pred)
  lines(x.pred, n.pred, col=8, lwd=6, lty=4)



# Another example. This one demonstrating the importance of the offset

# Read in the data and attach it

  terrorism <- read.csv("http://courses.kvasaheim.com/stat40x3/data/terrorism.csv", header=TRUE)
  names(terrorism)
  summary(terrorism)


### What do we do about the days variable?


# Option One: as an independent variables
  model.1p <- glm(pira ~ riteleft + days, poisson(link=log), data=terrorism)
  model.1q <- glm(pira ~ riteleft + days, quasipoisson(link=log), data=terrorism)
  model.1n <- glm.nb(pira ~ riteleft + days, link=log, data=terrorism)
  summary(model.1p)
  summary(model.1q)
  summary(model.1n)


# Option Two: as an offset (preferred)
  model.2p <- glm(pira ~ riteleft, offset=log(days), poisson(link=log), data=terrorism)
  model.2q <- glm(pira ~ riteleft, offset=log(days), quasipoisson(link=log), data=terrorism)
  model.2n <- glm.nb(pira ~ riteleft, offset(log(days)), link=log, data=terrorism)
  summary(model.2p)
  summary(model.2q)
  summary(model.2n)



# Now, let us start adding multiple independent variables (as driven 
# by our *literature review* and our *research question*. Note the I() 
# notation. I() means 'as-is'.

# To save some typing, and to ease model creation (especially if we are
# determining which model we wish to fit, let us create a variable
# called 'formula'. Note the different independent variable types.
  formula <- "pira ~ riteleft + I(riteleft^2) + I(riteleft^3) + I(riteleft^4) + labour"

# Now, fit that research model with a quasi-Poisson and a negative binomial
  model.3q <- glm(formula, offset=log(days), quasipoisson(link=log), data=terrorism)
  model.3n <- glm.nb(formula, offset(log(days)), link=log, data=terrorism)
  summary(model.3q)
  summary(model.3n)


# Yay! Statistical significance!



# Now, we get predictions in the same way as before:
  x <- -50:50
  prediction.q0 <- exp(predict(model.3q, newdata=data.frame(riteleft=x, labour=0, days=365) ))
  prediction.q1 <- exp(predict(model.3q, newdata=data.frame(riteleft=x, labour=1, days=365) ))
  prediction.n0 <- exp(predict(model.3n, newdata=data.frame(riteleft=x, labour=0, days=365) ))
  prediction.n1 <- exp(predict(model.3n, newdata=data.frame(riteleft=x, labour=1, days=365) ))


# Plot 1: quasi-Poisson
  plot(total/days*365 ~ riteleft, data=terrorism, type="n", xlim=c(-75,75), col=2, las=1, 
    ylab="Annualized Deaths due to Terrorism", 
    xlab="Level of conservatism in the UK Parliament" )
  abline(v=0, col='grey')
  with(terrorism, points(riteleft[labour==1], total[labour==1]/days[labour==1]*365, col=4, pch=16) )
  with(terrorism, points(riteleft[labour==0], total[labour==0]/days[labour==0]*365, col=2, pch=17) )
  with(terrorism, lines(x,prediction.q0, col=3, lwd=2) )
  with(terrorism, lines(x,prediction.q1, col=6, lwd=2) )



# Plot 2: negative binomial
  plot(total/days*365 ~ riteleft, data=terrorism, type="n", xlim=c(-75,75), col=2, las=1, 
    ylab="Annualized Deaths due to Terrorism", 
    xlab="Level of conservatism in the UK Parliament" )
  abline(v=0, col='grey')
  with(terrorism, points(riteleft[labour==1], total[labour==1]/days[labour==1]*365, col=4, pch=16) )
  with(terrorism, points(riteleft[labour==0], total[labour==0]/days[labour==0]*365, col=2, pch=17) )
  with(terrorism, lines(x,prediction.n0, col=3, lwd=2) )
  with(terrorism, lines(x,prediction.n1, col=6, lwd=2) )



# Wait! What happened? Overfitting!

# This is what happens when you have too many independent variables for 
# your data. You are fitting the data, not the population. Too many
# independent variables allows those curves to jump around to hit all
# of the data points. It results in predictions that are worthless in
# explaining the underlying population relationships.

# Beware!