######################################## # # Script: 10 March 2011 (20110310.R) # ######################################## # Today: # # ** Fitting GLMs # . Count data # . Poisson # . Quasipoisson # . Negative Binomial # . Graphing # . Overfitting (!!) # # Let's read in pseudo data dp <- read.csv("http://courses.kvasaheim.com/stat40x3/data/fakepoisson.csv", header=TRUE) summary(dp$y) # Another way of getting a feel for your data is to use a histogram, which is also # known as an empirical pdf hist(dp$y) # The classical linear model (CLM) assumes linearity in the variables, the residuals are # distributed Gaussian, and the link is the identity link. It is equivalent to ordinary # least squares (OLS), although completely different model.l <- lm(y ~ x1, data=dp) summary(model.l) # The Akaike's Information Criterion is a measure of how good the model fits the data, # BUT is it only useful when comparing different models using the same data. AIC(model.l) # As the data are count, let us fit them using a standard Poisson model model.c <- glm(y ~ x1, family=poisson(link=log), data=dp) summary(model.c) # Is this model better than model.l? Compare AIC scores. # Comparison using graphs with(dp, plot(x1, y, pch=16, las=1)) abline(model.l, col=2, lwd=2) # This abline shortcut can only be used with linear models. For # other models, you will have to use the usual method: x.pred <- 0:100/100 y.pred <- predict(model.c, newdata=data.frame(x1=x.pred)) y.pred <- exp(y.pred) lines(x.pred, y.pred, col=4, lwd=2) # So, which model appears to fit the data better? # Under certain circumstances, we will want to fit the data using a # quasi-poisson distribution, not a poisson distribution. Here is # how we do it. Note that the coefficient estimates are unchanged. # The only change is the standard error estimates. model.q <- glm(y ~ x1, family=quasipoisson(link=log), data=dp) summary(model.q) summary(model.c) # Also note that we lost the AIC measure. This is because the AIC # is a function of the likelihood, but the quasipoisson only has # a quasilikelihood. If you care, look it up. If not, save your time. # The Negative Binomial distribution is also used for counts. Here is # how you would fit the above data using a Negative Binomial family. library(MASS) model.nb <- glm.nb(y ~ x1, link=log, data=dp) warnings() summary(model.nb) # The Negative Binomial distribution has an additional parameter that it # needs to estimate. As such, it is more dependent on the data in order # for it to come up with correct estimates. # These warnings indicate that the results did not converge, thus we must # treat these estimates with a grain of salt (or increase the number of # allowable iterations. # A graph of the four models with(dp, plot(x1, y, pch=16, las=1) ) abline(model.l, col=2, lwd=2) lines(x.pred, y.pred, col=4, lwd=2) # The quasipoisson will ALWAYS give the same model plot as the poisson q.pred <- predict(model.q, newdata=data.frame(x1=x.pred)) q.pred <- exp(q.pred) lines(x.pred, q.pred, col=6, lwd=4, lty=2) # The negative binomial will often give something close n.pred <- predict(model.nb, newdata=data.frame(x1=x.pred)) n.pred <- exp(n.pred) lines(x.pred, n.pred, col=8, lwd=6, lty=4) # Another example. This one demonstrating the importance of the offset # Read in the data and attach it terrorism <- read.csv("http://courses.kvasaheim.com/stat40x3/data/terrorism.csv", header=TRUE) names(terrorism) summary(terrorism) ### What do we do about the days variable? # Option One: as an independent variables model.1p <- glm(pira ~ riteleft + days, poisson(link=log), data=terrorism) model.1q <- glm(pira ~ riteleft + days, quasipoisson(link=log), data=terrorism) model.1n <- glm.nb(pira ~ riteleft + days, link=log, data=terrorism) summary(model.1p) summary(model.1q) summary(model.1n) # Option Two: as an offset (preferred) model.2p <- glm(pira ~ riteleft, offset=log(days), poisson(link=log), data=terrorism) model.2q <- glm(pira ~ riteleft, offset=log(days), quasipoisson(link=log), data=terrorism) model.2n <- glm.nb(pira ~ riteleft, offset(log(days)), link=log, data=terrorism) summary(model.2p) summary(model.2q) summary(model.2n) # Now, let us start adding multiple independent variables (as driven # by our *literature review* and our *research question*. Note the I() # notation. I() means 'as-is'. # To save some typing, and to ease model creation (especially if we are # determining which model we wish to fit, let us create a variable # called 'formula'. Note the different independent variable types. formula <- "pira ~ riteleft + I(riteleft^2) + I(riteleft^3) + I(riteleft^4) + labour" # Now, fit that research model with a quasi-Poisson and a negative binomial model.3q <- glm(formula, offset=log(days), quasipoisson(link=log), data=terrorism) model.3n <- glm.nb(formula, offset(log(days)), link=log, data=terrorism) summary(model.3q) summary(model.3n) # Yay! Statistical significance! # Now, we get predictions in the same way as before: x <- -50:50 prediction.q0 <- exp(predict(model.3q, newdata=data.frame(riteleft=x, labour=0, days=365) )) prediction.q1 <- exp(predict(model.3q, newdata=data.frame(riteleft=x, labour=1, days=365) )) prediction.n0 <- exp(predict(model.3n, newdata=data.frame(riteleft=x, labour=0, days=365) )) prediction.n1 <- exp(predict(model.3n, newdata=data.frame(riteleft=x, labour=1, days=365) )) # Plot 1: quasi-Poisson plot(total/days*365 ~ riteleft, data=terrorism, type="n", xlim=c(-75,75), col=2, las=1, ylab="Annualized Deaths due to Terrorism", xlab="Level of conservatism in the UK Parliament" ) abline(v=0, col='grey') with(terrorism, points(riteleft[labour==1], total[labour==1]/days[labour==1]*365, col=4, pch=16) ) with(terrorism, points(riteleft[labour==0], total[labour==0]/days[labour==0]*365, col=2, pch=17) ) with(terrorism, lines(x,prediction.q0, col=3, lwd=2) ) with(terrorism, lines(x,prediction.q1, col=6, lwd=2) ) # Plot 2: negative binomial plot(total/days*365 ~ riteleft, data=terrorism, type="n", xlim=c(-75,75), col=2, las=1, ylab="Annualized Deaths due to Terrorism", xlab="Level of conservatism in the UK Parliament" ) abline(v=0, col='grey') with(terrorism, points(riteleft[labour==1], total[labour==1]/days[labour==1]*365, col=4, pch=16) ) with(terrorism, points(riteleft[labour==0], total[labour==0]/days[labour==0]*365, col=2, pch=17) ) with(terrorism, lines(x,prediction.n0, col=3, lwd=2) ) with(terrorism, lines(x,prediction.n1, col=6, lwd=2) ) # Wait! What happened? Overfitting! # This is what happens when you have too many independent variables for # your data. You are fitting the data, not the population. Too many # independent variables allows those curves to jump around to hit all # of the data points. It results in predictions that are worthless in # explaining the underlying population relationships. # Beware!