######################################## # # Script: 22 March 2011 (20110322.R) # ######################################## # Today: # # ** Fitting GLMs # . Assumptions # . Testing Assumptions # . More Graphing (!!) # ### New data: Belgian AIDS cases t <- 1980+1:13 y <- c(12,14,33,50,67,74,123,141,165,204,253,246,240) AIDS <- data.frame(year=t,cases=y) # Note that I have now created a new dataset called AIDS. I can # do to AIDS what I usually do to my other datasets (as I should) # What is stored in t? What does 1:13 do? # Quick and dirty plot (as AIDS has only two variables). NOT # presentation quality, however (nor homework quality) plot(AIDS) # Q: Has the rate of HIV infection actually leveled off, or is it # within expected random variation? # Ask yourself: What is distribution of Y? Why is this an # important question? model1 <- glm(cases ~ year, family=poisson(link=log), data=AIDS ) summary(model1) # How good it the fit? It looks good to me, but there is an issue with # overdispersion, which is the residual deviance divided by degrees of # freedom. If the model is a 'good' model, this ratio should be close # to one. Here, it is definitely not. # This tells us we should fix our model before proceeding. To fix the model, # we need to add independent variables, interactions between independent variables, # or powers of independent variables (in this order). Remember to not add too many # variables or you will overfit the data. # One can also adjust the standard errors if one cannot fix the model. # Overdispersion is an issue only in Poisson and Binomial models. # As social scientists, we would rather fix the model than fix the standard errors. # Let's create a better model model2 <- glm(cases ~ year + I(year^2), family=poisson(link=log), data=AIDS ) summary(model2) # Here, the dispersion is close enough to one, so we fixed the model. Not always # will you be able to fix the model. Why? # Let us check some assumptions of the generalized linear model using these # four graphs: plot(model2) # Those plots appear to indicate that there is no extreme violation of the assumptions, # so our next step is to interpret the coefficients (and p-values) and graph the # model: ynew <- 1981:1993 pr <- predict(model2, newdata=data.frame(year=ynew)) pr <- exp(pr) plot(AIDS, xlim=c(1980,1994), las=1, xlab="Year",ylab="AIDS Cases", ylim=c(0,300)) lines(ynew,pr) # This is as we have done several times in the past. # However, we can indicate on the graph the precision of our estimates of # the population curve by using confidence bands # These are analogous to confidence intervals from last term. ynew <- 1981:1993 pr <- predict(model2, newdata=data.frame(year=ynew), se.fit=TRUE) # Note the change pest <- exp(pr$fit) plot(AIDS, xlim=c(1980,1994), las=1, xlab="Year",ylab="AIDS Cases", ylim=c(0,300)) lines(ynew,pest) ucl <- pr$fit + 1.96*pr$se.fit lcl <- pr$fit - 1.96*pr$se.fit ucl <- exp(ucl) lcl <- exp(lcl) lines(ynew,ucl, lty=2) lines(ynew,lcl, lty=2) # What does this plot tell us that the previous did not? # Where did the 1.96 come from? # What do I get when I type just pr? This indicates why we need to use the $ notation. ################################################## rm(list=ls()) # Cleaning up our memory # Two functions that will come in handy logit <- function(p) log( p/(1-p) ) logit.inv <- function(x) exp(x)/(1+exp(x)) # Let's load the Sri Lanka election data and play with it some more sri <- read.csv("http://courses.kvasaheim.com/stat40x3/data/sri2010pres.csv", header=TRUE) names(sri) summary(sri) # Fix the data (as we know we should) v.raja <- sri$RAJAPAKSE v.rej <- sri$REJECTED v.total <- sri$TOTAL wonprov <- sri$WONPROVINCE division <- sri$DIVISION drop <- which(division=="Postal") v.raja <- v.raja[-drop] v.rej <- v.rej[-drop] v.total <- v.total[-drop] wonprov <- wonprov[-drop] division <- division[-drop] drop <- which(division=="Displace") v.raja <- v.raja[-drop] v.rej <- v.rej[-drop] v.total <- v.total[-drop] wonprov <- wonprov[-drop] division <- division[-drop] # Next, we create two important variables p.raja <- v.raja/v.total p.rej <- v.rej/v.total # Q: Is there a significant relationship between the proportion of votes declared invalid # and the proportion of votes for Candidate Rajapaksa? # What kind of variable is the dependent variable? model1 <- glm(p.rej~p.raja, family=binomial(link=logit)) summary(model1) # Oops! A warning message. Note that we are using a Binomial distribution. Recall from last # term that the binomial distribution is discrete, can only take on integer values. Thus, we # need to provide it a different type of dependent variable. # New stuff. The dependent variable needs to be a dual vector of successes and failures: y.rej <- cbind(v.rej, v.total-v.rej) ######### A big change from the preliminary script!! # The right model model1 <- glm(y.rej~p.raja, family=binomial(link=logit)) summary(model1) # Check dispersion. Oops! This model is highly overdispersed. Let us try adding # powers of our dependent variable model2 <- glm(y.rej~p.raja+I(p.raja^2), family=binomial(link=logit)) summary(model2) # Nope, no impressive change. Again, adding cubes: model3 <- glm(y.rej~p.raja+I(p.raja^2)+I(p.raja^3), family=binomial(link=logit)) summary(model3) # Nope, still hieavily overdispersed. You should not try higher powers, as you do NOT # want to overfit the data. So, the next step would be to add other variables to the # model (which means collecting more data). Such variables may include wealth or education # or median age. # I will let you do that. I happen to know that it will not work, but only because I already # know the issues with vote data. You, too, will be expert enough in certain areas to be # able to know when to stop. # So, we cannot fix the model. Thus, we return to our original model and adjust the # standard errors. We do this by using a quasi- family: model4 <- glm(y.rej~p.raja, family=quasibinomial(link=logit)) summary(model4) # As we are fitting a quasi- family, the overdispersion is irrelevent and the standard # errors are fixed for us. It is not the best solution, but it is the best we can do # given this data. # Are the assumptions violated by this model? plot(model4) # Graph the results: The data, the model curve, and the confidence bands par(mar=c(5,5,2,2)+0.1) plot(p.rej~p.raja, las=1, xlim=c(0,1),ylim=c(0,0.05), xlab="Proportion of Vote for Rajapaksa", ylab="Proportion of Vote Rejected") newraja <- seq(min(p.raja),max(p.raja),length=100 ) pr <- predict(model4, newdata=data.frame(p.raja=newraja), se.fit=TRUE) pest <- logit.inv(pr$fit) ucl <- pr$fit + 1.96*pr$se.fit lcl <- pr$fit - 1.96*pr$se.fit ucl <- logit.inv(ucl) lcl <- logit.inv(lcl) lines(newraja,pest) lines(newraja,ucl, lty=2) lines(newraja,lcl, lty=2) # Next question: Is this pattern the same in provinces he # won as in provinces he lost? Let's see. # One way of answering this is to add a variable indicating whether he # won the province. Another way is to do separate analyses on the two # subsets of data. The first is usually preferred. Let us do the first # here, and the second below. # The interaction model modelw1 <- glm(y.rej~p.raja*wonprov, family=quasibinomial(link=logit) ) summary(modelw1) # Why did I start with the interaction model? modelw2 <- glm(y.rej~p.raja+wonprov, family=quasibinomial(link=logit) ) summary(modelw2) # What is our null hypotheses? What are our conclusions? Are you sure? # Check assumptions plot(modelw2) # Graphs: par(mar=c(5,5,2,2)+0.1) plot(p.rej~p.raja, las=1, xlim=c(0,1),ylim=c(0,0.05), xlab="Proportion of Vote for Rajapaksa", ylab="Proportion of Vote Rejected") newraja <- seq(min(p.raja),max(p.raja),length=100 ) pr <- predict(modelw2, newdata=data.frame(p.raja=newraja,wonprov=1), se.fit=TRUE) pest <- logit.inv(pr$fit) ucl <- pr$fit + 1.96*pr$se.fit lcl <- pr$fit - 1.96*pr$se.fit ucl <- logit.inv(ucl) lcl <- logit.inv(lcl) lines(newraja,pest, col=2) lines(newraja,ucl, lty=2, col=2) lines(newraja,lcl, lty=2, col=2) pr <- predict(modelw2, newdata=data.frame(p.raja=newraja,wonprov=0), se.fit=TRUE) pest <- logit.inv(pr$fit) ucl <- pr$fit + 1.96*pr$se.fit lcl <- pr$fit - 1.96*pr$se.fit ucl <- logit.inv(ucl) lcl <- logit.inv(lcl) lines(newraja,pest, col=4) lines(newraja,ucl, lty=2, col=4) lines(newraja,lcl, lty=2, col=4) # Alternatively, we can model the two subsets of data separately: model.w1 <- glm(y.rej~p.raja, family=quasibinomial(link=logit), subset=wonprov==1 ) summary(model.w1) model.l1 <- glm(y.rej~p.raja, family=quasibinomial(link=logit), subset=wonprov==0 ) summary(model.l1) # Test assumptions plot(model.w1) plot(model.l1) # Actually, I do not like the look of the third graph for model.l1. That makes # me think this is not a good model and I should fix it somehow. However, since # this is not the preferred way of comparing the two groupings, I won't bother. # The new usual graphs: par(mar=c(5,5,2,2)+0.1) plot(p.rej~p.raja, las=1, xlim=c(0,1),ylim=c(0,0.05), xlab="Proportion of Vote for Rajapaksa", ylab="Proportion of Vote Rejected") newraja <- seq(min(p.raja[wonprov==1]),max(p.raja[wonprov==1]),length=100 ) pr <- predict(model.w1, newdata=data.frame(p.raja=newraja), se.fit=TRUE) pest <- logit.inv(pr$fit) ucl <- pr$fit + 1.96*pr$se.fit lcl <- pr$fit - 1.96*pr$se.fit ucl <- logit.inv(ucl) lcl <- logit.inv(lcl) lines(newraja,pest, col=2) lines(newraja,ucl, lty=2, col=2) lines(newraja,lcl, lty=2, col=2) newraja <- seq(min(p.raja[wonprov==0]),max(p.raja[wonprov==0]),length=100 ) pr <- predict(model.l1, newdata=data.frame(p.raja=newraja), se.fit=TRUE) pest <- logit.inv(pr$fit) ucl <- pr$fit + 1.96*pr$se.fit lcl <- pr$fit - 1.96*pr$se.fit ucl <- logit.inv(ucl) lcl <- logit.inv(lcl) lines(newraja,pest, col=4) lines(newraja,ucl, lty=2, col=4) lines(newraja,lcl, lty=2, col=4) # By now, this graphing should be second-nature to you # Many people want a single number that tells them how good the model describes the # data. In OLS, this is called the R^2 measure. It is a measure of the proportion of # the error reduced by using the model. # Technically, there is no R^2 measure for GLMs, but we can give a generalized R^2 # measure that has the same properties. Note that the deviance is a type of error. # Thus, this generalized R^2 measure is the improvement in deviance divided by the # initial deviance. # We can get the numbers from our model summary, or we can create a function that # calculates it for us: devr2 <- function(mod) (mod$null.deviance-mod$deviance)/mod$null.deviance # Try it. summary(model.l1) devr2(model.l1)