######################################## # # Script: 29 March 2011 (20110329.R) # ######################################## # Today: # # ** Errors # # ** More Fitting GLMs # . More assumptions (variability) # . Results when assumptions are violated # ssm <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ssm.csv", header=TRUE) names(ssm) summary(ssm) model.1 <- glm(pctfavor ~ year * povertyRate * gspcap,data=ssm, family=binomial(link=logit) ) ssm$propfavor <- ssm$pctfavor/100 model.2 <- glm(propfavor ~ year * povertyRate * gspcap,data=ssm, family=binomial(link=logit) ) successes <- 1000*ssm$propfavor failures <- 1000-successes y <- cbind(successes, failures) model.3 <- glm(y ~ year * povertyRate * gspcap, data=ssm, family=binomial(link=logit) ) summary(model.3) model.4 <- glm(y ~ year * povertyRate * gspcap, data=ssm, family=quasibinomial(link=logit) ) summary(model.4) # Paring down the model # Drop the three-way interaction (year:povertyRate:gspcap). (still no good) # Test the three pairs of two-way interactions. (still no good) # Test each of the three single two-way interactions. (still no good) # So, we are left with an additive model model.9 <- glm(y ~ year + povertyRate + gspcap, data=ssm, family=quasibinomial(link=logit) ) summary(model.9) ### # What about Poisson models? model.10 <- glm(successes ~ year * povertyRate * gspcap, data=ssm, family=poisson(link=log) ) summary(model.10) # Why no need for an offest? The divisor is constant. model.11 <- glm(successes ~ year * povertyRate * gspcap, data=ssm, family=quasipoisson(link=log) ) summary(model.11) # Same paring steps as above model.12 <- glm(successes ~ year * povertyRate * gspcap - year:povertyRate:gspcap, data=ssm, family=quasipoisson(link=log) ) summary(model.12) # Thus, no three-way interaction model.13 <- glm(successes ~ year * povertyRate * gspcap - povertyRate:gspcap - year:povertyRate:gspcap, data=ssm, family=quasipoisson(link=log) ) summary(model.13) model.14 <- glm(successes ~ year * povertyRate * gspcap - year:gspcap - year:povertyRate:gspcap, data=ssm, family=quasipoisson(link=log) ) summary(model.14) model.15 <- glm(successes ~ year * povertyRate * gspcap - year:povertyRate - year:povertyRate:gspcap, data=ssm, family=quasipoisson(link=log) ) summary(model.15) # Thus, no pairs of two-way interactions model.16 <- glm(successes ~ year + povertyRate + gspcap + year:povertyRate, data=ssm, family=quasipoisson(link=log) ) summary(model.16) model.17 <- glm(successes ~ year + povertyRate + gspcap + year:gspcap, data=ssm, family=quasipoisson(link=log) ) summary(model.17) model.18 <- glm(successes ~ year + povertyRate + gspcap + povertyRate:gspcap, data=ssm, family=quasipoisson(link=log) ) summary(model.18) # Thus, no two-way interactions. # Thus, we are left with an additive model model.19 <- glm(successes ~ year + povertyRate + gspcap, data=ssm, family=quasipoisson(link=log) ) summary(model.19) # Everything is statistically significant, except gspcap, which is close enough for me to keep in # Which is better, model.9 or model.19? summary(model.9) summary(model.19) # Test assumptions plot(model.9) plot(model.19) # Check Plot 3. The plot from model.19 does have a tail to it, but that is caused by a single # data point. The plot from model.9 has no tail. Thus, the differences are not that obvious to # me. As such, I would lean toward model.9, but I do not hate model.19. # We could also check to see how well the model fit the data. In other words, # let us check the proportional reduction in variance (error). First, # a helpful function: eov <- function(model) { fit.var <- var( residuals(model, type="response") ) null.var <- var( model$y) ret <- (null.var-fit.var)/null.var return (ret) } # Now for the reduction in variance measures eov(model.9) eov(model.19) # So, the second model reduced variance more than the first. However, # still not by a lot. Thus, either model is fine. # PS: You should be able to tell what the eov() function actually does # So, we are left with graphing our results. The graph will depend on # our variable of interest. ################################################## ################################################## ################################################## # Now for the effects of variability # When there is little variation in the dependent variable: model.x1 <- glm(won ~ year + povertyRate + gspcap, data=ssm, family=quasibinomial(link=logit) ) model.x2 <- glm(won ~ year + povertyRate + gspcap, data=ssm, family=binomial(link=logit) ) # These errors are the effects of too little variation in the y-variable # When there is too little variation in the independent variable model.x3 <- glm(povertyRate ~ ok, gspcap, data=ssm, family=gaussian(link=identity) ) summary(model.x3) # Note the dispersion # Thus, too little variation is a severe problem. If the y-variable has # too little variation, then the entire model is affected. If the x- # variable has too little variation, only that x-variable has an issue. # Usually that issue is very large standard errors. # Lesson: Make sure there is variability in your variables