########################################
#
#  Script: 29 March 2011 (20110329.R)
#
########################################

# Today:
#
#   ** Errors
#      
#   ** More Fitting GLMs
#      . More assumptions (variability)
#      . Results when assumptions are violated
#


ssm <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ssm.csv", header=TRUE)
names(ssm)
summary(ssm)


model.1 <- glm(pctfavor ~ year * povertyRate * gspcap,data=ssm, family=binomial(link=logit) )


ssm$propfavor <- ssm$pctfavor/100
model.2 <- glm(propfavor ~ year * povertyRate * gspcap,data=ssm, family=binomial(link=logit) )

successes <- 1000*ssm$propfavor
failures <- 1000-successes

y <- cbind(successes, failures)

model.3 <- glm(y ~ year * povertyRate * gspcap, data=ssm, family=binomial(link=logit) )
summary(model.3)


model.4 <- glm(y ~ year * povertyRate * gspcap, data=ssm, family=quasibinomial(link=logit) )
summary(model.4)


# Paring down the model
# Drop the three-way interaction (year:povertyRate:gspcap). (still no good)
# Test the three pairs of two-way interactions.             (still no good)
# Test each of the three single two-way interactions.       (still no good)


# So, we are left with an additive model

model.9 <- glm(y ~ year + povertyRate + gspcap, data=ssm, family=quasibinomial(link=logit) )
summary(model.9)


###

# What about Poisson models?

model.10 <- glm(successes ~ year * povertyRate * gspcap, data=ssm, family=poisson(link=log) )
summary(model.10)

# Why no need for an offest? The divisor is constant.


model.11 <- glm(successes ~ year * povertyRate * gspcap, data=ssm, family=quasipoisson(link=log) )
summary(model.11)

# Same paring steps as above


model.12 <- glm(successes ~ year * povertyRate * gspcap - year:povertyRate:gspcap, data=ssm, family=quasipoisson(link=log) )
summary(model.12)

# Thus, no three-way interaction


model.13 <- glm(successes ~ year * povertyRate * gspcap - povertyRate:gspcap - year:povertyRate:gspcap, data=ssm, family=quasipoisson(link=log) )
summary(model.13)

model.14 <- glm(successes ~ year * povertyRate * gspcap - year:gspcap - year:povertyRate:gspcap, data=ssm, family=quasipoisson(link=log) )
summary(model.14)

model.15 <- glm(successes ~ year * povertyRate * gspcap - year:povertyRate - year:povertyRate:gspcap, data=ssm, family=quasipoisson(link=log) )
summary(model.15)

# Thus, no pairs of two-way interactions


model.16 <- glm(successes ~ year + povertyRate + gspcap + year:povertyRate, data=ssm, family=quasipoisson(link=log) )
summary(model.16)

model.17 <- glm(successes ~ year + povertyRate + gspcap + year:gspcap, data=ssm, family=quasipoisson(link=log) )
summary(model.17)

model.18 <- glm(successes ~ year + povertyRate + gspcap + povertyRate:gspcap, data=ssm, family=quasipoisson(link=log) )
summary(model.18)

# Thus, no two-way interactions. 


# Thus, we are left with an additive model

model.19 <- glm(successes ~ year + povertyRate + gspcap, data=ssm, family=quasipoisson(link=log) )
summary(model.19)

# Everything is statistically significant, except gspcap, which is close enough for me to keep in




# Which is better, model.9 or model.19?

summary(model.9)
summary(model.19)


# Test assumptions

plot(model.9)
plot(model.19)

# Check Plot 3. The plot from model.19 does have a tail to it, but that is caused by a single 
# data point. The plot from model.9 has no tail. Thus, the differences are not that obvious to 
# me. As such, I would lean toward model.9, but I do not hate model.19.


# We could also check to see how well the model fit the data. In other words, 
# let us check the proportional reduction in variance (error). First, 
# a helpful function:

eov <- function(model) {
  fit.var  <- var( residuals(model, type="response") )
  null.var <- var( model$y)
  ret      <- (null.var-fit.var)/null.var
return (ret)
}


# Now for the reduction in variance measures

eov(model.9)
eov(model.19)


# So, the second model reduced variance more than the first. However, 
# still not by a lot. Thus, either model is fine.


# PS: You should be able to tell what the eov() function actually does


# So, we are left with graphing our results. The graph will depend on
# our variable of interest. 



##################################################
##################################################
##################################################


# Now for the effects of variability


# When there is little variation in the dependent variable:

model.x1 <- glm(won ~ year + povertyRate + gspcap, data=ssm, family=quasibinomial(link=logit) )
model.x2 <- glm(won ~ year + povertyRate + gspcap, data=ssm, family=binomial(link=logit) )

# These errors are the effects of too little variation in the y-variable



# When there is too little variation in the independent variable

model.x3 <- glm(povertyRate ~ ok, gspcap, data=ssm, family=gaussian(link=identity) )
summary(model.x3)

# Note the dispersion


# Thus, too little variation is a severe problem. If the y-variable has
# too little variation, then the entire model is affected. If the x-
# variable has too little variation, only that x-variable has an issue.
# Usually that issue is very large standard errors.


# Lesson: Make sure there is variability in your variables