#########################
#
# File: 20111011.R
#
#       Generalized Linear Models (Gaussian)
#
#########################


# Preamble

library(RFS)


#
# New data file : 
#   2008 presidential elections data for Colorado


# Read in data
elxn <- read.csv("http://courses.kvasaheim.com/pols6123/data/co2008pres2.csv")
attach(elxn)
names(elxn)
summary(elxn)

# Plot to get a feel for the relationship (cor only returns linear correlation)
plot(pMcCain,pSpoiled)



# We are going to test the Colorado election for a specific type of
# electoral fraud. *IF* an election is free and fair, then each person's 
# vote counts the same. This is equivalent to saying that the probability
# of a person's vote being rejected is independent of several different
# covariates. Such covariates include demographic features (age, gender,
# ethnicity) as well as whom the person voted for. Here, we test if the
# probability of a person's vote being rejected is independent of the
# candidate voted for.

# The term "independent of" indicates we need to perform an independence
# test. For two continuous variables, we might think that we could use a 
# correlation test. However, recall that independent covariates have a zero
# correlation, BUT the converse is not necessarily true.

# In addition to the logic issues using correlation to test independence,
# it is well neigh impossible to perform the test when there are control
# variables present or when the two variables are not Normally distributed.

# Thus, we will perform linear regression. 


# The plot again, but improved (not for publication)
plot(pMcCain,pSpoiled, pch=16, las=1, xlim=c(0,1), ylim=c(0,0.05) )


# So, we will perform linear regression as we have before
m1 <- lm(pSpoiled ~ pMcCain)
summary(m1)


# Whoops, the dependent variable is bounded, so we transform it
m2 <- lm( logit(pSpoiled) ~ pMcCain)
summary(m2)


# plot prediction curve
newM <- seq(0:1, length=100)
pr <- predict(m2, newdata=data.frame(pMcCain=newM) )
pr <- logistic(pr)
lines(newM, pr, col=2, lwd=2, lty=2)

# Notice that none of this is new

# Now, let us use GLMs instead of CLMs to fit the model
m3 <- glm( pSpoiled ~ pMcCain, family=gaussian(link=identity) )
summary(m3)
summary(m1) # Compare the results

# Note that the two models m1 and m3 are equivalent. This is true since
# GLMs and CLMs are equivalent when using the Gaussian (Normal) distribution
# and the canonical link (identity).


# As before, we note the dependent variable is bounded, so we transform
# it using the logit function
m4 <- glm( logit(pSpoiled) ~ pMcCain, family=gaussian(link=identity) )
summary(m4)
summary(m2) # Compare the results

# Again, note that the two models m2 and m4 are equivalent. This is true since
# GLMs and CLMs are equivalent when using the Gaussian (Normal) distribution
# and the canonical link (identity).


# An alternate (and very appropriate) model for fitting the data (and 
# testing the hypothesis) is to use the logit as the link function

m5 <- glm( pSpoiled ~ pMcCain, family=gaussian( link=make.link("logit") ) )
summary(m5)
summary(m2) # Compare the results

# Note that the results between these two models are not the same. We should
# not expect them to be equivalent, since we are not using the canonical link
# with a Gaussian distribution.



##################################################

# Let's do this analysis on another election dataset (with some quirks)


# The South Sudan unity referendum of January 2011

# Read in the data
ssd <- read.csv("http://courses.kvasaheim.com/pols6123/data/xsd2011referendum.csv")
attach(ssd)
names(ssd)
summary(ssd)


# Create the proportion variables

pInv <- Invalid/Votes
pSec <- Secession/(Votes-Invalid-Blank)   # This is a change! Elections theory demands it


# Feel the data
plot(pSec, pInv, pch=16, col=2)  # Not for publication (Why?)


# Our first model
s1 <- glm(pInv ~ pSec, family=gaussian(link=identity))
summary(s1)

# Whoops! We should not have done that, since we need to transform the dep var

s2 <- glm( logit(pInv) ~ pSec, family=gaussian(link=identity))
s3 <- glm( pInv ~ pSec, family=gaussian(link=make.link("logit")))

# Oh bother! An error message. This happened because I had proportions equal to 0 or 
# to 1. The logit has a domain of 0 < p < 1, which does not include 0 or 1. So, we
# MUST remove the counties that had no invalid votes

d <- which(pInv == 0)   # The identity of the zero-invalid counties

  pInv <- pInv[-d]      # Remove those counties from pInv
  pSec <- pSec[-d]      # Remove those counties from pSec

# They must be removed from BOTH variables because regression is based on the 
# assumption that the data points have a meaning, that is, you remove a record 
# (county), not just a value in a record.


# Let's try fitting again.

s2 <- glm( logit(pInv) ~ pSec, family=gaussian(link=identity))
s3 <- glm( pInv ~ pSec, family=gaussian(link=make.link("logit")))

summary(s2)
summary(s3)

# There is no way we can determine which of the two models is correct. They
# are not equivalent, but they are both appropriate when either is appropriate.


# Prediction curves
newS <- seq(0,1, length=1000)
pr2 <- predict(s2, newdata=data.frame(pSec=newS))
pr3 <- predict(s3, newdata=data.frame(pSec=newS))
pr2 <- logistic(pr2)  # Remember to back-transform!
pr3 <- logistic(pr3)  # Remember to back-transform!

par(mar=c(4,4,1,1)+0.1)
plot(pSec, pInv, pch=16, col=2, las=1, xlim=c(0,1), ylim=c(0,0.1),
  xlab="Proportion of Vote in Favor of Secession", ylab="Proportion of the Vote Invalidated") 
lines(newS,pr2, col=2, lwd=2)
lines(newS,pr3, col=3, lwd=2)

legend("bottomleft",bty="n", lwd=2, col=c(2,3), c("Link: Identity","Link: Logit") )


###

# Now, the interesting thing is that we can use additional information to determine
# where the vote was unfair. We will use that in a future class (or assignment). But, 
# until then, see if you can color the dots differently depending on whether the
# county is North, South, or OCV.
#
# Then, see if you can run individual regressions on the three regions.