#########################
#
# File: 20111011.R
#
# Generalized Linear Models (Gaussian)
#
#########################
# Preamble
library(RFS)
#
# New data file :
# 2008 presidential elections data for Colorado
# Read in data
elxn <- read.csv("http://courses.kvasaheim.com/pols6123/data/co2008pres2.csv")
attach(elxn)
names(elxn)
summary(elxn)
# Plot to get a feel for the relationship (cor only returns linear correlation)
plot(pMcCain,pSpoiled)
# We are going to test the Colorado election for a specific type of
# electoral fraud. *IF* an election is free and fair, then each person's
# vote counts the same. This is equivalent to saying that the probability
# of a person's vote being rejected is independent of several different
# covariates. Such covariates include demographic features (age, gender,
# ethnicity) as well as whom the person voted for. Here, we test if the
# probability of a person's vote being rejected is independent of the
# candidate voted for.
# The term "independent of" indicates we need to perform an independence
# test. For two continuous variables, we might think that we could use a
# correlation test. However, recall that independent covariates have a zero
# correlation, BUT the converse is not necessarily true.
# In addition to the logic issues using correlation to test independence,
# it is well neigh impossible to perform the test when there are control
# variables present or when the two variables are not Normally distributed.
# Thus, we will perform linear regression.
# The plot again, but improved (not for publication)
plot(pMcCain,pSpoiled, pch=16, las=1, xlim=c(0,1), ylim=c(0,0.05) )
# So, we will perform linear regression as we have before
m1 <- lm(pSpoiled ~ pMcCain)
summary(m1)
# Whoops, the dependent variable is bounded, so we transform it
m2 <- lm( logit(pSpoiled) ~ pMcCain)
summary(m2)
# plot prediction curve
newM <- seq(0:1, length=100)
pr <- predict(m2, newdata=data.frame(pMcCain=newM) )
pr <- logistic(pr)
lines(newM, pr, col=2, lwd=2, lty=2)
# Notice that none of this is new
# Now, let us use GLMs instead of CLMs to fit the model
m3 <- glm( pSpoiled ~ pMcCain, family=gaussian(link=identity) )
summary(m3)
summary(m1) # Compare the results
# Note that the two models m1 and m3 are equivalent. This is true since
# GLMs and CLMs are equivalent when using the Gaussian (Normal) distribution
# and the canonical link (identity).
# As before, we note the dependent variable is bounded, so we transform
# it using the logit function
m4 <- glm( logit(pSpoiled) ~ pMcCain, family=gaussian(link=identity) )
summary(m4)
summary(m2) # Compare the results
# Again, note that the two models m2 and m4 are equivalent. This is true since
# GLMs and CLMs are equivalent when using the Gaussian (Normal) distribution
# and the canonical link (identity).
# An alternate (and very appropriate) model for fitting the data (and
# testing the hypothesis) is to use the logit as the link function
m5 <- glm( pSpoiled ~ pMcCain, family=gaussian( link=make.link("logit") ) )
summary(m5)
summary(m2) # Compare the results
# Note that the results between these two models are not the same. We should
# not expect them to be equivalent, since we are not using the canonical link
# with a Gaussian distribution.
##################################################
# Let's do this analysis on another election dataset (with some quirks)
# The South Sudan unity referendum of January 2011
# Read in the data
ssd <- read.csv("http://courses.kvasaheim.com/pols6123/data/xsd2011referendum.csv")
attach(ssd)
names(ssd)
summary(ssd)
# Create the proportion variables
pInv <- Invalid/Votes
pSec <- Secession/(Votes-Invalid-Blank) # This is a change! Elections theory demands it
# Feel the data
plot(pSec, pInv, pch=16, col=2) # Not for publication (Why?)
# Our first model
s1 <- glm(pInv ~ pSec, family=gaussian(link=identity))
summary(s1)
# Whoops! We should not have done that, since we need to transform the dep var
s2 <- glm( logit(pInv) ~ pSec, family=gaussian(link=identity))
s3 <- glm( pInv ~ pSec, family=gaussian(link=make.link("logit")))
# Oh bother! An error message. This happened because I had proportions equal to 0 or
# to 1. The logit has a domain of 0 < p < 1, which does not include 0 or 1. So, we
# MUST remove the counties that had no invalid votes
d <- which(pInv == 0) # The identity of the zero-invalid counties
pInv <- pInv[-d] # Remove those counties from pInv
pSec <- pSec[-d] # Remove those counties from pSec
# They must be removed from BOTH variables because regression is based on the
# assumption that the data points have a meaning, that is, you remove a record
# (county), not just a value in a record.
# Let's try fitting again.
s2 <- glm( logit(pInv) ~ pSec, family=gaussian(link=identity))
s3 <- glm( pInv ~ pSec, family=gaussian(link=make.link("logit")))
summary(s2)
summary(s3)
# There is no way we can determine which of the two models is correct. They
# are not equivalent, but they are both appropriate when either is appropriate.
# Prediction curves
newS <- seq(0,1, length=1000)
pr2 <- predict(s2, newdata=data.frame(pSec=newS))
pr3 <- predict(s3, newdata=data.frame(pSec=newS))
pr2 <- logistic(pr2) # Remember to back-transform!
pr3 <- logistic(pr3) # Remember to back-transform!
par(mar=c(4,4,1,1)+0.1)
plot(pSec, pInv, pch=16, col=2, las=1, xlim=c(0,1), ylim=c(0,0.1),
xlab="Proportion of Vote in Favor of Secession", ylab="Proportion of the Vote Invalidated")
lines(newS,pr2, col=2, lwd=2)
lines(newS,pr3, col=3, lwd=2)
legend("bottomleft",bty="n", lwd=2, col=c(2,3), c("Link: Identity","Link: Logit") )
###
# Now, the interesting thing is that we can use additional information to determine
# where the vote was unfair. We will use that in a future class (or assignment). But,
# until then, see if you can color the dots differently depending on whether the
# county is North, South, or OCV.
#
# Then, see if you can run individual regressions on the three regions.