######################### # # File: 20111011.R # # Generalized Linear Models (Gaussian) # ######################### # Preamble library(RFS) # # New data file : # 2008 presidential elections data for Colorado # Read in data elxn <- read.csv("http://courses.kvasaheim.com/pols6123/data/co2008pres2.csv") attach(elxn) names(elxn) summary(elxn) # Plot to get a feel for the relationship (cor only returns linear correlation) plot(pMcCain,pSpoiled) # We are going to test the Colorado election for a specific type of # electoral fraud. *IF* an election is free and fair, then each person's # vote counts the same. This is equivalent to saying that the probability # of a person's vote being rejected is independent of several different # covariates. Such covariates include demographic features (age, gender, # ethnicity) as well as whom the person voted for. Here, we test if the # probability of a person's vote being rejected is independent of the # candidate voted for. # The term "independent of" indicates we need to perform an independence # test. For two continuous variables, we might think that we could use a # correlation test. However, recall that independent covariates have a zero # correlation, BUT the converse is not necessarily true. # In addition to the logic issues using correlation to test independence, # it is well neigh impossible to perform the test when there are control # variables present or when the two variables are not Normally distributed. # Thus, we will perform linear regression. # The plot again, but improved (not for publication) plot(pMcCain,pSpoiled, pch=16, las=1, xlim=c(0,1), ylim=c(0,0.05) ) # So, we will perform linear regression as we have before m1 <- lm(pSpoiled ~ pMcCain) summary(m1) # Whoops, the dependent variable is bounded, so we transform it m2 <- lm( logit(pSpoiled) ~ pMcCain) summary(m2) # plot prediction curve newM <- seq(0:1, length=100) pr <- predict(m2, newdata=data.frame(pMcCain=newM) ) pr <- logistic(pr) lines(newM, pr, col=2, lwd=2, lty=2) # Notice that none of this is new # Now, let us use GLMs instead of CLMs to fit the model m3 <- glm( pSpoiled ~ pMcCain, family=gaussian(link=identity) ) summary(m3) summary(m1) # Compare the results # Note that the two models m1 and m3 are equivalent. This is true since # GLMs and CLMs are equivalent when using the Gaussian (Normal) distribution # and the canonical link (identity). # As before, we note the dependent variable is bounded, so we transform # it using the logit function m4 <- glm( logit(pSpoiled) ~ pMcCain, family=gaussian(link=identity) ) summary(m4) summary(m2) # Compare the results # Again, note that the two models m2 and m4 are equivalent. This is true since # GLMs and CLMs are equivalent when using the Gaussian (Normal) distribution # and the canonical link (identity). # An alternate (and very appropriate) model for fitting the data (and # testing the hypothesis) is to use the logit as the link function m5 <- glm( pSpoiled ~ pMcCain, family=gaussian( link=make.link("logit") ) ) summary(m5) summary(m2) # Compare the results # Note that the results between these two models are not the same. We should # not expect them to be equivalent, since we are not using the canonical link # with a Gaussian distribution. ################################################## # Let's do this analysis on another election dataset (with some quirks) # The South Sudan unity referendum of January 2011 # Read in the data ssd <- read.csv("http://courses.kvasaheim.com/pols6123/data/xsd2011referendum.csv") attach(ssd) names(ssd) summary(ssd) # Create the proportion variables pInv <- Invalid/Votes pSec <- Secession/(Votes-Invalid-Blank) # This is a change! Elections theory demands it # Feel the data plot(pSec, pInv, pch=16, col=2) # Not for publication (Why?) # Our first model s1 <- glm(pInv ~ pSec, family=gaussian(link=identity)) summary(s1) # Whoops! We should not have done that, since we need to transform the dep var s2 <- glm( logit(pInv) ~ pSec, family=gaussian(link=identity)) s3 <- glm( pInv ~ pSec, family=gaussian(link=make.link("logit"))) # Oh bother! An error message. This happened because I had proportions equal to 0 or # to 1. The logit has a domain of 0 < p < 1, which does not include 0 or 1. So, we # MUST remove the counties that had no invalid votes d <- which(pInv == 0) # The identity of the zero-invalid counties pInv <- pInv[-d] # Remove those counties from pInv pSec <- pSec[-d] # Remove those counties from pSec # They must be removed from BOTH variables because regression is based on the # assumption that the data points have a meaning, that is, you remove a record # (county), not just a value in a record. # Let's try fitting again. s2 <- glm( logit(pInv) ~ pSec, family=gaussian(link=identity)) s3 <- glm( pInv ~ pSec, family=gaussian(link=make.link("logit"))) summary(s2) summary(s3) # There is no way we can determine which of the two models is correct. They # are not equivalent, but they are both appropriate when either is appropriate. # Prediction curves newS <- seq(0,1, length=1000) pr2 <- predict(s2, newdata=data.frame(pSec=newS)) pr3 <- predict(s3, newdata=data.frame(pSec=newS)) pr2 <- logistic(pr2) # Remember to back-transform! pr3 <- logistic(pr3) # Remember to back-transform! par(mar=c(4,4,1,1)+0.1) plot(pSec, pInv, pch=16, col=2, las=1, xlim=c(0,1), ylim=c(0,0.1), xlab="Proportion of Vote in Favor of Secession", ylab="Proportion of the Vote Invalidated") lines(newS,pr2, col=2, lwd=2) lines(newS,pr3, col=3, lwd=2) legend("bottomleft",bty="n", lwd=2, col=c(2,3), c("Link: Identity","Link: Logit") ) ### # Now, the interesting thing is that we can use additional information to determine # where the vote was unfair. We will use that in a future class (or assignment). But, # until then, see if you can color the dots differently depending on whether the # county is North, South, or OCV. # # Then, see if you can run individual regressions on the three regions.