######################################## # # Script: 8 March 2011 (20110308.R) # ######################################## # Today: # # ** Fitting GLMs # . Non-canonical links # . Graphing predictions (!) # . Dealing with Binomials models (!) # # Let us plot the gdp per capita against the honesty in government # score from the gdpcap dataset gdp <- read.csv("http://courses.kvasaheim.com/stat40x3/data/gdpcap.csv", header=TRUE) names(gdp) # There are many ways of plotting this data. Each of the following offers # certain advantages. The latter solutions give better results than the # former solutions. Keep this in mind when doing graphs for homework # and life # Solution 1 # The usual plot, for what it's worth. par(mar=c(5,5,2,2)+0.1) plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index", ylab="GDP per capita\n", xlim=c(0,10)) # Solution 2 # Note the addition of the 'xaxt' parameter and the axis function. What # are the benefits here? par(mar=c(5,5,2,2)+0.1) plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index", ylab="GDP per capita\n", xlim=c(0,10), yaxt="n") axis(2, labels=c(0,10000, 50000, 100000), at=c(0,10000, 50000, 100000), las=1) # Solution 3 # Note the changes to the axis function. This is not my favorite way of # accomplishing what I need to accomplish, but it is doable. par(mar=c(5,5,2,2)+0.1) plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index", ylab="GDP per capita\n", xlim=c(0,10), yaxt="n") axis(2, labels=format(c(0,10000, 50000, 100000),nsmall=0,scientific=FALSE, big.mark=","), at=c(0,10000, 50000, 100000), las=1) # Solution 4 # Still making changes to the asix function. I actually like how this # came out. Why? par(mar=c(8,5,2,2)+0.1) plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index", ylab="GDP per capita\n", xlim=c(1,10), yaxt="n", pch=16) axis(2, labels=c("0","10,000", "50,000", "100,000"), at=c(0,10000, 50000, 100000), las=1) # Back to analysis: # # Wow, it looks like there is a relationship between these two variables. # Let us use GLM to estimate that relationship. Second question: What # are the restrictions on the dependent variable??? # The dependent variable is positive and integer (but, essentially # continuous). Thus, we can either use Gaussian or Poisson as our # distribution, but using a log-link in either case. # Let us do both and see which gives better results mod.gaus <- glm(gdpcap~hig, data=gdp, family=gaussian(link=log)) # Note the link function summary(mod.gaus) mod.pois <- glm(gdpcap~hig, data=gdp, family=poisson(link=log)) summary(mod.pois) # Which is better? # Well, which fits the data better? # This is the usual procedure for plotting the line on a graph. It # predicts new response (dependent variable) values from provided # independent variable values. The x-axis variable will be varied, # while the other independent variables need to be held constant. # Create a vector of new x-values (hig, here) newxvals <- 1:10 # Predict the y-values from these x-values pred.gaus <- predict(mod.gaus, newdata=data.frame(hig=newxvals)) pred.pois <- predict(mod.pois, newdata=data.frame(hig=newxvals)) # Remember to back-transform the predictions pred.gaus <- exp(pred.gaus) pred.pois <- exp(pred.pois) # Plot the lines on the current plot window lines(newxvals, pred.gaus, col=2, lwd=2) lines(newxvals, pred.pois, col=4, lwd=2) # And you are done. Later, we will add a legend to the graphs to # help us remember which color refers to which model. # Not too much of a difference between the two, as we expected. # Add a legend to the plot: legend("topright", c("Gaussian","Poisson"), lwd=2, col=c(2,4), bty="n" ) # What if we did not use a log link? mod.gaus2 <- glm(gdpcap~hig, data=gdp, family=gaussian(link=identity)) summary(mod.gaus2) # Again, for graphing: # Create a vector of new x-values (hig, here) newxvals <- 1:10 # Predict the y-values from these x-values pred.gaus2 <- predict(mod.gaus2, newdata=data.frame(hig=newxvals)) # Plot the lines on the current plot window lines(newxvals, pred.gaus2, col="#008000", lwd=2) # So, which is the best model? Why? ### # Let us now focus on the OPEC States par(mar=c(5,5,2,2)+0.1) plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index", ylab="GDP per capita\n", xlim=c(1,10), yaxt="n") axis(2, labels=c("0","10,000", "50,000", "100,000"), at=c(0,10000, 50000, 100000), las=1) lines(newxvals, pred.pois, col=4, lwd=2) points(gdpcap ~ hig, data=gdp, subset=(OPEC=="Member"), pch=16, col='green4') # Notice that almost all OPEC members are above their prediction. This # suggests to me that OPEC members tend to be richer than other States. Let # us check to see if this is true. # Fit an interaction model mod.pois2 <- glm(gdpcap~hig*OPEC, data=gdp, family=poisson(link=log)) summary(mod.pois2) # The interaction is statistically significant, so we shoulduse this # model. However, as an exercise, let us also fit the additive model mod.pois3 <- glm(gdpcap~hig+OPEC, data=gdp, family=poisson(link=log)) summary(mod.pois3) # Which is appropriate? Interaction term or no interaction term? Why? # Graphing: par(mar=c(5,5,2,2)+0.1) plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index", ylab="GDP per capita\n", xlim=c(1,10), yaxt="n") axis(2, labels=c("0","10,000", "50,000", "100,000"), at=c(0,10000, 50000, 100000), las=1) points(gdpcap ~ hig, data=gdp, subset=(OPEC=="Member"), pch=16, col='green') # Predictions, as usual, for the additive model: newxvals <- 1:10 pred.pois3o <- predict(mod.pois3, newdata=data.frame(hig=newxvals, OPEC="Member")) pred.pois3o <- exp(pred.pois3o) lines(newxvals, pred.pois3o, col=3, lwd=2) pred.pois3n <- predict(mod.pois3, newdata=data.frame(hig=newxvals, OPEC="Non-Member")) pred.pois3n <- exp(pred.pois3n) lines(newxvals, pred.pois3n, col=4, lwd=2) # Alternatively, for the additive model, but on the log scale (if you like): par(mar=c(5,5,2,2)+0.1) plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index", ylab="GDP per capita\n", xlim=c(1,10), yaxt="n", log="y", ylim=c(1e2,2e5)) axis(2, labels=c("100","1000","10,000","100,000"), at=c(100,1000,10000,100000), las=1) points(gdpcap ~ hig, data=gdp, subset=(OPEC=="Member"), pch=16, col='green') newxvals <- 1:10 pred.pois3o <- predict(mod.pois3, newdata=data.frame(hig=newxvals, OPEC="Member")) pred.pois3o <- exp(pred.pois3o) lines(newxvals, pred.pois3o, col=3, lwd=2) pred.pois3n <- predict(mod.pois3, newdata=data.frame(hig=newxvals, OPEC="Non-Member")) pred.pois3n <- exp(pred.pois3n) lines(newxvals, pred.pois3n, col=4, lwd=2) ################################################## # Start with a fresh slate rm(list=ls()) # A new dataset: Single-Sex Marriage ssm <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ssm.csv", header=TRUE) # If loaded on your flash drive, you would use: ssm <- read.csv("E:/STAT40x3/data/ssm.csv", header=TRUE) names(ssm) summary(ssm) hist(ssm$pctfavor, xlab="Percent in Favor of Measure") # Plot the data plot(pctfavor ~ religiosity, data=ssm, las=1, xlab="Religiosity",ylab="Percent in Favor", xlim=c(0,6), ylim=c(40,90) ) # Q: Which factors can be used to predict the outcome of an anti-SSM # ballot measure? Let us try these three: Year of the ballot measure, # whether the ballot measure also seeks to ban single-sex civil unions, # and the level of religiosity in the US state. # Fitting a straight, linear relationship (no pun intended) mod.gau <- glm(pctfavor ~ year + civilunion + religiosity, data=ssm) summary(mod.gau) # Predictions for the outcome of a ballot measure in 2000, which also seeks # to ban SS civil unions, as the level of religiosity changes newrelg <- 1:5 pred.gau1 <- predict(mod.gau, newdata=data.frame(year=2000,civilunion=1,religiosity=newrelg) ) lines(newrelg, pred.gau1, col=2, lwd=2) # Similarly for the same ballot measure that does NOT ban SS civil unions pred.gau0 <- predict(mod.gau, newdata=data.frame(year=2000,civilunion=0,religiosity=newrelg) ) lines(newrelg, pred.gau0, col=4, lwd=2) # Add a legend to keep things straight legend("bottomright", c("Does Not Ban Civil Unions","Bans Civil Unions"), lwd=2, col=c(4,2), bty="n" ) # Now, if we do not believe that the dependent variable is Gaussian, we # can fit using GLMs, specifically a Binomial distribution. Why? # We will need these: logit <- function(x) log( x/(1-x) ) logit.inv <- function(x) exp(x)/(1+exp(x)) plot(pctfavor ~ religiosity, data=ssm, las=1, xlab="Religiosity",ylab="Percent in Favor", xlim=c(0,6), ylim=c(40,90) ) mod.bin <- glm(pctfavor ~ year + civilunion + religiosity, data=ssm, family=binomial(link=logit)) # Oops! An error. What does it mean? # Create a variable for proportion of vote, not percent of vote ssm$propfavor <- ssm$pctfavor/100 mod.bin <- glm(propfavor ~ year + civilunion + religiosity, data=ssm, family=binomial(link=logit)) # Oh dear. We got a warning... Why? Think: What are the aspects of the # Binomial distribution you learned last term? # We can fix this in a few ways. First, we can gather the actual vote counts # for and against in each of the ballot measures. Second, we can multiply the # vote percent by 10 (as one is measured in tenths). Third, we can logit- # transform the proportion and fit it with a linear model. Let us do the # second one. # To do the second, we create three new variables. The number of votes for # the measure, the number of votes against the measure, and a congenery # of the two. We must create this last variable for R to fit the model: v.for <- ssm$pctfavor*10 v.against <- 1000 - v.for y.vote <- cbind(v.for,v.against) # Now, we fit the model. Note the y-variable mod.bin <- glm(y.vote ~ year + civilunion + religiosity, data=ssm, family=binomial(link=logit)) summary(mod.bin) # Graph the predictions newrelg <- 1:5 pred.bin1 <- predict(mod.bin, newdata=data.frame(year=2001,civilunion=1,religiosity=newrelg) ) pred.bin1 <- logit.inv(pred.bin1) * 100 lines(newrelg, pred.bin1, col=2, lwd=2) pred.bin0 <- predict(mod.bin, newdata=data.frame(year=2001,civilunion=0,religiosity=newrelg) ) pred.bin0 <- logit.inv(pred.bin0) * 100 lines(newrelg, pred.bin0, col=4, lwd=2) legend("bottomright", c("Does Not Ban Civil Unions","Bans Civil Unions"), lwd=2, col=c(4,2), bty="n" ) # So, take the following away: # # . Graphing # . Fitting Binomials