######################### # # File: 20111018.R # Generalized Linear Models (Binomial) # Binary Dependent Variables # ######################### # Preamble library(RFS) # Read in pseudo-data cn <- read.csv("http://courses.kvasaheim.com/pols6123/data/coins2.csv") names(cn) summary(cn) attach(cn) # Linear probability model m1 <- lm(head ~ coin ) summary(m1) plot(coin,head) newX <- 1:100 pr <- predict(m1, newdata=data.frame(coin=newX)) # Check CLM model plot(m1) # Lots of bad things! # Another shot at graphing to make the point of # the dangers of extrapolation par(mar=c(4,4,1,1)+0.1) plot(coin,head, las=1, xlab="Coin Number",ylab="Probability of a Head", pch=16, xlim=c(-100,200)) newX <- seq(-100,200, length=1000) pr.l <- predict(m1, newdata=data.frame(coin=newX)) lines(newX,pr.l, col="grey", lwd=3) abline(h=0.500) abline(v=35) # A fix is to use GLMs. Remember you need to know three things # to properly use GLMs # Since the y-variable is 0/1, we will # first use the Binomial distribution **** # The canonical link gm1 <- glm(head~coin, family=binomial(link=logit)) summary(gm1) pr.logit <- logistic( predict(gm1, newdata=data.frame(coin=newX)) ) lines(newX,pr.logit, col="red4", lwd=3) # A different link function (see Forsberg Section 7.6) gm2 <- glm(head~coin, family=binomial(link=probit)) summary(gm2) summary(gm1) pr.probit <- probit.inv( predict(gm2, newdata=data.frame(coin=newX)) ) lines(newX,pr.probit, col="red1", lwd=3) # Another different link function gm3 <- glm(head~coin, family=binomial(link=cloglog)) summary(gm3) pr.cloglog <- cloglog.inv( predict(gm3, newdata=data.frame(coin=newX)) ) lines(newX,pr.cloglog, col="blue4", lwd=3) # Another different link function gm4 <- glm(head~coin, family=binomial(link=make.link("loglog"))) summary(gm4) pr.loglog <- loglog.inv( predict(gm4, newdata=data.frame(coin=newX)) ) lines(newX,pr.loglog, col="blue1", lwd=3) # Another different link function gm5 <- glm(head~coin, family=binomial(link=cauchit)) summary(gm5) pr.cauchit <- cauchit.inv( predict(gm5, newdata=data.frame(coin=newX)) ) lines(newX,pr.cauchit, col="tomato", lwd=3) # Add a legend to help us figure these curves out legend("topleft", c("Linear","Logit","Probit","Cauchit","Log-Log","Complementary Log-Log"), lwd=2, bty="n", col=c("grey","red4","red1","tomato","blue1","blue4") ) ################################################## # Read in data fuf <- read.csv("http://courses.kvasaheim.com/pols6123/data/fuf.csv") names(fuf) summary(fuf) attach(fuf) # Linear probability model m1 <- lm(fuffer ~ milex * majpower ) summary(m1) # Plot the data and prediction lines plot(milex, fuffer, las=1, xlab="Military Expenditures ($)", ylab="First User of Force", pch=16) newX <- seq(0,1e9,length=100) pr1 <- predict(m1, newdata=data.frame(milex=newX, majpower=1)) pr0 <- predict(m1, newdata=data.frame(milex=newX, majpower=0)) lines(newX,pr1, col=2, lwd=3) lines(newX,pr0, col=3, lwd=3) abline(h=0.5) # The 50-50 line # Brief check of the model assumptions plot(m1) # Stupendous fail! # Why a fail? # Logistic regression model fm2 <- glm(fuffer ~ milex * majpower, family=binomial(link=logit) ) summary(fm2) # Plot the data and prediction curves plot(milex, fuffer, las=1, xlab="Military Expenditures ($)", ylab="First User of Force", pch=16) newX <- seq(0,1e9,length=100) pr1 <- logistic( predict(fm2, newdata=data.frame(milex=newX, majpower=1)) ) pr0 <- logistic( predict(fm2, newdata=data.frame(milex=newX, majpower=0)) ) lines(newX,pr1, col=2, lwd=3) lines(newX,pr0, col=3, lwd=3) abline(h=0.500) plot(fm2) # Why no improvement??? # Checks: # Same as for the CLM: # Is the linear predictor correct? # Is the dependent variable appropriately distributed? # - Overdispersion (and underdispersion) # Are there any outliers? # Linear predictor # a. What does theory tell you? # b. Any multicollinearity? cor(fuf) # What does this mean? # Appropriate distribution? # Two possible outcomes -> Binomial (and similar) # Overdispersion? 1826.5/1620 # 1.13 # What does this mean? # Outliers: # There is the 'outlier' package, but let us # examine the residual plot for outliers. plot(residuals(m2)) # Nopers # Model checks out =) par(mar=c(5,5,1,1)+0.1) plot(milex, fuffer, las=1, xlab="Military Expenditures ($)", ylab="First User of Force", pch=16, yaxt="n") axis(2, las=1, at=c(1,0), label=c("User","Non-User")) newX <- seq(0,1e9,length=100) pr1 <- logistic( predict(m2, newdata=data.frame(milex=newX, majpower=1)) ) pr0 <- logistic( predict(m2, newdata=data.frame(milex=newX, majpower=0)) ) lines(newX,pr1, col=2, lwd=3) lines(newX,pr0, col=3, lwd=3) legend(2e8,0.75,c("Major Power","Non-Major Power"), lwd=2,col=c(2,3), bty="n") # Note the use of the axis() function and its effects. ################################################## # What if you have overdispersion? Three options: # 1. Post hoc adjustment using the dispersion ratio (BAD!!) # 2. Adjust your research model, since there may be something # wrong with it (BEST for us) # 3. Use quasi-likelihood estimation (QLE) instead of maximum # likelihood estimation (MLE) (best if the model fix did not work) # Example: fg2 <- glm(fuffer ~ milper + polity2 * cinc, family=binomial(link=logit)) summary(fg2) # Let's pretend there is significant overdispersion # Using quasilikelihood fq2 <- glm(fuffer ~ milper + polity2 * cinc, family=quasibinomial(link=logit)) summary(fq2) # Distribution: Binomial # Estimation method: Quasi-Likelihood Estimation # ... as opposed to Maximum Likelihood Estimation # Notice: # The coefficients do not change. The standard errors do. The predictions will # not change. The statistical significance will. Overdispersion underestimates # the standard errors, which leads to lower p-values. Underdispersion over- # estimates the standard errors, which leads to higher p-values.