######################################## # # Script: 24 March 2011 (20110324.R) # ######################################## # Today: # # ** Fitting GLMs # . Assumptions # . Testing Assumptions # . Binary Response Model # ** Model goodness of fit # . Deviance R^2 # . Receiver Operating Characteristic curve # # Recall that this lecture was presented without a complete script available # to y'all. During class, we organically created our script to reflect our # needs at the time. The functions we wrote were also written en vivo, which # showed the back-and-forth procedure to how functions are really written. In # fact, the function that I had prepared before-hand was improved upon during # class. ### # Useful functions logit <- function(p) log( p / (1-p) ) logit.inv <- function(x) exp(x) / (1+exp(x)) devr2 <- function(mod) (mod$null.deviance-mod$deviance)/mod$null.deviance # Note: A 'Binary Response Variable' or 'Binary Dependent Variable' regression # only differs in two aspects from previous regressions. First, the outcome # variable is limited to 0 and 1. Thus, prediction of the existence of a # characteristic would be perfect here. The second difference is that the # predictions provided by computers are not 0/1, they are (0,1): decimals # between 0 and 1. So, we have to determine a threshold (or cutpoint) that # delineates those predictions that will be 1 and those that will be 0. ### # New data: Terrorism (Binary Response Variable) ter <- read.csv("http://courses.kvasaheim.com/stat40x3/data/terror.csv") summary(ter) names(ter) dim(ter) # There are 144 records with 8 variables per record # First, we find a 'best' model. We pay attention to dispersion and to # statistical significance of the independent variables. As social # scientists, we also pay attention to the current literature, which # will tell us which variables should be in there, which are optional, # and which need to be in there. Our research question/hypothesis also # helps us determine which variables (our research variables) MUST be # in our final model. Here, riteleft is our research variable. Also, the # literature tells me that democ really should be there, as well as the # world.trade (a measure of globalization). # So, let us try to fit the correct model # First, the full model model.1 <- glm(attack ~ riteleft + parlilean + year + world.trade + gdp.real + democ + I(democ^2) + democ*riteleft + ccode2 , family=binomial(link=logit), data=ter ) # This warning should actually be an error. It indicates you need to # remove some varaibles or increase the amount of data. # Remove the factor, as it has the greatest effect on the model model.2 <- glm(attack ~ riteleft + parlilean + year + world.trade + gdp.real + democ + I(democ^2) + democ*riteleft, family=binomial(link=logit), data=ter ) summary(model.2) # Begin paring variables off. Start with the interaction terms # (especially if the underlying variables are not stat-sig). model.3 <- glm(attack ~ riteleft + parlilean + year + world.trade + gdp.real + democ + I(democ^2) , family=binomial(link=logit), data=ter ) summary(model.3) model.4 <- glm(attack ~ riteleft + parlilean + year + world.trade + gdp.real + democ , family=binomial(link=logit), data=ter ) summary(model.4) # For the remaining, which should we think about removing? Remember that # the independent variables are called independent for a reason. Let us # check which variables are not independent, that is, which are highly # correlated cor(ter) # Unfortunately, this no longer works as a shortcut, since # ccode2 is a character variable. =( # A work around is to remove the columns with character data: tempdata <- ter[,-2] # -2, because we are dropping the second column cor(tempdata) # So, it seems as though year and world.trade are highly correlated as are # riteleft and polilean. Since my research is about riteleft, we'll drop # polilean. Also, as world.trade is more substantively more interesting # than year, we will drop year. # Thus, this leads to our final, best model model.5 <- glm(attack ~ riteleft + world.trade + gdp.real + democ, family=binomial(link=logit), data=ter ) summary(model.5) # Now, an important question is: # Q: We know we have statistically significant results, but are the # results *substantively* significant? summary(ter) # Our last fundamental question is: # Q: How good is the model? # Ans: We want to determine the model's goodness of fit. # One way, using a deviance-R^2: devr2(model.5) # The Deviance R^2 value # How do we interpret it? # Another way, leveraging the structure of the data, is to ask questions # like: # How accurate is the model? # What is the FPR? # What is the FNR? # Remember that the dependent variable is binary! # If you think about how the ultimate predictions are made, the number of 1s # and 0s depend on the threshold you select. Thus, a global measure of # goodness of fit will depend on all possible values of that threshold. # One way is to graph the accuracy as the threshold changes. Let us do this # here (and now). This is the part that was not done during class. I left it # off because there is no distribution (and thus no p-value) for this graph. # It is good, however, at showing how sensitive our accuracy is to our # choice of threshold. # First, an accuracy function (from an earlier class) ACC <- function(t) (t[1,1]+t[2,2])/(t[1,1]+t[1,2]+t[2,1]+t[2,2]) # Next, a function that calculates the prediction table. It requires # both the model and the threshold # Recall the prediction table # # # Reality # P N # Prediction P' TP FP # N' FN TN # # # Let's create the function that calculates the prediction table (as a function of threshold) predictiontable <- function(model,tau) { p <- predict(model, type="response") p <- as.numeric(p>=tau) p <- as.factor(p) y <- model$y t11 <- length(which(p==1 & y==1)) t21 <- length(which(p==1 & y==0)) t12 <- length(which(p==0 & y==1)) t22 <- length(which(p==0 & y==0)) ret <- matrix( c(t11,t12,t21,t22), nrow=2 ) dimnames(ret) = list("Predicted" = c("T", "F"), "Actual"=c("T", "F") ) return(ret) } # So, our accuracy graph will loop through many different thresholds, # calculate the accuracy for that threshold, then plot the results a <- numeric() for(i in 1:1000) { tau <- i/1000 t <- predictiontable(model.5,tau) a[i] <- ACC(t) } plot(a) # Of course, we can pretty-up that graph to make it presentation (homework) # quality. The key here is that there is a small spike in accuracy at # tau=0.483, but every threshold between 0.3 and 0.6 gives about the same # accuracy (0.8-ish). par(mar=c(5,4,2,2)+0.1) plot(1:1000/1000,a, ylim=c(0,1), las=1, xlim=c(0,1), type="l", xlab="Threshold",ylab="Accuracy" ) # Since there is a wide range of thresholds that give essentially the same # accuracy, we can conclude that the accuracy of this model is very stable -- # or robust -- to changes in the threshold. ### # A second way is to plot the changes in TPR and FPR as the threshold changes. This # is called ROC analysis. It consists of a curve (ROC Curve) and an area under # the ROC curve (AUC) # There exists a package which calculates AUC and plots ROC, but let us do # these by hand to help us understand what is going on and to help us # become more familiar with R. # The Receiver Operating Characteristic (ROC) curve is a plot of TPR, also known as # Sensitivity, against FPR, which is equivalent to 1-Specificity. # So, we need to create functions for TPR and FPR: TPR <- function(t) t[1,1]/(t[1,1]+t[2,1]) FPR <- function(t) t[1,2]/(t[1,2]+t[2,2]) # Now, we can write a function that plots the ROC curve (as well as determining # the actual positions for the FPR and TPR for each threshold value). Since we will # use this to calculate the area under the curve, I am making a function out of it. ROC <- function(model, n=100, plot=TRUE, vals=FALSE) { T <- numeric() F <- numeric() for( i in 1:n ) { tau <- i/n t <- predictiontable(model,tau) T[i] <- TPR(t) F[i] <- FPR(t) } if(plot) { plot(1,1, xlim=c(0,1), ylim=c(0,1), type="n", xlab="False Positive Rate",ylab="True Positive Rate", main=paste("For Model:",deparse(substitute(model))), las=1 ) points(F,T) abline(0,1, col="grey") for(i in 2:length(F)) { segments( F[i],T[i], F[i-1],T[i] ) segments( F[i-1],T[i], F[i-1],T[i-1] ) } } if(vals) { return( list(FPR=F,TPR=T) ) } } # How do we calculate the area under the curve? By using simple grade-school # geometry (areas of rectangles): AUC <- function(model, n=100) { v <- ROC(model, n=n, plot=FALSE, vals=TRUE) area <- -sum( diff(v$FPR,1)*v$TPR[2:length(v$TPR)] ) return(area) } # A test to see if it all works ROC(model.5, n=100) AUC(model.5,n=100) # A package that helps you do ROC analysis is the verification package. It # is not a part of the base distribution of R, but it is a valuable package, # so you should install it. # Once it is installed, you activate it by library(verification) # The function of interest is roc.plot. To learn about it, ?roc.plot # Here it is in action roc.curve <- roc.plot(ter$attack, pred=predict(model.5), binormal=TRUE, plot="both" ) roc.curve$roc.vol # What does the area under the curve (AUC) tell us? The area measures discrimination, # that is, the ability of the test to correctly classify those with and without the # condition. # For more, please read the following: # http://gim.unmc.edu/dxtests/roc3.htm # http://www.childrens-mercy.org/stats/ask/roc.asp # http://www.cs.nyu.edu/~mohri/pub/area.pdf