######################### # # File: 20111025.R # Generalized Linear Models (Binomial) # Binary Dependent Variables # Accuracy Measures # ######################### # Preamble library(RFS) # What does package RFS supply? help(package="RFS") ?"RFS-package" # You can click on the 'index' link at the bottom to get # a clickable list of all functions in the package # Also helpful at times: sessionInfo() # Read in data coin <- read.csv("http://courses.kvasaheim.com/pols6123/data/coinflips.csv") names(coin) summary(coin) attach(coin) # Let us model this data in the usual manner gm <- glm(head ~ trial, family=binomial(link=logit)) summary(gm) # Of course, we would try different link functions and # different powers of the trial variable if we wanted. # From the homework, this model is as good as any we # tried. So, let's go with it. plot(trial,head) newX <- seq(1,100) pr <- logit.inv( predict(gm, newdata=data.frame(trial=newX)) ) lines(newX,pr, col=2, lwd=2) abline(h=0.500) # Horizontal black line at 0.500 abline(v=66) # Vertical black line at coin #66 # Note that the graph is now divided into 4 parts. Two are # errors (BR and TL) and two are correct predictions (TR and BL). # Counting the correct predictions gives 71, thus our accuracy # is 71 / 100 = 0.710 accuracy(model=gm, t=0.5) # This is a function of the chosen threshold. If we vary the # threshold, the accuracy will (probably) change: accuracy(model=gm, t=0.25) accuracy(model=gm, t=0.75) accuracy(model=gm, t=0.05) accuracy(model=gm, t=0.95) # So, which threshold give the best accuracy? # To determine this, we would have to calculate the accuracy # for all possible values of the threshold, then find the # threshold with the largest accuracy. # We can make R do it quickly: a <- numeric() # Create a variable for accuracy for(i in 1:100) # Loop for each threshold { # Begin loop a[i] <- accuracy(model=gm, t=i/100) # Calculate accuracy } # End loop # So, what information can a give us? max(a) # The best accuracy which(a==max(a)) # Which threshold gave us the best accuracy plot(a) # A plot of the accuracies points(which(a==max(a)), max(a), col=3, pch=16) points(50,accuracy(model=gm,t=0.50), col=2, pch=16) # ... etc. # The receivor operating characteristic curve library(Epi) r <- ROC(pr, head, plot="ROC") # Plots the ROC curve r$AUC # Provides just the A' value # Why is A' important? It is an absolute measure of goodness # of fit for the binary dependent variable model.