#########################
#
# File: 20111025.R
# Generalized Linear Models (Binomial)
# Binary Dependent Variables
# Accuracy Measures
#
#########################
# Preamble
library(RFS)
# What does package RFS supply?
help(package="RFS")
?"RFS-package"
# You can click on the 'index' link at the bottom to get
# a clickable list of all functions in the package
# Also helpful at times:
sessionInfo()
# Read in data
coin <- read.csv("http://courses.kvasaheim.com/pols6123/data/coinflips.csv")
names(coin)
summary(coin)
attach(coin)
# Let us model this data in the usual manner
gm <- glm(head ~ trial, family=binomial(link=logit))
summary(gm)
# Of course, we would try different link functions and
# different powers of the trial variable if we wanted.
# From the homework, this model is as good as any we
# tried. So, let's go with it.
plot(trial,head)
newX <- seq(1,100)
pr <- logit.inv( predict(gm, newdata=data.frame(trial=newX)) )
lines(newX,pr, col=2, lwd=2)
abline(h=0.500) # Horizontal black line at 0.500
abline(v=66) # Vertical black line at coin #66
# Note that the graph is now divided into 4 parts. Two are
# errors (BR and TL) and two are correct predictions (TR and BL).
# Counting the correct predictions gives 71, thus our accuracy
# is 71 / 100 = 0.710
accuracy(model=gm, t=0.5)
# This is a function of the chosen threshold. If we vary the
# threshold, the accuracy will (probably) change:
accuracy(model=gm, t=0.25)
accuracy(model=gm, t=0.75)
accuracy(model=gm, t=0.05)
accuracy(model=gm, t=0.95)
# So, which threshold give the best accuracy?
# To determine this, we would have to calculate the accuracy
# for all possible values of the threshold, then find the
# threshold with the largest accuracy.
# We can make R do it quickly:
a <- numeric() # Create a variable for accuracy
for(i in 1:100) # Loop for each threshold
{ # Begin loop
a[i] <- accuracy(model=gm, t=i/100) # Calculate accuracy
} # End loop
# So, what information can a give us?
max(a) # The best accuracy
which(a==max(a)) # Which threshold gave us the best accuracy
plot(a) # A plot of the accuracies
points(which(a==max(a)), max(a), col=3, pch=16)
points(50,accuracy(model=gm,t=0.50), col=2, pch=16)
# ... etc.
# The receivor operating characteristic curve
library(Epi)
r <- ROC(pr, head, plot="ROC") # Plots the ROC curve
r$AUC # Provides just the A' value
# Why is A' important? It is an absolute measure of goodness
# of fit for the binary dependent variable model.