#########################
#
# File: 20111025.R
#       Generalized Linear Models (Binomial)
#       Binary Dependent Variables
#       Accuracy Measures
#
#########################


# Preamble
library(RFS)


# What does package RFS supply?
help(package="RFS")
?"RFS-package"
# You can click on the 'index' link at the bottom to get 
# a clickable list of all functions in the package


# Also helpful at times:
sessionInfo()


# Read in data
coin <- read.csv("http://courses.kvasaheim.com/pols6123/data/coinflips.csv")
names(coin)
summary(coin)
attach(coin)


# Let us model this data in the usual manner

gm <- glm(head ~ trial, family=binomial(link=logit))
summary(gm)

# Of course, we would try different link functions and 
# different powers of the trial variable if we wanted. 
# From the homework, this model is as good as any we
# tried. So, let's go with it.


plot(trial,head)
newX <- seq(1,100)
pr <- logit.inv( predict(gm, newdata=data.frame(trial=newX)) )
lines(newX,pr, col=2, lwd=2)

abline(h=0.500) # Horizontal black line at 0.500
abline(v=66)    # Vertical black line at coin #66

# Note that the graph is now divided into 4 parts. Two are
# errors (BR and TL) and two are correct predictions (TR and BL).
# Counting the correct predictions gives 71, thus our accuracy 
# is 71 / 100 = 0.710

accuracy(model=gm, t=0.5)


# This is a function of the chosen threshold. If we vary the 
# threshold, the accuracy will (probably) change:

accuracy(model=gm, t=0.25)
accuracy(model=gm, t=0.75)
accuracy(model=gm, t=0.05)
accuracy(model=gm, t=0.95)

# So, which threshold give the best accuracy? 

# To determine this, we would have to calculate the accuracy
# for all possible values of the threshold, then find the 
# threshold with the largest accuracy.

# We can make R do it quickly:

a <- numeric()                            # Create a variable for accuracy
for(i in 1:100)                           # Loop for each threshold
{                                         # Begin loop
  a[i] <- accuracy(model=gm, t=i/100)     # Calculate accuracy 
}                                         # End loop

# So, what information can a give us?
max(a)               # The best accuracy
which(a==max(a))     # Which threshold gave us the best accuracy
plot(a)              # A plot of the accuracies
points(which(a==max(a)), max(a), col=3, pch=16)
points(50,accuracy(model=gm,t=0.50), col=2, pch=16)

# ... etc.


# The receivor operating characteristic curve
library(Epi)

r <- ROC(pr, head, plot="ROC")     # Plots the ROC curve 
r$AUC                              # Provides just the A' value

# Why is A' important? It is an absolute measure of goodness
# of fit for the binary dependent variable model.