########################################
#
#  Script: 24 March 2011 (20110324.R)
#
########################################

# Today:
#
#   ** Fitting GLMs
#      . Assumptions
#      . Testing Assumptions
#      . Binary Response Model
#   ** Model goodness of fit
#      . Deviance R^2
#      . Receiver Operating Characteristic curve
#


# Recall that this lecture was presented without a complete script available
# to y'all. During class, we organically created our script to reflect our
# needs at the time. The functions we wrote were also written en vivo, which
# showed the back-and-forth procedure to how functions are really written. In
# fact, the function that I had prepared before-hand was improved upon during
# class.



###
#  Useful functions

logit      <- function(p) log( p / (1-p) )
logit.inv  <- function(x) exp(x) / (1+exp(x))
devr2      <- function(mod) (mod$null.deviance-mod$deviance)/mod$null.deviance



# Note: A 'Binary Response Variable' or 'Binary Dependent Variable' regression
# only differs in two aspects from previous regressions. First, the outcome
# variable is limited to 0 and 1. Thus, prediction of the existence of a 
# characteristic would be perfect here. The second difference is that the
# predictions provided by computers are not 0/1, they are (0,1): decimals
# between 0 and 1. So, we have to determine a threshold (or cutpoint) that
# delineates those predictions that will be 1 and those that will be 0.



### 
#  New data: Terrorism (Binary Response Variable)

ter <- read.csv("http://courses.kvasaheim.com/stat40x3/data/terror.csv")
summary(ter)
names(ter)

dim(ter)  # There are 144 records with 8 variables per record


# First, we find a 'best' model. We pay attention to dispersion and to 
# statistical significance of the independent variables. As social
# scientists, we also pay attention to the current literature, which 
# will tell us which variables should be in there, which are optional,
# and which need to be in there. Our research question/hypothesis also
# helps us determine which variables (our research variables) MUST be
# in our final model. Here, riteleft is our research variable. Also, the
# literature tells me that democ really should be there, as well as the
# world.trade (a measure of globalization).


# So, let us try to fit the correct model

# First, the full model

model.1 <- glm(attack ~ riteleft + parlilean + year + world.trade + gdp.real + democ + I(democ^2) + democ*riteleft + ccode2 ,
    family=binomial(link=logit), data=ter )

# This warning should actually be an error. It indicates you need to
# remove some varaibles or increase the amount of data.

# Remove the factor, as it has the greatest effect on the model

model.2 <- glm(attack ~ riteleft + parlilean + year + world.trade + gdp.real + democ + I(democ^2) + democ*riteleft,
    family=binomial(link=logit), data=ter )
  summary(model.2)


# Begin paring variables off. Start with the interaction terms 
# (especially if the underlying variables are not stat-sig).

model.3 <- glm(attack ~ riteleft + parlilean + year + world.trade + gdp.real + democ + I(democ^2) ,
    family=binomial(link=logit), data=ter )
  summary(model.3)

model.4 <- glm(attack ~ riteleft + parlilean + year + world.trade + gdp.real + democ ,
    family=binomial(link=logit), data=ter )
  summary(model.4)


# For the remaining, which should we think about removing? Remember that
# the independent variables are called independent for a reason. Let us
# check which variables are not independent, that is, which are highly
# correlated


cor(ter)  # Unfortunately, this no longer works as a shortcut, since 
          # ccode2 is a character variable. =(


# A work around is to remove the columns with character data:
  tempdata <- ter[,-2]   # -2, because we are dropping the second column
  cor(tempdata)

# So, it seems as though year and world.trade are highly correlated as are
# riteleft and polilean. Since my research is about riteleft, we'll drop 
# polilean. Also, as world.trade is more substantively more interesting
# than year, we will drop year.

# Thus, this leads to our final, best model

model.5 <- glm(attack ~ riteleft + world.trade + gdp.real + democ,
    family=binomial(link=logit), data=ter )
  summary(model.5)



# Now, an important question is:
# Q: We know we have statistically significant results, but are the 
#    results *substantively* significant?

summary(ter)


# Our last fundamental question is:
# Q: How good is the model?

# Ans: We want to determine the model's goodness of fit.


# One way, using a deviance-R^2:

devr2(model.5)    # The Deviance R^2 value
                  # How do we interpret it?


# Another way, leveraging the structure of the data, is to ask questions
# like:
#   How accurate is the model?
#   What is the FPR?
#   What is the FNR?

# Remember that the dependent variable is binary!


# If you think about how the ultimate predictions are made, the number of 1s
# and 0s depend on the threshold you select. Thus, a global measure of 
# goodness of fit will depend on all possible values of that threshold.


# One way is to graph the accuracy as the threshold changes. Let us do this
# here (and now). This is the part that was not done during class. I left it
# off because there is no distribution (and thus no p-value) for this graph.
# It is good, however, at showing how sensitive our accuracy is to our
# choice of threshold.


# First, an accuracy function (from an earlier class)

ACC <- function(t) (t[1,1]+t[2,2])/(t[1,1]+t[1,2]+t[2,1]+t[2,2])


# Next, a function that calculates the prediction table. It requires
# both the model and the threshold


# Recall the prediction table
#
#
#                           Reality
#                           P    N  
#          Prediction   P'  TP   FP
#                       N'  FN   TN
#
#

# Let's create the function that calculates the prediction table (as a function of threshold)

predictiontable <- function(model,tau) {

  p <- predict(model, type="response")
  p <- as.numeric(p>=tau)
  p <- as.factor(p)

  y <- model$y

  t11 <- length(which(p==1 & y==1))
  t21 <- length(which(p==1 & y==0))
  t12 <- length(which(p==0 & y==1))
  t22 <- length(which(p==0 & y==0))

  ret <- matrix( c(t11,t12,t21,t22), nrow=2 )
  dimnames(ret) = list("Predicted" = c("T", "F"), "Actual"=c("T", "F") )

return(ret)
}


# So, our accuracy graph will loop through many different thresholds, 
# calculate the accuracy for that threshold, then plot the results

a <- numeric()

for(i in 1:1000) {
  tau <- i/1000
  t <- predictiontable(model.5,tau)
  a[i] <- ACC(t)
}

plot(a)

# Of course, we can pretty-up that graph to make it presentation (homework)
# quality. The key here is that there is a small spike in accuracy at 
# tau=0.483, but every threshold between 0.3 and 0.6 gives about the same 
# accuracy (0.8-ish).

par(mar=c(5,4,2,2)+0.1)
plot(1:1000/1000,a, ylim=c(0,1), las=1, xlim=c(0,1), type="l",
  xlab="Threshold",ylab="Accuracy" )

# Since there is a wide range of thresholds that give essentially the same
# accuracy, we can conclude that the accuracy of this model is very stable --
# or robust -- to changes in the threshold.


###


# A second way is to plot the changes in TPR and FPR as the threshold changes. This
# is called ROC analysis. It consists of a curve (ROC Curve) and an area under
# the ROC curve (AUC)

# There exists a package which calculates AUC and plots ROC, but let us do
# these by hand to help us understand what is going on and to help us
# become more familiar with R.



# The Receiver Operating Characteristic (ROC) curve is a plot of TPR, also known as
# Sensitivity, against FPR, which is equivalent to 1-Specificity.

# So, we need to create functions for TPR and FPR:

TPR <- function(t) t[1,1]/(t[1,1]+t[2,1])
FPR <- function(t) t[1,2]/(t[1,2]+t[2,2])


# Now, we can write a function that plots the ROC curve (as well as determining
# the actual positions for the FPR and TPR for each threshold value). Since we will
# use this to calculate the area under the curve, I am making a function out of it.


ROC <- function(model, n=100, plot=TRUE, vals=FALSE) {

  T <- numeric()
  F <- numeric()
  for( i in 1:n ) {
    tau <- i/n
    t <- predictiontable(model,tau)
    T[i] <- TPR(t)
    F[i] <- FPR(t)
  }


  if(plot) {
    plot(1,1, xlim=c(0,1), ylim=c(0,1), type="n", 
      xlab="False Positive Rate",ylab="True Positive Rate", 
      main=paste("For Model:",deparse(substitute(model))), las=1 )
    points(F,T)
    abline(0,1, col="grey")
  
    for(i in 2:length(F)) {
      segments( F[i],T[i], F[i-1],T[i] )
      segments( F[i-1],T[i], F[i-1],T[i-1] )
    }
  }

  if(vals) {
    return( list(FPR=F,TPR=T) )
  }

}



# How do we calculate the area under the curve? By using simple grade-school
# geometry (areas of rectangles):

AUC <- function(model, n=100) {
  v <- ROC(model, n=n, plot=FALSE, vals=TRUE)
  area <- -sum( diff(v$FPR,1)*v$TPR[2:length(v$TPR)] )
return(area)
}



# A test to see if it all works
  ROC(model.5, n=100)
  AUC(model.5,n=100)



# A package that helps you do ROC analysis is the verification package. It
# is not a part of the base distribution of R, but it is a valuable package,
# so you should install it.

# Once it is installed, you activate it by
  library(verification)

# The function of interest is roc.plot. To learn about it,
  ?roc.plot

# Here it is in action
  roc.curve <- roc.plot(ter$attack, pred=predict(model.5), binormal=TRUE, plot="both" )
  roc.curve$roc.vol


# What does the area under the curve (AUC) tell us? The area measures discrimination, 
# that is, the ability of the test to correctly classify those with and without the 
# condition. 

# For more, please read the following:
#   http://gim.unmc.edu/dxtests/roc3.htm
#   http://www.childrens-mercy.org/stats/ask/roc.asp
#   http://www.cs.nyu.edu/~mohri/pub/area.pdf