########################################
#
#  Script: 3 March 2011 (20110303.R)
#
########################################

# Today:
#
#   ** Scatterplots
#
#   ** Generalized Linear Models
#      . Linear predictor
#      . Conditional distribution of Y
#      . Link function
#



# Let us plot the points scored against the post scored against
# from our NCAA 2009 football data.

fb <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ncaa2009football.csv")
names(fb)

plot(score~opscore, data=fb)    #             y~x
plot(opscore,score, data=fb)    # Equivalent  x,y

# Ways to fix it up are very similar to what we did with boxplots, with
# a couple additions, notable the pch parameter, which specifies the 
# plotting character.

plot(score~opscore, data=fb,  pch=16)
plot(score~opscore, data=fb,  pch=67)
plot(score~opscore, data=fb,  pch="X")

# There is also, of course, the color parameter

plot(score~opscore, data=fb,  pch=10, col="tomato")
plot(score~opscore, data=fb,  pch=4, col=4)

# and so forth...

# The rules of a good graph are the same, so we need to label our axes, 
# get the values heading the right way, and even adjust our axis limits

par( mar=c(5,5,2,2)+0.1)
plot(score~opscore, data=fb,  pch=4, col=4, las=1, xlab="Points Scored Against",
    ylab="Points Scored", xlim=c(0,80), ylim=c(0,80))


# Or something like that





# The rest of today will be lecture. So, I am putting some additional
# examples for you to go through for practice.



# Let us examine the accuracy function you created for the
# midterm. Here is mine:

accuracy <- function(t) {
  acc <- (t[1,1]+t[2,2])/sum(t)
return (acc)
}

# It is available online at
source("http://courses.kvasaheim.com/stat40x3/scripts/accuracy.R")







# A model for predicting the winner of NCAA football games

fb <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ncaa2009football.csv")
names(fb)

fb$win  <- fb$score>fb$opscore    # What does this line do?
names(fb)                         # Hint.


pred.win <- fb$score>10


# Here is a table of count data. These are the results from my model on
# NCAA Football games:

t <-  table(pred.win,fb$win)
dimnames(t) <- list( predicted=c("Loss","Win"),actual=c("Loss","Win"))
accuracy(t)

# Q1:  Calculate the reltive accuracy
# Ans. The relative accuracy is 1.157233


# For practice, you may want to write a relative accuracy function, but
# we will never use it in the future, so it is not necessary.


# Table t is not in the form we usually see it. To flip it so that it
# is in canonical form, we can do this:

t <- t[2:1,2:1]

# Why does this function actually work?



# Q2:  Which value of ww gives the highest accuracy?
# Ans. The value ww=477 gives an accuracy of 0.7925532.

# Q3:  What is the false positive rate (FPR) of t?
# Ans. FPR = 0.7392740

# Q4:  What is the false negative rate (FNR) of t?
# Ans. FNR = 0.0083857



# You may want to write those last two as functions. 





# Here is my model from above:

pred.win <- fb$score>10

# It is not a stretch to say that the accuracy of my model depends on the 
# threshold value I selected (10, here). What happens to the accuracy if
# I change the threshold?

# Let's do that:


a <- numeric()

for(threshold in 1:(max(fb$score)-1) ) {
  pred.win <- fb$score>threshold
  t <-  table(pred.win,fb$win)
  a[threshold] <- accuracy(t)
}

plot(a, las=1, ylim=c(0,1), ylab="Accuracy", xlab="Score Threshold", type="l")

# Q5:  What is the best threshold level to chose to maximize the accuracy?
# Ans. Using a cutoff of 22 gives an accuracy of 0.7935897

# Q6:  What threshold minimizes the FPR?
# Ans. Any threshold below 7 gives a FPR of 0.

# Q7:  What threshold minimizes the FNR?
# Ans. Any threshold above 51 gives a FNR of 0.



# What do these two results actually tell us that we already knew?
# More importantly, how do we balance these two errors?