########################################
#
#  Script: 8 March 2011 (20110308.R)
#
########################################

# Today:
#
#   ** Fitting GLMs
#   . Non-canonical links
#   . Graphing predictions (!)
#   . Dealing with Binomials models (!)
#



# Let us plot the gdp per capita against the honesty in government
# score from the gdpcap dataset

  gdp <- read.csv("http://courses.kvasaheim.com/stat40x3/data/gdpcap.csv", header=TRUE)
  names(gdp)

# There are many ways of plotting this data. Each of the following offers
# certain advantages. The latter solutions give better results than the
# former solutions. Keep this in mind when doing graphs for homework
# and life


# Solution 1
# The usual plot, for what it's worth.

  par(mar=c(5,5,2,2)+0.1)
  plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index",
    ylab="GDP per capita\n", xlim=c(0,10))


# Solution 2
# Note the addition of the 'xaxt' parameter and the axis function. What
# are the benefits here?

  par(mar=c(5,5,2,2)+0.1)
  plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index",
    ylab="GDP per capita\n", xlim=c(0,10), yaxt="n")
  axis(2, labels=c(0,10000, 50000, 100000), at=c(0,10000, 50000, 100000), las=1)


# Solution 3
# Note the changes to the axis function. This is not my favorite way of
# accomplishing what I need to accomplish, but it is doable.

  par(mar=c(5,5,2,2)+0.1)
  plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index",
    ylab="GDP per capita\n", xlim=c(0,10), yaxt="n")
  axis(2, labels=format(c(0,10000, 50000, 100000),nsmall=0,scientific=FALSE, big.mark=","), at=c(0,10000, 50000, 100000), las=1)


# Solution 4
# Still making changes to the asix function. I actually like how this
# came out. Why?

  par(mar=c(8,5,2,2)+0.1)
  plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index",
    ylab="GDP per capita\n", xlim=c(1,10), yaxt="n", pch=16)
  axis(2, labels=c("0","10,000", "50,000", "100,000"), at=c(0,10000, 50000, 100000), las=1)


# Back to analysis:
#
# Wow, it looks like there is a relationship between these two variables.
# Let us use GLM to estimate that relationship. Second question: What
# are the restrictions on the dependent variable???

# The dependent variable is positive and integer (but, essentially 
# continuous). Thus, we can either use Gaussian or Poisson as our 
# distribution, but using a log-link in either case.

# Let us do both and see which gives better results

  mod.gaus <- glm(gdpcap~hig, data=gdp, family=gaussian(link=log))   # Note the link function
  summary(mod.gaus)

  mod.pois <- glm(gdpcap~hig, data=gdp, family=poisson(link=log))
  summary(mod.pois)

# Which is better? 

# Well, which fits the data better?

# This is the usual procedure for plotting the line on a graph. It 
# predicts new response (dependent variable) values from provided
# independent variable values. The x-axis variable will be varied,
# while the other independent variables need to be held constant.

# Create a vector of new x-values (hig, here)
  newxvals <- 1:10

# Predict the y-values from these x-values
  pred.gaus <- predict(mod.gaus, newdata=data.frame(hig=newxvals))
  pred.pois <- predict(mod.pois, newdata=data.frame(hig=newxvals))

# Remember to back-transform the predictions
  pred.gaus <- exp(pred.gaus)
  pred.pois <- exp(pred.pois)

# Plot the lines on the current plot window
  lines(newxvals, pred.gaus, col=2, lwd=2)
  lines(newxvals, pred.pois, col=4, lwd=2)

# And you are done. Later, we will add a legend to the graphs to
# help us remember which color refers to which model.




# Not too much of a difference between the two, as we expected.

# Add a legend to the plot:
  legend("topright", c("Gaussian","Poisson"), lwd=2, col=c(2,4), bty="n" )


# What if we did not use a log link?

  mod.gaus2 <- glm(gdpcap~hig, data=gdp, family=gaussian(link=identity))
  summary(mod.gaus2)


# Again, for graphing:

# Create a vector of new x-values (hig, here)
  newxvals <- 1:10

# Predict the y-values from these x-values
  pred.gaus2 <- predict(mod.gaus2, newdata=data.frame(hig=newxvals))

# Plot the lines on the current plot window
  lines(newxvals, pred.gaus2, col="#008000", lwd=2)




# So, which is the best model? Why?


###


# Let us now focus on the OPEC States

  par(mar=c(5,5,2,2)+0.1)
  plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index",
    ylab="GDP per capita\n", xlim=c(1,10), yaxt="n")
  axis(2, labels=c("0","10,000", "50,000", "100,000"), at=c(0,10000, 50000, 100000), las=1)
  lines(newxvals, pred.pois, col=4, lwd=2)
  points(gdpcap ~ hig, data=gdp, subset=(OPEC=="Member"), pch=16, col='green4')

# Notice that almost all OPEC members are above their prediction. This
# suggests to me that OPEC members tend to be richer than other States. Let
# us check to see if this is true.

# Fit an interaction model

  mod.pois2 <- glm(gdpcap~hig*OPEC, data=gdp, family=poisson(link=log))
  summary(mod.pois2)

# The interaction is statistically significant, so we shoulduse this 
# model. However, as an exercise, let us also fit the additive model

  mod.pois3 <- glm(gdpcap~hig+OPEC, data=gdp, family=poisson(link=log))
  summary(mod.pois3)

# Which is appropriate? Interaction term or no interaction term? Why?

# Graphing:

  par(mar=c(5,5,2,2)+0.1)
  plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index",
    ylab="GDP per capita\n", xlim=c(1,10), yaxt="n")
  axis(2, labels=c("0","10,000", "50,000", "100,000"), at=c(0,10000, 50000, 100000), las=1)
  points(gdpcap ~ hig, data=gdp, subset=(OPEC=="Member"), pch=16, col='green')


# Predictions, as usual, for the additive model:

  newxvals <- 1:10

  pred.pois3o <- predict(mod.pois3, newdata=data.frame(hig=newxvals, OPEC="Member"))
  pred.pois3o <- exp(pred.pois3o)
  lines(newxvals, pred.pois3o, col=3, lwd=2)

  pred.pois3n <- predict(mod.pois3, newdata=data.frame(hig=newxvals, OPEC="Non-Member"))
  pred.pois3n <- exp(pred.pois3n)
  lines(newxvals, pred.pois3n, col=4, lwd=2)


# Alternatively, for the additive model, but on the log scale (if you like):

  par(mar=c(5,5,2,2)+0.1)
  plot(gdpcap ~ hig, data=gdp, las=1, xlab="Honesty in Government Index",
    ylab="GDP per capita\n", xlim=c(1,10), yaxt="n", log="y", ylim=c(1e2,2e5))
  axis(2, labels=c("100","1000","10,000","100,000"), at=c(100,1000,10000,100000), las=1)
  points(gdpcap ~ hig, data=gdp, subset=(OPEC=="Member"), pch=16, col='green')

  newxvals <- 1:10

  pred.pois3o <- predict(mod.pois3, newdata=data.frame(hig=newxvals, OPEC="Member"))
  pred.pois3o <- exp(pred.pois3o)
  lines(newxvals, pred.pois3o, col=3, lwd=2)

  pred.pois3n <- predict(mod.pois3, newdata=data.frame(hig=newxvals, OPEC="Non-Member"))
  pred.pois3n <- exp(pred.pois3n)
  lines(newxvals, pred.pois3n, col=4, lwd=2)




##################################################

# Start with a fresh slate
  rm(list=ls())

# A new dataset: Single-Sex Marriage
  ssm <- read.csv("http://courses.kvasaheim.com/stat40x3/data/ssm.csv", header=TRUE)

# If loaded on your flash drive, you would use:
  ssm <- read.csv("E:/STAT40x3/data/ssm.csv", header=TRUE)

  names(ssm)
  summary(ssm)
  hist(ssm$pctfavor, xlab="Percent in Favor of Measure")

# Plot the data
  plot(pctfavor ~ religiosity, data=ssm, las=1, xlab="Religiosity",ylab="Percent in Favor",
    xlim=c(0,6), ylim=c(40,90) )


# Q: Which factors can be used to predict the outcome of an anti-SSM
# ballot measure? Let us try these three: Year of the ballot measure,
# whether the ballot measure also seeks to ban single-sex civil unions,
# and the level of religiosity in the US state.


# Fitting a straight, linear relationship (no pun intended)
  mod.gau <- glm(pctfavor ~ year + civilunion + religiosity, data=ssm)
  summary(mod.gau)

# Predictions for the outcome of a ballot measure in 2000, which also seeks 
# to ban SS civil unions, as the level of religiosity changes

  newrelg <- 1:5
  pred.gau1 <- predict(mod.gau, newdata=data.frame(year=2000,civilunion=1,religiosity=newrelg) )
  lines(newrelg, pred.gau1, col=2, lwd=2)

# Similarly for the same ballot measure that does NOT ban SS civil unions
  pred.gau0 <- predict(mod.gau, newdata=data.frame(year=2000,civilunion=0,religiosity=newrelg) )
  lines(newrelg, pred.gau0, col=4, lwd=2)

# Add a legend to keep things straight 
  legend("bottomright", c("Does Not Ban Civil Unions","Bans Civil Unions"), lwd=2, col=c(4,2), bty="n" )



# Now, if we do not believe that the dependent variable is Gaussian, we
# can fit using GLMs, specifically a Binomial distribution. Why?


# We will need these:
  logit     <- function(x) log( x/(1-x) )
  logit.inv <- function(x) exp(x)/(1+exp(x))


  plot(pctfavor ~ religiosity, data=ssm, las=1, xlab="Religiosity",ylab="Percent in Favor",
    xlim=c(0,6), ylim=c(40,90) )

  mod.bin <- glm(pctfavor ~ year + civilunion + religiosity, data=ssm, family=binomial(link=logit))

# Oops! An error. What does it mean?


# Create a variable for proportion of vote, not percent of vote

  ssm$propfavor <- ssm$pctfavor/100
  mod.bin <- glm(propfavor ~ year + civilunion + religiosity, data=ssm, family=binomial(link=logit))

# Oh dear. We got a warning... Why? Think: What are the aspects of the 
# Binomial distribution you learned last term?


# We can fix this in a few ways. First, we can gather the actual vote counts
# for and against in each of the ballot measures. Second, we can multiply the
# vote percent by 10 (as one is measured in tenths). Third, we can logit-
# transform the proportion and fit it with a linear model. Let us do the
# second one.

# To do the second, we create three new variables. The number of votes for
# the measure, the number of votes against the measure, and a congenery
# of the two. We must create this last variable for R to fit the model:

  v.for     <- ssm$pctfavor*10
  v.against <- 1000 - v.for
  y.vote    <- cbind(v.for,v.against)

# Now, we fit the model. Note the y-variable

  mod.bin <- glm(y.vote ~ year + civilunion + religiosity, data=ssm, family=binomial(link=logit))
  summary(mod.bin)


# Graph the predictions

  newrelg <- 1:5
  pred.bin1 <- predict(mod.bin, newdata=data.frame(year=2001,civilunion=1,religiosity=newrelg) )
  pred.bin1 <- logit.inv(pred.bin1) * 100
  lines(newrelg, pred.bin1, col=2, lwd=2)

  pred.bin0 <- predict(mod.bin, newdata=data.frame(year=2001,civilunion=0,religiosity=newrelg) )
  pred.bin0 <- logit.inv(pred.bin0) * 100
  lines(newrelg, pred.bin0, col=4, lwd=2)

  legend("bottomright", c("Does Not Ban Civil Unions","Bans Civil Unions"), lwd=2, col=c(4,2), bty="n" )


# So, take the following away:
#
# . Graphing
# . Fitting Binomials