##### SCA-42a
##### 
##### Correlation, Regression, and Predictions
##### 

### This gives a few examples of the analysis process for using 
### correlation and regression to answer questions of interest.



### Preamble

source("http://rfs.kvasaheim.com/stat200.R")



### Part 1: Biomes
dt = read.csv("http://rfs.kvasaheim.com/data/biome2.csv")
attach(dt)

## Is there a correlation between the mfri and the elevation?

cor.test(mfri,elevation)

# According to Pearson's product-moment correlation test, the biome's elevation
# and mean fire return interval are negatively correlated (p-value=0.04112). A 95%
# confidence interval for that correlation is from -0.65 to -0.12, with a point
# estimate of -0.38. 


## Perhaps a better question:
## How are elevation and mean fire return interval related?

mod1 = lm(mfri~elevation)
summary(mod1)
confint(mod1)

# We are 95% confident that for every increase of 1000 feet in elevation, the mean
# fire return interval decreases be between 6 and 277 years, with a point estimate 
# of 141 years.



# Putting those two conclusions together provides a much better understanding of the
# relationship between the two variables. Remember back to ANOVA, where the ANOVA test
# only told us if there was a difference, not which was different. An additional 
# test needed to be performed to determine the difference.


# What is the expected mfri for a site at elevation 1000?
predict(mod1, newdata=data.frame(elevation=1000))
# The expected mfri for a site at 1000 feet is 52.7 years.

# What about a confidence interval for that mean?
predict(mod1, newdata=data.frame(elevation=1000), interval="confidence")
# We are 95% confident that the expected (mean) mfri for a site at 1000 feet is
# between 0 and 162 years.


# What is the predicted mfri for Bear Hollow, an area in Central Oregon at 
# 1000 ft elevation? Include a prediction interval.
predict(mod1, newdata=data.frame(elevation=1000), interval="prediction")
# There is a 95% chance that the mfri for Bear Hollow is between 0 and 461 years.


detach(dt)







### Part 2: Cattle

dt = read.csv("http://rfs.kvasaheim.com/data/cattleData2.csv")
attach(dt)

## Is there a relationship between age and weight for feeder cattle?

cor.test(age,weight)

# According to Pearson's product-moment correlation test, there is a significant
# positive relationship between the cattle weight and age (p-value<<0.0001). We 
# are 95% confident that the correlation between age and weight for feeder cattle
# is between 0.69 and 0.74, with a point estimate of 0.71.

mod2 = lm(weight~age)
summary(mod2)
confint(mod2)

# A 95% confidence interval for the effect of age on weight is from 47 to 53 pounds 
# per year, with a point estimate of 50 pounds per year.

# Bessie is a 20 year-old cow. Predict her weight, and give a 95% interval
predict(mod2, newdata=data.frame(age=20), interval="prediction")
# She is most likely 1304 pounds, with a 95% prediction interval from 1168
# to 1441 pounds.


detach(dt)







### Part 3: Unfairness in Cote d'Ivoire elections?

# Is there are relationship between the invalidation rate and the candidate
# support rate in the 2010 Ivoirian presidential election?

dt = read.csv("http://rfs.kvasaheim.com/data/cdi2010pres2.csv")
attach(dt)
summary(dt)

# create variables

pRejected = REJECTED/TOTAL
pGbagbo = GBAGBO/VALID

mod3 = lm(pRejected~pGbagbo) 
summary(mod3)

# There is no statistically significant relationship between the invalidation rate and
# the support level for Gbagbo in the 2010 Ivoirian presidential election (p-value = 0.656).

## Graphic

plot(pGbagbo,pRejected)

# better

plot(pGbagbo,pRejected, xlim=c(0,1), ylim=c(0,0.1), xlab="Gbagbo Support",ylab="Rejection Rate")

# even better

plot(pGbagbo,pRejected, xlim=c(0,1), ylim=c(0,0.1), xlab="Gbagbo Support",ylab="Rejection Rate")
abline(mod3)

# pretty graphic

par(mar=c(3,3,0,0)+1)
par(las=1)
par(family="serif")
par(xaxs="i", yaxs="i")
par(cex.lab=1.2, font.lab=2)

plot.new()
plot.window( xlim=c(0,1), ylim=c(0,0.07) )

axis(1, at=0:10/10)
axis(2, at=0:10/100)

title(xlab="Gbagbo Support Level", line=2.5)
title(ylab="Rejection Rate", line=3)

abline(mod3)

points(pGbagbo,pRejected, pch=21, bg="dodgerblue")


detach(dt)





### Part 4: South Sudanese Independence

# Is there are relationship between the invalidation rate and the support
# rate for independence in the 2014 South Sudanese referendum?

dt = read.csv("http://rfs.kvasaheim.com/data/xsd2011referendum.csv")
attach(dt)
summary(dt)

# create variables

pRejected = Invalid/Votes
pIndependence = Secession/(Votes-Invalid)


mod4 = lm(pRejected~pIndependence)
summary(mod4)
confint(mod4)

# There is a statistically significant relationship between the invalidation rate
# and the support for independence in the 2014 South Sudanese independence referendum
# (p-value << 0.0001). A 95% confidence interval for the relationship between the two
# is from -0.06 to -0.08, with a point estimate of -0.07. The following scatter plot
# illustrates this relationship.

par(mar=c(3,3,0,0)+1)
par(las=1)
par(family="serif")
par(xaxs="i", yaxs="i")
par(cex.lab=1.1, cex.font=2)

plot.new()
plot.window( xlim=c(0,1), ylim=c(0,0.08) )

axis(1, at=0:10/10)
axis(2, at=0:10/100)

title(xlab="Independence Support Level", line=2.5)
title(ylab="Rejection Rate", line=3)

abline(mod4)

points(pIndependence,pRejected, pch=21, bg="salmon")


# End of graphic

detach(dt)




### Part 5: Some College

dt = read.csv("http://rfs.kvasaheim.com/data/someCollegeClean.csv")
attach(dt)
summary(dt)

# Which is a better predictor of college success (gpa): SAT reading, SAT math, or
# SAT composite?

mod5r = lm(gpa~reading)
mod5m = lm(gpa~math)
mod5c = lm(gpa~composite)

summary(mod5r)
summary(mod5m)
summary(mod5c)

# NOTE: In all three cases, the gpa is highly correlated with the SAT test area. Thus, all
# three will be good predictors... fantastic predictors! However, the last one has the lowest
# "residual standard error," which is a measure of how well the model fails to fit the data. 
# Thus, we can conclude that there is evidence that the SAT composite score predicts the gpa 
# better than either score alone.

# Could we have done this using a correlation test? ABSOLUTELY! All we are doing is determining
# which of the three predictors is best -- in terms of lowest uncertainty. If we asked which 
# predictor had the highest effect on the GPA -- a different question -- we would have to use
# linear regression.

# By the way, if we ignore the composite score, the math score is a better predictor of 
# college success than is the reading score (t=8.005 vs. t=6.543)

# Bob received a 800 on his reading, 500 on his math, and 1300 on his composite SAT score. What
# do we predict his GPA to be? What is the 95% prediction interval?
BOB = data.frame(reading=800, math=500, composite=1300)
predict(mod5r, newdata=BOB, interval="prediction")
predict(mod5m, newdata=BOB, interval="prediction")
predict(mod5c, newdata=BOB, interval="prediction")

# Note that the GPA prediction based on the Reading score is highest, but that there is definite
# overlap