##############################
#
#
# November 29, 2011
# Topics:
#   Time Series Analysis
#   Cross-Sectional Time Series data
#
#
##############################



# Preamble

# We will need these packages today
library(RFS)
library(MASS)
library(tseries)
library(forecast)




ipa <- read.csv("beer.csv")
names(ipa)
attach(ipa)

plot(ipa)

m1 <- glm(beer ~ party, family=gaussian(link=identity))
summary(m1)

m2 <- glm(beer ~ party, family=gaussian(link=log))
summary(m2)

m3 <- glm(beer ~ party, family=poisson(link=log))
summary(m3)

m4 <- glm.nb(beer ~ party, link=log)
summary(m4)



# Go with m2


# or, do we?

m6 <- glm(beer ~ party + year + month, family=gaussian(link=identity))
summary(m6)

m7 <- glm(beer ~ party + year + month, family=gaussian(link=log))
summary(m7)

m8 <- glm(beer ~ party + year + month, family=poisson(link=log))
summary(m8)

m9 <- glm.nb(beer ~ party + year + month, link=log)
summary(m9)


m6a <- glm(beer ~ party + year + month + I(month^2), family=gaussian(link=identity))
summary(m6a)

m6b <- glm(beer ~ party + year + I(year^2) + month + I(month^2), family=gaussian(link=identity))
summary(m6b)

# go with m6b  (why no log transform????)



# plot and predict

plot(year, beer)

newYear <- seq(1950,2000, length=100)
prR <- predict(m6b, newdata=data.frame(year=newYear, party="Republican", month=9) )
prD <- predict(m6b, newdata=data.frame(year=newYear, party="Democrat", month=9) )

lines(newYear,prR, col=2, lwd=2)
lines(newYear,prD, col=4, lwd=2)





# Wait!?!?!?



# New time variable

# Months after starting


mas <- (year-1956)*12+month

m11 <- glm(beer ~ party + mas + I(mas^2), family=gaussian(link=identity))
summary(m11)

m12 <- glm(beer ~ party + mas + I(mas^2), family=gaussian(link=log))
summary(m12)

m13 <- glm(beer ~ party + mas + I(mas^2), family=poisson(link=log))
summary(m13)

m14 <- glm.nb(beer ~ party + mas + I(mas^2))
summary(m14)



# Let's go with m14

summary(m14)


# Plot and predict over time

masN <- seq(1, 500)
pr14R <- predict(m11, newdata=data.frame(mas=masN, party="Republican"))
pr14D <- predict(m11, newdata=data.frame(mas=masN, party="Democrat"))

plot(mas, beer, las=1, xlim=c(0,500), col="pink", pch=16,
  ylab="Beer Consumption [Bottles per capita]", xlab="Months after January 1956")
points(mas[party=="Democrat"], beer[party=="Democrat"], col="lightblue", pch=16)
lines(masN, pr14R, col="red", lwd=2)
lines(masN, pr14D, col="blue", lwd=2)

legend("topleft", c("Republican President","Democratic President"), lwd=2, 
  col=c("red","blue"), bty="n")



# Similarly, we could:

plot(mas, beer, las=1, xlim=c(0,500), col="pink", pch=16, xaxt="n",
  ylab="Beer Consumption [Bottles per capita]", xlab="Year")
points(mas[party=="Democrat"], beer[party=="Democrat"], col="lightblue", pch=16)
lines(masN, pr14R, col="red", lwd=2)
lines(masN, pr14D, col="blue", lwd=2)
axis(1, labels=1956:1996, at=0:40*12)

legend("topleft", c("Republican President","Democratic President"), lwd=2, 
  col=c("red","blue"), bty="n")


# Conclusions??

summary(m14)










# Additional time series analysis

ipa <- ts(ipa[,1],start=1956,freq=12)
plot(ipa)


# seasonal decomposition
decomp <- stl(ipa, s.window="periodic")
plot( decomp )

trend <- decomp$time.series[,"trend"]
plot(trend)



# Ignore the seasonality and focus on the underlying trend

mt1 <- glm(trend~party+mas+I(mas^2), family=gaussian(link=log) )
summary(mt1)

masN <- seq(1, 500)
prt1R <- exp( predict(mt1, newdata=data.frame(mas=masN, party="Republican"))  )
prt1D <- exp( predict(mt1, newdata=data.frame(mas=masN, party="Democrat"))    )

plot(mas, beer, las=1, xlim=c(0,500), col="pink", pch=16, xaxt="n", type="l",
  ylab="Beer Consumption [Bottles per capita]", xlab="Year")
lines(mas[party=="Democrat"], beer[party=="Democrat"], col="lightblue")
lines(masN, pr14R, col="red", lwd=2)
lines(masN, pr14D, col="blue", lwd=2)
axis(1, labels=1956:1996, at=0:40*12)

legend("topleft", c("Republican President","Democratic President"), lwd=2, 
  col=c("red","blue"), bty="n")


# Check differences from above

lines(masN, pr14R, col="pink", lwd=1)
lines(masN, pr14D, col="lightblue", lwd=1)

# No real difference!









plot(ipa)
decomp <- stl(ipa, s.window="periodic")
plot( decomp )


# Explain the seasonality???
seas <- decomp$time.series[,"seasonal"]
plot(seas[1:12], type="h")
abline(h=0)



ms1 <- glm(seas~party, family=gaussian(link=identity))
summary(ms1)

ms2 <- aov(beer~as.factor(month))
summary(ms2)

library(agricolae)
HSD.test(beer,as.factor(month), 464, 897)


kruskal.test(beer~as.factor(month))
kruskal(beer,as.factor(month))


detach(ipa)
rm(list=ls())


##################################################

##################################################


# Preamble
# We will need these packages here

library(RFS)
library(MASS)
library(geepack)



# Use internal data

data(ohio)
head(ohio,20)
str(ohio)
summary(ohio)

attach(ohio)




# Need to model respiratory disease as a function of age and status
# as a smoker. In fact, I hypothesize that smokers suffer from 
# respiratory disease at a higher rate than non-smokers!

m.glma <- glm(resp~age+smoke, family=binomial(link=logit), data=ohio)
m.glmm <- glm(resp~age*smoke, family=binomial(link=logit), data=ohio)

summary(m.glma)
summary(m.glmm)

# Anything to change?

# Which is better?

# What are the appropriate conclusions?

# Graphing!



plot(age,resp, xlab="Age", ylab="Respiratory Disease", yaxt="n", xaxt="n")
axis(1, label=-2:1, at=-2:1)
axis(2, label=c("Present","Absent"), at=1:0, las=1)


# Maybe better (why?):
plot(jitter(age),jitter(resp), xlab="Age", ylab="Respiratory Disease", yaxt="n", xaxt="n" )
axis(1, label=-2:1, at=-2:1)
axis(2, label=c("Present","Absent"), at=1:0, las=1)

# Prediction curve:
newAge <- seq(-3,2,length=100)
pr1 <- logistic( predict(m.glma, newdata=data.frame(smoke=1, age=newAge) ) )
lines(newAge,pr1, lwd=2, col=2)
pr0 <- logistic( predict(m.glma, newdata=data.frame(smoke=0, age=newAge) ) )
lines(newAge,pr0, lwd=2, col=4)
legend("left", c("Smokers","Non-Smokers"), lwd=2, bty="n", col=c(2,4) )



# Now, let's think for a moment: What assumptions did we make?
#  . Proper distribution
#  . Proper link
#  . Proper linear predictor
#  . Residuals are iid (independent and identically distributed)
 


# Oops! We have a problem (perhaps)








# Note:
#   The experimental unit is the person
#     On each person, measured for four years:
#     This is typical cross-sectional, time series data
#


# The above analysis, we assumed the errors were independent
#   It may not actually be the case, since we are taking *repeated*
#   measures on the people
#
# The above analysis 'pools' the data, ignoring the effect of the
#  individual. Let's add it back in


# New thing to know: 
#    The correlation structure within the individual


# Possibilities are endless, but there are some 'typical' ones:

#  . Independence
#  . Exchangeable
#  . Autoregressive of order 1 (AR1)
#  . Unstructured

m.ind <- geeglm(resp~age+smoke, family=binomial(link=logit), id=id, corstr = "independence" )
m.exc <- geeglm(resp~age+smoke, family=binomial(link=logit), id=id, corstr = "exchangeable" )
m.ar1 <- geeglm(resp~age+smoke, family=binomial(link=logit), id=id, corstr = "ar1" )
m.uns <- geeglm(resp~age+smoke, family=binomial(link=logit), id=id, corstr = "unstructured" )

summary(m.ind)
summary(m.exc)
summary(m.ar1)
summary(m.uns)


# Which is best?


# ??



# Graphing our best:


# Maybe better (why?):
plot(jitter(age),jitter(resp), xlab="Age", ylab="Respiratory Disease", yaxt="n", xaxt="n")
axis(1, label=-2:1, at=-2:1)
axis(2, label=c("Present","Absent"), at=1:0, las=1)

# Prediction curve:
newAge <- seq(-3,2,length=100)
pr1 <- logistic( predict(m.ar1, newdata=data.frame(smoke=1, age=newAge) ) )
lines(newAge,pr1, lwd=2, col=2)
pr0 <- logistic( predict(m.ar1, newdata=data.frame(smoke=0, age=newAge) ) )
lines(newAge,pr0, lwd=2, col=4)

legend("left", c("Smokers","Non-Smokers"), lwd=2, bty="n", col=c(2,4) )




detach(ohio)
rm(list=ls())


##################################################



# Another

data(dietox)
summary(dietox)
head(dietox,20)
str(dietox)
attach(dietox)



# RQ: What is the effect of copper supplements on the weight of the porcine?

m.uns <- geeglm (Weight ~ Cu*(Evit + Time), id=Pig, family=gaussian(link=identity), corstr="unstructured")
summary(m.uns)
m.uns <- geeglm (Weight ~ Cu + Evit + Time, id=Pig, family=gaussian(link=identity), corstr="unstructured")
summary(m.uns)


m.ind <- geeglm (Weight ~ Cu + Time, id=Pig, family=gaussian(link=identity), corstr="independence")
summary(m.ind)

m.exc <- geeglm (Weight ~ Cu + Time, id=Pig, family=gaussian(link=identity), corstr="exchangeable")
summary(m.exc)

m.ar1 <- geeglm (Weight ~ Cu + Time, id=Pig, family=gaussian(link=identity), corstr="ar1")
summary(m.ar1)

# What about 

m.ar1b <- geeglm (Weight ~ Cu + Time, id=Pig, family=gaussian(link=log), corstr="ar1")
summary(m.ar1b)


# Which is best?







# Graphing

plot(Time,Weight, las=1)
newTime <- seq(0,15,length=10)
pr1 <- predict(m.ar1, newdata=data.frame(Time=newTime, Cu=1, Evit=mean(Evit) ) )
lines(newTime,pr1, col="orange",lwd=2)
pr0 <- predict(m.ar1, newdata=data.frame(Time=newTime, Cu=0, Evit=mean(Evit) ) )
lines(newTime,pr0, col="green4",lwd=2)
legend("topleft",bty="n", c("With Copper Supplement","Without Copper Supplement"), lwd=2, col=c("orange","green4"))







plot(Time,Weight, las=1)
newTime <- seq(0,15,length=10)
pr1 <- exp( predict(m.ar1b, newdata=data.frame(Time=newTime, Cu=1, Evit=mean(Evit) ) ) )
lines(newTime,pr1, col="orange",lwd=2)
pr0 <- exp( predict(m.ar1b, newdata=data.frame(Time=newTime, Cu=0, Evit=mean(Evit) ) ) )
lines(newTime,pr0, col="green4",lwd=2)
legend("topleft",bty="n", c("With Copper Supplement","Without Copper Supplement"), lwd=2, col=c("orange","green4"))


# What is our conclusion?