############################## # # # November 29, 2011 # Topics: # Time Series Analysis # Cross-Sectional Time Series data # # ############################## # Preamble # We will need these packages today library(RFS) library(MASS) library(tseries) library(forecast) ipa <- read.csv("beer.csv") names(ipa) attach(ipa) plot(ipa) m1 <- glm(beer ~ party, family=gaussian(link=identity)) summary(m1) m2 <- glm(beer ~ party, family=gaussian(link=log)) summary(m2) m3 <- glm(beer ~ party, family=poisson(link=log)) summary(m3) m4 <- glm.nb(beer ~ party, link=log) summary(m4) # Go with m2 # or, do we? m6 <- glm(beer ~ party + year + month, family=gaussian(link=identity)) summary(m6) m7 <- glm(beer ~ party + year + month, family=gaussian(link=log)) summary(m7) m8 <- glm(beer ~ party + year + month, family=poisson(link=log)) summary(m8) m9 <- glm.nb(beer ~ party + year + month, link=log) summary(m9) m6a <- glm(beer ~ party + year + month + I(month^2), family=gaussian(link=identity)) summary(m6a) m6b <- glm(beer ~ party + year + I(year^2) + month + I(month^2), family=gaussian(link=identity)) summary(m6b) # go with m6b (why no log transform????) # plot and predict plot(year, beer) newYear <- seq(1950,2000, length=100) prR <- predict(m6b, newdata=data.frame(year=newYear, party="Republican", month=9) ) prD <- predict(m6b, newdata=data.frame(year=newYear, party="Democrat", month=9) ) lines(newYear,prR, col=2, lwd=2) lines(newYear,prD, col=4, lwd=2) # Wait!?!?!? # New time variable # Months after starting mas <- (year-1956)*12+month m11 <- glm(beer ~ party + mas + I(mas^2), family=gaussian(link=identity)) summary(m11) m12 <- glm(beer ~ party + mas + I(mas^2), family=gaussian(link=log)) summary(m12) m13 <- glm(beer ~ party + mas + I(mas^2), family=poisson(link=log)) summary(m13) m14 <- glm.nb(beer ~ party + mas + I(mas^2)) summary(m14) # Let's go with m14 summary(m14) # Plot and predict over time masN <- seq(1, 500) pr14R <- predict(m11, newdata=data.frame(mas=masN, party="Republican")) pr14D <- predict(m11, newdata=data.frame(mas=masN, party="Democrat")) plot(mas, beer, las=1, xlim=c(0,500), col="pink", pch=16, ylab="Beer Consumption [Bottles per capita]", xlab="Months after January 1956") points(mas[party=="Democrat"], beer[party=="Democrat"], col="lightblue", pch=16) lines(masN, pr14R, col="red", lwd=2) lines(masN, pr14D, col="blue", lwd=2) legend("topleft", c("Republican President","Democratic President"), lwd=2, col=c("red","blue"), bty="n") # Similarly, we could: plot(mas, beer, las=1, xlim=c(0,500), col="pink", pch=16, xaxt="n", ylab="Beer Consumption [Bottles per capita]", xlab="Year") points(mas[party=="Democrat"], beer[party=="Democrat"], col="lightblue", pch=16) lines(masN, pr14R, col="red", lwd=2) lines(masN, pr14D, col="blue", lwd=2) axis(1, labels=1956:1996, at=0:40*12) legend("topleft", c("Republican President","Democratic President"), lwd=2, col=c("red","blue"), bty="n") # Conclusions?? summary(m14) # Additional time series analysis ipa <- ts(ipa[,1],start=1956,freq=12) plot(ipa) # seasonal decomposition decomp <- stl(ipa, s.window="periodic") plot( decomp ) trend <- decomp$time.series[,"trend"] plot(trend) # Ignore the seasonality and focus on the underlying trend mt1 <- glm(trend~party+mas+I(mas^2), family=gaussian(link=log) ) summary(mt1) masN <- seq(1, 500) prt1R <- exp( predict(mt1, newdata=data.frame(mas=masN, party="Republican")) ) prt1D <- exp( predict(mt1, newdata=data.frame(mas=masN, party="Democrat")) ) plot(mas, beer, las=1, xlim=c(0,500), col="pink", pch=16, xaxt="n", type="l", ylab="Beer Consumption [Bottles per capita]", xlab="Year") lines(mas[party=="Democrat"], beer[party=="Democrat"], col="lightblue") lines(masN, pr14R, col="red", lwd=2) lines(masN, pr14D, col="blue", lwd=2) axis(1, labels=1956:1996, at=0:40*12) legend("topleft", c("Republican President","Democratic President"), lwd=2, col=c("red","blue"), bty="n") # Check differences from above lines(masN, pr14R, col="pink", lwd=1) lines(masN, pr14D, col="lightblue", lwd=1) # No real difference! plot(ipa) decomp <- stl(ipa, s.window="periodic") plot( decomp ) # Explain the seasonality??? seas <- decomp$time.series[,"seasonal"] plot(seas[1:12], type="h") abline(h=0) ms1 <- glm(seas~party, family=gaussian(link=identity)) summary(ms1) ms2 <- aov(beer~as.factor(month)) summary(ms2) library(agricolae) HSD.test(beer,as.factor(month), 464, 897) kruskal.test(beer~as.factor(month)) kruskal(beer,as.factor(month)) detach(ipa) rm(list=ls()) ################################################## ################################################## # Preamble # We will need these packages here library(RFS) library(MASS) library(geepack) # Use internal data data(ohio) head(ohio,20) str(ohio) summary(ohio) attach(ohio) # Need to model respiratory disease as a function of age and status # as a smoker. In fact, I hypothesize that smokers suffer from # respiratory disease at a higher rate than non-smokers! m.glma <- glm(resp~age+smoke, family=binomial(link=logit), data=ohio) m.glmm <- glm(resp~age*smoke, family=binomial(link=logit), data=ohio) summary(m.glma) summary(m.glmm) # Anything to change? # Which is better? # What are the appropriate conclusions? # Graphing! plot(age,resp, xlab="Age", ylab="Respiratory Disease", yaxt="n", xaxt="n") axis(1, label=-2:1, at=-2:1) axis(2, label=c("Present","Absent"), at=1:0, las=1) # Maybe better (why?): plot(jitter(age),jitter(resp), xlab="Age", ylab="Respiratory Disease", yaxt="n", xaxt="n" ) axis(1, label=-2:1, at=-2:1) axis(2, label=c("Present","Absent"), at=1:0, las=1) # Prediction curve: newAge <- seq(-3,2,length=100) pr1 <- logistic( predict(m.glma, newdata=data.frame(smoke=1, age=newAge) ) ) lines(newAge,pr1, lwd=2, col=2) pr0 <- logistic( predict(m.glma, newdata=data.frame(smoke=0, age=newAge) ) ) lines(newAge,pr0, lwd=2, col=4) legend("left", c("Smokers","Non-Smokers"), lwd=2, bty="n", col=c(2,4) ) # Now, let's think for a moment: What assumptions did we make? # . Proper distribution # . Proper link # . Proper linear predictor # . Residuals are iid (independent and identically distributed) # Oops! We have a problem (perhaps) # Note: # The experimental unit is the person # On each person, measured for four years: # This is typical cross-sectional, time series data # # The above analysis, we assumed the errors were independent # It may not actually be the case, since we are taking *repeated* # measures on the people # # The above analysis 'pools' the data, ignoring the effect of the # individual. Let's add it back in # New thing to know: # The correlation structure within the individual # Possibilities are endless, but there are some 'typical' ones: # . Independence # . Exchangeable # . Autoregressive of order 1 (AR1) # . Unstructured m.ind <- geeglm(resp~age+smoke, family=binomial(link=logit), id=id, corstr = "independence" ) m.exc <- geeglm(resp~age+smoke, family=binomial(link=logit), id=id, corstr = "exchangeable" ) m.ar1 <- geeglm(resp~age+smoke, family=binomial(link=logit), id=id, corstr = "ar1" ) m.uns <- geeglm(resp~age+smoke, family=binomial(link=logit), id=id, corstr = "unstructured" ) summary(m.ind) summary(m.exc) summary(m.ar1) summary(m.uns) # Which is best? # ?? # Graphing our best: # Maybe better (why?): plot(jitter(age),jitter(resp), xlab="Age", ylab="Respiratory Disease", yaxt="n", xaxt="n") axis(1, label=-2:1, at=-2:1) axis(2, label=c("Present","Absent"), at=1:0, las=1) # Prediction curve: newAge <- seq(-3,2,length=100) pr1 <- logistic( predict(m.ar1, newdata=data.frame(smoke=1, age=newAge) ) ) lines(newAge,pr1, lwd=2, col=2) pr0 <- logistic( predict(m.ar1, newdata=data.frame(smoke=0, age=newAge) ) ) lines(newAge,pr0, lwd=2, col=4) legend("left", c("Smokers","Non-Smokers"), lwd=2, bty="n", col=c(2,4) ) detach(ohio) rm(list=ls()) ################################################## # Another data(dietox) summary(dietox) head(dietox,20) str(dietox) attach(dietox) # RQ: What is the effect of copper supplements on the weight of the porcine? m.uns <- geeglm (Weight ~ Cu*(Evit + Time), id=Pig, family=gaussian(link=identity), corstr="unstructured") summary(m.uns) m.uns <- geeglm (Weight ~ Cu + Evit + Time, id=Pig, family=gaussian(link=identity), corstr="unstructured") summary(m.uns) m.ind <- geeglm (Weight ~ Cu + Time, id=Pig, family=gaussian(link=identity), corstr="independence") summary(m.ind) m.exc <- geeglm (Weight ~ Cu + Time, id=Pig, family=gaussian(link=identity), corstr="exchangeable") summary(m.exc) m.ar1 <- geeglm (Weight ~ Cu + Time, id=Pig, family=gaussian(link=identity), corstr="ar1") summary(m.ar1) # What about m.ar1b <- geeglm (Weight ~ Cu + Time, id=Pig, family=gaussian(link=log), corstr="ar1") summary(m.ar1b) # Which is best? # Graphing plot(Time,Weight, las=1) newTime <- seq(0,15,length=10) pr1 <- predict(m.ar1, newdata=data.frame(Time=newTime, Cu=1, Evit=mean(Evit) ) ) lines(newTime,pr1, col="orange",lwd=2) pr0 <- predict(m.ar1, newdata=data.frame(Time=newTime, Cu=0, Evit=mean(Evit) ) ) lines(newTime,pr0, col="green4",lwd=2) legend("topleft",bty="n", c("With Copper Supplement","Without Copper Supplement"), lwd=2, col=c("orange","green4")) plot(Time,Weight, las=1) newTime <- seq(0,15,length=10) pr1 <- exp( predict(m.ar1b, newdata=data.frame(Time=newTime, Cu=1, Evit=mean(Evit) ) ) ) lines(newTime,pr1, col="orange",lwd=2) pr0 <- exp( predict(m.ar1b, newdata=data.frame(Time=newTime, Cu=0, Evit=mean(Evit) ) ) ) lines(newTime,pr0, col="green4",lwd=2) legend("topleft",bty="n", c("With Copper Supplement","Without Copper Supplement"), lwd=2, col=c("orange","green4")) # What is our conclusion?