########################################
#
#  Script: 26 April 2011 (preliminary)
#
########################################

# Today:
#
#  ** Survival Time Analysis
#     . alias: Event History Analysis
#

# We need to load this package

library(survival)


# To get help with an entire library, you can access the 
# Revolution Analytics community site. It just makes it
# easier to find information on R packages. It adds very
# little new information. The URL is
#                       http://www.inside-r.org/packages





# A contrived example

# Create a dataset of size 100 of T and X, where T depends on X.
  set.seed(30)
  n <- 100
  X <- runif(n)
  T <- ceiling(rexp(n,rate=X))


# What does the distribution of T look like?
  hist(T, breaks=51)  


# Let us model it using GLMs as usual. Here, since the dependent variable
# is a "Time until..." some event, we know that an appropriate distribution
# may be the Gamma distribution (with an inverse link):

  model.g <- glm(T ~ X, family=Gamma(link=inverse))
  summary(model.g)



# Alternatively, we can see this dataset as an example of Survival-Time (or
# Event History) analysis. There are two main options for distribution in 
# this realm: Exponential and Weibull.

# Step 1 is to create the "Survival Time" object

  y.time <- Surv(T)  # Looks familiar, no?

# Step 2 ?????
# Step 3 is to fit the regression as usual (note the command):

  model.e <- survreg(y.time ~ X, dist="exponential")
  summary(model.e)

# The other distribution of interest is

  model.w <- survreg(y.time ~ X, dist="weibull")
  summary(model.w)

# Technically, the Weibull is a general case of the Exponential, as is the
# Gamma. BUT, the Gamma is NOT a special case of the Weibull (or vice-versa).


# So, which of the three models fits best? Could use AIC:
# Recall that the formula for AIC is: AIC = -2 lnLik + 2k

# So,
# What is AIC(model.g)?   =                       461.15
# What is AIC(model.e)?   = -2 * -294.5 + 2 * 2 = 593.0
# What is AIC(model.w)?   = -2 * -283.1 + 2 * 3 = 572.2


# What was the rule of thumb?
#   Lower by 5 (or 8)

# Alternatively, we could compare the models using a graph:


plot(T~X, ylim=c(0,50), ylab="Time until Death", xlab="Independent Variable")

newx <- 1:1000/1000

pr <- predict(model.g, newdata=data.frame(X=newx), type="response")
lines(newx,pr, col=2, lty=3)

pr <- predict(model.e, newdata=data.frame(X=newx), type="response")
lines(newx,pr, col=4, lty=3)

pr <- predict(model.w, newdata=data.frame(X=newx), type="response")
lines(newx,pr, col=3, lty=3)

legend("topright", c("Gamma","Exponential","Weibull"), col=c(2,4,3), 
   lty=3, bty="n")



# So, which fits best? What evidence are you using? 

# Predict the lifetime of a person with X = 0.01 using all three models. 

newperson <- data.frame(X=0.01)
pr.g <- predict(model.g, newdata=newperson, type="response")
pr.e <- predict(model.e, newdata=newperson, type="response")
pr.w <- predict(model.w, newdata=newperson, type="response")

pr.g  # 128.77910
pr.e  #  42.94992
pr.w  #  27.09441



# Now, predict the lifetime of a person with X = 0.99 using all three models

newperson <- data.frame(X=0.99)
pr.g <- predict(model.g, newdata=newperson, type="response")
pr.e <- predict(model.e, newdata=newperson, type="response")
pr.w <- predict(model.w, newdata=newperson, type="response")

pr.g  # 1.2131750
pr.e  # 0.6573755
pr.w  # 0.6602893


# What are the appropriate conclusions you can draw?


# Step 4: Profit!



##################################################


# Next Example, in which I give all three steps

# Let us use an interior dataset, the LUNG dataset.

summary(lung)

# How does this dataset differ from the previous?

# Censoring! Not everyone died by the end of the period, so the value
#  of time is the time until I stop measuring, which is either an event
#  (death) or an end to the study (censoring).



# Step 1: Specify the structure of the dataset, specifically the
#         (starting time,) the ending time, and the status at 
#         the ending time (0=event occurred; 1=censored). This
#         varies according to the statistical program. Be aware!

y.time <- Surv(lung$time, lung$status) 


# Step 2: Determine the systematic component


# Step 3: Determine the distribution of the survival function. The 
#         Exponential distribution has a constant hazard rate, continuously
#         decreasing survival rate. Does this match your expectations?

model.e <- survreg(y.time ~ ph.ecog + age + sex, lung, dist="exponential")
summary(model.e)


#         The Weibull is a two-parameter distribution, which allows a much
#         higher degree of flexibility in fitting humped survival curves.
#         Does this match your expectations?

model.w <- survreg(y.time ~ ph.ecog + age + sex, lung, dist="weibull")
summary(model.w)


# Which fits better?

anova(model.e,model.w)


# Prediction curves

par(mar=c(5,4,2,2)+0.1)

with( lung, 
  plot(age,time , ylab="Time until death [days]",xlab="Patient Age", las=1, 
    xlim=c(30,90), ylim=c(0,1200), pch=16 )
  )

newage <- 30:90

pr0 <- predict(model.w, newdata=data.frame(age=newage, ph.ecog=0,sex=1), type="response" )
lines(newage,pr0, col=2)

pr1 <- predict(model.w, newdata=data.frame(age=newage, ph.ecog=1,sex=1), type="response" )
lines(newage,pr1, col=3)

pr2 <- predict(model.w, newdata=data.frame(age=newage, ph.ecog=2,sex=0), type="response" )
lines(newage,pr2, col=4)

pr3 <- predict(model.w, newdata=data.frame(age=newage, ph.ecog=3,sex=0), type="response" )
lines(newage,pr3, col=5)

legend("topright",c("ph.ecog=0","ph.ecog=1","ph.ecog=2","ph.ecog=3"), col=c(2,3,4,5), 
  lwd=1, bty="n")





##################################################
#
# Note: The process for prediction and graphing is the same as we have 
#       been doing since the start. There is nothing new to do (hence, 
#       this part was subtle review for the final).
#