######################################## # # Script: 26 April 2011 (preliminary) # ######################################## # Today: # # ** Survival Time Analysis # . alias: Event History Analysis # # We need to load this package library(survival) # To get help with an entire library, you can access the # Revolution Analytics community site. It just makes it # easier to find information on R packages. It adds very # little new information. The URL is # http://www.inside-r.org/packages # A contrived example # Create a dataset of size 100 of T and X, where T depends on X. set.seed(30) n <- 100 X <- runif(n) T <- ceiling(rexp(n,rate=X)) # What does the distribution of T look like? hist(T, breaks=51) # Let us model it using GLMs as usual. Here, since the dependent variable # is a "Time until..." some event, we know that an appropriate distribution # may be the Gamma distribution (with an inverse link): model.g <- glm(T ~ X, family=Gamma(link=inverse)) summary(model.g) # Alternatively, we can see this dataset as an example of Survival-Time (or # Event History) analysis. There are two main options for distribution in # this realm: Exponential and Weibull. # Step 1 is to create the "Survival Time" object y.time <- Surv(T) # Looks familiar, no? # Step 2 ????? # Step 3 is to fit the regression as usual (note the command): model.e <- survreg(y.time ~ X, dist="exponential") summary(model.e) # The other distribution of interest is model.w <- survreg(y.time ~ X, dist="weibull") summary(model.w) # Technically, the Weibull is a general case of the Exponential, as is the # Gamma. BUT, the Gamma is NOT a special case of the Weibull (or vice-versa). # So, which of the three models fits best? Could use AIC: # Recall that the formula for AIC is: AIC = -2 lnLik + 2k # So, # What is AIC(model.g)? = 461.15 # What is AIC(model.e)? = -2 * -294.5 + 2 * 2 = 593.0 # What is AIC(model.w)? = -2 * -283.1 + 2 * 3 = 572.2 # What was the rule of thumb? # Lower by 5 (or 8) # Alternatively, we could compare the models using a graph: plot(T~X, ylim=c(0,50), ylab="Time until Death", xlab="Independent Variable") newx <- 1:1000/1000 pr <- predict(model.g, newdata=data.frame(X=newx), type="response") lines(newx,pr, col=2, lty=3) pr <- predict(model.e, newdata=data.frame(X=newx), type="response") lines(newx,pr, col=4, lty=3) pr <- predict(model.w, newdata=data.frame(X=newx), type="response") lines(newx,pr, col=3, lty=3) legend("topright", c("Gamma","Exponential","Weibull"), col=c(2,4,3), lty=3, bty="n") # So, which fits best? What evidence are you using? # Predict the lifetime of a person with X = 0.01 using all three models. newperson <- data.frame(X=0.01) pr.g <- predict(model.g, newdata=newperson, type="response") pr.e <- predict(model.e, newdata=newperson, type="response") pr.w <- predict(model.w, newdata=newperson, type="response") pr.g # 128.77910 pr.e # 42.94992 pr.w # 27.09441 # Now, predict the lifetime of a person with X = 0.99 using all three models newperson <- data.frame(X=0.99) pr.g <- predict(model.g, newdata=newperson, type="response") pr.e <- predict(model.e, newdata=newperson, type="response") pr.w <- predict(model.w, newdata=newperson, type="response") pr.g # 1.2131750 pr.e # 0.6573755 pr.w # 0.6602893 # What are the appropriate conclusions you can draw? # Step 4: Profit! ################################################## # Next Example, in which I give all three steps # Let us use an interior dataset, the LUNG dataset. summary(lung) # How does this dataset differ from the previous? # Censoring! Not everyone died by the end of the period, so the value # of time is the time until I stop measuring, which is either an event # (death) or an end to the study (censoring). # Step 1: Specify the structure of the dataset, specifically the # (starting time,) the ending time, and the status at # the ending time (0=event occurred; 1=censored). This # varies according to the statistical program. Be aware! y.time <- Surv(lung$time, lung$status) # Step 2: Determine the systematic component # Step 3: Determine the distribution of the survival function. The # Exponential distribution has a constant hazard rate, continuously # decreasing survival rate. Does this match your expectations? model.e <- survreg(y.time ~ ph.ecog + age + sex, lung, dist="exponential") summary(model.e) # The Weibull is a two-parameter distribution, which allows a much # higher degree of flexibility in fitting humped survival curves. # Does this match your expectations? model.w <- survreg(y.time ~ ph.ecog + age + sex, lung, dist="weibull") summary(model.w) # Which fits better? anova(model.e,model.w) # Prediction curves par(mar=c(5,4,2,2)+0.1) with( lung, plot(age,time , ylab="Time until death [days]",xlab="Patient Age", las=1, xlim=c(30,90), ylim=c(0,1200), pch=16 ) ) newage <- 30:90 pr0 <- predict(model.w, newdata=data.frame(age=newage, ph.ecog=0,sex=1), type="response" ) lines(newage,pr0, col=2) pr1 <- predict(model.w, newdata=data.frame(age=newage, ph.ecog=1,sex=1), type="response" ) lines(newage,pr1, col=3) pr2 <- predict(model.w, newdata=data.frame(age=newage, ph.ecog=2,sex=0), type="response" ) lines(newage,pr2, col=4) pr3 <- predict(model.w, newdata=data.frame(age=newage, ph.ecog=3,sex=0), type="response" ) lines(newage,pr3, col=5) legend("topright",c("ph.ecog=0","ph.ecog=1","ph.ecog=2","ph.ecog=3"), col=c(2,3,4,5), lwd=1, bty="n") ################################################## # # Note: The process for prediction and graphing is the same as we have # been doing since the start. There is nothing new to do (hence, # this part was subtle review for the final). #