############################## # # Kaplan - Meier Estimator (plus graphing) # ############################## library(survival) # We will need this package today set.seed(370) # This sets the random-number generator seed, so we agree on our random numbers # Create a fake data set time <- c( 1, 4, 4, 5, 6, 7, 9, 10, 15) # Time when something happens to Z died <- c( 1, 1, 1, 1, 0, 1, 0, 1, 1) # Binary variable y.var <- Surv(time,died) m.c <- survfit(y.var~1) # Fits a model with only the intercept summary( m.c ) plot( m.c ) # Plots a Kaplan-Meier estimator plot( m.c, conf.int=FALSE ) # Gets rid of the 95% confidence bands # Note that the confidence interval was almost worthless in this data set. This # is because the number of data points is small (n=9). This is common throughout # everything we've talked about: Larger sample sizes will give us estimates that # are more precise (all things being equal). ## Create larger dataset (and random) set.seed(370) time <- ceiling(rexp(1000,0.1)) # This gives us 1000 random draws from an Exponential distribution died <- rbinom(1000, 1,0.80) # This gives us 1000 random draws from a Binomial distribution (about 80% are 1) group<- rbinom(1000, 1,0.75) # This gives us 1000 random draws from a Binomial distribution (about 75% are 1) y <- Surv(time,died) # You must ALWAYS create this 'survival object'. All programs require it plot( survfit( y~1) ) # Kaplan-Meier estimator for the groups together (no indep var) lines( survfit( y~group , subset=group==0), col="blue" ) # Kaplan-Meier for Group 0, only lines( survfit( y~group , subset=group==1), col="red" ) # Kaplan-Meier for Group 1, only ##### Parametric models # There are several options for the distribution: # weibull, exponential, gaussian, logistic, lognormal, and loglogistic # The Weibull is the default. You can also create your own distribution if # you have a reason to do so. Type: ?"survreg.distributions" for help doing # this. ## What some of these look like: ## Weibull x <- seq(0,5, length=1000) h1 <- -log( 1 - pweibull(x, shape=1, scale=1) ) h2 <- -log( 1 - pweibull(x, shape=2, scale=1) ) h3 <- -log( 1 - pweibull(x, shape=3, scale=1) ) h4 <- -log( 1 - pweibull(x, shape=4, scale=1) ) plot(x,h1, type="l", col=1, lwd=2, las=1, ylab="Cumulative Hazard",xlab="Time", ylim=c(0,20), main="Weibull Distribution" ) lines(x,h2, col=2, lwd=2) lines(x,h3, col=3, lwd=2) lines(x,h4, col=4, lwd=2) ## Gaussian x <- seq(0,5, length=1000) h1 <- -log( 1 - pnorm(x, m=0, s=1) ) h2 <- -log( 1 - pnorm(x, m=0, s=2) ) h3 <- -log( 1 - pnorm(x, m=1, s=3) ) h4 <- -log( 1 - pnorm(x, m=1, s=4) ) plot(x,h1, type="l", col=1, lwd=2, las=1, ylab="Cumulative Hazard",xlab="Time", ylim=c(0,15), main="Gaussian Distribution" ) lines(x,h2, col=2, lwd=2) lines(x,h3, col=3, lwd=2) lines(x,h4, col=4, lwd=2) ## Lognormal x <- seq(0,5, length=1000) h1 <- -log( 1 - plnorm(x, m=0, s=1) ) h2 <- -log( 1 - plnorm(x, m=0, s=2) ) h3 <- -log( 1 - plnorm(x, m=1, s=3) ) h4 <- -log( 1 - plnorm(x, m=1, s=4) ) plot(x,h1, type="l", col=1, lwd=2, las=1, ylab="Cumulative Hazard",xlab="Time", ylim=c(0,2), main="Log-Normal Distribution" ) lines(x,h2, col=2, lwd=2) lines(x,h3, col=3, lwd=2) lines(x,h4, col=4, lwd=2) ##### Back to modeling # Create fake data set.seed(370) died <- rbinom(1000, 1,0.95) # 1000 random binary values (about 95% are 1) weight <- runif(1000)*100+100 # 1000 random values between 100 and 200 drug <- rbinom(1000,1,0.15) # 1000 random binary values (about 15% are 1) time <- ceiling(rexp(1000,0.03)) + -10*drug + weight # Define our dep var in terms of our indep vars time <- time / 3 # Done to create realistic life times m1 <- survreg(Surv(time,died) ~ drug + weight ) # Parametric model summary(m1) # Shows the direction and stat sig of the vars # A plot to *illustrate* the effects of the independent variables prweight <- min(weight):max(weight) prweight <- seq(min(weight),max(weight), length=1000) pdrug0 <- predict( m1, newdata=data.frame(weight=prweight, drug=0) ) pdrug1 <- predict( m1, newdata=data.frame(weight=prweight, drug=1) ) plot(weight,time, las=1, lwd=2, ylim=c(40,100), xlim=c(90,210), xlab="Weight (pounds)", ylab="Predicted Age at Death", col="gray90") lines(prweight, pdrug0, col="blue", lwd=2) lines(prweight, pdrug1, col="red", lwd=2) # Add an appropriate legend legend("topleft", c("Drug-free", "Drug-user"), col=c(2,4), lwd=2, bty="n" ) ##### Load a new data set (a package dataset, so real data) data(lung) names(lung) attach(lung) summary(lung) lg1 <- survreg(Surv(time,status) ~ age + sex, data=lung, dist="lognormal") summary(lg1) # Another plot to *illustrate* the effects of the independent variables prage <- min(lung$age):max(lung$age) psex1 <- predict(lg1, newdata=data.frame(age=prage, sex=1)) psex2 <- predict(lg1, newdata=data.frame(age=prage, sex=2)) plot(prage,psex1, type="l", col=4, xlab="Current Age", ylim=c(0,1000), ylab="Predicted Longevity (days)", las=1, lwd=2) points(prage,psex2, type="l", col=2, lwd=2) legend("topright", c("Male", "Female"), col=c(4,2), lty=1, lwd=2, bty="n" ) detach(lung) ##### Load a new data set (another package dataset) data(stanford2) names(stanford2) attach(stanford2) summary(stanford2) st1 <- survreg(Surv(time,status) ~ age + t5, data=stanford2, dist="loglogistic" ) summary(st1) # Note that t5 is a continuous variable. How do we graph both age and t5 on # the same graph? Select a few representative values for t5 and just plot # those few values... like so: prage <- min(age):max(age) pt50 <- predict(st1, newdata=data.frame(age=prage, t5=0)) pt51 <- predict(st1, newdata=data.frame(age=prage, t5=1)) pt52 <- predict(st1, newdata=data.frame(age=prage, t5=2)) pt55 <- predict(st1, newdata=data.frame(age=prage, t5=5)) plot(prage, pt50, type="l", xlab="Current Age", lty=1, ylab="Predicted Longevity (days)", las=1, ylim=c(0,3000)) points(prage, pt51, type="l", lty=2) points(prage, pt52, type="l", lty=3) points(prage, pt55, type="l", lty=4) legend("topright", c("Mismatch distance = 0", "Mismatch distance = 1", "Mismatch distance = 2"), lty=c(1,2,3), lwd=2, bty="n" ) detach(stanford2) # Note that the effect of t5 is illustrated, even though t5 is continuous. This # technique can always be done. By default, perhaps, select the min, median, and # max values? # We have long passed from the realm where coefficients tell us enough. Graphs, # like those here, illustrate your findings like tables cannot.