##############################
#
# Kaplan - Meier Estimator (plus graphing)
#
##############################

library(survival)   # We will need this package today
set.seed(370)       # This sets the random-number generator seed, so we agree on our random numbers


# Create a fake data set

time <- c( 1, 4, 4, 5, 6, 7, 9, 10, 15)  # Time when something happens to Z
died <- c( 1, 1, 1, 1, 0, 1, 0,  1,  1)  # Binary variable

y.var <- Surv(time,died)
m.c <- survfit(y.var~1)      # Fits a model with only the intercept
summary( m.c )
plot( m.c )                  # Plots a Kaplan-Meier estimator
plot( m.c, conf.int=FALSE )  # Gets rid of the 95% confidence bands


# Note that the confidence interval was almost worthless in this data set. This
# is because the number of data points is small (n=9). This is common throughout
# everything we've talked about: Larger sample sizes will give us estimates that 
# are more precise (all things being equal).


## Create larger dataset (and random)
set.seed(370)
time <- ceiling(rexp(1000,0.1))  # This gives us 1000 random draws from an Exponential distribution
died <- rbinom(1000, 1,0.80)     # This gives us 1000 random draws from a Binomial distribution (about 80% are 1)
group<- rbinom(1000, 1,0.75)     # This gives us 1000 random draws from a Binomial distribution (about 75% are 1)

y <- Surv(time,died)             # You must ALWAYS create this 'survival object'. All programs require it
plot( survfit( y~1) )            # Kaplan-Meier estimator for the groups together (no indep var)
lines( survfit( y~group , subset=group==0), col="blue" )  # Kaplan-Meier for Group 0, only
lines( survfit( y~group , subset=group==1), col="red" )   # Kaplan-Meier for Group 1, only




##### Parametric models

# There are several options for the distribution:
#   weibull, exponential, gaussian, logistic, lognormal, and loglogistic
# The Weibull is the default. You can also create your own distribution if
# you have a reason to do so. Type: ?"survreg.distributions" for help doing 
# this.

## What some of these look like:

## Weibull
x <- seq(0,5, length=1000)
h1 <-  -log( 1 - pweibull(x, shape=1, scale=1) )
h2 <-  -log( 1 - pweibull(x, shape=2, scale=1) )
h3 <-  -log( 1 - pweibull(x, shape=3, scale=1) )
h4 <-  -log( 1 - pweibull(x, shape=4, scale=1) )
plot(x,h1, type="l", col=1, lwd=2, las=1, ylab="Cumulative Hazard",xlab="Time", ylim=c(0,20), main="Weibull Distribution" )
lines(x,h2, col=2, lwd=2)
lines(x,h3, col=3, lwd=2)
lines(x,h4, col=4, lwd=2)

## Gaussian
x <- seq(0,5, length=1000)
h1 <-  -log( 1 - pnorm(x, m=0, s=1) )
h2 <-  -log( 1 - pnorm(x, m=0, s=2) )
h3 <-  -log( 1 - pnorm(x, m=1, s=3) )
h4 <-  -log( 1 - pnorm(x, m=1, s=4) )
plot(x,h1, type="l", col=1, lwd=2, las=1, ylab="Cumulative Hazard",xlab="Time", ylim=c(0,15), main="Gaussian Distribution" )
lines(x,h2, col=2, lwd=2)
lines(x,h3, col=3, lwd=2)
lines(x,h4, col=4, lwd=2)

## Lognormal
x <- seq(0,5, length=1000)
h1 <-  -log( 1 - plnorm(x, m=0, s=1) )
h2 <-  -log( 1 - plnorm(x, m=0, s=2) )
h3 <-  -log( 1 - plnorm(x, m=1, s=3) )
h4 <-  -log( 1 - plnorm(x, m=1, s=4) )
plot(x,h1, type="l", col=1, lwd=2, las=1, ylab="Cumulative Hazard",xlab="Time", ylim=c(0,2), main="Log-Normal Distribution" )
lines(x,h2, col=2, lwd=2)
lines(x,h3, col=3, lwd=2)
lines(x,h4, col=4, lwd=2)



##### Back to modeling

# Create fake data
set.seed(370)
died   <- rbinom(1000, 1,0.95)   # 1000 random binary values (about 95% are 1)
weight <- runif(1000)*100+100    # 1000 random values between 100 and 200
drug   <- rbinom(1000,1,0.15)    # 1000 random binary values (about 15% are 1)

time   <- ceiling(rexp(1000,0.03)) + -10*drug + weight   # Define our dep var in terms of our indep vars
time   <- time / 3                                       # Done to create realistic life times

m1 <- survreg(Surv(time,died) ~ drug + weight )          # Parametric model
summary(m1)                                              # Shows the direction and stat sig of the vars


# A plot to *illustrate* the effects of the independent variables

prweight <- min(weight):max(weight)
prweight <- seq(min(weight),max(weight), length=1000)

pdrug0   <- predict( m1, newdata=data.frame(weight=prweight, drug=0) )
pdrug1   <- predict( m1, newdata=data.frame(weight=prweight, drug=1) )

plot(weight,time, las=1, lwd=2, ylim=c(40,100), xlim=c(90,210),
    xlab="Weight (pounds)", ylab="Predicted Age at Death", col="gray90")
lines(prweight, pdrug0, col="blue", lwd=2)
lines(prweight, pdrug1, col="red",  lwd=2)

# Add an appropriate legend
legend("topleft", c("Drug-free", "Drug-user"), col=c(2,4), lwd=2, bty="n" )



##### Load a new data set (a package dataset, so real data)

data(lung)
names(lung)
attach(lung)
summary(lung)


lg1 <- survreg(Surv(time,status) ~ age + sex, data=lung, dist="lognormal")
summary(lg1)


# Another plot to *illustrate* the effects of the independent variables

prage <- min(lung$age):max(lung$age)
psex1 <- predict(lg1, newdata=data.frame(age=prage, sex=1))
psex2 <- predict(lg1, newdata=data.frame(age=prage, sex=2))

plot(prage,psex1, type="l", col=4, xlab="Current Age", ylim=c(0,1000),
	ylab="Predicted Longevity (days)", las=1, lwd=2)
points(prage,psex2, type="l", col=2, lwd=2)
legend("topright", c("Male", "Female"),	col=c(4,2), lty=1, lwd=2, bty="n" )

detach(lung)


##### Load a new data set (another package dataset)

data(stanford2)
names(stanford2)
attach(stanford2)
summary(stanford2)


st1 <- survreg(Surv(time,status) ~ age + t5, data=stanford2, dist="loglogistic" )
summary(st1)

# Note that t5 is a continuous variable. How do we graph both age and t5 on
# the same graph? Select a few representative values for t5 and just plot
# those few values... like so:

prage <- min(age):max(age)
pt50  <- predict(st1, newdata=data.frame(age=prage, t5=0))
pt51  <- predict(st1, newdata=data.frame(age=prage, t5=1))
pt52  <- predict(st1, newdata=data.frame(age=prage, t5=2))
pt55  <- predict(st1, newdata=data.frame(age=prage, t5=5))

plot(prage, pt50, type="l", xlab="Current Age", lty=1,
	ylab="Predicted Longevity (days)", las=1, ylim=c(0,3000))
points(prage, pt51, type="l", lty=2)
points(prage, pt52, type="l", lty=3)
points(prage, pt55, type="l", lty=4)

legend("topright", c("Mismatch distance = 0", "Mismatch distance = 1", "Mismatch distance = 2"),
	lty=c(1,2,3), lwd=2, bty="n" )

detach(stanford2)

# Note that the effect of t5 is illustrated, even though t5 is continuous. This 
# technique can always be done. By default, perhaps, select the min, median, and 
# max values?


# We have long passed from the realm where coefficients tell us enough. Graphs,
# like those here, illustrate your findings like tables cannot.