This file contains R code for the analyses in Chapter 11 of the book Dynamic Prediction in Clinical Survival Analysis (CRC Chapman & Hall) by Hans C. van Houwelingen and Hein Putter
R code written by Hein Putter (H.Putter@lumc.nl for comments/questions) The dynpred package is available from CRAN
Consistency with the book has been checked with - R version 2.14.0 - survival version 2.36-10 - dynpred version 0.1.1
Code
library(dynpred)library(penalized)# The following is to install and load Biobase from Bioconductor. Biobase# contains standardized data structures to represent genomic data.# If you have not already installed Biobase, please uncomment# source("http://bioconductor.org/biocLite.R")# biocLite("Biobase")library(Biobase)# The van de Vijver data, not part of dynpred, should be placed in the# working directoryload("VanDeVijver.Rdata")univar <-apply(t(exprs(VanDeVijver)), 2,function(x) { c1 <-coxph(Surv(survival.death.,event_death) ~ x,data=pData(VanDeVijver));return(c(c1$coef,c1$var))})univar <-t(univar)univar <-as.data.frame(univar)names(univar) <-c("coef","var")univar$z <- univar$coef/sqrt(univar$var)mean(abs(univar$z)>2)# Plotunivar$info <-1/univar$varplot(univar$info,univar$coef,pch=20,cex=0.5,ylim=c(-5,5),xlab="Information",ylab="Regression coefficient")info <-seq(0.08,max(univar$info),length=500)lines(info,2*sqrt(1/info),type="l",lwd=2)lines(info,-2*sqrt(1/info),type="l",lwd=2)### Lasso# Note, this is time-consuming, so instead of running this part, the# second part, now commented out, could be run instead. optlam1 <-optL1(Surv(survival.death.,event_death), t(exprs(VanDeVijver)),data=pData(VanDeVijver))lamlass <- optlam1$lambda # optimal lambdalasso <-cvl(Surv(survival.death.,event_death), t(exprs(VanDeVijver)),lambda1=lamlass, data=pData(VanDeVijver))Surv.lasso <-as.matrix(lasso$predictions)colnames(Surv.lasso) <-paste("time=", colnames(Surv.lasso), sep="")PI.lasso <-linear.predictors(lasso$ful)# Write result to tab-delimited text file (uncomment next line)# write.table(cbind(PI.lasso, Surv.lasso), file="cvlassopred.txt", sep="\t", col.names=NA, quote=FALSE)# Uncomment this if the above part was not (completely) run.# Please make sure that the file "cvlassopred.txt", available from# the book website, is placed in the working directory# cvlasso <- read.table("cvlassopred.txt",header=TRUE,sep="\t")# PI.lasso <- cvlasso$PI.lasso# Surv.lasso <- cvlasso[,-(1:2)]### Ridge# Note, this is very time-consuming, so instead of running this part, the# second part, now commented out, could be run instead. Please make# sure that the file "cvlassopred.txt", available from the book website# is placed in the working directoryoptlam2 <-optL2(Surv(survival.death.,event_death), t(exprs(VanDeVijver)),data=pData(VanDeVijver))lamridg <- optlam2$lambda # optimal lambdaridg <-cvl(Surv(survival.death.,event_death), t(exprs(VanDeVijver)),lambda2=lamridg, data=pData(VanDeVijver))Surv.ridge <-as.matrix(ridg$predictions)colnames(Surv.ridge) <-paste("time=", colnames(Surv.ridge), sep="")PI.ridge <-linear.predictors(ridg$ful)# Write result to tab-delimited text file (uncomment next line)# write.table(data.frame(PI.ridge, Surv.ridge), file="cvridgepred.txt", sep="\t", col.names=NA, quote=FALSE)# Uncomment this if the above part was not (completely) run.# Please make sure that the file "cvridgepred.txt", available from# the book website, is placed in the working directory# cvridge <- read.table("cvridgepred.txt",header=TRUE,sep="\t")# PI.ridge <- cvridge$PI.ridge# Surv.ridge <- cvridge[,-(1:2)]################################################################################################################################################################# Figure 11.1: CVPL for ridge (left) and lasso (right); below is the### number of non-zero coefficients of the lasso################################################################################################################################################################# !!! NOTE, this takes a very long time to run !!!fit1 <-profL1(Surv(survival.death.,event_death), t(exprs(VanDeVijver)),minlambda1=2, maxlambda1=28, data=pData(VanDeVijver), steps=250)fit2 <-profL2(Surv(survival.death.,event_death), t(exprs(VanDeVijver)),minlambda2=10^1.5, maxlambda2=10^6.5, data=pData(VanDeVijver), steps=100)plot(fit1$lambda,fit1$cvl,type="l",xlab="lambda", ylab="Cross-validated partial log-likelihood")nnonzero <-unlist(lapply(fit1$fullfit,function(x) length(coef(x))))plot(fit1$lambda,nnonzero,type="s",xlab="lambda",ylab="Number of non-zero coefficients")plot(fit2$lambda,fit2$cvl,type="l",log="x",xlim=c(min(fit2$lambda),100000),xlab="lambda",ylab="Cross-validated partial log-likelihood",axes=FALSE)axis(1,at=c(100,1000,10000,100000),labels=as.character(c(100,1000,10000,100000)))axis(2)box()################################################################################################################################################################# Figure 11.2: Scatterplot of the prognostic indices for ridge and lasso##############################################################################################################################################################oldpar <-par(no.readonly=TRUE) # save graphical parameterslayout(matrix(c(2,0,1,3),2,2,byrow=TRUE), widths=c(2,1), heights=c(1,2))# Scatterplotpar(mar=c(5,5,1,1))plot(PI.ridge,PI.lasso,xlim=c(-2.5,2.5),ylim=c(-2.5,2.5),xlab="",ylab="")mtext("Ridge",side=1,line=3,cex=1.5,font=2)mtext("Lasso",side=2,line=3,cex=1.5,font=2)# Histogram of ridgepar(mar=c(0,5,2,1))yhist <-hist(PI.ridge, breaks=seq(-2.5,2.5,0.5), plot=FALSE)top <-max(yhist$counts)barplot(yhist$counts, axes=TRUE, ylim=c(0, top), space=0, horiz=FALSE)leg <-legend("topright","",bty="n")$texttext(leg$x,leg$y,paste("Mean =",format(round(mean(PI.ridge),2),nsmall=2),"\nSD =",round(sd(PI.ridge),2)),adj=c(1,1))par(mar=c(5,0,1,2))yhist <-hist(PI.lasso, breaks=seq(-2.5,2.5,0.5), plot=FALSE)top <-max(yhist$counts)barplot(yhist$counts, axes=TRUE, xlim=c(0, top), space=0, horiz=TRUE)leg <-legend("topright","",bty="n")$texttext(leg$x,leg$y,paste("Mean =",format(round(mean(PI.lasso),2),nsmall=2),"\nSD =",round(sd(PI.lasso),2)),adj=c(1,1))par(oldpar) # reset graphical parameters### For future reference (Chapter 12), save the follow-up data### of VanDeVijver in separate data set, rename time and status### variablesvdv <-pData(VanDeVijver)vdv$time <- vdv$survival.death.vdv$status <- vdv$event_death################################################################################################################################################################# Figure 11.3: Survival curves for ridge (left) and lasso (right)################################################################################################################################################################# Survival curves for these selected percentilesqs <-c(0.125,0.375,0.5,0.625,0.875)## Ridgen <-nrow(Surv.ridge)nc <-ncol(Surv.ridge)require(mstate)Surv.ridge <-t(apply(Surv.ridge,1,mstate:::NAfix))ord.ridge <-order(-Surv.ridge[,nc])Surv.ridge.ord <- Surv.ridge[ord.ridge,]idxs <-round(qs*(n-1)+1)# Extract time points from the Surv.ridge object# These are stored as character strings, strip off first character,# except for 0tt <-as.numeric(substr(dimnames(Surv.ridge)[[2]],start=6,stop=100))plot(tt,Surv.ridge.ord[idxs[1],],type="s",lwd=2,ylim=c(0,1),xlab="Time (years)",ylab="Survival")for (i in2:length(idxs))lines(tt,Surv.ridge.ord[idxs[i],],type="s",lwd=2,lty=i)legend("bottomleft",paste(100*qs,"% percentile"),lwd=2,lty=1:5,bty="n")## Lasson <-nrow(Surv.lasso)nc <-ncol(Surv.lasso)Surv.lasso <-t(apply(Surv.lasso,1,mstate:::NAfix))ord.lasso <-order(-Surv.lasso[,nc])Surv.lasso.ord <- Surv.lasso[ord.lasso,]idxs <-round(qs*(n-1)+1)# Extract time points from the Surv.ridge objecttt <-as.numeric(substr(dimnames(Surv.lasso)[[2]],start=6,stop=100))plot(tt,Surv.lasso.ord[idxs[1],],type="s",lwd=2,ylim=c(0,1),xlab="Time (years)",ylab="Survival")for (i in2:length(idxs))lines(tt,Surv.lasso.ord[idxs[i],],type="s",lwd=2,lty=i)legend("bottomleft",paste(100*qs,"% percentile"),lwd=2,lty=1:5,bty="n")################################################################################################################################################################# Figure 11.4: Cross-validated Kaplan-Meier curves for ridge (left)### and lasso (right)############################################################################################################################################################### New cross-validated PI's, append them to vdv datat0 <-5idxt0 <-max(which(tt<=t0))CVPI.lasso <-log(-log(Surv.lasso[,idxt0]))CVPI.lasso <- CVPI.lasso -mean(CVPI.lasso)CVPI.ridge <-log(-log(Surv.ridge[,idxt0]))CVPI.ridge <- CVPI.ridge -mean(CVPI.ridge)vdv$group.lasso <-as.numeric(cut(CVPI.lasso,c(-Inf,quantile(CVPI.lasso)[2:4],Inf)))vdv$group.ridge <-as.numeric(cut(CVPI.ridge,c(-Inf,quantile(CVPI.ridge)[2:4],Inf)))## Ridgeplot(survfit(Surv(time,status)~group.ridge,data=vdv),mark.time=FALSE,lwd=2,lty=1:4,xlab="Time (years)",ylab="Survival")lines(survfit(Surv(time,status)~1,data=vdv),mark.time=FALSE,lwd=2,lty=1,col=8)legend("bottomleft",c("All tumours","Percentile 0-25","Percentile 25-50","Percentile 50-75","Percentile 75-100"),lwd=2,col=c(8,1,1,1,1),lty=c(1,1,2,3,4),bty="n")# Lassoplot(survfit(Surv(time,status)~group.lasso,data=vdv),mark.time=FALSE,lwd=2,lty=1:4,xlab="Time (years)",ylab="Survival")lines(survfit(Surv(time,status)~1,data=vdv),mark.time=FALSE,lwd=2,lty=1,col=8)legend("bottomleft",c("All tumours","Percentile 0-25","Percentile 25-50","Percentile 50-75","Percentile 75-100"),lwd=2,col=c(8,1,1,1,1),lty=c(1,1,2,3,4),bty="n")table(vdv$group.lasso,vdv$group.ridge)c1 <-coxph(Surv(time,status)~CVPI.lasso,data=vdv,method="breslow")c12*diff(c1$loglik)c2 <-coxph(Surv(time,status)~CVPI.ridge,data=vdv,method="breslow")c22*diff(c2$loglik)c3 <-coxph(Surv(time,status)~CVPI.lasso+CVPI.ridge,data=vdv,method="breslow")c32*diff(c3$loglik)################################################################################################################################################################# Section 11.4: Adding clinical predictors##############################################################################################################################################################library(dynpred)data(nki)### Crude Cox model with the clinical covariatescoxph(Surv(tyears,d) ~ chemotherapy + hormonaltherapy + typesurgery + histolgrade + vasc.invasion + diameter + posnodes + age + mlratio,data = nki, method ="breslow")### Cross-validated prognostic indicesCVPI.clin <-rep(NA,n)for (i in1:n) {# leave i out ci <-coxph(Surv(tyears,d) ~ chemotherapy + hormonaltherapy + typesurgery + histolgrade + vasc.invasion + diameter + posnodes + age + mlratio,data=nki[-i,], method="breslow") sfi <-survfit(ci, newdata=nki[i,]) summi <-summary(sfi, times=5) CVPI.clin[i] <-log(-log(summi$surv))}mean(CVPI.clin)CVPI.clin <- CVPI.clin-mean(CVPI.clin)mean(CVPI.clin)sd(CVPI.clin)nki$CVPI.clin <- CVPI.clinplot(CVPI.clin,CVPI.ridge)cor(CVPI.clin,CVPI.ridge)################################################################################################################################################################# Table 11.3: Super model Cox regression##############################################################################################################################################################c1 <-coxph(Surv(time,status)~CVPI.clin,data=vdv,method="breslow")c12*diff(c1$loglik)c2 <-coxph(Surv(time,status)~CVPI.ridge,data=vdv,method="breslow")c22*diff(c2$loglik)c3 <-coxph(Surv(time,status)~CVPI.clin+CVPI.ridge,data=vdv,method="breslow")c32*diff(c3$loglik)################################################################################################################################################################# Figure 11.5 Kullback-Leibler prediction error curves (left) and ### prediction error reduction curves (right) for the null model (Kaplan-Meier)### and for the three models of Table 11.3.##############################################################################################################################################################KL0 <-pecox(Surv(time,status)~1,Surv(time,status==0)~1,data=vdv)KL0$Err[is.na(KL0$Err)] <-0KL1 <-pecox(Surv(time,status)~CVPI.clin,Surv(time,status==0)~1,data=vdv)KL1$Err[is.na(KL1$Err)] <-0KL2 <-pecox(Surv(time,status)~CVPI.ridge,Surv(time,status==0)~1,data=vdv)KL2$Err[is.na(KL2$Err)] <-0KL3 <-pecox(Surv(time,status)~CVPI.clin+CVPI.ridge,Surv(time,status==0)~1,data=vdv)KL3$Err[is.na(KL3$Err)] <-0### Prediction error curvesplot(KL0$time,KL0$Err,type="s",lwd=2,lty=2,col="#646060",xlim=c(0,12.5),xlab="Time (years)",ylab="Prediction error")lines(KL1$time,KL1$Err,type="s",lwd=2,lty=2)lines(KL2$time,KL2$Err,type="s",lwd=2,col="#646060")lines(KL3$time,KL3$Err,type="s",lwd=2)legend("topleft",c("Null model","Clinical only","Genomic only","Clinical + genomic"),lwd=2,lty=c(2,2,1,1),col=c("#646060",1,"#646060",1),bty="n")KL <-data.frame(time=KL0$time,Err0=KL0$Err,Err1=KL1$Err,Err2=KL2$Err,Err3=KL3$Err)KL$ErrRed1 <- (KL$Err0-KL$Err1)/KL$Err0KL$ErrRed2 <- (KL$Err0-KL$Err2)/KL$Err0KL$ErrRed3 <- (KL$Err0-KL$Err3)/KL$Err0### Prediction error reduction curvesplot(KL$time,KL$ErrRed3,type="s",lwd=2,xlim=c(0,12.5),ylim=c(0,0.25),xlab="Time (years)",ylab="Prediction error reduction")lines(KL$time,KL$ErrRed2,type="s",lwd=2,col="#646060")lines(KL$time,KL$ErrRed1,type="s",lwd=2,lty=2)legend("topright",c("Clinical only","Genomic only","Clinical + genomic"),lwd=2,lty=c(2,1,1),col=c(1,"#646060",1),bty="n")