% % NOTE -- ONLY EDIT THE .Rnw FILE!!! The .tex file is % likely to be overwritten. % \documentclass[12pt]{article} \usepackage{amsmath,pstricks} \usepackage[authoryear,round]{natbib} \usepackage{hyperref} \textwidth=6.2in \textheight=8.5in %\parskip=.3cm \oddsidemargin=.1in \evensidemargin=.1in \headheight=-.3in \newcommand{\scscst}{\scriptscriptstyle} \newcommand{\scst}{\scriptstyle} \newcommand{\Rfunction}[1]{{\texttt{#1}}} \newcommand{\Robject}[1]{{\texttt{#1}}} \newcommand{\Rpackage}[1]{{\textit{#1}}} \newcommand{\Rmethod}[1]{{\texttt{#1}}} \newcommand{\Rfunarg}[1]{{\texttt{#1}}} \newcommand{\Rclass}[1]{{\textit{#1}}} \textwidth=6.2in \bibliographystyle{plainnat} \begin{document} %\setkeys{Gin}{width=0.55\textwidth} \title{Reproducibility of Dressman 2007} \author{VJ Carey} \maketitle The dispute between Baggerly and Coombes (BC) and Dressman, Potti and Nevins (DPN) in J Clin Oncology, 2008; 26(7):1186-1187, is of broad interest. Because general reproducibility of genome-scale data analysis has been questioned at high levels (cite Ioannidis and others), specific debates on difficulties of reproduction of analytic findings should be examined carefully. Study of these debates can reveal patterns of analysis and interpretation that should be avoided to reduce risks of conflicts over reproducibility, or promoted to increase ease with which new findings can be prioritized for application and extension. I have reviewed BC's supplementary materials with some care, to see whether a concise exposition of the conflict can be achieved. Briefly, in their original paper, Dressman et al. (JCO 25(5):517-25) showed that among individuals with ovarian tumors non-responsive to platinum, activation of Src or E2F3 pathways was significantly associated with survival (Figs 2B and 2C, p521). Below I present figures illustrating partial reproducibility of Dressman's original analyses, computed independently of the materials assembled by BC. Figure 1(a) shows the association between Src dysregulation and survival among non-responders using Dressman et al.'s 'corrected RMA' from the supplementary web site \url{http://data.cgt.duke.edu/platinum.php} (retrieved April 4 2009) with sample identifiers given in that file. Figure 1(b) shows the same association after sample identifiers are redefined using the method of BC to map Dressman's quantifications to the original CEL files which are assumed to be accurately labeled to correspond to records in the clinical data. Figure 1(b) is extremely similar to Dressman's original Figure 2B, and shows that BC are correct when they say that the labels in the 'corrected RMA' archive need to be revised. The mild discrepancies between Figure 1(b) and Dressman's original 2B might be explained through differences in tumor scoring coefficients, or through differences in exact numbers of patients/events available -- only 116 of 119 transcript profiles could be reliably mapped to CEL files for relabeling. In any case, Figure 1(b) suggests that we can use the public archive, with some adjustments, to technically reproduce Dressman et al.'s original findings to a reasonable approximation. Figures 1(c) and 1(d) are more troubling. Figure 1(c) reproduces the methods for Figure 1(b) in application to E2F3 activation. Among patients labeled as nonresponders by Dressman in the public archive, there is no association between pathway activation and survival. Figure 1(d) shows that there is an association, but among the responders. These findings are congruent with those of BC, who explored more variations on the data sources and could not recover the finding of the original Figure 2C. We now have two basic problems. First, ignoring concerns about confounding, and using only (and all) quantifications provided by Dressman, and thereby 'following their methods', we cannot reproduce Figure 2C. Second, the significance of the association indicated in my Figure 1(b) is lost when a simple allowance for batch effects is made in the test for different survival distributions by pathway activation. <>= psurv = function (x, digits = max(options()$digits - 4, 3), ...) { saveopt <- options(digits = digits) on.exit(options(saveopt)) if (!inherits(x, "survdiff")) stop("Object is not the result of survdiff") if (!is.null(cl <- x$call)) { } omit <- x$na.action if (length(omit)) { } if (length(x$n) == 1) { z <- sign(x$exp - x$obs) * sqrt(x$chisq) temp <- c(x$obs, x$exp, z, signif(1 - pchisq(x$chisq, 1), digits)) names(temp) <- c("Observed", "Expected", "Z", "p") print(temp) } else { if (is.matrix(x$obs)) { otmp <- apply(x$obs, 1, sum) etmp <- apply(x$exp, 1, sum) } else { otmp <- x$obs etmp <- x$exp } df <- (sum(1 * (etmp > 0))) - 1 temp <- cbind(x$n, otmp, etmp, ((otmp - etmp)^2)/etmp, ((otmp - etmp)^2)/diag(x$var)) dimnames(temp) <- list(names(x$n), c("N", "Observed", "Expected", "(O-E)^2/E", "(O-E)^2/V")) uu <- 1 - pchisq(x$chisq, df) } uu } library(dressCheck) library(chron) data(DrAsGiven) data(srcWts) giv = DrAsGiven[intersect(featureNames(DrAsGiven), names(srcWts)),] srcWtsL = srcWts[featureNames(giv)] sco = t(exprs(giv))%*%srcWtsL sdys = 1*(sco>median(sco)) par(mfrow=c(2,2)) library(survival) with(pData(DrAsGiven), plot(survfit(Surv(Survival, X0...alive...1...dead)~sdys, subset=response.0.NR..1.CR==0),col=c("blue", "yellow"), lwd=3, xlab="Months", ylab="survival (%)", main="(a)")) with(pData(DrAsGiven), d1 <<- survdiff(Surv(Survival, X0...alive...1...dead)~sdys, subset=response.0.NR..1.CR==0)) text(37,.05, paste("logrank p=", round(psurv(d1),3))) data(corrp116) corr = corrp116[intersect(featureNames(corrp116), names(srcWts)),] srcWtsL = srcWts[featureNames(corr)] sco = t(exprs(corr))%*%srcWtsL sdys = 1*(sco>median(sco)) with(pData(corrp116), plot(survfit(Surv(Survival, dead)~sdys, subset=CR==0),col=c("blue", "yellow"), lwd=3, xlab="Months", ylab="survival (%)", main="(b)")) with(pData(corrp116), d2 <<- survdiff(Surv(Survival, dead)~sdys, subset=CR==0)) text(37,.05, paste("logrank p=", round(psurv(d2),3))) data(e2f3Wts) corr = corrp116[intersect(featureNames(corrp116), names(e2f3Wts)),] eWtsL = e2f3Wts[featureNames(corr)] sco = t(exprs(corr))%*%eWtsL edys = 1*(sco>median(sco)) with(pData(corrp116), d1 <<- survdiff(Surv(Survival, dead)~edys, subset=CR==0)) with(pData(corrp116), plot(survfit(Surv(Survival, dead)~edys, subset=CR==0),col=c("blue", "yellow"), lwd=3, xlab="Months", ylab="survival (%)", main="(c)")) text(37,.05, paste("logrank p=", round(psurv(d1),3))) with(pData(corrp116), d2 <<- survdiff(Surv(Survival, dead)~edys, subset=CR==1)) with(pData(corrp116), plot(survfit(Surv(Survival, dead)~edys, subset=CR==1),col=c("blue", "yellow"), lwd=3, xlab="Months", ylab="survival (%)", main="(d)")) text(57,.05, paste("logrank p=", round(psurv(d2),3))) with(pData(corrp116), m2 <<- survreg(Surv(Survival, dead)~sdys+poly(chron(rundate),2), subset=CR==0)) summary(m2) @ \end{document}