## ----setup, message=FALSE, echo = FALSE------------------------------------------------- library(BiocStyle) library(knitr) options(digits=3) options(width=90) ## ----setup2, message=FALSE, eval=TRUE--------------------------------------------------- library(limma) library(Glimma) library(edgeR) library(Mus.musculus) ## ----downloadData, eval=TRUE------------------------------------------------------------ url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE63310&format=file" utils::download.file(url, destfile="GSE63310_RAW.tar", mode="wb") utils::untar("GSE63310_RAW.tar", exdir = ".") files <- c("GSM1545535_10_6_5_11.txt", "GSM1545536_9_6_5_11.txt", "GSM1545538_purep53.txt", "GSM1545539_JMS8-2.txt", "GSM1545540_JMS8-3.txt", "GSM1545541_JMS8-4.txt", "GSM1545542_JMS8-5.txt", "GSM1545544_JMS9-P7c.txt", "GSM1545545_JMS9-P8c.txt") for(i in paste(files, ".gz", sep="")) R.utils::gunzip(i, overwrite=TRUE) ## ----import1---------------------------------------------------------------------------- files <- c("GSM1545535_10_6_5_11.txt", "GSM1545536_9_6_5_11.txt", "GSM1545538_purep53.txt", "GSM1545539_JMS8-2.txt", "GSM1545540_JMS8-3.txt", "GSM1545541_JMS8-4.txt", "GSM1545542_JMS8-5.txt", "GSM1545544_JMS9-P7c.txt", "GSM1545545_JMS9-P8c.txt") read.delim(files[1], nrow=5) ## ----import2---------------------------------------------------------------------------- x <- readDGE(files, columns=c(1,3)) class(x) dim(x) ## ----annotatesamples-------------------------------------------------------------------- samplenames <- substring(colnames(x), 12, nchar(colnames(x))) samplenames colnames(x) <- samplenames group <- as.factor(c("LP", "ML", "Basal", "Basal", "ML", "LP", "Basal", "ML", "LP")) x$samples$group <- group lane <- as.factor(rep(c("L004","L006","L008"), c(3,4,2))) x$samples$lane <- lane x$samples ## ----annotategenes, message=FALSE------------------------------------------------------- geneid <- rownames(x) genes <- select(Mus.musculus, keys=geneid, columns=c("SYMBOL", "TXCHROM"), keytype="ENTREZID") head(genes) ## ----removedups------------------------------------------------------------------------- genes <- genes[!duplicated(genes$ENTREZID),] ## ----assigngeneanno--------------------------------------------------------------------- x$genes <- genes x ## ----cpm-------------------------------------------------------------------------------- cpm <- cpm(x) lcpm <- cpm(x, log=TRUE, prior.count=2) ## ----lcpm------------------------------------------------------------------------------- L <- mean(x$samples$lib.size) * 1e-6 M <- median(x$samples$lib.size) * 1e-6 c(L, M) summary(lcpm) ## ----zeroes----------------------------------------------------------------------------- table(rowSums(x$counts==0)==9) ## ----filter----------------------------------------------------------------------------- keep.exprs <- filterByExpr(x, group=group) x <- x[keep.exprs,, keep.lib.sizes=FALSE] dim(x) ## ----filterplot1, fig.height=4, fig.width=8, fig.cap="æ¯ä¸ªæ ·æœ¬è¿‡æ»¤å‰çš„原始数æ®ï¼ˆA)和过滤åŽï¼ˆB)的数æ®çš„log-CPMå€¼å¯†åº¦ã€‚ç«–ç›´è™šçº¿æ ‡å‡ºäº†è¿‡æ»¤æ¥éª¤ä¸æ‰€ç”¨é˜ˆå€¼ï¼ˆç›¸å½“于CPM值为约0.2)。"---- lcpm.cutoff <- log2(10/M + 2/L) library(RColorBrewer) nsamples <- ncol(x) col <- brewer.pal(nsamples, "Paired") par(mfrow=c(1,2)) plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="") title(main="A. Raw data", xlab="Log-cpm") abline(v=lcpm.cutoff, lty=3) for (i in 2:nsamples){ den <- density(lcpm[,i]) lines(den$x, den$y, col=col[i], lwd=2) } legend("topright", samplenames, text.col=col, bty="n") lcpm <- cpm(x, log=TRUE) plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="") title(main="B. Filtered data", xlab="Log-cpm") abline(v=lcpm.cutoff, lty=3) for (i in 2:nsamples){ den <- density(lcpm[,i]) lines(den$x, den$y, col=col[i], lwd=2) } legend("topright", samplenames, text.col=col, bty="n") ## ----normalize-------------------------------------------------------------------------- x <- calcNormFactors(x, method = "TMM") x$samples$norm.factors ## ----normalizemodifieddata-------------------------------------------------------------- x2 <- x x2$samples$norm.factors <- 1 x2$counts[,1] <- ceiling(x2$counts[,1]*0.05) x2$counts[,2] <- x2$counts[,2]*5 ## ----plotmodifieddata, fig.height=4, fig.width=8, fig.cap="æ ·ä¾‹æ•°æ®ï¼šlog-CPM值的箱线图展示了未ç»å½’一化的数æ®ï¼ˆA)åŠå½’一化åŽçš„æ•°æ®ï¼ˆB)ä¸æ¯ä¸ªæ ·æœ¬çš„表达分布。数æ®é›†ç»è¿‡è°ƒæ•´ï¼Œæ ·æœ¬1å’Œ2ä¸çš„表达计数分别被缩放到其原始值的5%å’Œ500%。"---- par(mfrow=c(1,2)) lcpm <- cpm(x2, log=TRUE) boxplot(lcpm, las=2, col=col, main="") title(main="A. Example: Unnormalised data",ylab="Log-cpm") x2 <- calcNormFactors(x2) x2$samples$norm.factors lcpm <- cpm(x2, log=TRUE) boxplot(lcpm, las=2, col=col, main="") title(main="B. Example: Normalised data",ylab="Log-cpm") ## ----MDS1, fig.height=4, fig.width=8, fig.cap="log-CPM值在维度1å’Œ2çš„MDSå›¾ï¼Œä»¥æ ·å“åˆ†ç»„ä¸Šè‰²å¹¶æ ‡è®°ï¼ˆA)和维度3å’Œ4çš„MDS图,以测åºé“ä¸Šè‰²å¹¶æ ‡è®°ï¼ˆB)。图ä¸çš„è·ç¦»å¯¹åº”于最主è¦çš„å€æ•°å˜åŒ–(fold change),默认情况下也就是å‰500个在æ¯å¯¹æ ·å“ä¹‹é—´å·®å¼‚æœ€å¤§çš„åŸºå› çš„å¹³å‡ï¼ˆå‡æ–¹æ ¹ï¼‰log2å€æ•°å˜åŒ–。"---- lcpm <- cpm(x, log=TRUE) par(mfrow=c(1,2)) col.group <- group levels(col.group) <- brewer.pal(nlevels(col.group), "Set1") col.group <- as.character(col.group) col.lane <- lane levels(col.lane) <- brewer.pal(nlevels(col.lane), "Set2") col.lane <- as.character(col.lane) plotMDS(lcpm, labels=group, col=col.group) title(main="A. Sample groups") plotMDS(lcpm, labels=lane, col=col.lane, dim=c(3,4)) title(main="B. Sequencing lanes") ## ----GlimmaMDSplot---------------------------------------------------------------------- glMDSPlot(lcpm, labels=paste(group, lane, sep="_"), groups=x$samples[,c(2,5)], launch=FALSE) ## ----design----------------------------------------------------------------------------- design <- model.matrix(~0+group+lane) colnames(design) <- gsub("group", "", colnames(design)) design ## ----contrasts-------------------------------------------------------------------------- contr.matrix <- makeContrasts( BasalvsLP = Basal-LP, BasalvsML = Basal - ML, LPvsML = LP - ML, levels = colnames(design)) contr.matrix ## ----voom, fig.height=4, fig.width=8, fig.cap="图ä¸ç»˜åˆ¶äº†æ¯ä¸ªåŸºå› çš„å‡å€¼ï¼ˆx轴)和方差(y轴),显示了在该数æ®ä¸Šä½¿ç”¨`voom`å‰å®ƒä»¬ä¹‹é—´çš„相关性(左),以åŠå½“è¿ç”¨`voom`的精确æƒé‡åŽè¿™ç§è¶‹åŠ¿æ˜¯å¦‚何消除的(å³ï¼‰ã€‚左侧的图是使用`voom`函数绘制的,它为进行log-CPM转æ¢åŽçš„æ•°æ®æ‹Ÿåˆçº¿æ€§æ¨¡åž‹ä»Žè€Œæå–残差方差。然åŽï¼Œå¯¹æ–¹å·®å–å¹³æ–¹æ ¹ï¼ˆæˆ–å¯¹æ ‡å‡†å·®å–å¹³æ–¹æ ¹ï¼‰ï¼Œå¹¶ç›¸å¯¹æ¯ä¸ªåŸºå› çš„å¹³å‡è¡¨è¾¾ä½œå›¾ã€‚å‡å€¼é€šè¿‡å¹³å‡è®¡æ•°åŠ 上2å†è¿›è¡Œlog2转æ¢è®¡ç®—得到。å³ä¾§çš„图使用`plotSA`绘制了log2æ®‹å·®æ ‡å‡†å·®ä¸Žlog-CPMå‡å€¼çš„关系。平å‡log2æ®‹å·®æ ‡å‡†å·®ç”±æ°´å¹³è“çº¿æ ‡å‡ºã€‚åœ¨è¿™ä¸¤å¹…å›¾ä¸ï¼Œæ¯ä¸ªé»‘ç‚¹è¡¨ç¤ºä¸€ä¸ªåŸºå› ï¼Œçº¢çº¿ä¸ºå¯¹è¿™äº›ç‚¹çš„æ‹Ÿåˆã€‚"---- par(mfrow=c(1,2)) v <- voom(x, design, plot=TRUE) v vfit <- lmFit(v, design) vfit <- contrasts.fit(vfit, contrasts=contr.matrix) efit <- eBayes(vfit) plotSA(efit, main="Final model: Mean-variance trend") ## ----decidetests------------------------------------------------------------------------ summary(decideTests(efit)) ## ----treat------------------------------------------------------------------------------ tfit <- treat(vfit, lfc=1) dt <- decideTests(tfit) summary(dt) ## ----venn, fig.height=6, fig.width=6, fig.cap="韦æ©å›¾å±•ç¤ºäº†ä»…basalå’ŒLP(左)ã€ä»…basalå’ŒML(å³ï¼‰çš„对比的DEåŸºå› æ•°é‡ï¼Œè¿˜æœ‰ä¸¤ç§å¯¹æ¯”ä¸å…±åŒçš„DEåŸºå› æ•°é‡ï¼ˆä¸ï¼‰ã€‚在任何对比ä¸å‡ä¸å·®å¼‚è¡¨è¾¾çš„åŸºå› æ•°é‡æ ‡äºŽå³ä¸‹ã€‚"---- de.common <- which(dt[,1]!=0 & dt[,2]!=0) length(de.common) head(tfit$genes$SYMBOL[de.common], n=20) vennDiagram(dt[,1:2], circle.col=c("turquoise", "salmon")) write.fit(tfit, dt, file="results.txt") ## ----toptables-------------------------------------------------------------------------- basal.vs.lp <- topTreat(tfit, coef=1, n=Inf) basal.vs.ml <- topTreat(tfit, coef=2, n=Inf) head(basal.vs.lp) head(basal.vs.ml) ## ----MDplot, fig.keep='none'------------------------------------------------------------ plotMD(tfit, column=1, status=dt[,1], main=colnames(tfit)[1], xlim=c(-8,13)) ## ----GlimmaMDplot----------------------------------------------------------------------- glMDPlot(tfit, coef=1, status=dt, main=colnames(tfit)[1], side.main="ENTREZID", counts=lcpm, groups=group, launch=FALSE) ## ----heatmap, fig.height=8, fig.width=5, fig.cap="在basalå’ŒLP的对比ä¸å‰100个DEåŸºå› log-CPM值的çƒå›¾ã€‚ç»è¿‡ç¼©æ”¾è°ƒæ•´åŽï¼Œæ¯ä¸ªåŸºå› (æ¯è¡Œï¼‰çš„表达å‡å€¼ä¸º0ï¼Œå¹¶ä¸”æ ‡å‡†å·®ä¸º1ã€‚ç»™å®šåŸºå› ç›¸å¯¹é«˜è¡¨è¾¾çš„æ ·æœ¬è¢«æ ‡è®°ä¸ºçº¢è‰²ï¼Œç›¸å¯¹ä½Žè¡¨è¾¾çš„æ ·æœ¬è¢«æ ‡è®°ä¸ºè“色。浅色和白色代表ä¸ç‰è¡¨è¾¾æ°´å¹³çš„åŸºå› ã€‚æ ·æœ¬å’ŒåŸºå› å·²é€šè¿‡åˆ†å±‚èšç±»çš„方法é‡æ–°æŽ’åºã€‚图ä¸æ˜¾ç¤ºæœ‰æ ·æœ¬èšç±»çš„æ ‘çŠ¶å›¾ã€‚", message=FALSE---- library(gplots) basal.vs.lp.topgenes <- basal.vs.lp$ENTREZID[1:100] i <- which(v$genes$ENTREZID %in% basal.vs.lp.topgenes) mycol <- colorpanel(1000,"blue","white","red") heatmap.2(lcpm[i,], scale="row", labRow=v$genes$SYMBOL[i], labCol=group, col=mycol, trace="none", density.info="none", margin=c(8,6), lhei=c(2,10), dendrogram="column") ## ----camera----------------------------------------------------------------------------- load(system.file("extdata", "mouse_c2_v5p1.rda", package = "RNAseq123")) idx <- ids2indices(Mm.c2,id=rownames(v)) cam.BasalvsLP <- camera(v,idx,design,contrast=contr.matrix[,1]) head(cam.BasalvsLP,5) cam.BasalvsML <- camera(v,idx,design,contrast=contr.matrix[,2]) head(cam.BasalvsML,5) cam.LPvsML <- camera(v,idx,design,contrast=contr.matrix[,3]) head(cam.LPvsML,5) ## ----barcodeplot, fig.height=6, fig.width=6, fig.cap="`LIM_MAMMARY_LUMINAL_MATURE_UP` (红色æ¡å½¢ï¼Œå›¾è¡¨ä¸Šæ–¹ï¼‰å’Œ`LIM_MAMMARY_LUMINAL_MATURE_DN`(è“色æ¡å½¢ï¼Œå›¾è¡¨ä¸‹æ–¹ï¼‰åŸºå› 集在LPå’ŒML的对比ä¸çš„æ¡ç 图,æ¯ä¸ªåŸºå› 集都有一æ¡å¯Œé›†çº¿å±•ç¤ºäº†ç«–ç›´æ¡å½¢åœ¨å›¾è¡¨æ¯éƒ¨åˆ†çš„相对富集程度。Limç‰äººçš„实验[@Lim:BreastCancerRes:2010]éžå¸¸ç±»ä¼¼äºŽæˆ‘们的,用了相åŒçš„分选方å¼æ¥èŽ·å–ä¸åŒçš„细胞群,åªæ˜¯ä»–们使用的是微阵列而ä¸æ˜¯RNA-seqæ¥æµ‹å®šåŸºå› 表达。需è¦æ³¨æ„çš„æ˜¯ï¼Œä¸Šè°ƒåŸºå› é›†å‘ç”Ÿä¸‹è°ƒè€Œä¸‹è°ƒåŸºå› é›†å‘生上调的逆相关性æ¥è‡ªäºŽå¯¹æ¯”的设定方å¼ï¼ˆLP相比于ML),如果将其对调,方å‘性将会å»åˆã€‚"---- barcodeplot(efit$t[,3], index=idx$LIM_MAMMARY_LUMINAL_MATURE_UP, index2=idx$LIM_MAMMARY_LUMINAL_MATURE_DN, main="LPvsML") ## ----softwareinfo----------------------------------------------------------------------- sessionInfo()