## ----, message=F, warning=F---------------------------------------------- library("SISPA") load("SISPA_data.Rda") ## ----, message=F, warning=F---------------------------------------------- library(GSVA) library(changepoint) library(data.table) library(plyr) library(ggplot2) ## ----, message=F, warning=F---------------------------------------------- gsva_results <- callGSVA(ExpressionSet,GeneSet) head(gsva_results) ## ----echo=F, message=F, warning=F---------------------------------------- #callGSVA function code callGSVA = function(x,y) { if(missing(x)){ stop("input expression data missing!") } if(missing(y)){ stop("input gene set missing!") } #y <- list(as.character(y[,1])) gsva.results <- gsva(x, y, method="zscore", rnaseq=FALSE,verbose=FALSE) tr_gsva.results <- t(gsva.results) #label column names tr_result_zscore <- cbind(samples = rownames(tr_gsva.results), tr_gsva.results) colnames(tr_result_zscore) <- c("samples","zscore") rownames(tr_result_zscore) <- NULL return ( as.data.frame(tr_result_zscore)) } ## ----echo=F, message=F, warning=F---------------------------------------- ######### START: sub-functions used by the cptSamples main function ######### ## Sorts the data frame by a column index # x : data frame # i : numeric column index of the data frame to sort it by # b : sorting order, ascending (FALSE) or descending (TRUE) sortData = function(x,i,b) { if(missing(x)){ stop("input data is missing!") } if(missing(b)){ b=FALSE } if(missing(i)){ i=1 } return( x[ order(x[,i], decreasing=b), ] ) } ## Identify the changepoints for the data # x : data within which you wish to identify the changepoints # i : numeric column index of the data frame to sort it by # b : sorting order, ascending (FALSE) or descending (TRUE) perDiffcpt = function(x,cpt_data,cpt_method,cpt_max){ max <- length(x) if(max >= cpt_max){ if(cpt_data == "mean"){ changepoints <- cpt.mean(x,method=cpt_method,Q=cpt_max) } else{ changepoints <- cpt.var(x,method=cpt_method,Q=cpt_max) } }else{ if(cpt_data == "mean"){ changepoints <- cpt.mean(x,method=cpt_method,Q=5) } else{ changepoints <- cpt.var(x,method=cpt_method,Q=5) } } return ( cpts(changepoints) ) } ## Add the estimated changepoint locations to the data frame # x : A data frame containing the data used for change point estimation # y : vector containing the changepoint locations for the data supplied cptAdd = function(x,y){ level=character() l = 1 for(i in 1:length(y) ) { if(i==1){ for(j in 1:y[i]){ level[j]=l } }else{ a <- (y[i-1])+1 for(j in a:y[i]){ level[j]=l } } if(i==length(y)){ l=1000 a <- y[i]+1 for(j in a:nrow(x)){ level[j]=1000 } } l=l+1 } return(as.numeric(level)) } ## Locate all values for the estimated change point locations # x : A data frame containing the data values used for change point estimation # y : Vector containing the changepoint locations for the data supplied # z : Column index of the data values used for changepoint estimation cptLocate_all = function(x,y,z){ cutoff_all <- character() for(i in 1:length(y) ) { cutoff_all[i] <- x[y[i],][z] } cpts_plot_cutoff_all <- cutoff_all[!is.na(cutoff_all)] cpts_plot_cutoff_all <- as.numeric(cpts_plot_cutoff_all) cpts_plot_cutoff_all <- round(cpts_plot_cutoff_all,digits=2) return(cpts_plot_cutoff_all) } ## Plot the single or multiple changepoints identified within the data # x : A vector containing the data within which changepoints were identified # y : A vector containing the identified number of changepoints on the data # LABEL : Y-axis label cptsPlot <- function(x,y,LABEL){ if(missing(x)){ stop("data is missing!") } if(missing(y)){ stop("changepoint cutoffs are missing!") } x <- sort(x,FALSE) y <- sort(y,TRUE) cptplot <- plot(x, type='p', lty=3, lwd=1, main="", xlab="Gene Index", ylab=LABEL, cex=1.5, pch=1, xlim=c(0,length(x))) for(i in 1:length(y)) { if(i==1) { abline(h=y[i],lty=2,col='red') text(y[i],y[i],y[i], cex=1.0, pos=4, col="red") }else{ if(y[i]<0){ abline(h=y[i],lty=2,col='blue') text(y[i],y[i],y[i], cex=1.0, pos=4, col="blue") } else{ abline(h=y[i],lty=2,col='green3') text(y[i],y[i],y[i], cex=1.0, pos=4, col="green3") } } } return(cptplot) } ######### END: sub-functions used by the cptSamples main function ######### ## ----message=F, warning=F, echo=F---------------------------------------- cptSamples <- function (x,dir,cpt_data,cpt_method,cpt_max){ #read sample and sample zscores st <- x[,c(1,2)] #correct for empty cells or NAs st_rmv_na <- st[!is.na(st[ncol(st)]), ] #for upward direction if(dir == "up"){ colnames(st_rmv_na)[ncol(st_rmv_na)]<-"zscore" Y_LABEL <- "Z-Score" st_rmv_na$zscore <- as.numeric(as.character(st_rmv_na[,2])) st_rmv_na_sort = sortData(st_rmv_na,ncol(st_rmv_na),'TRUE' ) cpts = perDiffcpt(st_rmv_na_sort$zscore,cpt_data,cpt_method,cpt_max) }else{ #for downward direction st_rmv_na$NegOneValue_zscore <- as.numeric(as.character(st_rmv_na[,2])) * -1 colnames(st_rmv_na)[ncol(st_rmv_na)]<-"NegOneValue_zscore" colnames(st_rmv_na)[ncol(st_rmv_na)-1]<-"zscore" Y_LABEL <- "-1 x (Z-Score)" st_rmv_na_sort = sortData(st_rmv_na,ncol(st_rmv_na),'TRUE' ) cpts = perDiffcpt(st_rmv_na_sort$NegOneValue_zscore,cpt_data,cpt_method,cpt_max) } #check for the number of changepoints identified if(length(cpts)<1){ warning("No changepoints identified in the data set!") }else{ #add estimated changepoint locations to the data changepoints <- cptAdd(st_rmv_na_sort,cpts) } #append changepoints to the original data frame cpt_out <- data.frame(st_rmv_na_sort,changepoints) if(dir == "up"){ all_cutoffs <- cptLocate_all(cpt_out,cpts,2) }else{ all_cutoffs <- cptLocate_all(cpt_out,cpts,3) } cpt_out_sort <- sortData(cpt_out,ncol(cpt_out)-1,'FALSE' ) #plot all changepoints identified if(dir == "up"){ #cptsPlot(cpt_out_sort$zscore,all_cutoffs,Y_LABEL) }else{ #cptsPlot(cpt_out_sort$NegOneValue_zscore,all_cutoffs,Y_LABEL) } #generate the file with all changepoints #format the data file for reading usability #any changepoint other than 1 is designated as '0' #plot changepoints within the data identified cpt_out$sample_groups <- cpt_out$changepoints cpt_out [, ncol(cpt_out) ][ cpt_out [ ,ncol(cpt_out) ] >1 ] <- "0" cpt_out [, ncol(cpt_out)-1 ][ cpt_out [ ,ncol(cpt_out)-1 ] == 1000 ] <- "NA" #write.table(cpt_out,file=paste(dd,DATA_FILE,sep="/"),sep=",",row.names=FALSE) if(dir == "up"){ #don't remove anything }else{ cpt_out <- cpt_out[,-c(3)] } return(cpt_out) } ## ----echo=F, warning=F, message=F---------------------------------------- waterfallplot = function(x){ if(missing(x)){ stop("input data is missing!") } output_file <- "WATERFALL_PLOT.pdf" x <- x[,-ncol(x)] colnames(x)[1] <- "samples" #create a start column x[,ncol(x)+1] <- 0 #select and modify the data for readability read_data_sort <- sortData(x,2,FALSE) read_data_sort$changepoints[is.na(read_data_sort$changepoints)] <- 1000 #arrange the columns arrange_read_data_sort <- data.frame(read_data_sort[,1],read_data_sort[,4],read_data_sort[,2],read_data_sort[,3]) colnames(arrange_read_data_sort)<-c("samples","Start","End","changepoints") arrange_read_data_sort[,1] <- factor(arrange_read_data_sort[,1], levels=arrange_read_data_sort[,1]) arrange_read_data_sort$id <- seq_along(arrange_read_data_sort$End) arrange_read_data_sort$type <- ifelse(arrange_read_data_sort$changepoints==1, "cpt1", "other") arrange_read_data_sort <- arrange_read_data_sort[,c(5,1,6,2,3)] #generate the waterfall plot wf <- ggplot(arrange_read_data_sort, aes(colour=type,samples,fill=type)) + scale_fill_manual(values=c("#FF8000","white")) + scale_colour_manual(values=c("#FF8000","grey")) + geom_rect(aes(x=samples,xmin=id-0.35,xmax=id+0.35,ymin=End,ymax=Start)) wf <- wf + labs(x="", y="", fill="") wf <- wf + theme(legend.position = "none") wf <- wf + theme(axis.text.y = element_text(colour="black",size=12.0)) wf <- wf + theme(axis.text.x = element_blank()) #wf <- wf + theme(axis.title.y = element_text(colour="black",face="bold",size=12.0)) wf <- wf + theme(panel.background = element_rect(fill="NA")) wf <- wf + theme(panel.border = element_rect(colour = "black", fill="NA")) wf <- wf + theme(panel.grid.major.y = element_line(colour="NA")) wf <- wf + theme(panel.grid.minor = element_line(colour = "NA")) wf <- wf + theme(axis.ticks.x = element_blank()) wf <- wf + theme(axis.ticks.y=element_line(colour="black")) wf <- wf + ylab("SISPA Profile Score") wf <- wf + xlab("Samples\n") return(wf) } ## ----, echo=F, warning=F, message=F-------------------------------------- freqplot = function(x){ count_data <- count(x,"sample_groups") freq <- barplot(count_data$freq,col=c("grey","orange"),ylab="# of Samples",names.arg=c("Inactive Profile","Active Profile"),cex.names=1.0) freq <- box() return(freq) } ## ----, message=F, warning=F, fig.height=6, fig.width=6, fig.align="center"---- cpt_on_samples <- cptSamples(gsva_results,dir="up",cpt_data="var",cpt_method="BinSeg",cpt_max=60) ## ----message=F, warning=F------------------------------------------------ head(cpt_on_samples) ## ----, eval=F, warning=F, message=F, fig.height=6, fig.width=6, fig.align="center"---- ## waterfallplot(cpt_on_samples) ## ----, eval=F, warning=F, message=F, fig.height=6, fig.width=6, fig.align="center"---- ## freqplot(cpt_on_samples)