## ----, message=F, warning=F----------------------------------------------
library("SISPA")
load("SISPA_data.Rda")

## ----, message=F, warning=F----------------------------------------------
library(GSVA)
library(changepoint)
library(data.table)
library(plyr)
library(ggplot2)

## ----, message=F, warning=F----------------------------------------------
gsva_results <- callGSVA(ExpressionSet,GeneSet)
head(gsva_results)

## ----echo=F, message=F, warning=F----------------------------------------
#callGSVA function code
callGSVA = function(x,y) {
  if(missing(x)){
    stop("input expression data missing!")
  }
  if(missing(y)){
    stop("input gene set missing!")
  }
  #y <- list(as.character(y[,1]))
  gsva.results <- gsva(x, y, method="zscore", rnaseq=FALSE,verbose=FALSE)
  tr_gsva.results <- t(gsva.results)
  #label column names
  tr_result_zscore <- cbind(samples = rownames(tr_gsva.results), tr_gsva.results)
  colnames(tr_result_zscore) <- c("samples","zscore")
  rownames(tr_result_zscore) <- NULL
  return ( as.data.frame(tr_result_zscore))
}

## ----echo=F, message=F, warning=F----------------------------------------
 ######### START: sub-functions used by the cptSamples main function #########
  ## Sorts the data frame by a column index
  # x : data frame 
  # i : numeric column index of the data frame to sort it by
  # b : sorting order, ascending (FALSE) or descending (TRUE)
  sortData = function(x,i,b) {
    if(missing(x)){
      stop("input data is missing!")
    }
    if(missing(b)){
      b=FALSE
    }
    if(missing(i)){
     i=1
      }
    return( x[ order(x[,i], decreasing=b), ] )
  }
  ## Identify the changepoints for the data
  # x : data within which you wish to identify the changepoints
  # i : numeric column index of the data frame to sort it by
  # b : sorting order, ascending (FALSE) or descending (TRUE)
  perDiffcpt = function(x,cpt_data,cpt_method,cpt_max){
   max <- length(x)
   if(max >= cpt_max){
     if(cpt_data == "mean"){
       changepoints <- cpt.mean(x,method=cpt_method,Q=cpt_max)
       } else{
         changepoints <- cpt.var(x,method=cpt_method,Q=cpt_max)
         }
     }else{
       if(cpt_data == "mean"){
         changepoints <- cpt.mean(x,method=cpt_method,Q=5)
         } else{
           changepoints <- cpt.var(x,method=cpt_method,Q=5)
           }
       }
   return ( cpts(changepoints) )
   }
  ## Add the estimated changepoint locations to the data frame
  # x : A data frame containing the data used for change point estimation
  # y : vector containing the changepoint locations for the data supplied
  cptAdd = function(x,y){
    level=character()
    l = 1
    for(i in 1:length(y) ) {
      if(i==1){
        for(j in 1:y[i]){
          level[j]=l
        }
      }else{
        a <- (y[i-1])+1
        for(j in a:y[i]){
          level[j]=l
        }
      }
      if(i==length(y)){
        l=1000
        a <- y[i]+1
        for(j in a:nrow(x)){
          level[j]=1000
        }
      }
      
      l=l+1
    }
    return(as.numeric(level))
  }
  ## Locate all values for the estimated change point locations
  # x : A data frame containing the data values used for change point estimation
  # y : Vector containing the changepoint locations for the data supplied
  # z : Column index of the data values used for changepoint estimation
  cptLocate_all = function(x,y,z){
    cutoff_all <- character()  
    for(i in 1:length(y) ) {
      cutoff_all[i] <- x[y[i],][z]
    }
    cpts_plot_cutoff_all <- cutoff_all[!is.na(cutoff_all)]
    cpts_plot_cutoff_all <- as.numeric(cpts_plot_cutoff_all)
    cpts_plot_cutoff_all <- round(cpts_plot_cutoff_all,digits=2)
    return(cpts_plot_cutoff_all)
  }
  ## Plot the single or multiple changepoints identified within the data
  # x : A vector containing the data within which changepoints were identified
  # y : A vector containing the identified number of changepoints on the data
  # LABEL : Y-axis label
  cptsPlot <- function(x,y,LABEL){
    if(missing(x)){
      stop("data is missing!")
    }
    if(missing(y)){
      stop("changepoint cutoffs are missing!")
    }
    x <- sort(x,FALSE)
    y <- sort(y,TRUE)
    
    cptplot <- plot(x, type='p', lty=3, lwd=1, main="", xlab="Gene Index", ylab=LABEL, cex=1.5, pch=1, xlim=c(0,length(x)))
    
    for(i in 1:length(y)) {
      if(i==1) {
        abline(h=y[i],lty=2,col='red')
        text(y[i],y[i],y[i], cex=1.0, pos=4, col="red")
      }else{
        if(y[i]<0){
          abline(h=y[i],lty=2,col='blue')
          text(y[i],y[i],y[i], cex=1.0, pos=4, col="blue")
        } else{
          abline(h=y[i],lty=2,col='green3')
          text(y[i],y[i],y[i], cex=1.0, pos=4, col="green3")
        }
      }
    }

    return(cptplot)
  }
  ######### END: sub-functions used by the cptSamples main function #########

## ----message=F, warning=F, echo=F----------------------------------------
cptSamples <- function (x,dir,cpt_data,cpt_method,cpt_max){
  #read sample and sample zscores
  st <- x[,c(1,2)]
  #correct for empty cells or NAs
  st_rmv_na <- st[!is.na(st[ncol(st)]), ]
  #for upward direction
  if(dir == "up"){
    colnames(st_rmv_na)[ncol(st_rmv_na)]<-"zscore"
    Y_LABEL <- "Z-Score"
    st_rmv_na$zscore <- as.numeric(as.character(st_rmv_na[,2]))
    st_rmv_na_sort = sortData(st_rmv_na,ncol(st_rmv_na),'TRUE' )
    cpts = perDiffcpt(st_rmv_na_sort$zscore,cpt_data,cpt_method,cpt_max)
  }else{ #for downward direction
    st_rmv_na$NegOneValue_zscore <- as.numeric(as.character(st_rmv_na[,2])) * -1
    colnames(st_rmv_na)[ncol(st_rmv_na)]<-"NegOneValue_zscore"
    colnames(st_rmv_na)[ncol(st_rmv_na)-1]<-"zscore"
    Y_LABEL <- "-1 x (Z-Score)"
    st_rmv_na_sort = sortData(st_rmv_na,ncol(st_rmv_na),'TRUE' )
    cpts = perDiffcpt(st_rmv_na_sort$NegOneValue_zscore,cpt_data,cpt_method,cpt_max)
  }
  #check for the number of changepoints identified
  if(length(cpts)<1){
    warning("No changepoints identified in the data set!")
  }else{  
    #add estimated changepoint locations to the data
    changepoints <- cptAdd(st_rmv_na_sort,cpts)
  }
  #append changepoints to the original data frame
  cpt_out <- data.frame(st_rmv_na_sort,changepoints)
  if(dir == "up"){
      all_cutoffs <- cptLocate_all(cpt_out,cpts,2)
    }else{
      all_cutoffs <- cptLocate_all(cpt_out,cpts,3)
    }
  cpt_out_sort <- sortData(cpt_out,ncol(cpt_out)-1,'FALSE' )
  
    #plot all changepoints identified
    if(dir == "up"){
      #cptsPlot(cpt_out_sort$zscore,all_cutoffs,Y_LABEL)
    }else{
      #cptsPlot(cpt_out_sort$NegOneValue_zscore,all_cutoffs,Y_LABEL)
    }
  
    #generate the file with all changepoints
    #format the data file for reading usability
    #any changepoint other than 1 is designated as '0'
    #plot changepoints within the data identified
    cpt_out$sample_groups <- cpt_out$changepoints
    cpt_out [, ncol(cpt_out) ][ cpt_out [ ,ncol(cpt_out) ] >1 ] <- "0"
    cpt_out [, ncol(cpt_out)-1 ][ cpt_out [ ,ncol(cpt_out)-1 ] == 1000 ] <- "NA"
    #write.table(cpt_out,file=paste(dd,DATA_FILE,sep="/"),sep=",",row.names=FALSE)
    if(dir == "up"){
      #don't remove anything
    }else{
      cpt_out <- cpt_out[,-c(3)]
    }
  
  return(cpt_out)
}

## ----echo=F, warning=F, message=F----------------------------------------
waterfallplot = function(x){
  if(missing(x)){
    stop("input data is missing!")
  }
  output_file <- "WATERFALL_PLOT.pdf"
  x <- x[,-ncol(x)]
  colnames(x)[1] <- "samples"
  #create a start column
  x[,ncol(x)+1] <- 0
  #select and modify the data for readability
  read_data_sort <- sortData(x,2,FALSE)
  read_data_sort$changepoints[is.na(read_data_sort$changepoints)] <- 1000
    #arrange the columns
    arrange_read_data_sort <- data.frame(read_data_sort[,1],read_data_sort[,4],read_data_sort[,2],read_data_sort[,3])
    colnames(arrange_read_data_sort)<-c("samples","Start","End","changepoints")
    arrange_read_data_sort[,1] <- factor(arrange_read_data_sort[,1], levels=arrange_read_data_sort[,1])
    arrange_read_data_sort$id <- seq_along(arrange_read_data_sort$End)
    arrange_read_data_sort$type <- ifelse(arrange_read_data_sort$changepoints==1, "cpt1", "other")
    arrange_read_data_sort <- arrange_read_data_sort[,c(5,1,6,2,3)]
    #generate the waterfall plot
    wf <- ggplot(arrange_read_data_sort, aes(colour=type,samples,fill=type)) + scale_fill_manual(values=c("#FF8000","white")) + scale_colour_manual(values=c("#FF8000","grey")) + geom_rect(aes(x=samples,xmin=id-0.35,xmax=id+0.35,ymin=End,ymax=Start))  
    wf <- wf + labs(x="", y="", fill="")
    wf <- wf + theme(legend.position = "none")
    wf <- wf + theme(axis.text.y = element_text(colour="black",size=12.0))
    wf <- wf + theme(axis.text.x = element_blank())
    #wf <- wf + theme(axis.title.y = element_text(colour="black",face="bold",size=12.0))
    wf <- wf + theme(panel.background = element_rect(fill="NA"))
    wf <- wf + theme(panel.border = element_rect(colour = "black", fill="NA"))
    wf <- wf + theme(panel.grid.major.y = element_line(colour="NA"))
    wf <- wf + theme(panel.grid.minor = element_line(colour = "NA"))
    wf <- wf + theme(axis.ticks.x = element_blank())
    wf <- wf + theme(axis.ticks.y=element_line(colour="black"))
    wf <- wf + ylab("SISPA Profile Score")
    wf <- wf + xlab("Samples\n")
    return(wf)
}

## ----, echo=F, warning=F, message=F--------------------------------------
freqplot = function(x){
  count_data <- count(x,"sample_groups")
  freq <- barplot(count_data$freq,col=c("grey","orange"),ylab="# of Samples",names.arg=c("Inactive Profile","Active Profile"),cex.names=1.0)
  freq <- box()
  return(freq)
}

## ----, message=F, warning=F, fig.height=6, fig.width=6, fig.align="center"----
cpt_on_samples <- cptSamples(gsva_results,dir="up",cpt_data="var",cpt_method="BinSeg",cpt_max=60)

## ----message=F, warning=F------------------------------------------------
head(cpt_on_samples)

## ----, eval=F, warning=F, message=F, fig.height=6, fig.width=6, fig.align="center"----
## waterfallplot(cpt_on_samples)

## ----, eval=F, warning=F, message=F, fig.height=6, fig.width=6, fig.align="center"----
## freqplot(cpt_on_samples)