## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## ----knitr-options, echo=FALSE, message=FALSE, warning=FALSE------------------
library(knitr)
opts_chunk$set(fig.align = 'center', fig.width = 6, fig.height = 6, dev = 'png')

## ----eval=FALSE---------------------------------------------------------------
#  library(BiocManager)
#  BiocManager::install('SGCP')

## ----message=FALSE, warning=FALSE---------------------------------------------
library(SGCP)

## ----message=FALSE, warning=FALSE---------------------------------------------
library(SummarizedExperiment)
data(cheng)
cheng
print("gene expression...")
print("rownames and colnames correspond to gene Entrez ids and sample names")
head(assay(cheng))

print(" \n gene ids...")
print("rownames are the gene Entrez ids")
head(rowData(cheng))

## ----message=TRUE, warning=FALSE----------------------------------------------
message("expData")
expData <- assay(cheng)
head(expData)
dim(expData)


message(" \n geneID")
geneID <- rowData(cheng)
geneID <- geneID$ENTREZID
head(geneID)
length(geneID)

library(org.Hs.eg.db)
annotation_db <- "org.Hs.eg.db"


## ----message=TRUE, warning=FALSE----------------------------------------------
# sgcp <- ezSGCP(expData = expData, geneID = geneID, annotation_db = annotation_db,
#               eff.egs = FALSE , saveOrig = FALSE, sil = TRUE, hm = NULL)
data(sgcp)
summary(sgcp, show.data=TRUE)

## ----message=TRUE, warning=FALSE----------------------------------------------

##  starting network construction step...
## normalization...
## Gaussian kernel...
## it may take time...
## TOM...
##  it may take time...
## network is created, done!...

##  starting network clustering step...
## calculating normalized Laplacian 
##  it may take time...
## calculating eigenvalues/vectors 
##  it may take time...
## n_egvec is 100
## number of clusters for relativeGap method is 2
## number of clusters for secondOrderGap method is 5
## number of clusters for additiveGap method is 3

##  Gene Ontology Validation...
## method relativeGap....
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...

##  method secondOrderGap...
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...

##  method additiveGap....
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...

##  method relativeGap is selected using GO validation and k is 2
## calculating the Silhouette index 
##  it may take time...
## network clustering is done...

## starting initial gene ontology enrichment step...
## GOenrichment for cluster 2
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...
## GOenrichment for cluster 1
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...
## gene ontology done!..

## starting semi-labeling stage...
## cutoff value is 9.28130152105493e-05
## semiLabeling done!..

## starting semi-supervised step...
## creating train and test sets based on remarkable and unremarkable genes...
## number of remarkable genes is 1165
## number of unremarkable genes is 335
## performing knn...
##  it may take time
## Loading required package: ggplot2
## Loading required package: lattice
##             Length Class      Mode     
## learn       2      -none-     list     
## k           1      -none-     numeric  
## theDots     0      -none-     list     
## xNames      4      -none-     character
## problemType 1      -none-     character
## tuneValue   1      data.frame list     
## obsLevels   2      -none-     character
## param       0      -none-     list     
## 24-nearest neighbor model
## Training set outcome distribution:

##   1   2 
## 498 667 

## class assignment for unremarkable genes...
## top class labels, and bottom number of predicted genes
## prediction
##   1   2 
## 130 205 
## semi-supervised done!..

## starting final gene ontology enrichment step...
## GOenrichment for cluster 2
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...
## GOenrichment for cluster 1
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...
## gene ontology done!..

## ezSGCP done!..

## ----message=FALSE, warning=FALSE---------------------------------------------
message("PCA of expression data without label")
SGCP_ezPLOT(sgcp = sgcp, expreData = expData, silhouette_index = TRUE, keep = FALSE)

## ----message=TRUE, warning=FALSE----------------------------------------------
rm(sgcp)

## ----message=TRUE, warning=FALSE----------------------------------------------
message("Plotting PCA of expression data")
pl <- SGCP_plot_pca(m = expData, clusLabs = NULL, tit = "PCA plot", ps = .5)
print(pl)

## ----message=TRUE, warning=TRUE-----------------------------------------------
resAdja <- adjacencyMatrix(expData = expData, hm = NULL)
resAdja[0:10, 0:5]

## ----message=TRUE, warning=FALSE----------------------------------------------
#message("Plotting adjacency heatmap")
#pl <- SGCP_plot_heatMap(m = resAdja, tit = "Adjacency Heatmap", 
#                    xname = "genes", yname = "genes")
#print(pl)

## ----message=TRUE, warning=FALSE----------------------------------------------
# resClus <- clustering(adjaMat = resAdja, geneID = geneID, annotation_db = annotation_db,
#                       eff.egs = FALSE , saveOrig = FALSE, sil = TRUE)

data(resClus)
summary(resClus)
# lets drop noisy genes from expData

geneID <- resClus$geneID
if(length(resClus$dropped.indices)>0 ){ expData <- expData[-resClus$dropped.indices, ]}

# removing the adjacency matrix
rm(resAdja)

## ----message=TRUE, warning=FALSE----------------------------------------------
## calculating normalized Laplacian 
##  it may take time...
## calculating eigenvalues/vectors 
##  it may take time...
## n_egvec is 100
## number of clusters for relativeGap method is 2
## number of clusters for secondOrderGap method is 5
## number of clusters for additiveGap method is 3

##  Gene Ontology Validation...
## method relativeGap....
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...

##  method secondOrderGap...
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...

##  method additiveGap....
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...

##  method relativeGap is selected using GO validation and k is 2
## calculating the Silhouette index 
##  it may take time...
## network clustering is done...

## ----message=TRUE, warning=FALSE----------------------------------------------
message("Plotting PCA of expression data with label")

# pl <- SGCP_plot_pca(m = expData, clusLabs = NULL, tit = "PCA plot without label", ps = .5)
# print(pl)

## ----message=TRUE, warning=FALSE----------------------------------------------
message("Plptting PCA of transformed data without label")
# pl <- SGCP_plot_pca(m = resClus$Y, clusLabs = NULL, tit = "PCA plot without label", ps = .5)
# print(pl)

## ----message=TRUE, warning=FALSE----------------------------------------------
message("Plotting PCA of transformed data with label")
# pl <- SGCP_plot_pca(m = resClus$Y, clusLabs = resClus$clusterLabels, tit = "PCA plot with label", ps = .5)
# print(pl)

## ----message=TRUE, warning=FALSE----------------------------------------------
if(resClus$cv == "cvGO" || resClus$cv == "cvConductance"){
    message("plotting relativeGap, secondOrderGap, additiveGap methods ...")
    print(resClus$clusterNumberPlot$relativeGap)
}

## ----message=TRUE, warning=FALSE----------------------------------------------
if(resClus$cv == "cvGO" || resClus$cv == "cvConductance"){
    message("plotting relativeGap, secondOrderGap, additiveGap methods ...")
    print(resClus$clusterNumberPlot$secondOrderGap)
}

## ----message=TRUE, warning=FALSE----------------------------------------------
if(resClus$cv == "cvGO" || resClus$cv == "cvConductance"){
    message("plotting relativeGap, secondOrderGap, additiveGap methods ...")
    print(resClus$clusterNumberPlot$additiveGap)
}

## ----message=TRUE, warning=FALSE----------------------------------------------
# checking if conductance index is calculated
if(resClus$cv == "cvGO" || resClus$cv == "cvConductance" ){
    message("plotting cluster conductance index...")
    pl <- SGCP_plot_conductance(conduct = resClus$conductance, 
                        tit = "Cluster Conductance Index", 
                        xname = "clusterLabel", yname = "conductance index")
    print(pl)}


## ----message=TRUE, warning=FALSE----------------------------------------------
# checking if silhouette index is calculated
if(resClus$cv == "cvGO" || resClus$cv == "cvConductance" ){
    message("plotting gene silhouette index...")
    pl <- SGCP_plot_silhouette(resClus$silhouette, tit = "Gene Silhouette Index",
                            xname = "genes", yname = "silhouette index")
    print(pl)}

## ----message=TRUE, warning=TRUE-----------------------------------------------

# resInitialGO <- geneOntology(geneUniv = geneID, clusLab = resClus$clusterLabels, 
#                              annotation_db = annotation_db)

data(resInitialGO)

head(resInitialGO$GOresults)

## ----message=TRUE, warning=TRUE-----------------------------------------------
## GOenrichment for cluster 1
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...
## GOenrichment for cluster 2
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...
## gene ontology done!..

## ----message=TRUE, warning=FALSE----------------------------------------------
message("plotting initial GO p-values jitters...")
pl <- SGCP_plot_jitter(df = resInitialGO$GOresults, 
                    tit = "Initial GO p-values",
                    xname = "cluster", yname = "-log10 p-value", ps = 3)
print(pl)


## ----message=TRUE, warning=FALSE----------------------------------------------
message("plotting initial GO p-values density")
pl <- SGCP_plot_density(df = resInitialGO$GOresults, 
                    tit = "Initial GO p-values Density",
                    xname = "cluster", yname = "-log10 p-value")

print(pl)

## ----message=TRUE, warning=FALSE----------------------------------------------
message("plotting initial GO p-values mean")
pl <- SGCP_plot_bar(df = resInitialGO$GOresults, tit = "Cluster Performance",
                    xname = "cluster", yname = "mean -log10 p-value")
print(pl)

## ----message=TRUE, warning=FALSE----------------------------------------------
message("plotting initial GO p-values pie chart...")
pl <- SGCP_plot_pie(df = resInitialGO$GOresults, tit = "Initial GO Analysis",
                xname = "cluster", yname = "count", posx = 1.8)
print(pl)

## ----message=TRUE, warning=TRUE-----------------------------------------------
resSemiLabel <- semiLabeling(geneID = geneID, 
                            df_GO = resInitialGO$GOresults, 
                            GOgenes = resInitialGO$FinalGOTermGenes)

print(head(resSemiLabel$geneLabel))
table(resSemiLabel$geneLabel$label)

## ----message=TRUE, warning=TRUE-----------------------------------------------
resSemiSupervised <- semiSupervised(specExp = resClus$Y, 
                                geneLab = resSemiLabel$geneLabel,
                                model = "knn", kn = NULL)
print(head(resSemiSupervised$FinalLabeling))
print(table(resSemiSupervised$FinalLabeling$FinalLabel))

## ----message=TRUE, warning=FALSE----------------------------------------------
# message("Plotting PCA of transformed data with label")
# pl <- SGCP_plot_pca(m = expData, 
#                clusLabs = resSemiSupervised$FinalLabeling$FinalLabel, 
#                tit = "PCA plot with label", ps = .5)
print(pl)

## ----message=TRUE, warning=FALSE----------------------------------------------
# message("Plotting PCA of transformed data with label")
# pl <- SGCP_plot_pca(m = resClus$Y, 
#                clusLabs = resSemiSupervised$FinalLabeling$FinalLabel, 
#                tit = "PCA plot with label", ps = .5)
#print(pl)

## ----message=TRUE, warning=TRUE-----------------------------------------------
# resFinalGO <- geneOntology(geneUniv = geneID, clusLab = resSemiSupervised$FinalLabeling$FinalLabel, 
#                              annotation_db = annotation_db)

data(resFinalGO)
head(resFinalGO$GOresults)


## ----message=TRUE, warning=TRUE-----------------------------------------------
## GOenrichment for cluster 1
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...
## GOenrichment for cluster 2
## calling GOstats for overBP...
## identifying  genes for each GOTerm...
## calling GOstats for overCC...
## identifying  genes for each GOTerm...
## calling GOstats for overMF...
## identifying  genes for each GOTerm...
## calling GOstats for underBP...
## identifying  genes for each GOTerm...
## calling GOstats for underCC...
## identifying  genes for each GOTerm...
## calling GOstats for underMF...
## identifying  genes for each GOTerm...
## gene ontology done!..


## ----message=TRUE, warning=FALSE----------------------------------------------
message("plotting final GO p-values jitters...")
pl <- SGCP_plot_jitter(df = resFinalGO$GOresults, tit = "Final GO p-values",
                    xname = "module", yname = "-log10 p-value", ps = 3)
print(pl)

## ----message=TRUE, warning=FALSE----------------------------------------------
message("plotting final GO p-values density...")
pl <- SGCP_plot_density(df = resFinalGO$GOresults, 
                    tit = "Final GO p-values Density",
                    xname = "module", yname = "-log10 p-value")
print(pl)
rm(pl)

## ----message=TRUE, warning=FALSE----------------------------------------------
message("plotting final GO p-values mean...")
pl <- SGCP_plot_bar(df = resFinalGO$GOresults, tit = "Module Performance",
                xname = "module", yname = "mean -log10 p-value")
print(pl)
rm(pl)

## ----message=TRUE, warning=FALSE----------------------------------------------
message("plotting final GO p-values pie xhart...")
pl <- SGCP_plot_pie(df = resFinalGO$GOresults, tit = "Final GO Analysis",
                xname = "module", yname = "count", posx = 1.9)
print(pl)
rm(pl)

## ----message=TRUE, warning=FALSE----------------------------------------------
rm(resClus, resInitialGO, resSemiLabel, resSemiSupervised, resFinalGO, pl)

## ----message=TRUE, warning=FALSE----------------------------------------------
sI <- sessionInfo()
sI