## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    fig.width = 7,
    fig.height = 5,
    out.width = "80%",
    fig.align = "center",
    crop = NULL # suppress "The magick package is required to crop" issue
)
library(BiocStyle)

## ----libs, message=FALSE------------------------------------------------------
library(monaLisa)
library(JASPAR2020)
library(TFBSTools)
library(BSgenome.Mmusculus.UCSC.mm10)
library(Biostrings)
library(SummarizedExperiment)
library(ComplexHeatmap)
library(circlize)

## ----loadData-----------------------------------------------------------------
# load GRanges object with logFC and peaks
gr_path <- system.file("extdata", "atac_liver_vs_lung.rds", 
                       package = "monaLisa")
gr <- readRDS(gr_path)

## ----predictorMatrix, warning=FALSE-------------------------------------------
# get PFMs (vertebrate TFs from Jaspar)
pfms <- getMatrixSet(JASPAR2020, list(matrixtype = "PFM", 
                                      tax_group = "vertebrates"))

# randomly sample 300 PFMs for illustration purposes (for quick runtime)
set.seed(4563)
pfms <- pfms[sample(length(pfms), size = 300)]

# convert PFMs to PWMs
pwms <- toPWM(pfms)

# get TFBS on given GRanges (peaks)
# suppress warnings generated by matchPWM due to the presence of Ns 
# in the sequences
peakSeq <- getSeq(BSgenome.Mmusculus.UCSC.mm10, gr)
suppressWarnings({
  hits <- findMotifHits(query = pwms, subject = peakSeq, min.score = 10.0,
                        BPPARAM = BiocParallel::SerialParam())
})

# get TFBS matrix
TFBSmatrix <- unclass(table(factor(seqnames(hits), levels = seqlevels(hits)),
                            factor(hits$pwmname, levels = name(pwms))))
TFBSmatrix[1:6, 1:6]

# remove TF motifs with 0 binding sites in all regions
zero_TF <- colSums(TFBSmatrix) == 0
sum(zero_TF)
TFBSmatrix <- TFBSmatrix[, !zero_TF]

# calculate G+C and CpG obs/expected
fMono <- oligonucleotideFrequency(peakSeq, width = 1L, as.prob = TRUE)
fDi <- oligonucleotideFrequency(peakSeq, width = 2L, as.prob = TRUE)
fracGC <- fMono[, "G"] + fMono[, "C"]
oeCpG <- (fDi[, "CG"] + 0.01) / (fMono[, "G"] * fMono[, "C"] + 0.01)

# add GC and oeCpG to predictor matrix
TFBSmatrix <- cbind(fracGC, oeCpG, TFBSmatrix)
TFBSmatrix[1:6, 1:6]

## ----stabSelTFs---------------------------------------------------------------
## randLassoStabSel() is stochastic, so if we run with parallelization
##   (`mc.cores` argument), we must select a random number generator that can
##   provide multiple streams of random numbers used in the `parallel` package
##   and set its seed for reproducible results
# RNGkind("L'Ecuyer-CMRG")
# set.seed(123)
# se <- randLassoStabSel(x = TFBSmatrix, y = gr$logFC_liver_vs_lung, 
#                        cutoff = 0.8, mc.preschedule = TRUE, 
#                        mc.set.seed = TRUE, mc.cores = 2L)

# if not running in parallel mode, it is enough to use set.seed() before 
#   the call to ensure reproducibility (`mc.sores = 1`)
set.seed(123)
se <- randLassoStabSel(x = TFBSmatrix, y = gr$logFC_liver_vs_lung, 
                       cutoff = 0.8)
se

# selected TFs
colnames(se)[se$selected]

## ----plotStabilityPaths-------------------------------------------------------
plotStabilityPaths(se)

## ----plotSelProbs, fig.width=8, fig.height=5----------------------------------
plotSelectionProb(se, directional = TRUE)

## ----TFBScor_selected, fig.width=10, fig.height=7-----------------------------
# subset the selected TFs
sel <- colnames(se)[se$selected]
se_sub <- se[, sel]

# exclude oeCpG and fracGC
excl <- colnames(se_sub) %in% c("oeCpG", "fracGC")
se_sub <- se_sub[, !excl]

# correlation matrix 
TFBSmatrixCorSel <- cor(TFBSmatrix[, colnames(se_sub)], method = "pearson")

# heatmap
pfmsSel <- pfms[match(colnames(TFBSmatrixCorSel), name(pfms))]
maxwidth <- max(sapply(TFBSTools::Matrix(pfmsSel), ncol))
seqlogoGrobs <- lapply(pfmsSel, seqLogoGrob, xmax = maxwidth)

hmSeqlogo <- rowAnnotation(logo = annoSeqlogo(seqlogoGrobs, which = "row"),
                           annotation_width = unit(2, "inch"), 
                           show_annotation_name = FALSE
)

colAnn <- HeatmapAnnotation(AUC = se_sub$selAUC, selProb = se_sub$selProb,
                            show_legend = TRUE, 
                            show_annotation_name = TRUE,
                            col = list(
                              AUC = colorRamp2(c(0, 1), 
                                               c("white", "brown")),
                              selProb = colorRamp2(c(0, 1), 
                                                   c("white", "steelblue")))
)

Heatmap(TFBSmatrixCorSel, 
        show_row_names = TRUE, 
        show_column_names = TRUE, 
        name = "Pear. Cor.", column_title = "Selected TFs",
        col = colorRamp2(c(-1, 0, 1), c("blue", "white", "red")), 
        right_annotation = hmSeqlogo,
        top_annotation = colAnn)

## ----topPeaks-----------------------------------------------------------------
TF <- sel[2]
TF

i <- which(assay(se, "x")[, TF] > 0) # peaks that contain TF hits...
nm <- names(sort(abs(gr$logFC_liver_vs_lung[i]), 
                 decreasing = TRUE)) # ... order by |logFC|

gr[nm]

## ----session------------------------------------------------------------------
sessionInfo()