## ----setup, echo=FALSE, results="hide"----------------------------------------
knitr::opts_chunk$set(tidy = FALSE,
                      cache = FALSE,
                      dev = "png",
                      message = FALSE, error = FALSE, warning = TRUE)

## ---- echo=FALSE, results="hide", warning=FALSE-------------------------------
suppressPackageStartupMessages({
    library(GenomicRanges)
    library(GenomicAlignments)
    library(rtracklayer)
    library(ggplot2)
    library(tidyr)
    library(ComplexHeatmap)
    library(BindingSiteFinder)
    library(forcats)
    library(dplyr)
})

## ----BiocManager, eval=FALSE--------------------------------------------------
#  if (!require("BiocManager"))
#      install.packages("BiocManager")
#  BiocManager::install("BindingSiteFinder")

## -----------------------------------------------------------------------------
library(rtracklayer)
csFile <- system.file("extdata", "PureCLIP_crosslink_sites_example.bed", 
                      package="BindingSiteFinder")
cs = import(con = csFile, format = "BED")
cs

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
library(ggplot2)
quants = quantile(cs$score, probs = seq(0,1, by = 0.05))
csFilter = cs[cs$score >= quants[2]]

ggplot(data = data.frame(score = cs$score), aes(x = log2(score))) +
    geom_histogram(binwidth = 0.5) +
    geom_vline(xintercept = log2(quants[2])) +
    theme_classic() +
    xlab("PureCLIP score (log2)") +
    ylab("Count")

## -----------------------------------------------------------------------------
# Load clip signal files and define meta data object
files <- system.file("extdata", package="BindingSiteFinder")
clipFilesP <- list.files(files, pattern = "plus.bw$", full.names = TRUE)
clipFilesM <- list.files(files, pattern = "minus.bw$", full.names = TRUE)

meta = data.frame(
  id = c(1,2,3,4),
  condition = factor(c("WT", "WT", "KD", "KD"), 
                     levels = c("KD", "WT")), 
  clPlus = clipFilesP, clMinus = clipFilesM)
meta

## ---- eval=FALSE--------------------------------------------------------------
#  library(BindingSiteFinder)
#  bds = BSFDataSetFromBigWig(ranges = csFilter, meta = meta, silent = TRUE)

## -----------------------------------------------------------------------------
exampleFile <- list.files(files, pattern = ".rda$", full.names = TRUE)
load(exampleFile)
bds

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
supportRatioPlot(bds, bsWidths = seq(from = 3, to = 19, by = 2))

## -----------------------------------------------------------------------------
bds1 <- makeBindingSites(object = bds, bsSize = 3, minWidth = 3,
                         minCrosslinks = 2, minClSites = 1)

bds2 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 3,
                         minCrosslinks = 2, minClSites = 1)

bds3 <- makeBindingSites(object = bds, bsSize = 19, minWidth = 3,
                         minCrosslinks = 2, minClSites = 1)

bds4 <- makeBindingSites(object = bds, bsSize = 29, minWidth = 3,
                         minCrosslinks = 2, minClSites = 1)
l = list(`1. bsSize = 3` = bds1, `2. bsSize = 9` = bds2, 
         `3. bsSize = 19` = bds3, `4. bsSize = 29` = bds4)

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
rangeCoveragePlot(l, width = 20) 

## -----------------------------------------------------------------------------
bds1 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 3,
                         minCrosslinks = 2, minClSites = 1)

bds2 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 3,
                         minCrosslinks = 2, minClSites = 2)

bds3 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 3,
                         minCrosslinks = 2, minClSites = 3)

bds4 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 3,
                         minCrosslinks = 2, minClSites = 4)

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
l = list(`1. minClSites = 1` = bds1, `2. minClSites = 2` = bds2, 
         `3. minClSites = 3` = bds3, `4. minClSites = 4` = bds4)
mergeSummaryPlot(l, select = "minClSites")

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
l = list(`1. minClSites = 1` = bds1, `2. minClSites = 2` = bds2, 
         `3. minClSites = 3` = bds3, `4. minClSites = 4` = bds4)
rangeCoveragePlot(l, width = 20)

## -----------------------------------------------------------------------------
bds10 <- makeBindingSites(object = bds, bsSize = 3, minWidth = 3,
                         minCrosslinks = 1, minClSites = 1)

bds20 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 3,
                         minCrosslinks = 3, minClSites = 1)

bds30 <- makeBindingSites(object = bds, bsSize = 19, minWidth = 3,
                         minCrosslinks = 6, minClSites = 1)

bds40 <- makeBindingSites(object = bds, bsSize = 29, minWidth = 3,
                         minCrosslinks = 10, minClSites = 1)

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
reproducibiliyCutoffPlot(bds1, max.range = 20, cutoff = c(0.05))
reproducibiliyCutoffPlot(bds1, max.range = 20, cutoff = c(0.2))

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
reproducibiliyCutoffPlot(bds1, max.range = 20, cutoff = c(0.05, 0.1))
reproducibiliyCutoffPlot(bds1, max.range = 20, cutoff = c(0.1, 0.05))

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
s1 = reproducibilityFilter(bds1, cutoff = c(0.05), n.reps = c(3), 
                           returnType = "data.frame")
library(ComplexHeatmap)
m1 = make_comb_mat(s1)
UpSet(m1)

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
s2 = reproducibilityFilter(bds1, cutoff = c(0.1, 0.05), n.reps = c(1, 2),
                           returnType = "data.frame")
m2 = make_comb_mat(s2)
UpSet(m2)

## -----------------------------------------------------------------------------
bdsFinal = reproducibilityFilter(bds1, cutoff = c(0.1, 0.05), n.reps = c(1, 2))
getRanges(bdsFinal)

## -----------------------------------------------------------------------------
bdsFinal = annotateWithScore(bdsFinal, getRanges(bds))
bindingSites = getRanges(bdsFinal)
bindingSites

## ---- eval=FALSE--------------------------------------------------------------
#  library(GenomicFeatures)
#  # Make annotation database from gff3 file
#  annoFile = "./gencode.v37.annotation.chr22.header.gff3"
#  annoDb = makeTxDbFromGFF(file = annoFile, format = "gff3")
#  annoInfo = import(annoFile, format = "gff3")
#  # Get genes as GRanges
#  gns = genes(annoDb)
#  idx = match(gns$gene_id, annoInfo$gene_id)
#  elementMetadata(gns) = cbind(elementMetadata(gns),
#                               elementMetadata(annoInfo)[idx,])

## -----------------------------------------------------------------------------
# Load GRanges with genes
geneFile <- list.files(files, pattern = "gns.rds$", full.names = TRUE)
load(geneFile)
gns

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
# count the number of annotation overlaps
mcols(bindingSites)$geneOl = factor(countOverlaps(bindingSites, gns))
df = as.data.frame(mcols(bindingSites))
# 
ggplot(df, aes(x = geneOl)) +
    geom_bar() +
    scale_y_log10() +
    geom_text(stat='count', aes(label=..count..), vjust=-0.3) +
    labs(
        title = "Overlapping gene annotation problem",
        caption = "Bar plot that show how many times a binding site
    overlaps with an annotation of a different gene.",
    x = "Number of overlaps", 
    y = "Count (log10)"
    ) + theme_bw()

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
# show which gene types cause the most annoation overlaps. Split binding sites
# by the number of overlaps and remove those binding sites that do not overlap
# with any annotation (intergenic)
library(forcats)
splitSites = split(bindingSites, bindingSites$geneOl)
df = as(lapply(splitSites, function(x){subsetByOverlaps(gns,x)}), "GRangesList")
df = df[2:length(df)]
df = lapply(1:length(df), function(x) {
    data.frame(olClass = names(df[x]),
               geneType = factor(df[[x]]$gene_type))})
df = do.call(rbind,df)
df = df %>% mutate(geneType = fct_lump_n(geneType, 4)) 
# 
ggplot(df, aes(x = olClass, fill = geneType)) +
    geom_bar(position = "fill") + 
    scale_fill_brewer(palette = "Set2") +
    scale_y_continuous(labels = scales::percent) +
    labs(
        title = "Overlapping gene annotation causes",
        caption = "Percentage bar plot that show the fraction
    for each gene type and overlap number.",
    fill = "Gene type",
    x = "Number of overlaps", 
    y = "Percent"
    ) + theme_bw()

## -----------------------------------------------------------------------------
# Remove all binding sites that overlap multiple gene annotations and retain
# only protein coding genes and binding sites.
bindingSites = subset(bindingSites, geneOl == 1)
targetGenes = subsetByOverlaps(gns, bindingSites)
targetGenes = subset(targetGenes, gene_type == "protein_coding")

## ---- eval=FALSE--------------------------------------------------------------
#  # Count the overlaps of each binding site fore each part of the transcript.
#  cdseq = cds(annoDb)
#  intrns = unlist(intronsByTranscript(annoDb))
#  utrs3 = unlist(threeUTRsByTranscript(annoDb))
#  utrs5 = unlist(fiveUTRsByTranscript(annoDb))
#  regions = list(CDS = cdseq, Intron = intrns, UTR3 = utrs3, UTR5 = utrs5)

## -----------------------------------------------------------------------------
# Load list with transcript regions
regionFile <- list.files(files, pattern = "regions.rds$", full.names = TRUE)
load(regionFile)

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
# Count the overlaps of each binding site fore each part of the transcript. 
cdseq = regions$CDS %>% countOverlaps(bindingSites,.)
intrns = regions$Intron %>% countOverlaps(bindingSites,.)
utrs3 = regions$UTR3 %>% countOverlaps(bindingSites,.)
utrs5 = regions$UTR5 %>% countOverlaps(bindingSites,.)
countDf = data.frame(CDS = cdseq, Intron = intrns, UTR3 = utrs3, UTR5 = utrs5)
# Counting how many times an annotation is not present.
df = data.frame(olClass = apply(countDf,1,function(x) length(x[x == 0]))) 
df = data.frame(type = rev(names(table(df))), count = as.vector(table(df))) 
ggplot(df, aes(x = type, y = count)) +
    geom_col() +
    scale_y_log10() +
    geom_text(aes(label = count), vjust=-0.3) +
    labs(
        title = "Overlapping transcript annotation crashes",
        caption = "Bar plot that show how many times a binding site 
        overlaps with an annotation of a different transcript part",
        x = "Number of overlaps", 
        y = "Count (log10)"
    ) + theme_bw()

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
# Count the number of binding sites within each transcript region, ignoring the 
# problem of overlapping annotations (counting binding sites multiple times).
plotCountDf = countDf
plotCountDf[plotCountDf > 1] = 1
df = plotCountDf %>% 
    pivot_longer(everything()) %>% 
    group_by(name) %>% 
    summarize(count = sum(value), .groups = "drop") %>%
    mutate(name = as.factor(name)) %>%
    mutate(name = fct_relevel(name, "Intron", "UTR3", "CDS", "UTR5"))

ggplot(df, aes(x = name, y = count)) + 
    geom_col() +
    scale_y_log10() +
    xlab("Number of overlaps") +
    ylab("Count") +
    geom_text(aes(label = count), vjust=-0.3) +
    labs(
        title = "Binding sites per genomic region",
        caption = "Bar plot that shows the number of
    binding sites per genomic region.",
    x = "Genomic region", 
    y = "Count (log10)"
    ) + theme_bw()

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
# Show how many annotation overlaps are caused by what transcript types.
m = make_comb_mat(plotCountDf)
UpSet(m)

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
# Solve the overlapping transcript types problem with majority votes and 
# hierarchical rule. 
rule = c("Intron", "UTR3", "CDS", "UTR5")
# Prepare dataframe for counting (reorder by rule, add intergenic counts)
countDf = countDf[, rule] %>% 
    as.matrix() %>%
    cbind.data.frame(., Intergenic = ifelse(rowSums(countDf) == 0, 1, 0) ) 
# Apply majority vote scheme
regionName = colnames(countDf)
region = apply(countDf, 1, function(x){regionName[which.max(x)]})
mcols(bindingSites)$region = region
df = data.frame(Region = region)
# 
ggplot(df, aes(x = fct_infreq(Region))) +
    geom_bar() +
    scale_y_log10() +
    geom_text(stat='count', aes(label=..count..), vjust=-0.3) +
    labs(
        title = "Binding sites per genomic region",
        caption = "Bar plot that shows the number of binding sites per
    genomic region.",
    x = "Genomic region", 
    y = "Count (log10)"
    ) + theme_bw()

## -----------------------------------------------------------------------------
# Sum up the length of each transcript region with overlapping binding sites.
cdsLen = regions$CDS %>% 
    subsetByOverlaps(., bindingSites) %>% 
    width() %>% 
    sum()
intrnsLen = regions$Intron %>% 
    subsetByOverlaps(., bindingSites) %>%
    width() %>% 
    sum()
utrs3Len = regions$UTR3 %>% 
    subsetByOverlaps(., bindingSites) %>% 
    width() %>% 
    sum()
utrs5Len = regions$UTR5 %>% 
    subsetByOverlaps(., bindingSites) %>% 
    width() %>%
    sum()
lenSum = c(cdsLen, intrnsLen, utrs3Len, utrs5Len)

## ---- fig.retina = 1, dpi = 100-----------------------------------------------
# Compute the relative enrichment per transcript region.
bindingSites = subset(bindingSites, region != "Intergenic")
df = data.frame(region = names(table(bindingSites$region)),
                count = as.vector(table(bindingSites$region)))
df$regionLength = lenSum
df$normalizedCount = df$count / df$regionLength
# 
ggplot(df, aes(x = region, y = normalizedCount)) +
    geom_col(width = 0.5) +
    labs(
        title = "Relative enrichment per genomic region ",
        caption = "Bar plot of region count, divided by the summed length.",
        x = "Genomic region", 
        y = "Relative enrichment",
        fill = "Genomic region"
    ) + theme_bw()

## -----------------------------------------------------------------------------
sessionInfo()