## ----setup, echo=FALSE-----------------------------------------------------
knitr::opts_chunk$set(collapse=TRUE, comment = "#>")
suppressPackageStartupMessages(library(universalmotif))
suppressPackageStartupMessages(library(Biostrings))
suppressMessages(suppressPackageStartupMessages(library(MotifDb)))
data(ArabidopsisPromoters)
data(ArabidopsisMotif)

## --------------------------------------------------------------------------
library(universalmotif)

motif <- create_motif("CWWWWCC", nsites = 6)
sequences <- DNAStringSet(rep(c("CAAAACC", "CTTTTCC"), 3))
motif.k2 <- add_multifreq(motif, sequences, add.k = 2)

# Or:
# motif.k2 <- create_motif(sequences, add.multifreq = 2)

motif.k2

## --------------------------------------------------------------------------
library(universalmotif)
data(ArabidopsisMotif)
data(ArabidopsisPromoters)

enrich_motifs(ArabidopsisMotif, ArabidopsisPromoters, shuffle.k = 3,
              verbose = 0, progress = FALSE, RC = TRUE)

## --------------------------------------------------------------------------
library(universalmotif)
data(ArabidopsisMotif)
data(ArabidopsisPromoters)

enrich_motifs(ArabidopsisMotif, ArabidopsisPromoters, shuffle.k = 3,
              search.mode = "positional", verbose = 0, progress = FALSE,
              RC = TRUE)

## --------------------------------------------------------------------------
library(universalmotif)

m <- matrix(c(0.10,0.27,0.23,0.19,0.29,0.28,0.51,0.12,0.34,0.26,
              0.36,0.29,0.51,0.38,0.23,0.16,0.17,0.21,0.23,0.36,
              0.45,0.05,0.02,0.13,0.27,0.38,0.26,0.38,0.12,0.31,
              0.09,0.40,0.24,0.30,0.21,0.19,0.05,0.30,0.31,0.08),
            byrow=TRUE,nrow=4)
motif <- create_motif(m, alphabet = "DNA", type = "PWM")
motif

## --------------------------------------------------------------------------
data(ArabidopsisPromoters)

res <- scan_sequences(motif, ArabidopsisPromoters, verbose = 0,
                      progress = FALSE, threshold = 0.6,
                      threshold.type = "logodds")
head(res)

## --------------------------------------------------------------------------
## The first match was CTCTTTATTC, with a score of 3.931 (max possible = 6.536)

library(Biostrings)

bkg <- colMeans(oligonucleotideFrequency(ArabidopsisPromoters, 1,
                                         as.prob = TRUE))
bkg

## --------------------------------------------------------------------------
hit.prob <- bkg["A"]^1 * bkg["C"]^3 * bkg["G"]^0 * bkg["T"]^6
hit.prob <- unname(hit.prob)
hit.prob

## --------------------------------------------------------------------------
pvals <- motif_pvalue(motif, res$score, progress = FALSE, bkg.probs = bkg)
res2 <- data.frame(motif=res$motif, match=res$match,
                   pval=pvals)[order(pvals), ]
knitr::kable(head(res2), digits = 22, row.names = FALSE, format = "markdown")

## --------------------------------------------------------------------------
scores <- c(-6, -3, 0, 3, 6)
k <- c(2, 4, 6, 8, 10)
out <- data.frame(k = c(2, 4, 6, 8, 10),
                  score.minus6 = rep(0, 5),
                  score.minus3 = rep(0, 5),
                  score.0 = rep(0, 5),
                  score.3 = rep(0, 5),
                  score.6 = rep(0, 5))

for (i in seq_along(scores)) {
  for (j in seq_along(k)) {
    out[j, i + 1] <- motif_pvalue(motif, scores[i], k = k[j], progress = FALSE,
                                  bkg.probs = bkg)
  }
}

knitr::kable(out, format = "markdown", digits = 10)

## --------------------------------------------------------------------------
r <- motif_pvalue(motif, pvalue = c(0.01, 0.001, 0.0001, 0.00001),
                  progress = FALSE, bkg.probs = bkg, k = 10)

## --------------------------------------------------------------------------
r2 <- motif_pvalue(motif, score = r, progress = FALSE, bkg.probs = bkg, k = 10)

res <- data.frame(pval=c(0.01, 0.001, 0.0001, 0.00001), score = r,
                  pval.calc = r2)
knitr::kable(res, format = "markdown", digits = 22)

## --------------------------------------------------------------------------
library(universalmotif)
library(MotifDb)

motifs <- convert_motifs(MotifDb)
motifs <- filter_motifs(motifs, altname = c("M0003_1.02", "M0004_1.02"))
summarise_motifs(motifs)
try_all <- function(motifs, ...) {
  scores <- numeric(8)
  methods <- c("MPCC", "PCC", "MEUCL", "EUCL", "MSW", "SW", "MKL", "KL")
  for (i in 1:8) {
    scores[i] <- compare_motifs(motifs, method = methods[i],
                                progress = FALSE, ...)[1, 2]
  }
  res <- data.frame(method = c("MPCC", "PCC", "MEUCL", "EUCL",
                                "MSW", "SW", "MKL", "KL"),
                     score = scores,
                     max = c(1, NA, sqrt(2), NA, 2, NA, NA, NA),
                     type = c("similarity", "similarity", "distance",
                              "distance", "similarity", "similarity",
                              "distance", "distance"))
  knitr::kable(res, format = "markdown")
}
try_all(motifs)

## --------------------------------------------------------------------------
try_all(motifs, tryRC = FALSE)

## --------------------------------------------------------------------------
try_all(motifs, use.type = "ICM")

## --------------------------------------------------------------------------
try_all(motifs, normalise.scores = TRUE)

## --------------------------------------------------------------------------
view_motifs(motifs)

## --------------------------------------------------------------------------
view_motifs(motifs, min.overlap = 99)
try_all(motifs, min.overlap = 99)
try_all(motifs, min.overlap = 99, normalise.scores = TRUE)

## --------------------------------------------------------------------------
try_all(motifs, min.overlap = 1)
view_motifs(motifs, min.overlap = 1)

## --------------------------------------------------------------------------
motifs2 <- filter_motifs(MotifDb, altname = c("M0100_1.02", "M0104_1.02"))
view_motifs(motifs2, min.mean.ic = 0)
try_all(motifs2, min.mean.ic = 0)
view_motifs(motifs2, min.mean.ic = 0.7)
try_all(motifs2, min.mean.ic = 0.7)

## --------------------------------------------------------------------------
library(universalmotif)
library(MotifDb)

motifs <- convert_motifs(MotifDb)
motifs <- filter_motifs(motifs, organism = "Athaliana")[1:100]
tree <- motif_tree(motifs, layout = "daylight", linecol = "family",
                   progress = FALSE)

## Make some changes to the tree in regular ggplot2 fashion:
# tree <- tree + ...

tree

## ----eval=FALSE------------------------------------------------------------
#  library(universalmotif)
#  
#  ## 1. Once per session: via `options`
#  
#  options(meme.bin = "/path/to/meme/bin/meme")
#  
#  run_meme(...)
#  
#  ## 2. Once per run: via `run_meme`
#  
#  run_meme(..., bin = "/path/to/meme/bin/meme")

## ----eval=FALSE------------------------------------------------------------
#  library(universalmotif)
#  data(ArabidopsisPromoters)
#  
#  ## 1. Read sequences from disk (in fasta format):
#  
#  library(Biostrings)
#  
#  # The following `read*` functions are available in Biostrings:
#  # DNA: readDNAStringSet
#  # DNA with quality scores: readQualityScaledDNAStringSet
#  # RNA: readRNAStringSet
#  # amino acid: readAAStringSet
#  # any: readBStringSet
#  
#  sequences <- readDNAStringSet("/path/to/sequences.fasta")
#  
#  run_meme(sequences, ...)
#  
#  ## 2. Extract from a `BSgenome` object:
#  
#  library(GenomicFeatures)
#  library(TxDb.Athaliana.BioMart.plantsmart28)
#  library(BSgenome.Athaliana.TAIR.TAIR9)
#  
#  # Let us retrieve the same promoter sequences from ArabidopsisPromoters:
#  gene.names <- names(ArabidopsisPromoters)
#  
#  # First get the transcript coordinates from the relevant `TxDb` object:
#  transcripts <- transcriptsBy(TxDb.Athaliana.BioMart.plantsmart28,
#                               by = "gene")[gene.names]
#  
#  # There are multiple transcripts per gene, we only care for the first one
#  # in each:
#  
#  transcripts <- lapply(transcripts, function(x) x[1])
#  transcripts <- unlist(GRangesList(transcripts))
#  
#  # Then the actual sequences:
#  
#  # Unfortunately this is a case where the chromosome names do not match
#  # between the two databases
#  
#  seqlevels(TxDb.Athaliana.BioMart.plantsmart28)
#  #> [1] "1"  "2"  "3"  "4"  "5"  "Mt" "Pt"
#  seqlevels(BSgenome.Athaliana.TAIR.TAIR9)
#  #> [1] "Chr1" "Chr2" "Chr3" "Chr4" "Chr5" "ChrM" "ChrC"
#  
#  # So we must first rename the chromosomes in `transcripts`:
#  seqlevels(transcripts) <- seqlevels(BSgenome.Athaliana.TAIR.TAIR9)
#  
#  # Finally we can extract the sequences
#  promoters <- getPromoterSeq(transcripts,
#                              BSgenome.Athaliana.TAIR.TAIR9,
#                              upstream = 0, downstream = 1000)
#  
#  run_meme(promoters, ...)

## ----eval=FALSE------------------------------------------------------------
#  run_meme(sequences, output = "/path/to/desired/output/folder")

## ----sessionInfo, echo=FALSE-----------------------------------------------
sessionInfo()