## ----eval = TRUE, echo = TRUE, message = FALSE-----------------------------
library(ORFik)
library(GenomicFeatures)

## ----eval = TRUE, echo = TRUE----------------------------------------------
txdbFile <- system.file("extdata", "hg19_knownGene_sample.sqlite", 
                        package = "GenomicFeatures")
txdb <- loadDb(txdbFile)
fiveUTRs <- fiveUTRsByTranscript(txdb, use.names = TRUE)
fiveUTRs

## ----eval = TRUE, echo = TRUE, message = FALSE-----------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {

  # Extract sequences of fiveUTRs.
  tx_seqs <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, fiveUTRs) 
  
  # Find all ORFs on those transcripts and get their genomic coordinates
  fiveUTR_ORFs <- findMapORFs(fiveUTRs, tx_seqs)
  fiveUTR_ORFs
}

## ----eval = TRUE, echo = TRUE----------------------------------------------
# path to example CageSeq data from hg19 heart sample
cageData <- system.file("extdata", "cage-seq-heart.bed.bgz", 
                        package = "ORFik")
# get new Transcription Start Sites using CageSeq dataset
newFiveUTRs <- reassignTSSbyCage(fiveUTRs, cageData)
newFiveUTRs

## ----eval = TRUE, echo = TRUE----------------------------------------------
bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
footprints <- GenomicAlignments::readGAlignments(bam_file)

## ----eval = TRUE, echo = TRUE----------------------------------------------
table(qwidth(footprints))

## ----eval = TRUE, echo = TRUE----------------------------------------------
footprints <- footprints[qwidth(footprints) == 29]
footprintsGR <- granges(footprints, use.mcols = TRUE)
footprintsGR

## ----eval = TRUE, echo = TRUE----------------------------------------------
footprintsGR <- resize(footprintsGR, 1)
footprintsGR

## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE------------
gtf_file <- system.file("extdata", "annotations.gtf", package = "ORFik")
txdb <- GenomicFeatures::makeTxDbFromGFF(gtf_file, format = "gtf")
cds <- GenomicFeatures::cdsBy(txdb, by = "tx", use.names = TRUE)
cds[1]

## ----eval = TRUE, echo = TRUE, warning = FALSE-----------------------------
txNames <- txNamesWithLeaders(txdb)
windows <- getStartStopWindows(txdb, txNames)
windows

## ----eval = TRUE, echo = TRUE, warning = FALSE-----------------------------
hitMapStart <- metaWindow(footprintsGR, windows$start)
hitMapStop <- metaWindow(footprintsGR, windows$stop)

## ----eval = TRUE, echo = TRUE, warning = FALSE-----------------------------
if (requireNamespace("ggplot2")) {
  library(ggplot2)
  ggplot(hitMapStart, aes(x = factor(position), y = avg_counts, fill = factor(frame))) +
    geom_bar(stat = "identity") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
    labs(title = paste0("Length 29 over START of canonical CDS")) +
    xlab("\nshift from first START nucleotide [bp]") +
    ylab("Averaged counts") +
    guides(fill = FALSE)
}

## ----eval = TRUE, echo = TRUE, warning = FALSE-----------------------------
if (requireNamespace("ggplot2")) {
  ggplot(hitMapStop, aes(x = factor(position), y = avg_counts, fill = factor(frame))) +
    geom_bar(stat = "identity") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
    labs(title = paste0("Length 29 over STOP of canonical CDS")) +
    xlab("\nshift from last STOP nucleotide [bp]") +
    ylab("Averaged counts") +
    guides(fill = FALSE)
}

## ----eval = TRUE, echo = TRUE, warning = FALSE-----------------------------
shifts <- detectRibosomeShifts(footprints, txdb, stop = TRUE)
shifts

## ----eval = TRUE, echo = TRUE, warning = FALSE-----------------------------
shiftedFootprints <- shiftFootprints(
  footprints, shifts$fragment_length, shifts$offsets_start)
shiftedFootprints

## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE------------
 if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  library(GenomicFeatures)

  # Extract sequences of fiveUTRs.
  fiveUTRs <- fiveUTRs[1:10]
  faFile <- BSgenome.Hsapiens.UCSC.hg19::Hsapiens
  tx_seqs <- extractTranscriptSeqs(faFile, fiveUTRs)

  # Find all ORFs on those transcripts and get their genomic coordinates
  fiveUTR_ORFs <- findMapORFs(fiveUTRs, tx_seqs)
  unlistedORFs <- unlistGrl(fiveUTR_ORFs)
  # group GRanges by ORFs instead of Transcripts, use 4 first ORFs
  fiveUTR_ORFs <- groupGRangesBy(unlistedORFs, unlistedORFs$names)[1:4]

  # make some toy ribo seq and rna seq data
  starts <- unlist(ORFik:::firstExonPerGroup(fiveUTR_ORFs), use.names = FALSE)
  RFP <- promoters(starts, upstream = 0, downstream = 1)
  score(RFP) <- rep(29, length(RFP)) # the original read widths

  # set RNA seq to duplicate transcripts
  RNA <- unlist(exonsBy(txdb, by = "tx", use.names = TRUE), use.names = TRUE)
  # transcript database
  txdb <- loadDb(txdbFile)
  dt <- computeFeatures(fiveUTR_ORFs, RFP, RNA, txdb, faFile,
                  orfFeatures =  TRUE)
  print(dt)
}

## ----eval = TRUE, echo = TRUE----------------------------------------------
  # In this example we will find kozak score of cds' 

  if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {

    cds <- cdsBy(txdb, by = "tx", use.names = TRUE)[1:10]

    faFile <- BSgenome.Hsapiens.UCSC.hg19::Hsapiens

    kozakSequenceScore(cds, faFile, species = "human")

    # A few species are pre supported, if not, make your own input pfm.

    # here is an example where the human pfm is sent in again, even though
    # it is already supported.

    pfm <- t(matrix(as.integer(c(29,26,28,26,22,35,62,39,28,24,27,17,
                                 21,26,24,16,28,32,5,23,35,12,42,21,
                                 25,24,22,33,22,19,28,17,27,47,16,34,
                                 25,24,26,25,28,14,5,21,10,17,15,28)),
                    ncol = 4))

   kozakSequenceScore(cds, faFile, species = pfm)

  }


## ----eval = TRUE, echo = TRUE----------------------------------------------

if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  # the orfs are now grouped by orfs. If we want to go back to transcripts we do:
  unlisted_ranges <- unlistGrl(fiveUTR_ORFs)
  unlisted_ranges
  test_ranges <- groupGRangesBy(unlisted_ranges, names(unlisted_ranges))
  
  # test_ranges is now grouped by transcript, but we want them grouped by ORFs:
  # we use the orfs exon column called ($names) to group, it is made by ORFik.
  unlisted_ranges <- unlistGrl(test_ranges)
  test_ranges <- groupGRangesBy(unlisted_ranges, unlisted_ranges$names)
}

## ----eval = TRUE, echo = TRUE----------------------------------------------

if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  # lets use the fiveUTR_ORFs
  #1. Group by ORFs
  unlisted_ranges <- unlistGrl(fiveUTR_ORFs)
  ORFs <- groupGRangesBy(unlisted_ranges, unlisted_ranges$names)
  length(ORFs)
  #2. Remove widths < 60
  ORFs <- ORFs[widthPerGroup(ORFs) >= 60]
  length(ORFs)
  #3. Keep only ORFs with at least 2 exons
  ORFs <- ORFs[numExonsPerGroup(ORFs) > 1]
  length(ORFs)
  
  #4. Keep only positive ORFs
  ORFs <- ORFs[strandPerGroup(ORFs) == "+"]
  # all remaining ORFs where on positive strand, so no change
  length(ORFs)
}

## ----eval = TRUE, echo = TRUE----------------------------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
  # let's use the ORFs from the previous examples
  #1. Find the start and stop sites
  startSites(fiveUTR_ORFs, asGR = TRUE, keep.names = TRUE, is.sorted = TRUE)
  stopSites(fiveUTR_ORFs, asGR = TRUE, keep.names = TRUE, is.sorted = TRUE)
  
  #2. Lets find the start and stop codons,
  # this takes care of potential 1 base exons etc. 
  starts <- startCodons(fiveUTR_ORFs, is.sorted = TRUE)
  starts
  stops <- stopCodons(fiveUTR_ORFs, is.sorted = TRUE)
  stops
  
  #3. Lets get the bases of the start and stop codons from the fasta file
  # It's very important to check that ORFs are sorted here, else you could get 
  # the end of the ORF instead of the beginning etc.
  txSeqsFromFa(starts, faFile, is.sorted = TRUE)
  txSeqsFromFa(stops, faFile, is.sorted = TRUE)
  }

## ----eval = TRUE, echo = TRUE----------------------------------------------
  library(Biostrings)
  library(S4Vectors)
  seqs <- "ATGAAATGAAGTAAATCAAAACAT" # strand with ORFs in both directions
  # positive strands
  pos <- findORFs(seqs, startCodon = "ATG", minimumLength = 0)
  # negative strands
  neg <- findORFs(reverseComplement(DNAStringSet(seqs)),
                  startCodon = "ATG", minimumLength = 0)
  # make GRanges since we want strand information
  pos <- GRanges(pos, strand = "+")
  neg <- GRanges(neg, strand = "-")
  # as GRanges
  res <- c(pos, neg)
  # or merge together and make GRangesList
  res <- split(res, seq.int(1, length(pos) + length(neg)))
  res

## ----eval = TRUE, echo = TRUE----------------------------------------------
  res[strandBool(res)]