<!--
%\VignetteEngine{knitr::knitr}
%\VignetteIndexEntry{mapping}
-->

```{r style, echo = FALSE}
BiocStyle::markdown()
```

<!-- --- -->
<!-- title: "Mapping proteins to their genomic coordinates" -->
<!-- author: "Laurent Gatto - <lg390@cam.ac.uk>" -->
<!-- output: -->
<!--   html_document: -->
<!--     toc: true -->
<!--     theme: united -->
<!-- --- -->

Mapping proteins to their genomic coordinates
=============================================

[Laurent Gatto](http://cpu.sysbiol.cam.ac.uk/)

```{r, env, message=FALSE, echo=FALSE, cache=FALSE, warning=FALSE}
library("Gviz")
library("Pbase")
library("Biostrings")
library("biomaRt")
library("BSgenome.Hsapiens.UCSC.hg19")
library("ggplot2")
```

## Introduction

The aim of this document is to document to mapping of proteins and the
tandem mass spectrometry derived peptides they have been inferred from
to genomic locations. 

```{r schema, echo=FALSE, fig.cap='Mapping proteins to a genome reference.', fig.align='center'}
par(mar = rep(0, 4),
    oma = rep(0, 4))

xlim <- c(0, 10)
ylim <-c(5.05, 10.25)

plot(0, type = "n", axes = FALSE,
     xlab = "", ylab = "",
     xlim = xlim,    
     ylim = ylim)

y <- 10

g <- c(0, 10)

exons <- list(c(0, 2),
              c(4.5, 5),
              c(6, 7.1),
              c(9, 10))

trans <- list(c(1, 2, 3, 4),
              c(1, 3, 4),
              c(1, 4))

prot <- list(exons[[1]] + c(0.5, 0), ## 5'UTR
             exons[[3]],
             exons[[4]] + c(0, -0.5)) ## 3'UTR

peps <- list(c(1, 1.29),
             c(6.1, 6.3), c(6.85, 7),
             c(9.1, 9.25))


rect(g[1], y-0.25, g[2], y+0.25, col = "black")
abline(h = y, lwd = 2)
text(g[1], y+0.25, expression(g[s]), pos = 3)
text(g[2], y+0.25, expression(g[e]), pos = 3)
text(0-0.26, y, "G", pos = 3)

abline(v = c(unlist(exons),
           unlist(prot),
           unlist(peps)),
       lty = "dotted",
       col = "lightgrey")

y <- y - 1
for (i in 1:length(trans)) {
    tr <- trans[[i]]
    abline(h = y, lty = "dotted")
    ## text(0-0.26, y, expression(T[x]), pos = 3)
    text(0-0.26, y,
         substitute(paste("T", list(x)), list(x = i)),
         pos = 3)
    for (j in 1:length(tr)) {
        e1 <- exons[[tr[j]]][1]
        e2 <- exons[[tr[j]]][2]
        rect(e1, y-0.25, e2, y+0.25, col = "grey")
        if (i == 1) {
            text(e1, y+0.25, expression(e[s]^i), pos = 3)
            text(e2, y+0.25, expression(e[e]^i), pos = 3)
        }
        text(mean(c(e1, e2)), y, paste0("i = ", tr[j]), cex = .7)
    }
    y <- y - .6
}

y <- y - .4

abline(h = y, lty = "dotted")
text(0-0.26, y, expression(P), pos = 3)
for (i in 1:length(prot)) {
    pr <- prot[[i]]
    rect(pr[1], y - 0.25,
         pr[2], y + 0.25,
         col = "steelblue")
    text(pr[1], y + 0.25, expression(p[s]^j), pos = 3)
    text(pr[2], y + 0.25, expression(p[e]^j), pos = 3)
    text(mean(c(pr[1], pr[2])), y, paste0("j = ", i), cex = .7)
}
y <- y - .75

## concat protein
## center
x <- mean(xlim)
protlen <- sum(sapply(prot, diff))
X0 <- x0 <- x - protlen/2 ## left start

abline(h = y, lty = "dotted")
text(0-0.26, y, expression(P), pos = 3)

for (i in 1:length(prot)) {
    pr <- prot[[i]]
    .pr1 <- x0
    .pr2 <- x0 + (pr[2] - pr[1])
    rect(.pr1, y - 0.25,
         .pr2, y + 0.25,
         col = "steelblue")
    segments(.pr1, y + 0.25, pr[1], y + .75 - .25, lty = "dotted")
    segments(.pr2, y + 0.25, pr[2], y + .75 - .25, lty = "dotted")
    text(mean(c(.pr1,.pr2)), y, paste0("j = ", i), cex = .7)
    x0 <- .pr2
}

text(X0, y, expression(1), pos = 2)
text(X0 + protlen, y, expression(L[P]), pos = 4)

## pos and length
relpeps <- list(c(0.5, 0.29),
                ## 1.% is length of exon 1
                c(1.5 + 0.1, 0.2),
                c(1.5 + 0.85, 0.15),
                ## 1.1 is length of exon 2
                c(1.5 + 1.1 + 0.1, 0.15))

for (i in 1:length(relpeps)) {
    rp <- relpeps[[i]]
    pep <- peps[[i]]
    rect(X0 + rp[1], y - 0.25,
         X0 + rp[1] + rp[2], y + 0.25,
         col = "#FFA50450", lwd = 0)
    segments(X0 + rp[1], y - 0.25,
             pep[1], y - .75 + .25, lty = "dotted")
    segments(X0 + rp[1] + rp[2], y - 0.25,
             pep[2], y - .75 + .25, lty = "dotted")
}
y <- y - .75


abline(h = y, lty = "dotted")
text(0-0.26, y, expression(Pi), pos = 3)
for (i in 1:length(peps)) {
    pep <- peps[[i]]
    rect(pep[1], y - 0.25,
         pep[2], y + 0.25,
         col = "#FFA504FF")
    text(pep[1], y + 0.25, expression(pi[s]^k), pos = 3, cex = .7)
    text(pep[2], y + 0.25, expression(pi[e]^k), pos = 3, cex = .7)
    text(mean(c(pep[1], pep[2])), y-0.35, paste0("k = ", i), cex = .8)
}
```

## Proteins and genome data

```{r, echo=FALSE}
data(p)
sp <- seqnames(p)[5]
```

We will use a small object from `Pbase` to illustrate how to retrieve
genome coordinates and map a protein back to genomic coordinates. In
the remainder of this vignette, we will concentrate on a spectific
protein, namely `r sp`.

```{r, p, message=FALSE}
library("Pbase")
data(p)
p
seqnames(p)
dat <- data.frame(i = 1:length(p),
                  npeps = elementLengths(pfeatures(p)),
                  protln = width(aa(p)))
dat
sp <- seqnames(p)[5]
```

### Querying with `biomaRt`

The input information consists of a UniProt identifier and a set of
peptides positions along the protein. The first step is to map the
protein accession number to a gene identifier (here, we will use
Ensembl) and to obtain genome coordinates.

Multiple solutiona are offered to use. Below, we use the `biomaRt`
Bioconductor package.

```{r bm, cache=TRUE, message=FALSE}
library("biomaRt")
ens <- useMart("ensembl", "hsapiens_gene_ensembl")

bm <- select(ens, keys = sp,
             keytype = "uniprot_swissprot_accession",
             columns = c(
                 "ensembl_gene_id",
                 "ensembl_transcript_id",
                 "ensembl_peptide_id",
                 "ensembl_exon_id",  
                 "description",     
                 "chromosome_name",
                 "strand",
                 "exon_chrom_start",
                 "exon_chrom_end",
                 "5_utr_start",
                 "5_utr_end",
                 "3_utr_start",
                 "3_utr_end",
                 "start_position",     
                 "end_position"))

bm

```

We will only consider the gene located on the X chromosome and ignore
the
[sequence](http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/patches.shtml)
[patch](http://www.ncbi.nlm.nih.gov/nuccore/NW_003871101.3?report=genbank)
and conveniently store the transcript and protein identifiers for
later use.

```{r}
bm <- bm[bm$chromosome_name == "X", ]
tr <- unique(bm$ensembl_transcript_id)
pr <- unique(bm$ensembl_peptide_id)
```

The information retrieved gives us various information, in particular
the Ensembl gene identifer `r bm$ensembl_gene_id[1]` and its starting
and ending position `r bm$start_position[1]` and `r bm$end_position[1]`
on chromosome `r bm$chromosome_name[1]`.

### Genome visualisation with `Gviz`

While the above defines all necessary genomic coordinates, and as we
will make use of the `Gviz` plotting infrastructure, it is more
straightforward to fetch the above information via `biomaRt` using
specialised `Gviz` infrastructure.

```{r gviz, cache=TRUE, message=FALSE}
library("Gviz")
options(ucscChromosomeNames=FALSE)

bmTrack <- BiomartGeneRegionTrack(start = min(bm$start_position),
                                  end = max(bm$end_position),
                                  chromosome = "chrX",
                                  genome = "hg19")

## see also the filters param and getBM to
## filter on biotype == "protein_coding"

bmTracks <- split(bmTrack, transcript(bmTrack))

grTrack <- bmTracks[[which(names(bmTracks) == tr)]]
names(grTrack) <- tr
```

We can convince ourselves that the manual retrieved data in `bm` and
the data in the `BiomartRegionTrack` subsequently subset into a
`GeneRegionTrack` match.

```{r gvizdfr}
as(grTrack, "data.frame")
```

Below, we create additional ideogram and axis tracks and produce a
visualisation of our genomic coordinates of interest.

```{r gvizfig, fig.align='center', cache=TRUE}
ideoTrack <- IdeogramTrack(genome = "hg19",
                           chromosome = "chrX")
axisTrack <- GenomeAxisTrack()
seqlevels(ranges(grTrack)) <- 
    chromosome(grTrack) <-
        chromosome(ideoTrack)
plotTracks(list(ideoTrack, axisTrack, grTrack),
           add53 = TRUE, add35 = TRUE)
```

### Using `TranscriptDb` instances

Finally, on could also use `TranscriptDb` objects to create the
`GeneRegionTrack`. Below, we use the UCSC annotation (which is
directly available from Bioconductor; Ensembl TranscriptDb instances
could be generated manually - **TODO** document this) to extract our
regions of interest.

```{r, txdb, cache=TRUE, message=FALSE}
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txTr <- GeneRegionTrack(txdb, chromosome = "chrX",
                        start = bm$start_position[1],
                        end = bm$end_position[1])

head(as(txTr, "data.frame"))
```

## Peptide features

```{r, pp0, echo=FALSE}
pp <- p[sp]
n <- elementLengths(pranges(pp))
pepstring <- paste(unique(pcols(pp)[[1]]$pepseq), collapse = ", ")
```

The peptides that have been experimentally observed are available as
ranges (coordinates) along the protein sequences. For example, below,
we see `r n` peptides (`r pepstring`) have been identified for our
protein of interest `r sp`.

```{r, pepsfig, fig.align='center'}
pp <- p[sp]
pranges(pp)
plot(pp)
```

The aim of this document is to document the mapping of peptides,
i.e. ranges along a protein sequence to ranges along the genome
reference. In other words, our aim is the convert protein coordinates
to genome coordinates.

## Mapping peptides back to the genome

Additional features of interest (in our case peptides) can be added to
the genomic visualisation using dedicated annotation tracks, that can
be constructed as shown below. 

### Comparing protein and translated DNA sequences

The first check that we want to implement is to verify that we can
regenerate the protein amino acid sequence from the genome regions
that we have extracted. We start by subsetting only the actual protein
coding regions, i.e ignoring the 5' and 3' untranslated regions of our
Genome Region track.

```{r, protranges}
(prng <- ranges(grTrack[feature(grTrack) == "protein_coding",]))
```

We also need the actual genome sequence (so far, we have only dealt
with regions and features). There are multiple ways to obtain genome
sequences with R and Bioconductor. As we have been working with the
Ensembl reference genome, we could simply
[download](ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/)
the whole genome of only the chromosome X and load it as an
`AAStringSet` and extract the protein coding regions of interest at
the coordinates defined by the ranges in `prng`.

```{r, extractat, eval=FALSE}
library("Biostrings")
chrx <- readDNAStringSet("./Homo_sapiens.GRCh37.75.dna.chromosome.X.fa.gz")
seq <- extractAt(chrx[[1]], ranges(prng))
pptr <- translate(unlist(seq))
```

It is also possible to use readily package genomes to do this. Above,
we have use the `TxDb.Hsapiens.UCSC.hg19.knownGene` package defining
transcripts. There is a similar package for the human UCSC genome
build, namely `BSgenome.Hsapiens.UCSC.hg19`. However, as we have
focused on Ensembl data annotation, we will first need to convert our
Ensembl transcript (or protein) identifier `r tr` (`r pr`) into the
UCSC equivalent. (**TODO** Use also `Homo.sapiens` annotation package.)

```{r, protfromgenome}
ucsc <- select(ens, key = pr,
               keytype = "ensembl_peptide_id",
               columns = "ucsc")
ucsc

library("BSgenome.Hsapiens.UCSC.hg19")
prng2 <- ranges(txTr[transcript(txTr) %in% ucsc])
prng2 <- prng2[prng2$feature == "CDS"]

seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19, prng2)
pptr <- translate(unlist(seq))
## remove stop codon
pptr <- pptr[1:417]
```

```{r, align}
writePairwiseAlignments(pairwiseAlignment(pp[[1]], pptr))
```

### Calculating new coordinates

To calculate the genomic coordinates of peptides we

1) Calculate the contiguous exon coordinates on the cDNA sequence
   `prex`.

2) Calculate the peptide corrdinates on the cDNA sequence `peprgn2`
   from the original peptide ranges along the protein sequence.

3) We calculate the indices of the exons in which all the peptides
   start and end on the cDNA sequence `start_ex` and `end_ex`.

4) We infer the coordinates on the genome from the actual peptide
   starting/ending position and exon index on the cDNA sequence, the
   exons coordinates on the cDNA sequences and the matching exons
   coordinates on the genome and store these in an `IRanges` object.

```{r, mapping}
## exons ranges along the protein
j <- cumsum(width(seq))
i <- cumsum(c(1, width(seq)))[1:length(j)]
prex <- IRanges(start = i, end = j)

peprng <- pranges(pp)[[1]]
## peptide positions on cdna
peprng2 <- IRanges(start = 1 + (start(peprng)-1) * 3,
                   width = width(peprng) * 3)

## find exon and position in prex 
start_ex <- subjectHits(findOverlaps(start(peprng2), prex))
end_ex <- subjectHits(findOverlaps(end(peprng2), prex))

getPos <- function(p, i, prtex = prex, nclex = prng2) {
    ## position in cdna
    ## exon index
    start(prng2[i]) + (p - start(prtex[i]))
}

peptides_on_genome <- IRanges(start = getPos(start(peprng2), start_ex),
                              end = getPos(end(peprng2), end_ex))

```

### Plotting

Based on the new peptide genomic coordinates, it is now
straightforward to create a new `AnnotationTrack` and add it the the
track visualisation.

```{r, pepcoords, fig.align='center'}
pepTr <- AnnotationTrack(start = start(peptides_on_genome),
                         end = end(peptides_on_genome),
                         chr = "chrX", genome = "hg19",
                         strand = "*",
                         id = pcols(pp)[[1]]$pepseq,
                         name = "pfeatures",
                         col = "steelblue")


plotTracks(list(ideoTrack, axisTrack, grTrack, pepTr),
           groupAnnotation = "id",
           just.group = "below",
           fontsize.group = 9,
           add53 = TRUE, add35 = TRUE)

```

### DetailsAnnotationTrack

Finally, we customise the figure by adding a track with the $MS^2$
spectra. The raw data used to search the protein database an create
`p` are available as an `MSnExp` object. 

```{r, msmsspectra, fig.align='center'}
data(pms)

library("ggplot2")
details <- function(identifier, ...) {
    p <- plot(pms[[as.numeric(identifier)]], full=TRUE, plot=FALSE) + ggtitle("") 
    p <- p + theme_bw() + theme(axis.text.y = element_blank(),
                                axis.text.x = element_blank()) + 
                                labs(x = NULL, y = NULL)
                                        
    print(p, newpage=FALSE)
}

deTrack <- AnnotationTrack(start = start(peptides_on_genome),
                           end = end(peptides_on_genome),
                           genome = "hg19", chromosom = "chrX",
                           id = pcols(pp)[[1]]$acquisitionnum,
                           name = "MS2 spectra",
                           stacking = "squish", fun = details)

plotTracks(list(ideoTrack, axisTrack, deTrack, grTrack),
           add53 = TRUE, add35 = TRUE)
```

**TODO** Check spectra. Describe how data tracks can be used to
  overlay additional information, such as quantitation data,
  identification scores, coverage, ...

## Session information

```{r, si}
sessionInfo()
```