\pdfminorversion=4 % tell pdflatex to generate PDF in version 1.4
\documentclass[article, shortnames, nojss]{jss}
\usepackage[utf8]{inputenc}
\usepackage{amsmath}
\usepackage{enumerate}
\usepackage{multirow}
\DeclareMathOperator*{\argmax}{arg\,max}
% \VignetteIndexEntry{Getting TCGA datasets}
% \VignetteKeyword{singular enrichment analysis}
% \VignetteKeyword{over representation analysis}
% \VignetteKeyword{gene set enrichment analysis}
% \VignetteKeyword{functional class scoring}
% \VignetteKeyword{big omics data}

% \VignetteDepends{MIGSA}
% \VignetteDepends{MIGSAdata}

\author{Juan C Rodriguez\\CONICET\\Universidad Cat\'{o}lica de C\'{o}rdoba\\
Universidad Nacional de C\'{o}rdoba
\And Crist\'{o}bal Fresno\\Instituto Nacional de Medicina Gen\'{o}mica
\AND Andrea S Llera\\CONICET\\Fundaci\'{o}n Instituto 
Leloir \And Elmer A Fern\'{a}ndez\\CONICET\\Universidad Cat\'{o}lica de
C\'{o}rdoba\\Universidad Nacional de C\'{o}rdoba}

\title{\pkg{MIGSA}: Getting TCGA datasets}

%% for pretty printing and a nice hypersummary also set:
%% comma-separated
\Plainauthor{Juan C Rodriguez, Crist\'{o}bal Fresno, Andrea S Llera, Elmer A 
    Fern\'{a}ndez}
%% without formatting
\Plaintitle{MIGSA: Getting TCGA datasets}
%% a short title (if necessary)
\Shorttitle{\pkg{MIGSA}: Getting TCGA datasets}

%% an abstract and keywords
\Abstract{
In this vignette we are going to show how we got the RData 
\textit{tcgaMAdata.RData} which can be loaded via the \pkg{MIGSAdata} package 
using data(tcgaMAdata) and \textit{tcgaRNAseqData.RData} which can be 
loaded using data(tcgaRNAseqData).
}
\Keywords{singular enrichment analysis, over representation analysis, gene set 
enrichment analysis, functional class scoring, big omics data}

%% without formatting
\Plainkeywords{singular enrichment analysis, over representation analysis, 
gene set enrichment analysis, functional class scoring, big omics data}
%% at least one keyword must be supplied 

%% The address of (at least) one author should be given  
%% in the following format:
\Address{
Juan C Rodriguez \& Elmer A Fern\'{a}ndez\\
Bioscience Data Mining Group\\ 
Facultad de Ingenier\'{i}a\\ 
Universidad Cat\'{o}lica de C\'{o}rdoba - CONICET\\ 
X5016DHK C\'{o}rdoba, Argentina\\ 
E-mail: \email{jcrodriguez@bdmg.com.ar, efernandez@bdmg.com.ar}\\
URL: \url{http://www.bdmg.com.ar/}\\
}

\begin{document}
\SweaveOpts{concordance=TRUE}
% \SweaveOpts{concordance=FALSE}

% \begin{titlepage}
% \title{
%     \textbf{IFA:} \\
%     Integrative Functional Analysis User's Guide
% }
% 
% \author{
%Juan C. Rodriguez\\
%\texttt{jcrodriguez@bdmg.com.ar}
%\and
%Elmer A Fern\'{a}ndez\\
%\texttt{efernandez@bdmg.com.ar}
% }
% % \textit{}
% \date{September 19, 2016}
% \end{titlepage}
% 
% \begin{center}
% \huge{
%   \textbf{IFA:} \\
%   Integrative Functional Analysis User's Guide
% }
% 
% \vspace{8pt}
% 
% \large{
%   Juan C. Rodriguez$^{1,2}$ \quad Elmer A Fern\'{a}ndez$^{1,3}$ \\
%     \texttt{jcrodriguez@bdmg.com.ar efernandez@bdmg.com.ar} \\
% %    \texttt{\{jcrodriguez, efernandez\}@bdmg.com.ar} \\
%     \vspace{2pt}
%     \small{
%     $^{1}$ UA AREA CS. AGR. ING. BIO. Y S, Universidad Cat\'{o}lica de 
% C\'{o}rdoba, CONICET, C\'{o}rdoba, Argentina \\
%     $^{2}$ Facultad de Matem\'{a}tica, Astronom\'{i}a y F\'{i}sica, 
% Universidad Nacional de C\'{o}rdoba, C\'{o}rdoba, Argentina \\
%     $^{3}$ Facultad de Ciencias Exactas, F\'{i}sicas y Naturales, 
% Universidad Nacional de C\'{o}rdoba, C\'{o}rdoba, Argentina \\
%     }
%     \vspace{8pt}
%     First edition September 19, 2016 \\
%     Last revised \today \\
%     \vspace{8cm}
%     \noindent\fbox{
%             \parbox{\textwidth}{
%                 This free open-source software implements academic 
% research by the
%                 authors and co-workers. If you use it, please support 
% the project by
%                 citing the journal article listed in Section
%                 \ref{citeSection}.
%             }
%     }
% }
% \end{center}
% \clearpage

\section{Getting the data}
\label{gettingData}
From the TCGA data portal the breast invasive carcinoma (BRCA) microarray and 
RNAseq datasets present at the date were downloaded. PAM50 subtypes Basal vs. 
Luminal A were evaluated. With these subjects, \textit{tcgaMAdata.RData} and 
\textit{tcgaRNAseqData.RData} were built.

\subsection{Basal-like subjects}
Basal-like TCGA subjects identifiers used:

<<basalSubjects>>=
library(MIGSAdata);
data(tcgaMAdata);

names(tcgaMAdata$subtypes)[ tcgaMAdata$subtypes == "Basal" ];
@

\subsection{Luminal A subjects}
Luminal A TCGA subjects identifiers used:

<<lumaSubjects>>=
library(MIGSAdata);
data(tcgaMAdata);

names(tcgaMAdata$subtypes)[ tcgaMAdata$subtypes == "LumA" ];
@

\section{Getting the data with TCGAbiolinks R package}
All the subject's data mentioned in section \ref{gettingData} was downloaded by 
means of the \pkg{TCGAbiolinks} R package, however, at the present this 
library had been greatly refactored, causing that this code does not work 
unless some files are present in your hard drive, these files are available 
upon request as they weigh too much. Below we show the code used to get both 
RDatas.

<<tcgabiolinksCode, eval=FALSE>>=
## Not run:

library(TCGAbiolinks);

R.Version()$version.string;
# [1] "R version 3.2.3 (2015-12-10)"
packageVersion("TCGAbiolinks");
# [1] â€˜1.0.10â€™

query <- TCGAquery(tumor="BRCA");
matSamples <- TCGAquery_integrate(query);

# subjects in both microarray and RNAseq data
matSamples["AgilentG4502A_07_3", "IlluminaHiSeq_RNASeq"];
# [1] 495

# we filter only microarray data
geneExprSubjects <- TCGAquery(tumor="BRCA", platform="AgilentG4502A_07_3", 
    level=3);

# we filter only RNAseq data
rnaSeqSubjects <- TCGAquery(tumor="BRCA", platform="IlluminaHiSeq_RNASeq", 
    level=3);

geneExprbarcodes <- geneExprSubjects$barcode;
geneExprbarcodes <- strsplit(geneExprbarcodes, ",");
geneExprbarcodes <- Reduce(union, geneExprbarcodes);

rnaSeqbarcodes <- rnaSeqSubjects$barcode;
rnaSeqbarcodes <- strsplit(rnaSeqbarcodes, ",");
rnaSeqbarcodes <- Reduce(union, rnaSeqbarcodes);

commonSubjects <- intersect(geneExprbarcodes, rnaSeqbarcodes);
rm(geneExprbarcodes); rm(rnaSeqbarcodes);
length(commonSubjects);
# [1] 547

# we filter microarray and RNAseq data (but just common subjects)
geneExprSubjects <- TCGAquery(tumor="BRCA", platform="AgilentG4502A_07_3", 
    samples=commonSubjects, level=3);
rnaSeqSubjects <- TCGAquery(tumor="BRCA", platform="IlluminaHiSeq_RNASeq", 
    samples=commonSubjects, level=3);


#### this lines are the ones which are not working any more (TCGAdownload)
# TCGAdownload(geneExprSubjects, path="geneExpr/", samples=commonSubjects);
# TCGAdownload(rnaSeqSubjects,   path="rnaSeq/",   samples=commonSubjects, 
#     type="gene.quantification");

## However, we can provide you necessary files to skip the TCGAdownload step.

## type is any of:
# RNASeq:             exon.quantification
#                     spljxn.quantification
#                     gene.quantification
# genome_wide_snp_6:  hg18.seg
#                     hg19.seg,nocnv_hg18.seg
#                     nocnv_hg19.seg

geneExpr <- TCGAprepare(geneExprSubjects, dir="geneExpr/");
rnaSeq <- TCGAprepare(rnaSeqSubjects, dir="rnaSeq/", 
    type="gene.quantification");

library(SummarizedExperiment);

assays(geneExpr);
# names(1): raw_counts

# It would be a better way of conversion
geneExpr <- head(assay(geneExpr, "raw_counts"), n=nrow(geneExpr));

assays(rnaSeq);
# names(3): raw_counts median_length_normalized RPKM
rnaSeq_raw <- head(assay(rnaSeq, "raw_counts"), n=nrow(rnaSeq));
rnaSeq_medianNorm <- head(assay(rnaSeq, "median_length_normalized"), 
    n=nrow(rnaSeq));
rnaSeq_rpkm <- head(assay(rnaSeq, "RPKM"), n=nrow(rnaSeq));

## checking if we have the same subjects in every experiment
stopifnot(all(colnames(geneExpr) %in% colnames(rnaSeq_raw)));
stopifnot(all(colnames(rnaSeq_raw) %in% colnames(rnaSeq_medianNorm)));
stopifnot(all(colnames(rnaSeq_medianNorm) %in% colnames(rnaSeq_rpkm)));
stopifnot(all(colnames(rnaSeq_rpkm) %in% colnames(geneExpr)));

mapping <- do.call(rbind, strsplit(rownames(rnaSeq_raw), "|", fixed=!F));
colnames(mapping) <- c("Symbol", "Entrez");

#### Now let's get subjects subtypes

library(genefu);

rnaSeq <- rnaSeq_rpkm;
rm(rnaSeq_rpkm);

## Also request this file!
pam50Annot <- read.csv("pam50_annotation.txt",sep="\t");

library(limma);
dim(geneExpr);
geneExpr <- avereps(geneExpr);
dim(geneExpr);

rownames(rnaSeq) <- mapping[, "Symbol" ];
dim(rnaSeq);
rnaSeq <- rnaSeq[ mapping[, "Symbol" ] != "?" , ];
dim(rnaSeq);
rnaSeq <- avereps(rnaSeq);
dim(rnaSeq);

geneExpr <- geneExpr[as.character(pam50Annot$GeneName),, drop=F];
dim(geneExpr);
rnaSeq <- rnaSeq[as.character(pam50Annot$GeneName),, drop=F];
dim(rnaSeq);
rnaSeq <- log(rnaSeq);

pam50Annot <- pam50Annot[,c("GeneName", "EntrezGene")];
colnames(pam50Annot) <- c("probe", "EntrezGene.ID");
pam50Annot$probe <- as.character(pam50Annot$probe);

## get subtypes
dataset <- apply(geneExpr, 1, as.numeric);
rownames(dataset) <- colnames(geneExpr);
subtypesGeneExpr <- intrinsic.cluster.predict(sbt.model=pam50.scale, 
    data=dataset, annot=pam50Annot, do.mapping=!F, do.prediction.strength=!F, 
    verbose=!F);

## get subtypes
dataset <- apply(rnaSeq, 1, as.numeric);
rownames(dataset) <- colnames(rnaSeq);
subtypesRnaSeq <- intrinsic.cluster.predict(sbt.model=pam50.scale, 
    data=dataset, annot=pam50Annot, do.mapping=!F, do.prediction.strength=!F, 
    verbose=!F);

table(subtypesGeneExpr$subtype);
#  Basal   Her2   LumA   LumB Normal 
#    101     77    150    157     62

table(subtypesRnaSeq$subtype);
#  Basal   Her2   LumA   LumB Normal
#    101     81    165    137     63

subtypesGeneExpr <- subtypesGeneExpr$subtype;
subtypesRnaSeq <- subtypesRnaSeq$subtype[names(subtypesGeneExpr)];

## how many subjects got the same subtype between microarray and RNAseq data
concSubtypes <- table(subtypesGeneExpr, subtypesRnaSeq);
concSubtypes;
#          Basal Her2 LumA LumB Normal
#   Basal     95    2    1    2      1
#   Her2       0   72    0    4      1
#   LumA       1    0  142    4      3
#   LumB       3    7   19  127      1
#   Normal     2    0    3    0     57
sum(diag(concSubtypes)) / sum(concSubtypes);
# [1] 0.9012797 # 90% of concordant subjects

stopifnot(all(names(subtypesGeneExpr) == names(subtypesRnaSeq)));

## I am going to use the subjects that got the same classification in both
subtypes <- subtypesGeneExpr[subtypesGeneExpr == subtypesRnaSeq];
length(subtypes);
# [1] 493

#### Now just translate GeneSymbols to EntrezGene IDs

## Also request this file!
annotAgi <- read.csv("AgilentG4502A_07_3.csv", sep="|");
geneExprSymbol <- rownames(geneExpr);
# we first search into Agilent annotation file
geneExprEntrez <- annotAgi[ match(geneExprSymbol, annotAgi[, "Symbol"]), 
    "Entrez" ];
sum(is.na(geneExprEntrez));
# [1] 796
# then we look into the mapping given by RNASeq TCGA data
geneExprEntrez[ is.na(geneExprEntrez) ] <- mapping[ match(geneExprSymbol[ 
    is.na(geneExprEntrez) ], mapping[, "Symbol"]), "Entrez" ];
sum(is.na(geneExprEntrez));
# [1] 772

geneExpr <- geneExpr[ !is.na(geneExprEntrez), ];
rownames(geneExpr) <- geneExprEntrez[ !is.na(geneExprEntrez) ];
dim(geneExpr);
geneExpr <- avereps(geneExpr);
dim(geneExpr);

rownames(rnaSeq) <- do.call(rbind, strsplit(rownames(rnaSeq), "|", 
    fixed=!F))[,2];
dim(rnaSeq);
rnaSeq <- avereps(rnaSeq);
dim(rnaSeq);

load("rnaSeq_raw.RData");
rownames(rnaSeq_raw) <- do.call(rbind, strsplit(rownames(rnaSeq_raw), "|", 
    fixed=!F))[,2];
dim(rnaSeq_raw);
rnaSeq_raw <- avereps(rnaSeq_raw);
dim(rnaSeq_raw);

#### And keep only Basal and Luminal A subjects
rnaSeq_raw <- rnaSeq_raw[, names(subtypes)[subtypes %in% c("Basal", "LumA")] ];
geneExpr <- geneExpr[, names(subtypes)[subtypes %in% c("Basal", "LumA")] ];

subtypes <- subtypes[subtypes %in% c("Basal", "LumA")];

## And these are the two data objects used.
tcgaRNAseqData <- list(rnaSeq=rnaSeq_raw, subtypes=subtypes);
tcgaMAdata <- list(geneExpr=geneExpr, subtypes=subtypes);
## End(Not run)
@

\section*{Session Info}
<<Session Info, echo=true>>=
sessionInfo()
@

% \bibliography{MIGSA}

\end{document}