\name{getGeneExp} \alias{getGeneExp} \title{getGeneExp: count the number of reads and calculate the RPKM for each gene} \description{ This function is used to count the number of reads and calculate the RPKM for each gene. It takes uniquely mapped reads from RNA-seq data for a sample with a gene annotation file as input. So users should map the reads (obtained from sequencing library of the sample) to the corresponding genome in advance. } \usage{ getGeneExp(mapResultBatch, fileFormat="bed", readLength=32, strandInfo=FALSE, refFlat, output=paste(mapResultBatch[1],".exp",sep=""), min.overlapPercent=1) } \arguments{ \item{mapResultBatch}{a vector containing uniquely mapping result files for a sample. \cr \emph{Note}: The sample can have multiple technical replicates.} \item{fileFormat}{file format: \code{"bed"} or \code{"eland"}. \cr example of \code{"bed"} format: \code{chr12 7 38 readID 2 +} \cr example of \code{"eland"} format: \code{readID chr12.fa 7 U2 F} \cr \emph{Note}: The field separator character is \code{TAB}. And the files must follow the format as one of the examples. } \item{readLength}{the length of the reads (only used if \code{fileFormat="eland"}).} \item{strandInfo}{whether the strand information was retained during the cloning of the cDNAs. \itemize{ \item \code{"TRUE" }: retained, \item \code{"FALSE"}: not retained. } } \item{refFlat}{gene annotation file in UCSC refFlat format. \cr See \url{http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat}. } \item{output}{the output file.} \item{min.overlapPercent}{the minimum percentage of the overlapping length for a read and an exon over the length of the read itself, for counting this read from the exon. should be <=1. \cr \code{0}: at least \code{1} bp overlap between a read and an exon. } } \note{ This function sums up the numbers of reads coming from all exons of a specific gene (according to the known gene annotation) as the gene expression value. The exons may include the 5'-UTR, protein coding region, and 3'-UTR of a gene. All introns are ignored for a gene for the sequenced reads are from the spliced transcript library. If a read falls in an exon (usually, a read is shorter than an exon), the read count for this exon plus 1. If a read is crossing the boundary of an exon, users can tune the parameter \code{min.overlapPercent}, which is the minimum percentage of the overlapping length for a read and an exon over the length of the read itself, for counting this read from the exon. The method use \emph{the average length} of all the isoforms as a gene length when calculating the RPKM values. This is different from Mortazavi,A. et al. (2008). } \references{ Mortazavi,A. et al. (2008) Mapping and quantifying mammalian transcriptomes by RNA-seq. \emph{Nat. Methods}, \bold{5}, 621-628. } \seealso{ \code{\link{DEGexp}}, \code{\link{DEGseq}}, \code{\link{readGeneExp}}, \code{\link{kidneyChr21.bed}}, \code{\link{liverChr21.bed}}, \code{\link{refFlatChr21}}. } \examples{ ## If the data files are collected in a zip archive, the following ## commands will first extract them to the temporary directory. kidneyR1L1 <- system.file("data", "kidneyChr21.bed.txt", package="DEGseq") refFlat <- system.file("data", "refFlatChr21.txt", package="DEGseq") if((kidneyR1L1 == "")||(refFlat == "")){ zipFile <- system.file("data", "Rdata.zip", package="DEGseq") if(zipFile != ""){ unzip(zipFile, c("kidneyChr21.bed.txt", "refFlatChr21.txt"), exdir=tempdir()) kidneyR1L1 <- file.path(tempdir(), "kidneyChr21.bed.txt") refFlat <- file.path(tempdir(), "refFlatChr21.txt") } } mapResultBatch <- c(kidneyR1L1) output <- file.path(tempdir(), "kidneyChr21.bed.exp") getGeneExp(mapResultBatch, refFlat=refFlat, output=output) exp <- readGeneExp(file=output, geneCol=1, valCol=c(2,3), label=c("raw count", "RPKM")) exp[30:35,] cat("output: ", output, "\n") } \keyword{methods}