## ----install_h5vcData, eval=FALSE---------------------------------------- # source("http://bioconductor.org/biocLite.R") # biocLite("h5vcData") ## ----loadPackages-------------------------------------------------------- suppressPackageStartupMessages(library(h5vc)) suppressPackageStartupMessages(library(rhdf5)) tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" ) ## ----listTallyFile------------------------------------------------------- h5ls(tallyFile) ## ----getSampleData------------------------------------------------------- sampleData <- getSampleData( tallyFile, "/ExampleStudy/16" ) sampleData ## ----setSampleData------------------------------------------------------- sampleData$ClinicalVariable <- rnorm(nrow(sampleData)) setSampleData( tallyFile, "/ExampleStudy/16", sampleData ) sampleData ## ----h5readBlockExample-------------------------------------------------- data <- h5readBlock( filename = tallyFile, group = "/ExampleStudy/16", names = c( "Coverages", "Counts" ), range = c(29000000,29001000) ) str(data) ## ----h5dapplyExample----------------------------------------------------- coverages <- h5dapply( filename = tallyFile, group = "/ExampleStudy/16", names = c( "Coverages" ), dims = c(3), range = c(29000000,29005000), blocksize = 500, FUN = binnedCoverage, sampleData ) options(scipen=10) coverages <- do.call( rbind, coverages ) rownames(coverages) <- NULL #remove block-ids used as row-names coverages ## ----callVariantsExample------------------------------------------------- variants <- h5dapply( filename = tallyFile, group = "/ExampleStudy/16", names = c( "Coverages", "Counts", "Deletions", "Reference" ), range = c(29950000,30000000), blocksize = 10000, FUN = callVariantsPaired, sampledata = sampleData, cl = vcConfParams(returnDataPoints = TRUE) ) variants <- do.call( rbind, variants ) variants$AF <- (variants$caseCountFwd + variants$caseCountRev) / (variants$caseCoverageFwd + variants$caseCoverageRev) variants <- variants[variants$AF > 0.2,] rownames(variants) <- NULL # remove rownames to save some space on output :D variants ## ----mismatchPlotExample, fig.width=10.5, fig.height=8.5, dpi=300, out.width="750px"---- windowsize <- 35 position <- variants$Start[1] data <- h5readBlock( filename = tallyFile, group = paste( "/ExampleStudy", variants$Chrom[1], sep="/" ), names = c("Coverages","Counts","Deletions", "Reference"), range = c( position - windowsize, position + windowsize) ) patient <- sampleData$Patient[sampleData$Sample == variants$Sample[1]] samples <- sampleData$Sample[sampleData$Patient == patient] p <- mismatchPlot( data = data, sampledata = sampleData, samples = samples, windowsize = windowsize, position = position ) print(p) ## ----mismatchPlotExamplesTheming, fig.width=10.5, fig.height=8.5, dpi=300, out.width="750px", fig.show='asis'---- print(p + theme(strip.text.y = element_text(family="serif", size=16, angle=0))) ## ----loadFiles, eval = .Platform$OS.type == "unix", results="hide"------- suppressPackageStartupMessages(library("h5vc")) suppressPackageStartupMessages(library("rhdf5")) suppressPackageStartupMessages(library("Rsamtools")) files <- c("NRAS.AML.bam", "NRAS.Control.bam") bamFiles <- file.path( system.file("extdata", package = "h5vcData"), files) ## ----chromDim, eval = (.Platform$OS.type == "unix")---------------------- chromdim <- sapply( scanBamHeader(bamFiles), function(x) x$targets ) colnames(chromdim) <- files head(chromdim) ## ----hdf5SetUp, eval = .Platform$OS.type == "unix"----------------------- chrom <- "1" chromlength <- chromdim[chrom,1] study <- "/NRAS" tallyFile <- file.path( tempdir(), "NRAS.tally.hfs5" ) if( file.exists(tallyFile) ){ file.remove(tallyFile) } if( prepareTallyFile( tallyFile, study, chrom, chromlength, nsamples = 2 ) ){ h5ls(tallyFile) }else{ message( paste( "Preparation of:", tallyFile, "failed" ) ) } ## ----sampleDataSetUp, eval = .Platform$OS.type == "unix"----------------- sampleData <- data.frame( Sample = c("AML","Control"), Column = c(1,2), Patient = c("Pt1","Pt1"), Type = c("Case","Control"), stringsAsFactors = FALSE ) group <- paste( study, chrom, sep = "/" ) setSampleData( tallyFile, group, sampleData ) getSampleData( tallyFile, group ) ## ----tallyingTheBamFiles, eval = .Platform$OS.type == "unix"------------- suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg19)) startpos <- 115247090 endpos <- 115259515 theData <- applyTallies( bamFiles, chrom = chrom, start = startpos, stop = endpos, ncycles = 10, reference = BSgenome.Hsapiens.UCSC.hg19[["chr1"]][startpos:endpos] ) # the first and last 10 sequencing cycles are called unreliable str(theData) ## ----writingToTallyFile, eval = .Platform$OS.type == "unix"-------------- h5write( theData$Counts, tallyFile, paste( group, "Counts", sep = "/" ), index = list( NULL, NULL, NULL, startpos:endpos ) ) h5write( theData$Coverages, tallyFile, paste( group, "Coverages", sep = "/" ), index = list( NULL, NULL, startpos:endpos ) ) h5write( theData$Deletions, tallyFile, paste( group, "Deletions", sep = "/" ), index = list( NULL, NULL, startpos:endpos ) ) h5write( theData$Reference, tallyFile, paste( group, "Reference", sep = "/" ), index = list( startpos:endpos ) ) h5ls(tallyFile) ## ----extractingData, eval = .Platform$OS.type == "unix"------------------ data <- h5readBlock( filename = tallyFile, group = group, range = c(startpos, endpos) ) str(data) ## ----callingVariants, eval = .Platform$OS.type == "unix"----------------- vars <- h5dapply( filename = tallyFile, group = group, blocksize = 1000, FUN = callVariantsPaired, sampledata = getSampleData(tallyFile,group), cl = vcConfParams(returnDataPoints=TRUE), range = c(startpos, endpos) ) vars <- do.call(rbind, vars) vars ## ----plottingVariant, eval = .Platform$OS.type == "unix"----------------- position <- vars$End[1] windowsize <- 50 data <- h5readBlock( filename = tallyFile, group = group, range = c(position - windowsize, position + windowsize) ) p <- mismatchPlot( data, getSampleData(tallyFile,group), samples = c("AML","Control"),windowsize=windowsize,position=position ) ## ----plottingVariantEx, fig.width=10.5, fig.height=7, dpi=300, out.width="750px", eval = .Platform$OS.type == "unix"---- print(p) ## ------------------------------------------------------------------------ tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" ) data( "example.variants", package = "h5vcData" ) #example variant calls head(variantCalls) ms = mutationSpectrum( variantCalls, tallyFile, "/ExampleStudy" ) head(ms) ## ----plottingMS, fig.width=10.5, fig.height=7, dpi=300, out.width="750px"---- plotMutationSpectrum(ms) + theme( strip.text.y = element_text(angle=0, size=10), axis.text.x = element_text(size = 7), axis.text.y = element_text(size = 10)) + scale_y_continuous(breaks = c(0,5,10,15)) ## ----parallelTally, eval = .Platform$OS.type == "unix"------------------- suppressPackageStartupMessages(library(BiocParallel)) serial.time <- system.time(theData <- applyTallies(bamFiles, chrom = chrom, start = startpos, stop = endpos, ncycles = 10)) multicore.time <- system.time(theData <- applyTallies(bamFiles, chrom = chrom, start = startpos, stop = endpos, ncycles = 10, BPPARAM = MulticoreParam())) serial.time multicore.time ## ------------------------------------------------------------------------ suppressPackageStartupMessages(library(BiocParallel)) system.time( coverages_serial <- h5dapply( filename = tallyFile, group = "/ExampleStudy/16", names = c( "Coverages" ), dims = c(3), range = c(29000000,29050000), blocksize = 1000, FUN = binnedCoverage, sampleData ) ) system.time( coverages_parallel <- h5dapply( filename = tallyFile, group = "/ExampleStudy/16", names = c( "Coverages" ), dims = c(3), range = c(29000000,29050000), blocksize = 1000, FUN = binnedCoverage, sampleData, BPPARAM = MulticoreParam() ) ) ## ------------------------------------------------------------------------ #instantiating a batch tally param for tallying the NRAS example files files <- c("NRAS.AML.bam", "NRAS.Control.bam") startpos <- 115247090 endpos <- 115259515 bamFiles <- file.path( system.file("extdata", package = "h5vcData"), files) btp <- batchTallyParam(bamFiles = bamFiles, destination = "tally.file.name.hfs5", group = "/NRAS/1", chrom="1", start=startpos, stop=endpos) btp ## ----batchjobsconf, eval=FALSE------------------------------------------- # cluster.functions <- makeClusterFunctionsLSF("/home/pyl/batchjobs/lsf.tmpl") # mail.start <- "first" # mail.done <- "last" # mail.error <- "all" # mail.from <- "" # mail.to <- "" # mail.control <- list(smtpServer="smtp.embl.de") ## ----eval=FALSE---------------------------------------------------------- # ## Default resources can be set in your .BatchJobs.R by defining the variable # ## 'default.resources' as a named list. # # ## remove everthing in [] if your cluster does not support arrayjobs # #BSUB-J <%= job.name %>[1-<%= arrayjobs %>] # name of the job / array jobs # #BSUB-o <%= log.file %> # output is sent to logfile, stdout + stderr by default # #BSUB-n <%= resources$ncpus %> # Number of CPUs on the node # #BSUB-q <%= resources$queue %> # Job queue # #BSUB-W <%= resources$walltime %> # Walltime in minutes # #BSUB-M <%= resources$memory %> # Memory requirements in Kbytes # # # we merge R output with stdout from LSF, which gets then logged via -o option # R CMD BATCH --no-save --no-restore "<%= rscript %>" /dev/stdout ## ----eval=FALSE---------------------------------------------------------- # library("BiocParallel") # library("BatchJobs") # cf <- makeClusterFunctionsLSF("/home/pyl/batchjobs/lsf.tmpl") # bjp <- BatchJobsParam( cluster.functions=cf, resources = list("queue" = "medium_priority", "memory"="4000", "ncpus"="4", walltime="00:30") ) # bplapply(1:10, sqrt) # bplapply(1:10, sqrt, BPPARAM=bjp) ## ----eval=FALSE---------------------------------------------------------- # batchTallies(confList = btp)