## ----setup, include = FALSE------------------------------------------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", crop = NULL ## Related to https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016656.html ) ## ----vignetteSetup, echo=FALSE, message=FALSE, warning = FALSE-------------------------------------------------------- ## Bib setup library("RefManageR") ## Write bibliography information bib <- c( R = citation(), BiocStyle = citation("BiocStyle")[1], knitr = citation("knitr")[1], RefManageR = citation("RefManageR")[1], rmarkdown = citation("rmarkdown")[1], sessioninfo = citation("sessioninfo")[1], testthat = citation("testthat")[1], DeconvoBuddies = citation("DeconvoBuddies")[1] ) ## ----"install", eval = FALSE------------------------------------------------------------------------------------------ # if (!requireNamespace("BiocManager", quietly = TRUE)) { # install.packages("BiocManager") # } # # BiocManager::install("DeconvoBuddies") # # ## Check that you have a valid Bioconductor installation # BiocManager::valid() ## ----"load_packages", message=FALSE, warning=FALSE-------------------------------------------------------------------- ## Packages for different types of RNA-seq data structures in R library("SummarizedExperiment") library("SingleCellExperiment") library("Biobase") ## For downloading data library("spatialLIBD") ## For running deconvolution library("BisqueRNA") ## Other helper packages for this vignette library("dplyr") library("tidyr") library("tibble") library("ggplot2") ## Our main package library("DeconvoBuddies") ## ----"load rse_gene"-------------------------------------------------------------------------------------------------- ## use fetch deconvo data to load rse_gene rse_gene <- fetch_deconvo_data("rse_gene") rse_gene # lobstr::obj_size(rse_gene) # 41.16 MB ## Use gene "Symbol" as identifiers for the genes in rownames(rse_gene) rownames(rse_gene) <- rowData(rse_gene)$Symbol ## bulk RNA seq samples were sequenced with different library types, ## and RNA extractions table(rse_gene$library_type, rse_gene$library_prep) ## ----"load snRNA-seq"------------------------------------------------------------------------------------------------- ## Use spatialLIBD to fetch the snRNA-seq dataset used in this project sce_path_zip <- fetch_deconvo_data("sce") ## unzip and load the data sce_path <- unzip(sce_path_zip, exdir = tempdir()) sce <- HDF5Array::loadHDF5SummarizedExperiment( file.path(tempdir(), "sce_DLPFC_annotated") ) # lobstr::obj_size(sce) # 172.28 MB ## exclude Ambiguous cell type sce <- sce[, sce$cellType_broad_hc != "Ambiguous"] sce$cellType_broad_hc <- droplevels(sce$cellType_broad_hc) ## Check the number of genes by the number of nuclei that we ## have to work with: dim(sce) ## Check the broad cell type distribution table(sce$cellType_broad_hc) ## We're going to subset to the first 5k genes to save memory ## just for this example. You wouldn't do this on a full analysis. sce <- sce[seq_len(5000), ] ## ----"access RNAScope proportions"------------------------------------------------------------------------------------ # Access the RNAScope proportion data.frame data("RNAScope_prop") head(RNAScope_prop) ## plot the RNAScope compositions plot_composition_bar( prop_long = RNAScope_prop, sample_col = "SAMPLE_ID", x_col = "SAMPLE_ID", add_text = FALSE ) + facet_wrap(~Combo, nrow = 2, scales = "free_x") ## ----"Run Mean Ratio"------------------------------------------------------------------------------------------------- # calculate the Mean Ratio of genes for each cell type marker_stats <- get_mean_ratio(sce, cellType_col = "cellType_broad_hc", gene_ensembl = "gene_id", gene_name = "gene_name" ) # check the top gene ranked gene for each cell type marker_stats |> group_by(cellType.target) |> slice(1) ## ----"plot marker genes"---------------------------------------------------------------------------------------------- # plot expression across cell type the top 4 Excit marker genes plot_marker_express(sce, stats = marker_stats, cell_type = "Excit", cellType_col = "cellType_broad_hc", rank_col = "MeanRatio.rank", anno_col = "MeanRatio.anno", gene_col = "gene" ) ## ----"marker_list"---------------------------------------------------------------------------------------------------- # select top 25 marker genes for each cell type, that are also in rse_gene marker_genes <- marker_stats |> filter(MeanRatio.rank <= 25 & gene %in% rownames(rse_gene)) # check how many genes for each cell type (some genes are not in both datasets) marker_genes |> count(cellType.target) # create a vector of marker genes to subset data before deconvolution marker_genes <- marker_genes |> pull(gene) ## ----"prep data as ExpressionSet"------------------------------------------------------------------------------------- ## convert bulk data to Expression set, sub-setting to marker genes ## include sample ID exp_set_bulk <- Biobase::ExpressionSet( assayData = assays(rse_gene[marker_genes, ])$counts, phenoData = AnnotatedDataFrame( as.data.frame(colData(rse_gene))[c("SAMPLE_ID")] ) ) ## convert snRNA-seq data to Expression set, sub-setting to marker genes ## include cell type and donor information exp_set_sce <- Biobase::ExpressionSet( assayData = as.matrix(assays(sce[marker_genes, ])$counts), phenoData = AnnotatedDataFrame( as.data.frame(colData(sce))[, c("cellType_broad_hc", "BrNum")] ) ) ## check for nuclei with 0 marker expression zero_cell_filter <- colSums(exprs(exp_set_sce)) != 0 message("Exclude ", sum(!zero_cell_filter), " cells") exp_set_sce <- exp_set_sce[, zero_cell_filter] ## ----"run Bisque"----------------------------------------------------------------------------------------------------- ## Run Bisque with bulk and single cell ExpressionSet inputs est_prop <- ReferenceBasedDecomposition( bulk.eset = exp_set_bulk, sc.eset = exp_set_sce, cell.types = "cellType_broad_hc", subject.names = "BrNum", use.overlap = FALSE ) ## ----"deconvo output"------------------------------------------------------------------------------------------------- ## Examine the output from Bisque, transpose to make it easier to work with est_prop$bulk.props <- t(est_prop$bulk.props) ## sample x cell type matrix head(est_prop$bulk.props) ## ----"composition plots"---------------------------------------------------------------------------------------------- ## add Phenotype data to proportion estimates pd <- colData(rse_gene) |> as.data.frame() |> select(SAMPLE_ID, Sample, library_combo) ## make proportion estimates long so they are ggplot2 friendly prop_long <- est_prop$bulk.props |> as.data.frame() |> tibble::rownames_to_column("SAMPLE_ID") |> tidyr::pivot_longer(!SAMPLE_ID, names_to = "cell_type", values_to = "prop") |> left_join(pd) ## create composition bar plots ## for all library preparations by sample n=110 ## Remove the SAMPLE_ID names since they are very long using ggplot2::theme() plot_composition_bar( prop_long = prop_long, sample_col = "SAMPLE_ID", x_col = "SAMPLE_ID", add_text = FALSE ) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) ## Average by brain donor plot_composition_bar( prop_long = prop_long, sample_col = "SAMPLE_ID", x_col = "Sample", add_text = FALSE ) ## Each brain donor has up to 6 unique RNA library type and RNA extraction ## combinations table(prop_long$Sample) / length(unique(prop_long$cell_type)) ## Here are the 6 "SAMPLE_ID" values for brain donor with ID "Br8667_mid" unique(prop_long$SAMPLE_ID[prop_long$Sample == "Br8667_mid"]) ## ----"compare prop"--------------------------------------------------------------------------------------------------- ## Combine Oligo and OPC into OligoOPC prop_long_opc <- prop_long |> mutate(cell_type = gsub("Oligo|OPC", "OligoOPC", cell_type)) |> group_by(SAMPLE_ID, Sample, library_combo, cell_type) |> summarize(prop = sum(prop)) |> ungroup() prop_long_opc |> count(cell_type) ## Join RNAScope/IF and Bisque cell type proportions prop_compare <- prop_long_opc |> inner_join( RNAScope_prop |> select(Sample, cell_type, prop_RNAScope = prop, prop_sn), by = c("Sample", "cell_type") ) ## ----"proportion scatter"--------------------------------------------------------------------------------------------- ## compute correlation with RNAScope/IF proportions cor(prop_compare$prop, prop_compare$prop_RNAScope) ## Scatter plot with RNAScope/IF proportions prop_compare |> ggplot(aes(x = prop_RNAScope, y = prop, color = cell_type, shape = library_combo)) + geom_point() + geom_abline() ## correlation with snRNA-seq proportion cor(prop_compare$prop, prop_compare$prop_sn) ## Scatter plot with RNAScope/IF proportions prop_compare |> ggplot(aes(x = prop_sn, y = prop, color = cell_type, shape = library_combo)) + geom_point() + geom_abline() ## ----"prep and run hspe"---------------------------------------------------------------------------------------------- if (FALSE) { ## Install hspe # if (!requireNamespace("hspe", quietly = TRUE)) { # ## Install version 0.1 which is the one listed on the main documentation # ## at https://github.com/gjhunt/hspe/tree/main?tab=readme-ov-file#software # remotes::install_url("https://github.com/gjhunt/hspe/raw/main/hspe_0.1.tar.gz") # ## Alternatively, install from the latest version on GitHub with: # # remotes::install_github("gjhunt/hspe", subdir = "lib_hspe") # # ## As of 2024-08-23, it's been 3 years since files were last modified # ## at https://github.com/gjhunt/hspe/tree/main/lib_hspe. # } # library("hspe") # pseudobulk the sce data by sample + cell type sce_pb <- spatialLIBD::registration_pseudobulk(sce, var_registration = "cellType_broad_hc", var_sample_id = "Sample" ) ## extract the gene expression from the bulk rse_gene mixture_samples <- t(assays(rse_gene)$logcounts) mixture_samples[1:5, 1:5] ## create a vector of indexes of the different cell types pure_samples <- rafalib::splitit(sce_pb$cellType_broad_hc) ## extract the the pseudobulked logcounts reference_samples <- t(assays(sce_pb)$logcounts) reference_samples[1:5, 1:5] ## check the number of genes match in the bulk (mixture) and single cell (reference) ncol(mixture_samples) == ncol(reference_samples) ## run hspe est_prop_hspe <- hspe( Y = mixture_samples, reference = reference_samples, pure_samples = pure_samples, markers = marker_genes, seed = 10524 ) } ## ----reproduce3, echo=FALSE------------------------------------------------------------------------------------------- ## Session info library("sessioninfo") options(width = 120) session_info() ## ----vignetteBiblio, results = "asis", echo = FALSE, warning = FALSE, message = FALSE--------------------------------- ## Print bibliography PrintBibliography(bib, .opts = list(hyperlink = "to.doc", style = "html"))