## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    warning = FALSE,
    message = FALSE,
    comment = "#>"
)

options(
  rmarkdown.html_vignette.check_title = FALSE
)

## ----setup--------------------------------------------------------------------
library(tidytof)
library(dplyr)
library(ggplot2)
library(forcats)

## ----eval = FALSE-------------------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE)) {
#      install.packages("BiocManager")
#  }
#  
#  BiocManager::install("HDCytoData")

## ----message = FALSE, warning = FALSE-----------------------------------------
levine <-
    HDCytoData::Levine_32dim_flowSet() |>
    as_tof_tbl() |>
    # a bit of data cleaning
    dplyr::mutate(population_id = as.character(population_id)) |>
    dplyr::rename_with(
        .fn = \(x) stringr::str_to_lower(stringr::str_remove(x, "\\|.+"))
    ) |>
    dplyr::mutate(dplyr::across(c(file_number, population_id), as.character)) |>
    # arcsinh transformation
    tof_preprocess(
        channel_cols =
            c(-time, -cell_length, -event_number, -file_number, -population_id)
    )

## -----------------------------------------------------------------------------
# convert 5 counts to asinh value with a cofactor of 5
threshold <- asinh(5 / 5)

levine |>
    tof_assess_channels(
        negative_threshold = threshold,
        negative_proportion_flag = 0.975
    )

## -----------------------------------------------------------------------------
levine |>
    tof_plot_cells_density(marker_col = cd14)

## -----------------------------------------------------------------------------
levine |>
    tof_assess_flow_rate(
        time_col = time,
        num_timesteps = 200,
        # flag timepoints in which flow rates are high or low at a signicance level
        # of p = 0.01
        alpha_threshold = 0.01,
        # plot the number of cells in each timestep, and whether or not the
        # rates were flagged as too high or too low
        visualize = TRUE
    )

## -----------------------------------------------------------------------------
levine |>
    tof_assess_flow_rate(
        time_col = time,
        # analyze two files in the levine dataset separately
        group_cols = file_number,
        alpha_threshold = 0.01,
        visualize = TRUE
    )

## -----------------------------------------------------------------------------
set.seed(2020L)

# simulate large population of cells that truly belong in their assigned cluster
sim_data_base <-
    dplyr::tibble(
        cd45 = c(rnorm(n = 600), rnorm(n = 500, mean = -4)),
        cd38 =
            c(
                rnorm(n = 100, sd = 0.5),
                rnorm(n = 500, mean = -3),
                rnorm(n = 500, mean = 8)
            ),
        cd34 =
            c(
                rnorm(n = 100, sd = 0.2, mean = -10),
                rnorm(n = 500, mean = 4),
                rnorm(n = 500, mean = 60)
            ),
        cd19 = c(rnorm(n = 100, sd = 0.3, mean = 10), rnorm(n = 1000)),
        cluster_id = c(rep("a", 100), rep("b", 500), rep("c", 500)),
        dataset = "non-outlier"
    )

# simulate outlier cells that do not belong in their assigned cluster
sim_data_outlier <-
    dplyr::tibble(
        cd45 = c(rnorm(n = 10), rnorm(50, mean = 3), rnorm(n = 50, mean = -12)),
        cd38 =
            c(
                rnorm(n = 10, sd = 0.5),
                rnorm(n = 50, mean = -10),
                rnorm(n = 50, mean = 10)
            ),
        cd34 =
            c(
                rnorm(n = 10, sd = 0.2, mean = -15),
                rnorm(n = 50, mean = 15),
                rnorm(n = 50, mean = 70)
            ),
        cd19 = c(rnorm(n = 10, sd = 0.3, mean = 19), rnorm(n = 100)),
        cluster_id = c(rep("a", 10), rep("b", 50), rep("c", 50)),
        dataset = "outlier"
    )

# bind simulated data together
sim_data <- bind_rows(sim_data_base, sim_data_outlier)

## -----------------------------------------------------------------------------
sim_data |>
    tof_plot_cells_embedding(color_col = cluster_id)

## -----------------------------------------------------------------------------
sim_data |>
    tof_plot_cells_embedding(color_col = dataset)

## -----------------------------------------------------------------------------
sim_data |>
    tof_assess_clusters_distance(
        cluster_col = cluster_id,
        # flag cells with a mahalanobis distance z-score over 3
        z_threshold = 3,
        augment = TRUE
    ) |>
    # visualize result as above
    dplyr::select(-dplyr::starts_with(".mahala"), -z_score) |>
    dplyr::mutate(flagged_cell = as.character(flagged_cell)) |>
    tof_plot_cells_embedding(color_col = flagged_cell) +
    scale_fill_manual(values = tof_generate_palette(num_colors = 2))

## -----------------------------------------------------------------------------
sim_data <-
    dplyr::tibble(
        cd45 =
            c(
                rnorm(n = 1000, sd = 2),
                rnorm(n = 1000, mean = 2),
                rnorm(n = 1000, mean = -2)
            ),
        cd38 =
            c(
                rnorm(n = 1000, sd = 2),
                rnorm(n = 1000, mean = 2),
                rnorm(n = 1000, mean = -2)
            ),
        cd34 =
            c(
                rnorm(n = 1000, sd = 2),
                rnorm(n = 1000, mean = 2),
                rnorm(n = 1000, mean = -2)
            ),
        cd19 =
            c(
                rnorm(n = 1000, sd = 2),
                rnorm(n = 1000, mean = 2),
                rnorm(n = 1000, mean = -2)
            ),
        cluster_id = c(rep("a", 1000), rep("b", 1000), rep("c", 1000))
    )

## -----------------------------------------------------------------------------
sim_data |>
    tof_reduce_dimensions(method = "pca") |>
    tof_plot_cells_embedding(
        embedding_cols = c(.pc1, .pc2),
        color_col = cluster_id
    )

## -----------------------------------------------------------------------------
set.seed(17L)

entropy_result <-
    sim_data |>
    # cluster into 2 clusters
    tof_cluster(
        num_clusters = 2,
        method = "kmeans"
    ) |>
    # calculate the entropy of all cells
    tof_assess_clusters_entropy(
        cluster_col = .kmeans_cluster,
        marker_cols = starts_with("cd"),
        entropy_quantile = 0.8,
        augment = TRUE
    )

# plot the clusters in PCA space
entropy_result |>
    select(-starts_with(".mahala"), -flagged_cell) |>
    tof_reduce_dimensions(pca_cols = starts_with("cd"), method = "pca") |>
    tof_plot_cells_embedding(embedding_cols = c(.pc1, .pc2), color_col = .kmeans_cluster)

# show the entropy values for each cell
entropy_result |>
    select(-starts_with(".mahala"), -flagged_cell) |>
    tof_reduce_dimensions(pca_cols = starts_with("cd"), method = "pca") |>
    tof_plot_cells_embedding(embedding_cols = c(.pc1, .pc2), color_col = entropy) +
    scale_fill_viridis_c()

## -----------------------------------------------------------------------------
entropy_result |>
    select(-starts_with(".mahala")) |>
    tof_reduce_dimensions(pca_cols = starts_with("cd"), method = "pca") |>
    tof_plot_cells_embedding(embedding_cols = c(.pc1, .pc2), color_col = flagged_cell) +
    scale_fill_viridis_d()

## -----------------------------------------------------------------------------
entropy_result |>
    ggplot(aes(x = entropy, fill = cluster_id)) +
    geom_density(alpha = 0.4) +
    theme_bw()

## -----------------------------------------------------------------------------
clusters_to_keep <-
    levine |>
    dplyr::count(population_id) |>
    dplyr::slice_max(order_by = n, n = 5L) |>
    dplyr::arrange(n) |>
    pull(population_id)

smallest_cluster <- clusters_to_keep[1]
largest_cluster <- clusters_to_keep[[length(clusters_to_keep)]]

small_levine <-
    levine |>
    dplyr::filter(population_id %in% clusters_to_keep)

## -----------------------------------------------------------------------------
# perform the perturbation
small_levine <-
    small_levine |>
    dplyr::mutate(
        new_population_id =
            dplyr::if_else(
                population_id %in% smallest_cluster,
                sample(
                    clusters_to_keep[-which(clusters_to_keep %in% smallest_cluster)],
                    size = nrow(small_levine),
                    replace = TRUE
                ),
                population_id
            )
    )

# perform the entropy assessment
entropy_levine <-
    small_levine |>
    tof_assess_clusters_entropy(
        cluster_col = new_population_id,
        marker_cols = starts_with("cd"),
        augment = TRUE
    )

## -----------------------------------------------------------------------------
entropy_levine |>
    mutate(population_id = fct_reorder(population_id, entropy)) |>
    tof_plot_cells_density(
        marker_col = entropy,
        group_col = population_id,
        use_ggridges = TRUE,
        scale = 0.1
    ) +
    ggplot2::theme(legend.position = "none") +
    ggplot2::labs(x = "Entropy", y = "Cluster ID")

## -----------------------------------------------------------------------------
entropy_levine |>
    mutate(flagged_cell = entropy > quantile(entropy, prob = 0.9)) |>
    dplyr::count(population_id, flagged_cell) |>
    group_by(population_id) |>
    mutate(prop = n / sum(n)) |>
    ungroup() |>
    dplyr::filter(flagged_cell)

## -----------------------------------------------------------------------------
sessionInfo()