## ----setup, message = FALSE, warning = FALSE, comment = NA--------------------
knitr::opts_chunk$set(message = FALSE, warning = FALSE, comment = NA,
                      fig.width = 6.25, fig.height = 5)
library(ANCOMBC)
library(tidyverse)

## ----getPackage, eval=FALSE---------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  BiocManager::install("ANCOMBC")

## ----load, eval=FALSE---------------------------------------------------------
#  library(ANCOMBC)

## -----------------------------------------------------------------------------
data(atlas1006, package = "microbiome")
tse = mia::makeTreeSummarizedExperimentFromPhyloseq(atlas1006)

# Subset to baseline
tse = tse[, tse$time == 0]

# Re-code the bmi group
tse$bmi = recode(tse$bmi_group,
                 obese = "obese",
                 severeobese = "obese",
                 morbidobese = "obese")
# Subset to lean, overweight, and obese subjects
tse = tse[, tse$bmi %in% c("lean", "overweight", "obese")]

# Note that by default, levels of a categorical variable in R are sorted 
# alphabetically. In this case, the reference level for `bmi` will be 
# `lean`. To manually change the reference level, for instance, setting `obese`
# as the reference level, use:
tse$bmi = factor(tse$bmi, levels = c("obese", "overweight", "lean"))
# You can verify the change by checking:
# levels(sample_data(tse)$bmi)

# Create the region variable
tse$region = recode(as.character(tse$nationality),
                    Scandinavia = "NE", UKIE = "NE", SouthEurope = "SE", 
                    CentralEurope = "CE", EasternEurope = "EE",
                    .missing = "unknown")

# Discard "EE" as it contains only 1 subject
# Discard subjects with missing values of region
tse = tse[, ! tse$region %in% c("EE", "unknown")]

print(tse)

## -----------------------------------------------------------------------------
set.seed(123)
out = ancom(data = tse, assay_name = "counts", 
            tax_level = "Family", phyloseq = NULL, 
            p_adj_method = "holm", prv_cut = 0.10, lib_cut = 1000, 
            main_var = "bmi", adj_formula = "age + region", 
            rand_formula = NULL, lme_control = NULL, struc_zero = TRUE,
            neg_lb = TRUE, alpha = 0.05, n_cl = 2)

res = out$res

# Similarly, if the main variable of interest is continuous, such as age, the
# ancom model can be specified as
# out = ancom(data = tse, assay_name = "counts",
#             tax_level = "Family", phyloseq = NULL,
#             p_adj_method = "holm", prv_cut = 0.10, lib_cut = 1000,
#             main_var = "age", adj_formula = "bmi + region",
#             rand_formula = NULL, lme_control = NULL, struc_zero = FALSE,
#             neg_lb = FALSE, alpha = 0.05, n_cl = 2)

# ancom also supports importing data in phyloseq format
# tse_alt = agglomerateByRank(tse, "Family")
# pseq = makePhyloseqFromTreeSummarizedExperiment(tse_alt)
# out = ancom(data = NULL, assay_name = NULL,
#             tax_level = "Family", phyloseq = pseq,
#             p_adj_method = "holm", prv_cut = 0.10, lib_cut = 1000,
#             main_var = "bmi", adj_formula = "age + region",
#             rand_formula = NULL, lme_control = NULL, struc_zero = TRUE,
#             neg_lb = TRUE, alpha = 0.05, n_cl = 2)

## -----------------------------------------------------------------------------
q_val = out$q_data
beta_val = out$beta_data
# Only consider the effect sizes with the corresponding q-value less than alpha
beta_val = beta_val * (q_val < 0.05) 
# Choose the maximum of beta's as the effect size
beta_pos = apply(abs(beta_val), 2, which.max) 
beta_max = vapply(seq_along(beta_pos), function(i) 
    beta_val[beta_pos[i], i], FUN.VALUE = double(1))
# Number of taxa except structural zeros
n_taxa = ifelse(is.null(out$zero_ind), 
                nrow(tse), 
                sum(apply(out$zero_ind[, -1], 1, sum) == 0))
# Cutoff values for declaring differentially abundant taxa
cut_off = 0.7 * (n_taxa - 1)

df_fig_w = res %>%
  dplyr::mutate(beta = beta_max,
                direct = case_when(
                  detected_0.7 == TRUE & beta > 0 ~ "Positive",
                  detected_0.7 == TRUE & beta <= 0 ~ "Negative",
                  TRUE ~ "Not Significant"
                  )) %>%
  dplyr::arrange(W)
df_fig_w$taxon = factor(df_fig_w$taxon, levels = df_fig_w$taxon)
df_fig_w$W = replace(df_fig_w$W, is.infinite(df_fig_w$W), n_taxa - 1)
df_fig_w$direct = factor(df_fig_w$direct, 
                         levels = c("Negative", "Positive", "Not Significant"))

p_w = df_fig_w %>%
  ggplot(aes(x = taxon, y = W, color = direct)) +
  geom_point(size = 2, alpha = 0.6) +
  labs(x = "Taxon", y = "W") +
  scale_color_discrete(name = NULL) + 
  geom_hline(yintercept = cut_off, linetype = "dotted", 
             color = "blue", size = 1.5) +
  geom_text(aes(x = 2, y = cut_off + 0.5, label = "W[0.7]"), 
            size = 5, vjust = -0.5, hjust = 0, color = "orange", parse = TRUE) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid.major = element_blank())
p_w

## -----------------------------------------------------------------------------
data(dietswap, package = "microbiome")
tse = mia::makeTreeSummarizedExperimentFromPhyloseq(dietswap)
print(tse)

## -----------------------------------------------------------------------------
set.seed(123)
out = ancom(data = tse, assay_name = "counts", 
            tax_level = "Family", phyloseq = NULL, 
            p_adj_method = "holm", prv_cut = 0.10, lib_cut = 1000, 
            main_var = "group",
            adj_formula = "nationality + timepoint", 
            rand_formula = "(timepoint | subject)", 
            lme_control = lme4::lmerControl(), 
            struc_zero = TRUE, neg_lb = TRUE, alpha = 0.05, n_cl = 2)

res = out$res

## -----------------------------------------------------------------------------
q_val = out$q_data
beta_val = out$beta_data
# Only consider the effect sizes with the corresponding q-value less than alpha
beta_val = beta_val * (q_val < 0.05) 
# Choose the maximum of beta's as the effect size
beta_pos = apply(abs(beta_val), 2, which.max) 
beta_max = vapply(seq_along(beta_pos), function(i) beta_val[beta_pos[i], i],
                  FUN.VALUE = double(1))
# Number of taxa except structural zeros
n_taxa = ifelse(is.null(out$zero_ind), 
                nrow(tse), 
                sum(apply(out$zero_ind[, -1], 1, sum) == 0))
# Cutoff values for declaring differentially abundant taxa
cut_off = 0.7 * (n_taxa - 1)

df_fig_w = res %>%
  dplyr::mutate(beta = beta_max,
                direct = case_when(
                  detected_0.7 == TRUE & beta > 0 ~ "Positive",
                  detected_0.7 == TRUE & beta <= 0 ~ "Negative",
                  TRUE ~ "Not Significant"
                  )) %>%
  dplyr::arrange(W)
df_fig_w$taxon = factor(df_fig_w$taxon, levels = df_fig_w$taxon)
df_fig_w$W = replace(df_fig_w$W, is.infinite(df_fig_w$W), n_taxa - 1)
df_fig_w$direct = factor(df_fig_w$direct, 
                     levels = c("Negative", "Positive", "Not Significant"))

p_w = df_fig_w %>%
  ggplot(aes(x = taxon, y = W, color = direct)) +
  geom_point(size = 2, alpha = 0.6) +
  labs(x = "Taxon", y = "W") +
  scale_color_discrete(name = NULL) + 
  geom_hline(yintercept = cut_off, linetype = "dotted", 
             color = "blue", size = 1.5) +
  geom_text(aes(x = 2, y = cut_off + 0.5, label = "W[0.7]"), 
            size = 5, vjust = -0.5, hjust = 0, color = "orange", parse = TRUE) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid.major = element_blank())
p_w

## ----sessionInfo, message = FALSE, warning = FALSE, comment = NA--------------
sessionInfo()