In this vignette we will share a few extra details regarding the usage of the functions in CCAFE.
For the vignette with basic usage and examples see the CCAFE Vignette.
Will create a simulated dataset to explore the issues discussed in this vignette.
This dataset will have:
library(tidyverse)
library(CCAFE)
library(DescTools)
library(cowplot)
# need to create simulated AFs from autosomes and sex chromosomes
# simulate 100 variants from each chromosome
# will simulate from a sample of 5000 XX and 5000 XY case individuals
# then add 5000 of each in control
set.seed(2020)
N_XX <- 5000
N_XY <- 5000
N <- N_XX + N_XY
for(chr in 1:24) {
refs <- runif(n = 100, min = 0, max = 1)
if(chr < 23) {
genos <- t(sapply(refs, function(x) {
rmultinom(1, N, c(x^2, 2*x*(1-x), (1-x)^2))
}))
afs_toadd <- apply(genos, 1, function(x) ((x[1]*2)+x[2])/(2*N))
} else {
if(chr == 23) { # simulate X chromosome variants
XX_alleles <- (sapply(refs, function(x) {
sum(rbinom(N_XX*2, 1, x))
}))
XY_alleles <- (sapply(refs, function(x) {
sum(rbinom(N_XY, 1, x))
}))
total_alleles <- XX_alleles + XY_alleles
afs_toadd <- total_alleles/(2*N_XX + N_XY)
} else { # simulate Y chromosome variants
XY_alleles <- (sapply(refs, function(x) {
sum(rbinom(N_XY, 1, x))
}))
afs_toadd <- XY_alleles/(N_XY)
}
}
if(chr == 1) {
simDat <- data.frame(chr = chr, pos = c(1:100), case_sim_af = afs_toadd)
} else {
simDat <- rbind(simDat, data.frame(chr = chr, pos = c(1:100), case_sim_af = afs_toadd))
}
}
simDat$control_sim_af <- 0
# simulate control AFs
for(chr in 1:24) {
refs <- simDat[simDat$chr == chr,]$case_sim_af
refs <- refs - runif(n = length(refs), min = 0, max = .025)
refs[refs < 0] <- 0
refs[refs > 1] <- 1
if(chr < 23) {
genos <- t(sapply(refs, function(x) {
rmultinom(1, N, c(x^2, 2*x*(1-x), (1-x)^2))
}))
afs_toadd <- apply(genos, 1, function(x) ((x[1]*2)+x[2])/(2*N))
} else {
if(chr == 23) { # simulate X chromosome variants
XX_alleles <- (sapply(refs, function(x) {
sum(rbinom(N_XX*2, 1, x))
}))
XY_alleles <- (sapply(refs, function(x) {
sum(rbinom(N_XY, 1, x))
}))
total_alleles <- XX_alleles + XY_alleles
afs_toadd <- total_alleles/(2*N_XX + N_XY)
} else { # simulate Y chromosome variants
XY_alleles <- (sapply(refs, function(x) {
sum(rbinom(N_XY, 1, x))
}))
afs_toadd <- XY_alleles/(N_XY)
}
}
simDat[simDat$chr == chr, ]$control_sim_af <- afs_toadd
}
simDat$total_sim_af <- (simDat$case_sim_af*10000 + simDat$control_sim_af*10000)/(10000 + 10000)
# need to add OR and SE
OR <- rep(0, nrow(simDat))
SE <- rep(0, nrow(simDat))
for(i in 1:nrow(simDat)) {
AF1 = simDat[i, ]$case_sim_af
AF2 = simDat[i,]$control_sim_af
if(simDat[i,]$chr < 23) {
# first calculate the 2x2 tables of allele counts
a = AF1 * 2 * 10000
b = (1-AF1) * 2 * 10000
c = AF2 * 2 * 10000
d = (1-AF2) * 2 * 10000
} else {
if(simDat[i,]$chr == 23) {
a = AF1 * (2*5000 + 5000)
b = (1-AF1) * (2*5000 + 5000)
c = AF2 * (2*5000 + 5000)
d = (1-AF2) * (2*5000 + 5000)
} else {
a = AF1 * 5000
b = (1-AF1) * 5000
c = AF2 * 5000
d = (1-AF2) * 5000
}
}
OR[i] <- (a*d)/(b*c)
SE[i] <- sqrt(1/a + 1/b + 1/c + 1/d)
}
simDat$OR <- OR
simDat$SE <- SE
simDat[simDat$chr == 23, ]$chr <- "X"
simDat[simDat$chr == 24, ]$chr <- "Y"
simDat <- simDat %>% rowwise() %>%
mutate(is_minor_case = ifelse(case_sim_af <= 0.5, 1, 0),
is_minor_control = ifelse(control_sim_af <= 0.5, 1, 0),
is_minor_total = ifelse(total_sim_af <= 0.5, 1, 0)) %>%
mutate(case_sim_maf = ifelse(is_minor_case, case_sim_af, 1-case_sim_af),
control_sim_maf = ifelse(is_minor_control, control_sim_af, 1-control_sim_af),
total_sim_maf = ifelse(is_minor_total, total_sim_af, 1-total_sim_af))
head(simDat)
#> # A tibble: 6 × 13
#> # Rowwise:
#> chr pos case_sim_af control_sim_af total_sim_af OR SE is_minor_case
#> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 0.649 0.636 0.642 1.06 0.0209 0
#> 2 1 2 0.394 0.389 0.392 1.02 0.0205 1
#> 3 1 3 0.615 0.591 0.603 1.11 0.0204 0
#> 4 1 4 0.476 0.456 0.466 1.09 0.0201 1
#> 5 1 5 0.138 0.117 0.128 1.20 0.0300 1
#> 6 1 6 0.0678 0.0452 0.0566 1.54 0.0441 1
#> # ℹ 5 more variables: is_minor_control <dbl>, is_minor_total <dbl>,
#> # case_sim_maf <dbl>, control_sim_maf <dbl>, total_sim_maf <dbl>
A key assumption made by the implementation of CaseControl_SE is that the frequency being estimated is that of the minor allele. As such, the returned value is always the minor allele and bounded [0,0.5], regardless of which allele was used for calculation of the input OR and SE. This can create complications in data harmonization between multiple datasets, because the minor allele may not always be the same between the two datasets. We note that care should be taken when using this method to reconstruct case and control AFs for use in secondary analyses as the alleles being compared may not be retained.
nrow(simDat[simDat$is_minor_case != simDat$is_minor_control, ])
#> [1] 32
There are 32 variants for which the minor allele is a different allele in cases and controls.
diff_alleles <- simDat[simDat$is_minor_case != simDat$is_minor_control, ]
diff_alleles
#> # A tibble: 32 × 13
#> # Rowwise:
#> chr pos case_sim_af control_sim_af total_sim_af OR SE
#> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 95 0.507 0.490 0.498 1.07 0.0200
#> 2 2 28 0.504 0.493 0.499 1.05 0.0200
#> 3 4 47 0.520 0.491 0.505 1.12 0.0200
#> 4 6 7 0.504 0.495 0.500 1.04 0.0200
#> 5 6 81 0.501 0.496 0.498 1.02 0.0200
#> 6 7 57 0.504 0.498 0.501 1.02 0.0200
#> 7 7 88 0.519 0.495 0.507 1.10 0.0200
#> 8 8 14 0.503 0.485 0.494 1.07 0.0200
#> 9 8 23 0.522 0.499 0.511 1.10 0.0200
#> 10 8 98 0.501 0.481 0.491 1.08 0.0200
#> # ℹ 22 more rows
#> # ℹ 6 more variables: is_minor_case <dbl>, is_minor_control <dbl>,
#> # is_minor_total <dbl>, case_sim_maf <dbl>, control_sim_maf <dbl>,
#> # total_sim_maf <dbl>
This will results in different alleles being reported from CaseControl_SE, which, if used for subsequent analyses like case case GWAS (CC-GWAS) would result in the user comparing different alleles between datasets/groups.
In analyses for the manuscript we assessed using the total sample MAF compared to the total sample AF. Here we noted that converting the total AF to MAF first (and using that as input) introduces variability due to rounding.
Here we will show the effects of converting to MAF in 3 scenarios:
simDat <- simDat[is.finite(simDat$OR), ]
simDat <- as.data.frame(simDat)
res_AF <- CaseControl_AF(data = simDat,
N_case = 10000,
N_control = 10000,
OR_colname = "OR",
AF_total_colname = "total_sim_af")
res_MAF <- CaseControl_AF(data = simDat,
N_case = 10000,
N_control = 10000,
OR_colname = "OR",
AF_total_colname = "total_sim_maf")
p_1 <- ggplot(res_AF, aes(x = case_sim_af, y = AF_case)) +
geom_point() +
geom_abline(color = "red", linetype = "dashed") +
theme_bw() +
xlab("True AF") +
ylab("Estimated AF") +
geom_text(x = .175, y = .9, label = paste0("CCC=", round(DescTools::CCC(res_AF$case_sim_af, res_AF$AF_case, na.rm = TRUE)$rho.c$est, 4))) +
ggtitle("1. No Conversion")
#> Warning in sqrt(((1 - ((r)^2)) * (p)^2 * (1 - ((p)^2))/(r)^2 + (2 * (p)^3 * :
#> NaNs produced
res_AF$MAF_case <- sapply(res_AF$AF_case, function(x) ifelse(x > 0.5, 1-x, x))
res_AF <- as.data.frame(res_AF)
p_2 <- ggplot(res_AF, aes(x = case_sim_maf, y = MAF_case)) +
geom_point() +
geom_abline(color = "red", linetype = "dashed") +
theme_bw() +
xlab("True MAF") +
ylab("Estimated MAF") +
geom_text(x = .1, y = .45, label = paste0("CCC=", round(DescTools::CCC(res_AF$case_sim_maf, res_AF$MAF_case, na.rm = TRUE)$rho.c$est, 4))) +
ggtitle("2. Convert AFTER")
#> Warning in sqrt(((1 - ((r)^2)) * (p)^2 * (1 - ((p)^2))/(r)^2 + (2 * (p)^3 * :
#> NaNs produced
p_3 <- ggplot(res_MAF, aes(x = case_sim_maf, y = AF_case)) +
geom_point() +
geom_abline(color = "red", linetype = "dashed") +
theme_bw() +
xlab("True MAF") +
ylab("Estimated MAF") +
geom_text(x = .1, y = .48, label = paste0("CCC=", round(DescTools::CCC(res_MAF$case_sim_maf, res_MAF$AF_case, na.rm = TRUE)$rho.c$est, 4))) +
ggtitle("3. Convert BEFORE")
cowplot::plot_grid(p_1, p_2, p_3, ncol = 3)
#> Warning: Removed 5 rows containing missing values or values outside the scale range
#> (`geom_point()`).
#> Warning: Removed 5 rows containing missing values or values outside the scale range
#> (`geom_point()`).
#> Removed 5 rows containing missing values or values outside the scale range
#> (`geom_point()`).
Here it’s clear to see that as long as the user does not convert to MAF before estimating the
case and control AFs, then we get perfect reconstruction of the AFs. Conversely, converting the
total sample AF to MAF first introduces variability around the estimates (seen in scenario 3).
Overall, the estimates are still highly accurate, but when possible we recommend not converting
AFs to MAF before inputting to CaseControl_AF.
Another potential problem that users may encounter is the presence of sex chromosomes when using CaseControl_SE.
The derivation for the framework underlying CaseControl_SE utilizes the total allele number (AN) for cases and controls. The original proposed method in ReACt used the equation AN = 2*N, where N is the sample size. However, this is only true in all samples for autosomes. This equation will also hold true for the X chromosome in an all biological female (XX) sample. However, in any sample in which there are both XX and XY individuals, this equation will not give the correct total AN for the sex chromosomes.
We have added the ability to estimate the case and control MAF for the sex chromosomes by employing the following equations for the AN in the original framework:
\[ AN_X = 2*N_{XX individuals} + N_{XY individuals} \] \[ AN_Y = N_{XY individuals} \] \[ AN_{autosomes} = 2*N\] Failing to use the correct AN resulted in MAF estimates that did not follow the same trend as the autosomes. We require chromosome data with the input for CaseControl_SE to check for the presence of variants on sex chromosomes. Notably, in order to estimate the MAFs for these sex chromosomes, users must know and provide as input the number of XX and XY individuals per case and control sample. While it is reasonable to expect most studies to publish this demographic information, it may not always be available. In this case, users can set the flag ‘remove_sex_chromosomes’ to TRUE and the method will return the MAFs for only autosomal variants. We will demonstrate below what the results may look like if the sex chromosomes are not properly accounted for.
The implementation of CaseControl_AF directly uses sample size and total AF, rather than total AN. Thus this method is not sensitive to the XX and XY specific samples sizes. We will also show this below.
# to simulate a dataset in which the sex chromosomes are not properly accounted for, we will falsely
# rename the x and y chromosomes to autosomes
simDat_nosex <- simDat
simDat_nosex[simDat_nosex$chr == "X", ]$chr <- 1
simDat_nosex[simDat_nosex$chr == "Y", ]$chr <- 2
af_res_sex <- CaseControl_AF(data = simDat,
N_case = 10000,
N_control = 10000,
OR_colname = "OR",
AF_total_colname = "total_sim_af")
se_res <- CaseControl_SE(data = simDat_nosex,
N_case = 10000,
N_control = 10000,
OR_colname = "OR",
SE_colname = "SE",
chromosome_colname = "chr",
position_colname = "pos",
sex_chromosomes = FALSE,
do_correction = FALSE)
se_res_sex <- CaseControl_SE(data = simDat,
N_case = 10000,
N_control = 10000,
OR_colname = "OR",
SE_colname = "SE",
chromosome_colname = "chr",
position_colname = "pos",
sex_chromosomes = TRUE,
remove_sex_chromosomes = FALSE,
do_correction = FALSE,
N_XX_case = 5000,
N_XX_control = 5000,
N_XY_case = 5000,
N_XY_control = 5000)
Examine the following scenarios:
p_a <- ggplot(af_res_sex, aes(x = case_sim_af, y = AF_case)) +
geom_point() +
geom_abline(color = "red", linetype = "dashed") +
theme_bw() +
xlab("True AF") +
ylab("Estimated AF") +
ggtitle("1. CaseControl_AF")
p_b <- ggplot(se_res, aes(x = case_sim_maf, y = MAF_case)) +
geom_point() +
geom_abline(color = "red", linetype = "dashed") +
theme_bw() +
xlab("True MAF") +
ylab("Estimated MAF") +
ggtitle("2. CaseControl_SE")
p_c <- ggplot(se_res_sex, aes(x = case_sim_maf, y = MAF_case)) +
geom_point() +
geom_abline(color = "red", linetype = "dashed") +
theme_bw() +
xlab("True MAF") +
ylab("Estimated MAF") +
ggtitle("3. CaseControl_SE")
plot_grid(p_a, p_b, p_c, ncol =3)
#> Warning: Removed 5 rows containing missing values or values outside the scale range
#> (`geom_point()`).
sessionInfo()
#> R Under development (unstable) (2025-03-13 r87965)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.2 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.21-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] cowplot_1.1.3 DescTools_0.99.60
#> [3] VariantAnnotation_1.53.1 Rsamtools_2.23.1
#> [5] Biostrings_2.75.4 XVector_0.47.2
#> [7] SummarizedExperiment_1.37.0 Biobase_2.67.0
#> [9] GenomicRanges_1.59.1 GenomeInfoDb_1.43.4
#> [11] IRanges_2.41.3 S4Vectors_0.45.4
#> [13] MatrixGenerics_1.19.1 matrixStats_1.5.0
#> [15] BiocGenerics_0.53.6 generics_0.1.3
#> [17] lubridate_1.9.4 forcats_1.0.0
#> [19] stringr_1.5.1 dplyr_1.1.4
#> [21] purrr_1.0.4 readr_2.1.5
#> [23] tidyr_1.3.1 tibble_3.2.1
#> [25] ggplot2_3.5.1 tidyverse_2.0.0
#> [27] CCAFE_0.99.7 BiocStyle_2.35.0
#>
#> loaded via a namespace (and not attached):
#> [1] DBI_1.2.3 bitops_1.0-9 gld_2.6.7
#> [4] readxl_1.4.5 rlang_1.1.5 magrittr_2.0.3
#> [7] e1071_1.7-16 compiler_4.6.0 RSQLite_2.3.9
#> [10] GenomicFeatures_1.59.1 png_0.1-8 vctrs_0.6.5
#> [13] pkgconfig_2.0.3 crayon_1.5.3 fastmap_1.2.0
#> [16] magick_2.8.6 labeling_0.4.3 utf8_1.2.4
#> [19] rmarkdown_2.29 tzdb_0.5.0 haven_2.5.4
#> [22] UCSC.utils_1.3.1 tinytex_0.56 bit_4.6.0
#> [25] xfun_0.51 cachem_1.1.0 jsonlite_2.0.0
#> [28] blob_1.2.4 DelayedArray_0.33.6 BiocParallel_1.41.5
#> [31] parallel_4.6.0 R6_2.6.1 bslib_0.9.0
#> [34] stringi_1.8.7 rtracklayer_1.67.1 boot_1.3-31
#> [37] cellranger_1.1.0 jquerylib_0.1.4 Rcpp_1.0.14
#> [40] bookdown_0.42 knitr_1.50 Matrix_1.7-3
#> [43] timechange_0.3.0 tidyselect_1.2.1 rstudioapi_0.17.1
#> [46] abind_1.4-8 yaml_2.3.10 codetools_0.2-20
#> [49] curl_6.2.2 lattice_0.22-6 withr_3.0.2
#> [52] KEGGREST_1.47.0 evaluate_1.0.3 proxy_0.4-27
#> [55] pillar_1.10.1 BiocManager_1.30.25 RCurl_1.98-1.17
#> [58] hms_1.1.3 rootSolve_1.8.2.4 munsell_0.5.1
#> [61] scales_1.3.0 class_7.3-23 glue_1.8.0
#> [64] lmom_3.2 tools_4.6.0 BiocIO_1.17.1
#> [67] data.table_1.17.0 BSgenome_1.75.1 GenomicAlignments_1.43.0
#> [70] Exact_3.3 fs_1.6.5 mvtnorm_1.3-3
#> [73] XML_3.99-0.18 grid_4.6.0 AnnotationDbi_1.69.0
#> [76] colorspace_2.1-1 GenomeInfoDbData_1.2.14 restfulr_0.0.15
#> [79] cli_3.6.4 expm_1.0-0 S4Arrays_1.7.3
#> [82] gtable_0.3.6 sass_0.4.9 digest_0.6.37
#> [85] SparseArray_1.7.7 rjson_0.2.23 farver_2.1.2
#> [88] memoise_2.0.1 htmltools_0.5.8.1 lifecycle_1.0.4
#> [91] httr_1.4.7 MASS_7.3-65 bit64_4.6.0-1