## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", out.width = "100%" ) ## ----echo=TRUE, message=FALSE------------------------------------------------- library(tidyverse) library(CCAFE) library(DescTools) library(cowplot) ## ----Simulations-------------------------------------------------------------- # need to create simulated AFs from autosomes and sex chromosomes # simulate 100 variants from each chromosome # will simulate from a sample of 5000 XX and 5000 XY case individuals # then add 5000 of each in control set.seed(2020) N_XX <- 5000 N_XY <- 5000 N <- N_XX + N_XY for(chr in 1:24) { refs <- runif(n = 100, min = 0, max = 1) if(chr < 23) { genos <- t(sapply(refs, function(x) { rmultinom(1, N, c(x^2, 2*x*(1-x), (1-x)^2)) })) afs_toadd <- apply(genos, 1, function(x) ((x[1]*2)+x[2])/(2*N)) } else { if(chr == 23) { # simulate X chromosome variants XX_alleles <- (sapply(refs, function(x) { sum(rbinom(N_XX*2, 1, x)) })) XY_alleles <- (sapply(refs, function(x) { sum(rbinom(N_XY, 1, x)) })) total_alleles <- XX_alleles + XY_alleles afs_toadd <- total_alleles/(2*N_XX + N_XY) } else { # simulate Y chromosome variants XY_alleles <- (sapply(refs, function(x) { sum(rbinom(N_XY, 1, x)) })) afs_toadd <- XY_alleles/(N_XY) } } if(chr == 1) { simDat <- data.frame(chr = chr, pos = c(1:100), case_sim_af = afs_toadd) } else { simDat <- rbind(simDat, data.frame(chr = chr, pos = c(1:100), case_sim_af = afs_toadd)) } } simDat$control_sim_af <- 0 # simulate control AFs for(chr in 1:24) { refs <- simDat[simDat$chr == chr,]$case_sim_af refs <- refs - runif(n = length(refs), min = 0, max = .025) refs[refs < 0] <- 0 refs[refs > 1] <- 1 if(chr < 23) { genos <- t(sapply(refs, function(x) { rmultinom(1, N, c(x^2, 2*x*(1-x), (1-x)^2)) })) afs_toadd <- apply(genos, 1, function(x) ((x[1]*2)+x[2])/(2*N)) } else { if(chr == 23) { # simulate X chromosome variants XX_alleles <- (sapply(refs, function(x) { sum(rbinom(N_XX*2, 1, x)) })) XY_alleles <- (sapply(refs, function(x) { sum(rbinom(N_XY, 1, x)) })) total_alleles <- XX_alleles + XY_alleles afs_toadd <- total_alleles/(2*N_XX + N_XY) } else { # simulate Y chromosome variants XY_alleles <- (sapply(refs, function(x) { sum(rbinom(N_XY, 1, x)) })) afs_toadd <- XY_alleles/(N_XY) } } simDat[simDat$chr == chr, ]$control_sim_af <- afs_toadd } simDat$total_sim_af <- (simDat$case_sim_af*10000 + simDat$control_sim_af*10000)/(10000 + 10000) # need to add OR and SE OR <- rep(0, nrow(simDat)) SE <- rep(0, nrow(simDat)) for(i in 1:nrow(simDat)) { AF1 = simDat[i, ]$case_sim_af AF2 = simDat[i,]$control_sim_af if(simDat[i,]$chr < 23) { # first calculate the 2x2 tables of allele counts a = AF1 * 2 * 10000 b = (1-AF1) * 2 * 10000 c = AF2 * 2 * 10000 d = (1-AF2) * 2 * 10000 } else { if(simDat[i,]$chr == 23) { a = AF1 * (2*5000 + 5000) b = (1-AF1) * (2*5000 + 5000) c = AF2 * (2*5000 + 5000) d = (1-AF2) * (2*5000 + 5000) } else { a = AF1 * 5000 b = (1-AF1) * 5000 c = AF2 * 5000 d = (1-AF2) * 5000 } } OR[i] <- (a*d)/(b*c) SE[i] <- sqrt(1/a + 1/b + 1/c + 1/d) } simDat$OR <- OR simDat$SE <- SE simDat[simDat$chr == 23, ]$chr <- "X" simDat[simDat$chr == 24, ]$chr <- "Y" simDat <- simDat %>% rowwise() %>% mutate(is_minor_case = ifelse(case_sim_af <= 0.5, 1, 0), is_minor_control = ifelse(control_sim_af <= 0.5, 1, 0), is_minor_total = ifelse(total_sim_af <= 0.5, 1, 0)) %>% mutate(case_sim_maf = ifelse(is_minor_case, case_sim_af, 1-case_sim_af), control_sim_maf = ifelse(is_minor_control, control_sim_af, 1-control_sim_af), total_sim_maf = ifelse(is_minor_total, total_sim_af, 1-total_sim_af)) head(simDat) ## ----Any mismatched alleles--------------------------------------------------- nrow(simDat[simDat$is_minor_case != simDat$is_minor_control, ]) ## ----Show mismatched alleles-------------------------------------------------- diff_alleles <- simDat[simDat$is_minor_case != simDat$is_minor_control, ] diff_alleles ## ----Calculate AFs------------------------------------------------------------ simDat <- simDat[is.finite(simDat$OR), ] simDat <- as.data.frame(simDat) res_AF <- CaseControl_AF(data = simDat, N_case = 10000, N_control = 10000, OR_colname = "OR", AF_total_colname = "total_sim_af") res_MAF <- CaseControl_AF(data = simDat, N_case = 10000, N_control = 10000, OR_colname = "OR", AF_total_colname = "total_sim_maf") ## ----Plot_AF_results, message=FALSE------------------------------------------- p_1 <- ggplot(res_AF, aes(x = case_sim_af, y = AF_case)) + geom_point() + geom_abline(color = "red", linetype = "dashed") + theme_bw() + xlab("True AF") + ylab("Estimated AF") + geom_text(x = .175, y = .9, label = paste0("CCC=", round(DescTools::CCC(res_AF$case_sim_af, res_AF$AF_case, na.rm = TRUE)$rho.c$est, 4))) + ggtitle("1. No Conversion") res_AF$MAF_case <- sapply(res_AF$AF_case, function(x) ifelse(x > 0.5, 1-x, x)) res_AF <- as.data.frame(res_AF) p_2 <- ggplot(res_AF, aes(x = case_sim_maf, y = MAF_case)) + geom_point() + geom_abline(color = "red", linetype = "dashed") + theme_bw() + xlab("True MAF") + ylab("Estimated MAF") + geom_text(x = .1, y = .45, label = paste0("CCC=", round(DescTools::CCC(res_AF$case_sim_maf, res_AF$MAF_case, na.rm = TRUE)$rho.c$est, 4))) + ggtitle("2. Convert AFTER") p_3 <- ggplot(res_MAF, aes(x = case_sim_maf, y = AF_case)) + geom_point() + geom_abline(color = "red", linetype = "dashed") + theme_bw() + xlab("True MAF") + ylab("Estimated MAF") + geom_text(x = .1, y = .48, label = paste0("CCC=", round(DescTools::CCC(res_MAF$case_sim_maf, res_MAF$AF_case, na.rm = TRUE)$rho.c$est, 4))) + ggtitle("3. Convert BEFORE") cowplot::plot_grid(p_1, p_2, p_3, ncol = 3) ## ----ceate unaccounted for sex chr data--------------------------------------- # to simulate a dataset in which the sex chromosomes are not properly accounted for, we will falsely # rename the x and y chromosomes to autosomes simDat_nosex <- simDat simDat_nosex[simDat_nosex$chr == "X", ]$chr <- 1 simDat_nosex[simDat_nosex$chr == "Y", ]$chr <- 2 ## ----Get estimated AF/MAF----------------------------------------------------- af_res_sex <- CaseControl_AF(data = simDat, N_case = 10000, N_control = 10000, OR_colname = "OR", AF_total_colname = "total_sim_af") se_res <- CaseControl_SE(data = simDat_nosex, N_case = 10000, N_control = 10000, OR_colname = "OR", SE_colname = "SE", chromosome_colname = "chr", position_colname = "pos", sex_chromosomes = FALSE, do_correction = FALSE) se_res_sex <- CaseControl_SE(data = simDat, N_case = 10000, N_control = 10000, OR_colname = "OR", SE_colname = "SE", chromosome_colname = "chr", position_colname = "pos", sex_chromosomes = TRUE, remove_sex_chromosomes = FALSE, do_correction = FALSE, N_XX_case = 5000, N_XX_control = 5000, N_XY_case = 5000, N_XY_control = 5000) ## ----plot_sex_chromosome_results, message=FALSE------------------------------- p_a <- ggplot(af_res_sex, aes(x = case_sim_af, y = AF_case)) + geom_point() + geom_abline(color = "red", linetype = "dashed") + theme_bw() + xlab("True AF") + ylab("Estimated AF") + ggtitle("1. CaseControl_AF") p_b <- ggplot(se_res, aes(x = case_sim_maf, y = MAF_case)) + geom_point() + geom_abline(color = "red", linetype = "dashed") + theme_bw() + xlab("True MAF") + ylab("Estimated MAF") + ggtitle("2. CaseControl_SE") p_c <- ggplot(se_res_sex, aes(x = case_sim_maf, y = MAF_case)) + geom_point() + geom_abline(color = "red", linetype = "dashed") + theme_bw() + xlab("True MAF") + ylab("Estimated MAF") + ggtitle("3. CaseControl_SE") plot_grid(p_a, p_b, p_c, ncol =3) ## ----------------------------------------------------------------------------- sessionInfo()