## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  out.width = "100%"
)

## ----echo=TRUE, message=FALSE-------------------------------------------------
library(tidyverse)
library(CCAFE)
library(DescTools)
library(cowplot)

## ----Simulations--------------------------------------------------------------
# need to create simulated AFs from autosomes and sex chromosomes

# simulate 100 variants from each chromosome

# will simulate from a sample of 5000 XX and 5000 XY case individuals
# then add 5000 of each in control

set.seed(2020)

N_XX <- 5000
N_XY <- 5000
N <- N_XX + N_XY

for(chr in 1:24) {
  refs <- runif(n = 100, min = 0, max = 1)
  if(chr < 23) {
    genos <- t(sapply(refs, function(x) {
      rmultinom(1, N, c(x^2, 2*x*(1-x), (1-x)^2))
    }))
    afs_toadd <- apply(genos, 1, function(x) ((x[1]*2)+x[2])/(2*N))
  } else {
    if(chr == 23) { # simulate X chromosome variants
      XX_alleles <- (sapply(refs, function(x) {
        sum(rbinom(N_XX*2, 1, x))
      }))
      XY_alleles <- (sapply(refs, function(x) {
        sum(rbinom(N_XY, 1, x))
      }))
      
      total_alleles <- XX_alleles + XY_alleles
      afs_toadd <- total_alleles/(2*N_XX + N_XY)
    } else { # simulate Y chromosome variants
      XY_alleles <- (sapply(refs, function(x) {
        sum(rbinom(N_XY, 1, x))
      }))
      
      afs_toadd <- XY_alleles/(N_XY)
    }
  }
  if(chr == 1) {
    simDat <- data.frame(chr = chr, pos = c(1:100), case_sim_af = afs_toadd)
  } else {
    simDat <- rbind(simDat, data.frame(chr = chr, pos = c(1:100), case_sim_af = afs_toadd))
  }
}

simDat$control_sim_af <- 0

# simulate control AFs

for(chr in 1:24) {
  refs <- simDat[simDat$chr == chr,]$case_sim_af
  refs <- refs - runif(n = length(refs), min = 0, max = .025)
  refs[refs < 0] <- 0
  refs[refs > 1] <- 1
  
  if(chr < 23) {
    genos <- t(sapply(refs, function(x) {
      rmultinom(1, N, c(x^2, 2*x*(1-x), (1-x)^2))
    }))
    afs_toadd <- apply(genos, 1, function(x) ((x[1]*2)+x[2])/(2*N))
  } else {
    if(chr == 23) { # simulate X chromosome variants
      XX_alleles <- (sapply(refs, function(x) {
        sum(rbinom(N_XX*2, 1, x))
      }))
      XY_alleles <- (sapply(refs, function(x) {
        sum(rbinom(N_XY, 1, x))
      }))
      
      total_alleles <- XX_alleles + XY_alleles
      afs_toadd <- total_alleles/(2*N_XX + N_XY)
    } else { # simulate Y chromosome variants
      XY_alleles <- (sapply(refs, function(x) {
        sum(rbinom(N_XY, 1, x))
      }))
      
      afs_toadd <- XY_alleles/(N_XY)
    }
  }
  simDat[simDat$chr == chr, ]$control_sim_af <- afs_toadd
}

simDat$total_sim_af <- (simDat$case_sim_af*10000 + simDat$control_sim_af*10000)/(10000 + 10000)

# need to add OR and SE
OR <- rep(0, nrow(simDat))
SE <- rep(0, nrow(simDat))

for(i in 1:nrow(simDat)) {
  AF1 = simDat[i, ]$case_sim_af
  AF2 = simDat[i,]$control_sim_af
  if(simDat[i,]$chr < 23) {
    # first calculate the 2x2 tables of allele counts
    a = AF1 * 2 * 10000 
    b = (1-AF1) * 2 * 10000
    c = AF2 * 2 * 10000
    d = (1-AF2) * 2 * 10000
  } else {
    if(simDat[i,]$chr == 23) {
      a = AF1 * (2*5000 + 5000)
      b = (1-AF1) * (2*5000 + 5000)
      c = AF2 * (2*5000 + 5000)
      d = (1-AF2) * (2*5000 + 5000)
    } else {
      a = AF1 * 5000
      b = (1-AF1) * 5000
      c = AF2 * 5000
      d = (1-AF2) * 5000
    }
  }
  
  OR[i] <- (a*d)/(b*c)
  SE[i] <- sqrt(1/a + 1/b + 1/c + 1/d)
}

simDat$OR <- OR
simDat$SE <- SE

simDat[simDat$chr == 23, ]$chr <- "X"
simDat[simDat$chr == 24, ]$chr <- "Y"

simDat <- simDat %>% rowwise() %>%
  mutate(is_minor_case = ifelse(case_sim_af <= 0.5, 1, 0),
         is_minor_control = ifelse(control_sim_af <= 0.5, 1, 0),
         is_minor_total = ifelse(total_sim_af <= 0.5, 1, 0)) %>%
  mutate(case_sim_maf = ifelse(is_minor_case, case_sim_af, 1-case_sim_af),
         control_sim_maf = ifelse(is_minor_control, control_sim_af, 1-control_sim_af),
         total_sim_maf = ifelse(is_minor_total, total_sim_af, 1-total_sim_af))

head(simDat)

## ----Any mismatched alleles---------------------------------------------------
nrow(simDat[simDat$is_minor_case != simDat$is_minor_control, ])

## ----Show mismatched alleles--------------------------------------------------
diff_alleles <- simDat[simDat$is_minor_case != simDat$is_minor_control, ]
diff_alleles

## ----Calculate AFs------------------------------------------------------------
simDat <- simDat[is.finite(simDat$OR), ]
simDat <- as.data.frame(simDat)

res_AF <- CaseControl_AF(data = simDat,
                         N_case = 10000,
                         N_control = 10000,
                         OR_colname = "OR",
                         AF_total_colname = "total_sim_af")

res_MAF <- CaseControl_AF(data = simDat,
                         N_case = 10000,
                         N_control = 10000,
                         OR_colname = "OR",
                         AF_total_colname = "total_sim_maf")

## ----Plot_AF_results, message=FALSE-------------------------------------------
p_1 <- ggplot(res_AF, aes(x = case_sim_af, y = AF_case)) +
  geom_point() +
  geom_abline(color = "red", linetype = "dashed") +
  theme_bw() +
  xlab("True AF") +
  ylab("Estimated AF") +
  geom_text(x = .175, y = .9, label = paste0("CCC=", round(DescTools::CCC(res_AF$case_sim_af, res_AF$AF_case, na.rm = TRUE)$rho.c$est, 4))) +
  ggtitle("1. No Conversion")

res_AF$MAF_case <- sapply(res_AF$AF_case, function(x) ifelse(x > 0.5, 1-x, x))
res_AF <- as.data.frame(res_AF)
p_2 <- ggplot(res_AF, aes(x = case_sim_maf, y = MAF_case)) +
  geom_point() +
  geom_abline(color = "red", linetype = "dashed") +
  theme_bw() +
  xlab("True MAF") +
  ylab("Estimated MAF") +
  geom_text(x = .1, y = .45, label = paste0("CCC=", round(DescTools::CCC(res_AF$case_sim_maf, res_AF$MAF_case, na.rm = TRUE)$rho.c$est, 4))) +
  ggtitle("2. Convert AFTER")

p_3 <- ggplot(res_MAF, aes(x = case_sim_maf, y = AF_case)) +
  geom_point() +
  geom_abline(color = "red", linetype = "dashed") +
  theme_bw() +
  xlab("True MAF") +
  ylab("Estimated MAF") +
  geom_text(x = .1, y = .48, label = paste0("CCC=", round(DescTools::CCC(res_MAF$case_sim_maf, res_MAF$AF_case, na.rm = TRUE)$rho.c$est, 4))) +
  ggtitle("3. Convert BEFORE")

cowplot::plot_grid(p_1, p_2, p_3, ncol = 3)

## ----ceate unaccounted for sex chr data---------------------------------------
# to simulate a dataset in which the sex chromosomes are not properly accounted for, we will falsely
# rename the x and y chromosomes to autosomes
simDat_nosex <- simDat
simDat_nosex[simDat_nosex$chr == "X", ]$chr <- 1
simDat_nosex[simDat_nosex$chr == "Y", ]$chr <- 2

## ----Get estimated AF/MAF-----------------------------------------------------
af_res_sex <- CaseControl_AF(data = simDat,
                           N_case = 10000,
                           N_control = 10000,
                           OR_colname = "OR",
                           AF_total_colname = "total_sim_af")

se_res <- CaseControl_SE(data = simDat_nosex,
                                N_case = 10000,
                                N_control = 10000,
                                OR_colname = "OR",
                                SE_colname = "SE",
                                chromosome_colname = "chr",
                                position_colname = "pos",
                                sex_chromosomes = FALSE,
                                do_correction = FALSE)

se_res_sex <- CaseControl_SE(data = simDat,
                             N_case = 10000,
                             N_control = 10000,
                             OR_colname = "OR",
                             SE_colname = "SE",
                             chromosome_colname = "chr",
                             position_colname = "pos",
                             sex_chromosomes = TRUE,
                             remove_sex_chromosomes = FALSE,
                             do_correction = FALSE,
                             N_XX_case = 5000,
                             N_XX_control = 5000,
                             N_XY_case = 5000,
                             N_XY_control = 5000)

## ----plot_sex_chromosome_results, message=FALSE-------------------------------
p_a <- ggplot(af_res_sex, aes(x = case_sim_af, y = AF_case)) +
  geom_point() +
  geom_abline(color = "red", linetype = "dashed") +
  theme_bw() +
  xlab("True AF") +
  ylab("Estimated AF") +
  ggtitle("1. CaseControl_AF")

p_b <- ggplot(se_res, aes(x = case_sim_maf, y = MAF_case)) +
  geom_point() +
  geom_abline(color = "red", linetype = "dashed") +
  theme_bw() +
  xlab("True MAF") +
  ylab("Estimated MAF") +
  ggtitle("2. CaseControl_SE") 

p_c <- ggplot(se_res_sex, aes(x = case_sim_maf, y = MAF_case)) +
  geom_point() +
  geom_abline(color = "red", linetype = "dashed") +
  theme_bw() +
  xlab("True MAF") +
  ylab("Estimated MAF") +
  ggtitle("3. CaseControl_SE") 

plot_grid(p_a, p_b, p_c, ncol =3)

## -----------------------------------------------------------------------------
sessionInfo()