### R code from vignette source 'vignettes/SeqArray/inst/doc/SeqArrayTutorial.Rnw' ################################################### ### code chunk number 1: SeqArrayTutorial.Rnw:90-92 ################################################### # load the R package SeqArray library(SeqArray) ################################################### ### code chunk number 2: SeqArrayTutorial.Rnw:96-101 ################################################### gds.fn <- seqExampleFileName("gds") # or gds.fn <- "C:/YourFolder/Your_GDS_File.gds" gds.fn seqSummary(gds.fn) ################################################### ### code chunk number 3: SeqArrayTutorial.Rnw:105-110 ################################################### # open a GDS file genofile <- seqOpen(gds.fn) # display the contents of the GDS file in a hierarchical structure genofile ################################################### ### code chunk number 4: SeqArrayTutorial.Rnw:114-119 ################################################### # display all contents of the GDS file print(genofile, all=TRUE) # close the GDS file seqClose(genofile) ################################################### ### code chunk number 5: SeqArrayTutorial.Rnw:173-180 ################################################### # the VCF file, using the example in the SeqArray package vcf.fn <- seqExampleFileName("vcf") # or vcf.fn <- "C:/YourFolder/Your_VCF_File.vcf" vcf.fn # parse the header seqVCF.Header(vcf.fn) ################################################### ### code chunk number 6: SeqArrayTutorial.Rnw:184-187 ################################################### # convert, save in "tmp.gds" seqVCF2GDS(vcf.fn, "tmp.gds") seqSummary("tmp.gds") ################################################### ### code chunk number 7: SeqArrayTutorial.Rnw:190-191 ################################################### unlink("tmp.gds") ################################################### ### code chunk number 8: SeqArrayTutorial.Rnw:200-221 ################################################### # the file of GDS gds.fn <- seqExampleFileName("gds") # or gds.fn <- "C:/YourFolder/Your_GDS_File.gds" # open a GDS file genofile <- seqOpen(gds.fn) # convert seqGDS2VCF(genofile, "tmp.vcf.gz") # read z <- readLines("tmp.vcf.gz", n=20) for (i in z) cat(substr(i, 1, 76), " ...\n", sep="") # output BN,GP,AA,HM2 in INFO (the variables are in this order), no FORMAT seqGDS2VCF(genofile, "tmp2.vcf.gz", info.var=c("BN","GP","AA","HM2"), fmt.var=character(0)) # read z <- readLines("tmp2.vcf.gz", n=15) for (i in z) cat(substr(i, 1, 56), " ...\n", sep="") # close the GDS file seqClose(genofile) ################################################### ### code chunk number 9: SeqArrayTutorial.Rnw:243-245 ################################################### # delete temporary files unlink(c("tmp.vcf.gz", "tmp1.vcf.gz", "tmp2.vcf.gz")) ################################################### ### code chunk number 10: SeqArrayTutorial.Rnw:254-278 ################################################### # the file of VCF vcf.fn <- seqExampleFileName("vcf") # or vcf.fn <- "C:/YourFolder/Your_VCF_File.vcf" # convert seqVCF2GDS(vcf.fn, "tmp.gds", verbose=FALSE) # make sure that open with "readonly=FALSE" genofile <- seqOpen("tmp.gds", readonly=FALSE) # display the original structure genofile # delete "HM2", "HM3", "AA", "OR" and "DP" seqDelete(genofile, info.varname=c("HM2", "HM3", "AA", "OR"), format.varname="DP") # display genofile # close the GDS file seqClose(genofile) # clean up the fragments, reduce the file size cleanup.gds("tmp.gds") ################################################### ### code chunk number 11: SeqArrayTutorial.Rnw:281-282 ################################################### unlink("tmp.gds") ################################################### ### code chunk number 12: SeqArrayTutorial.Rnw:295-298 ################################################### # open a GDS file gds.fn <- seqExampleFileName("gds") genofile <- seqOpen(gds.fn) ################################################### ### code chunk number 13: SeqArrayTutorial.Rnw:302-316 ################################################### # take out sample id head(samp.id <- seqGetData(genofile, "sample.id")) # take out variant id head(variant.id <- seqGetData(genofile, "variant.id")) # get "chromosome" table(seqGetData(genofile, "chromosome")) # get "allele" head(seqGetData(genofile, "allele")) # get "annotation/info/GP" head(seqGetData(genofile, "annotation/info/GP")) ################################################### ### code chunk number 14: SeqArrayTutorial.Rnw:320-327 ################################################### # set sample and variant filters seqSetFilter(genofile, sample.id=samp.id[c(2,4,6)]) set.seed(100) seqSetFilter(genofile, variant.id=sample(variant.id, 4)) # get "allele" seqGetData(genofile, "allele") ################################################### ### code chunk number 15: SeqArrayTutorial.Rnw:331-333 ################################################### # get genotypic data seqGetData(genofile, "genotype") ################################################### ### code chunk number 16: SeqArrayTutorial.Rnw:337-339 ################################################### # get "annotation/info/AA", a variable-length dataset seqGetData(genofile, "annotation/info/AA") ################################################### ### code chunk number 17: SeqArrayTutorial.Rnw:342-344 ################################################### # get "annotation/format/DP", a variable-length dataset seqGetData(genofile, "annotation/format/DP") ################################################### ### code chunk number 18: SeqArrayTutorial.Rnw:352-355 ################################################### # set sample and variant filters set.seed(100) seqSetFilter(genofile, sample.id=samp.id[c(2,4,6)], variant.id=sample(variant.id, 4)) ################################################### ### code chunk number 19: SeqArrayTutorial.Rnw:357-360 ################################################### # read multiple variables variant by variant seqApply(genofile, c(geno="genotype", id="annotation/id"), FUN=function(x) print(x), margin="by.variant", as.is="none") ################################################### ### code chunk number 20: SeqArrayTutorial.Rnw:363-370 ################################################### # remove the sample and variant filters seqSetFilter(genofile) # get the numbers of alleles per variant z <- seqApply(genofile, "allele", FUN=function(x) length(unlist(strsplit(x,","))), as.is="integer") table(z) ################################################### ### code chunk number 21: SeqArrayTutorial.Rnw:374-375 ################################################### HM3 <- seqGetData(genofile, "annotation/info/HM3") ################################################### ### code chunk number 22: SeqArrayTutorial.Rnw:377-384 ################################################### # Now HM3 is a global variable # print out RS id if the frequency of reference allele is less than 0.5% and it is HM3 seqApply(genofile, c(geno="genotype", id="annotation/id"), FUN = function(index, x) { p <- mean(x$geno == 0, na.rm=TRUE) # the frequency of reference allele if ((p < 0.005) & HM3[index]) print(x$id) }, as.is="none", var.index=TRUE, margin="by.variant") ################################################### ### code chunk number 23: SeqArrayTutorial.Rnw:392-397 ################################################### # calculate the frequency of reference allele, afreq <- seqApply(genofile, "genotype", FUN=function(x) mean(x==0, na.rm=TRUE), as.is="double", margin="by.variant") length(afreq) summary(afreq) ################################################### ### code chunk number 24: SeqArrayTutorial.Rnw:401-406 (eval = FALSE) ################################################### ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ################################################### ### code chunk number 25: SeqArrayTutorial.Rnw:408-409 ################################################### cl <- NULL ################################################### ### code chunk number 26: SeqArrayTutorial.Rnw:411-419 ################################################### # run in parallel afreq <- seqParallel(cl, genofile, FUN = function(gdsfile) { seqApply(gdsfile, "genotype", as.is="double", FUN=function(x) mean(x==0, na.rm=TRUE)) }, split = "by.variant") length(afreq) summary(afreq) ################################################### ### code chunk number 27: SeqArrayTutorial.Rnw:422-424 (eval = FALSE) ################################################### ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ################################################### ### code chunk number 28: SeqArrayTutorial.Rnw:427-428 ################################################### seqClose(genofile) ################################################### ### code chunk number 29: SeqArrayTutorial.Rnw:444-447 ################################################### # open a GDS file gds.fn <- seqExampleFileName("gds") genofile <- seqOpen(gds.fn) ################################################### ### code chunk number 30: SeqArrayTutorial.Rnw:530-548 ################################################### m.variant <- local({ # get ploidy, the number of samples and variants z <- seqSummary(genofile, "genotype") # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- z$seldim # ploidy ploidy <- z$dim[1] # apply the function marginally m <- seqApply(genofile, "genotype", function(x) { sum(is.na(x)) }, margin="by.variant", as.is="integer") # output m / (ploidy * dm[1]) }) length(m.variant) summary(m.variant) ################################################### ### code chunk number 31: SeqArrayTutorial.Rnw:552-573 ################################################### m.sample <- local({ # get ploidy, the number of samples and variants z <- seqSummary(genofile, "genotype") # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- z$seldim # ploidy ploidy <- z$dim[1] # need a global variable (only available in the bracket of "local") n <- integer(dm[1]) # apply the function marginally seqApply(genofile, "genotype", function(x) { n <<- n + colSums(is.na(x)) }, margin="by.variant", as.is="none") # output n / (ploidy * dm[2]) }) length(m.sample) summary(m.sample) ################################################### ### code chunk number 32: SeqArrayTutorial.Rnw:579-584 (eval = FALSE) ################################################### ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ################################################### ### code chunk number 33: SeqArrayTutorial.Rnw:586-587 ################################################### cl <- NULL ################################################### ### code chunk number 34: SeqArrayTutorial.Rnw:591-611 ################################################### # run in parallel m.variant <- local({ # get ploidy, the number of samples and variants z <- seqSummary(genofile, "genotype") # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- z$seldim # ploidy ploidy <- z$dim[1] m <- seqParallel(cl, genofile, FUN = function(gdsfile) { # apply the function marginally seqApply(gdsfile, "genotype", function(x) { sum(is.na(x)) }, margin="by.variant", as.is="integer") }, split = "by.variant") # output m / (ploidy * dm[1]) }) summary(m.variant) ################################################### ### code chunk number 35: SeqArrayTutorial.Rnw:615-646 ################################################### m.sample <- local({ # run in parallel m <- seqParallel(cl, genofile, FUN = function(gdsfile) local({ # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(gdsfile, "genotype")$seldim # need a global variable (only available in the bracket of "local") n <- integer(dm[1]) # apply the function marginally seqApply(gdsfile, "genotype", function(x) { n <<- n + colSums(is.na(x)) }, margin="by.variant", as.is="none") # output n }), .combine = "+", # sum all variables "n" of different processes split = "by.variant") # get ploidy, the number of samples and variants z <- seqSummary(genofile, "genotype") # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- z$seldim # ploidy ploidy <- z$dim[1] # output m / (ploidy * dm[2]) }) summary(m.sample) ################################################### ### code chunk number 36: SeqArrayTutorial.Rnw:650-652 (eval = FALSE) ################################################### ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ################################################### ### code chunk number 37: SeqArrayTutorial.Rnw:661-667 ################################################### # apply the function variant by variant afreq <- seqApply(genofile, "genotype", function(x) { mean(x==0, na.rm=TRUE) }, as.is="double", margin="by.variant") length(afreq) summary(afreq) ################################################### ### code chunk number 38: SeqArrayTutorial.Rnw:673-678 (eval = FALSE) ################################################### ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ################################################### ### code chunk number 39: SeqArrayTutorial.Rnw:680-681 ################################################### cl <- NULL ################################################### ### code chunk number 40: SeqArrayTutorial.Rnw:683-691 ################################################### # run in parallel afreq <- seqParallel(cl, genofile, FUN = function(gdsfile) { seqApply(gdsfile, "genotype", as.is="double", FUN=function(x) mean(x==0, na.rm=TRUE)) }, split = "by.variant") length(afreq) summary(afreq) ################################################### ### code chunk number 41: SeqArrayTutorial.Rnw:694-696 (eval = FALSE) ################################################### ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ################################################### ### code chunk number 42: SeqArrayTutorial.Rnw:709-737 ################################################### # genetic correlation matrix genmat <- local({ # get the number of samples and variants # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(genofile, "genotype")$seldim # need a global variable (only available in the bracket of "local") s <- matrix(0.0, nrow=dm[1], ncol=dm[1]) # apply the function variant by variant seqApply(genofile, "genotype", function(x) { g <- (x==0) # indicator of reference allele p <- mean(g, na.rm=TRUE) # allele frequency g2 <- colSums(g) - 2*p # genotypes adjusted by allele frequency m <- (g2 %o% g2) / (p*(1-p)) # genetic correlation matrix m[!is.finite(m)] <- 0 # correct missing values s <<- s + m # output to the global variable "s" }, margin="by.variant", as.is="none") # output, scaled by the trace of matrix "s" over the number of samples s / (sum(diag(s)) / dm[1]) }) # eigen-decomposition eig <- eigen(genmat) # eigenvalues head(eig$value) ################################################### ### code chunk number 43: SeqArrayTutorial.Rnw:739-741 ################################################### # eigenvectors plot(eig$vectors[,1], eig$vectors[,2], xlab="PC 1", ylab="PC 2") ################################################### ### code chunk number 44: SeqArrayTutorial.Rnw:748-753 (eval = FALSE) ################################################### ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ################################################### ### code chunk number 45: SeqArrayTutorial.Rnw:755-756 ################################################### cl <- NULL ################################################### ### code chunk number 46: SeqArrayTutorial.Rnw:758-790 ################################################### genmat <- seqParallel(cl, genofile, FUN = function(gdsfile) local({ # get the number of samples and variants # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(gdsfile, "genotype")$seldim # need a global variable (only available in the bracket of "local") s <- matrix(0.0, nrow=dm[1], ncol=dm[1]) # apply the function variant by variant seqApply(gdsfile, "genotype", function(x) { g <- (x==0) # indicator of reference allele p <- mean(g, na.rm=TRUE) # allele frequency g2 <- colSums(g) - 2*p # genotypes adjusted by allele frequency m <- (g2 %o% g2) / (p*(1-p)) # genetic correlation matrix m[!is.finite(m)] <- 0 # correct missing values s <<- s + m # output to the global variable "s" }, margin="by.variant", as.is="none") # output s }), .combine = "+", # sum all variables "s" of different processes split = "by.variant") # finally, scaled by the trace of matrix "s" over the number of samples dm <- seqSummary(genofile, "genotype")$seldim genmat <- genmat / (sum(diag(genmat)) / dm[1]) # eigen-decomposition eig <- eigen(genmat) # eigenvalues head(eig$value) ################################################### ### code chunk number 47: SeqArrayTutorial.Rnw:793-795 (eval = FALSE) ################################################### ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ################################################### ### code chunk number 48: SeqArrayTutorial.Rnw:809-834 ################################################### coeff <- local({ # get the number of samples and variants # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(genofile, "genotype")$seldim # need global variables (only available in the bracket of "local") n <- integer(dm[1]) s <- double(dm[1]) # apply the function variant by variant seqApply(genofile, "genotype", function(x) { p <- mean(x==0, na.rm=TRUE) # allele frequency g <- colSums(x==0) # genotypes: # of reference allele d <- (g*g - g*(1 + 2*p) + 2*p*p) / (2*p*(1-p)) n <<- n + is.finite(d) # output to the global variable "n" d[!is.finite(d)] <- 0 s <<- s + d # output to the global variable "s" }, margin="by.variant", as.is="none") # output s / n }) length(coeff) summary(coeff) ################################################### ### code chunk number 49: SeqArrayTutorial.Rnw:840-845 (eval = FALSE) ################################################### ## # load the "parallel" package ## library(parallel) ## ## # Use option cl.core to choose an appropriate cluster size (or # of cores) ## cl <- makeCluster(getOption("cl.cores", 2)) ################################################### ### code chunk number 50: SeqArrayTutorial.Rnw:847-848 ################################################### cl <- NULL ################################################### ### code chunk number 51: SeqArrayTutorial.Rnw:850-880 ################################################### coeff <- seqParallel(cl, genofile, FUN = function(gdsfile) local({ # get the number of samples and variants # dm[1] -- # of selected samples, dm[2] -- # of selected variants dm <- seqSummary(gdsfile, "genotype")$seldim # need global variables (only available in the bracket of "local") n <- integer(dm[1]) s <- double(dm[1]) # apply the function variant by variant seqApply(gdsfile, "genotype", function(x) { p <- mean(x==0, na.rm=TRUE) # allele frequency g <- colSums(x==0) # genotypes: # of reference allele d <- (g*g - g*(1 + 2*p) + 2*p*p) / (2*p*(1-p)) n <<- n + is.finite(d) # output to the global variable "n" d[!is.finite(d)] <- 0 s <<- s + d # output to the global variable "s" }, margin="by.variant", as.is="none") # output list(s=s, n=n) }), # sum all variables "s" and "n" of different processes .combine = function(x1,x2) { list(s = x1$s + x2$s, n = x1$n + x2$n) }, split = "by.variant") # finally, average! coeff <- coeff$s / coeff$n summary(coeff) ################################################### ### code chunk number 52: SeqArrayTutorial.Rnw:883-885 (eval = FALSE) ################################################### ## # Since we created the cluster manually, we should stop it: ## stopCluster(cl) ################################################### ### code chunk number 53: SeqArrayTutorial.Rnw:888-890 ################################################### # close the GDS file seqClose(genofile)