\name{findBAFvariance}
\alias{findBAFvariance}
\alias{sdByScanChromWindow}
\alias{meanSdByChromWindow}
\alias{medianSdOverAutosomes}

\title{Find chromosomal areas with high BAlleleFreq (or LogRRatio) standard deviation}

\description{
  \code{sdByScanChromWindow} uses a sliding window algorithm to
  calculate the standard deviation of the BAlleleFreq (or LogRRatio) values for a user
  specified number of bins across each chromosome of each scan.

  \code{medianSdOverAutosomes} calculates the median of the
  BAlleleFreq (or LogRRatio) standard deviation over all autosomes for each scan.

  \code{meanSdByChromWindow} calculates the mean and standard
  deviation of the BAlleleFreq standard deviation in each window in each
  chromosome over all scans.
  
  \code{findBAFvariance} flags chromosomal areas with high BAlleleFreq
  standard deviation using previously calculated means and standard
  deviations over scans, typically results from
  \code{sdByScanChromWindow}. 
}

\usage{
sdByScanChromWindow(intenData, genoData=NULL, var="BAlleleFreq", nbins=NULL,
  snp.exclude=NULL, return.mean=FALSE, incl.miss=TRUE, incl.het=TRUE, incl.hom=FALSE)

medianSdOverAutosomes(sd.by.scan.chrom.window)

meanSdByChromWindow(sd.by.scan.chrom.window, sex)

findBAFvariance(sd.by.chrom.window, sd.by.scan.chrom.window,
  sex, sd.threshold)
}

\arguments{
  \item{intenData}{A \code{\link{IntensityData}} object}
  \item{genoData}{A \code{\link{GenotypeData}} object.  May be omitted
  if \code{incl.miss}, \code{incl.het}, and \code{incl.hom} are all
  \code{TRUE}, as there is no need to distinguish between genotype calls in
  that case.}
  \item{var}{The variable for which to calculate standard deviations,
  typically "BAlleleFreq" (the default) or "LogRRatio."}
  \item{nbins}{A vector with integers corresponding to the number of
    bins for each chromosome.  The values all must be even integers.}
  \item{snp.exclude}{ An integer vector containing the snpIDs of SNPs to
    be excluded. }
  \item{return.mean}{a logical.  If \code{TRUE}, return mean as well as
	standard deviation.}
  \item{incl.miss}{a logical. If \code{TRUE}, include SNPs with missing
    genotype calls.}
  \item{incl.het}{a logical. If \code{TRUE}, include SNPs called as heterozygotes.}
  \item{incl.hom}{a logical. If \code{TRUE}, include SNPs called as
  homozygotes.  This is typically \code{FALSE} (the default) for
  BAlleleFreq calculations.}

  \item{sd.by.scan.chrom.window}{A list of matrices of standard deviation for each
    chromosome, with dimensions of number of scans x number of windows.
    This is typically the output of \code{sdByScanChromWindow}.}
  
  \item{sd.by.chrom.window}{A list of matrices of the standard
  deviations, as generated by \code{meanSdByChromWindow}.}
  
  \item{sex}{A character vector of sex ("M"/"F") for the scans.}

  \item{sd.threshold}{A value specifying the threshold for the number of standard deviations above the mean at which to flag.}
  
}

\details{
  \code{sdByScanChromWindow} calculates the standard deviation of
  BAlleleFreq (or LogRRatio) values across chromosomes 1-22 and chromosome X for a
  specified number of 'bins' in each chromosome as passed to the
  function in the 'nbins' argument.  The standard deviation is
  calculated using windows of width equal to 2 bins, and moves along the
  chromosome by an offset of 1 bin (or half a window).  Thus, there will
  be a total of \code{nbins-1} windows per chromosome.  If
  \code{nbins=NULL} (the default), there will be 2 bins (one window) for
  each chromosome.  

  \code{medianSdOverAutosomes} calulates the median over autosomes of
  BAlleleFreq (or LogRRatio) standard deviations calculated
  for sliding windows within each chromosome of each scan. The
  standard deviations should be a list with one element for
  each chromosome, and each element consisting of a matrix with scans as rows.

  \code{meanSdByChromWindow} calculates the mean and standard
  deviation over scans of BAlleleFreq standard deviations calculated
  for sliding windows within each chromosome of each scan. The
  BAlleleFreq standard deviations should be a list with one element for
  each chromosome, and each element consisting of a matrix containing
  the BAlleleFreq standard deviation for the i'th scan in the j'th
  bin. This is typically created using the
  \code{sdByScanChromWindow} function. For the X chromosome the
  calculations are separated out by gender.
  
  \code{findBAFvariance} determines which chromosomes of which scans
  have regions which are at least a given number of SDs from the mean,
  using BAlleleFreq means and standard deviations calculated from
  sliding windows over each chromosome by scan.
}

\value{
  \code{sdByScanChromWindow} returns a list of matrices containing standard deviations.
  There is a matrix for each chromosome, with each matrix having
  dimensions of number of scans x number of windows.  If
  \code{return.mean=TRUE}, two lists to matrices are returned, one with
  standard deviations and one with means.

  \code{medianSdOverAutosomes} returns a data frame with colums "scanID" and
  "med.sd" containing the median standard deviations over all
  autosomes for each scan.
  
  \code{meanSdByChromWindow} returns a list of matrices, one for
  each chromosome. Each matrix contains two columns called "Mean" and
  "SD", containing the mean and SD of the BAlleleFreq standard devations
  over scans for each bin. For the X chromosome the matrix has four
  columns "Female Mean", "Male Mean", "Female SD" and "Male SD".

  \code{findBAFvariance} returns a matrix with columns "scanID",
  "chromosome", "bin", and "sex" containing those scan by chromosome
  combinations with BAlleleFreq standard deviations greater than those
  specified by \code{sd.threshold}.
}

\author{Caitlin McHugh, Cathy Laurie}

\seealso{\code{\link{IntensityData}}, \code{\link{GenotypeData}},
  \code{\link{BAFfromClusterMeans}}, \code{\link{BAFfromGenotypes}}
}

\examples{
library(GWASdata)
data(illumina_scan_annot)
scanAnnot <- ScanAnnotationDataFrame(illumina_scan_annot)

blfile <- system.file("extdata", "illumina_bl.nc", package="GWASdata")
blnc <- NcdfIntensityReader(blfile)
blData <- IntensityData(blnc, scanAnnot=scanAnnot)

genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata")
genonc <- NcdfGenotypeReader(genofile)
genoData <- GenotypeData(genonc, scanAnnot=scanAnnot)

nbins <- rep(8, 3) # need bins for chromosomes 21,22,23
baf.sd <- sdByScanChromWindow(blData, genoData, nbins=nbins)

close(blData)
close(genoData)
med.res <- medianSdOverAutosomes(baf.sd)

sex <- scanAnnot$sex
sd.res <- meanSdByChromWindow(baf.sd, sex)

var.res <- findBAFvariance(sd.res, baf.sd, sex, sd.threshold=2)
}

\keyword{manip}