diff --git a/.Rbuildignore b/.Rbuildignore index ed8149a..3e05a38 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,8 +1,7 @@ -^Meta$ -^doc$ +^README\.md$ ^\.travis\.yml$ -^.*\.git$ +^\.git$ +^\.gitignore$ ^data-raw$ -README\.md -^\.Rprofile$ -^benchmarks$ +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index 9689a6a..530f808 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ +inst/doc +doc Meta -.DS_Store diff --git a/.travis.yml b/.travis.yml index 35adafd..17fa1d7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,24 +1,9 @@ -# Sample .travis.yml for R projects +# R for travis: see documentation at https://docs.travis-ci.com/user/languages/r -language: r +language: R r: - - 3.4 -sudo: false + - release + - devel cache: packages -bioc_required: true -warnings_are_errors: false - -env: - global: - - CRAN: http://cran.rstudio.com - -r_github_packages: - - jimhester/covr - -notifications: - email: - on_success: change - on_failure: change - after_success: - - Rscript -e 'covr::coveralls()' + - Rscript -e 'covr::coveralls()' diff --git a/DESCRIPTION b/DESCRIPTION index db70a7d..6c6fffa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,43 +1,49 @@ Package: methylSig -Type: Package -Title: a whole genome DNA methylation analysis pipeline -Version: 0.5.2 -Date: 2019-01-26 -Authors@R: c( - person("Yongseok", "Park", email = "yongpark@pitt.edu", role = c("aut")), - person("Raymond G.", "Cavalcante", email = "rcavalca@umich.edu", role = c("aut", "cre"))) -Description: MethylSig is a method for testing for differentially methylated - cytosines (DMCs) or regions (DMRs) in whole-genome bisulfite sequencing (bis- - seq) or reduced representation bisulfite sequencing (RRBS) experiments. - MethylSig uses a beta binomial model to test for significant differences - between groups of samples. Several options exist for either site-specific - or sliding window tests, combining strands, and for variance estimation. - It allows annotating the resulting regions to multiple genome features, and - visualizing the results for chosen genomic regions along with supporting genomic - information. -biocViews: DNAMethylation, DifferentialMethylation, Epigenetics, Regression, MethylSeq -License: GPL-3 -BugReports: https://github.com/sartorlab/methylSig/issues +Title: MethylSig: Differential Methylation Testing for WGBS and RRBS Data +Version: 0.99.0 +Date: 2020-03-20 +Authors@R: + c(person(given = "Yongseok", + family = "Park", + role = "aut", + email = "yongpark@pitt.edu"), + person(given = "Raymond G.", + family = "Cavalcante", + role = c("aut", "cre"), + email = "rcavalca@umich.edu")) +Description: MethylSig is a package for testing for + differentially methylated cytosines (DMCs) or regions (DMRs) in + whole-genome bisulfite sequencing (WGBS) or reduced representation + bisulfite sequencing (RRBS) experiments. MethylSig uses a beta + binomial model to test for significant differences between groups of + samples. Several options exist for either site-specific or sliding + window tests, and variance estimation. Depends: - R (>= 3.4) + R (>= 3.6) Imports: - annotatr, - BiocGenerics, - boot, bsseq, DelayedArray, + DelayedMatrixStats, DSS, + IRanges, GenomeInfoDb, GenomicRanges, methods, parallel, + stats, S4Vectors Suggests: BiocStyle, - gplots, - testthat, + bsseqData, knitr, rmarkdown, - rtracklayer + testthat (>= 2.1.0), + covr +License: GPL-3 +BugReports: https://github.com/sartorlab/methylSig/issues +biocViews: DNAMethylation, DifferentialMethylation, Epigenetics, + Regression, MethylSeq +Encoding: UTF-8 +LazyData: true VignetteBuilder: knitr -RoxygenNote: 6.1.1 +RoxygenNote: 7.0.2 diff --git a/NAMESPACE b/NAMESPACE index 52b5958..6f9bdbd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,21 +1,26 @@ # Generated by roxygen2: do not edit by hand -export(binomialDiffCalc) -export(methylSig.tfbsEnrichTest) -export(methylSigAnnotation) -export(methylSigCalc) -export(methylSigDSS) -export(methylSigReadData) -export(methylSigTile) -export(methylSig_weightFunc) -import(BiocGenerics) +export(diff_binomial) +export(diff_dss_fit) +export(diff_dss_test) +export(diff_methylsig) +export(filter_loci_by_coverage) +export(filter_loci_by_group_coverage) +export(filter_loci_by_location) +export(tile_by_regions) +export(tile_by_windows) import(DSS) import(DelayedArray) +import(DelayedMatrixStats) import(GenomeInfoDb) import(GenomicRanges) +import(IRanges) import(S4Vectors) -import(annotatr) import(bsseq) -import(methods) -importFrom(boot,corr) -importFrom(parallel,mclapply) +import(parallel) +importFrom(methods,is) +importFrom(stats,as.formula) +importFrom(stats,formula) +importFrom(stats,p.adjust) +importFrom(stats,pchisq) +importFrom(stats,pt) diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..f0931a4 --- /dev/null +++ b/NEWS @@ -0,0 +1,16 @@ +Changes in 0.99.0 (2020-03-23) ++ Refactor functions and workflow from pre-0.99.0 releases + o methylSigReadData() replaced with the functions: + o bsseq::read.bismark() + o filter_loci_by_coverage() + o filter_loci_by_location() + o methylSigTile() replaced with the functions: + o tile_by_regions() + o tile_by_windows() + o Differential testing should be preceded with: + o filter_loci_by_group_coverage() + o binomialDiffCalc() is replaced by diff_binomial() + o methylSigCalc() is replaced by diff_methylSig() + o methylSigDSS() is replaced by diff_dss_fit() and diff_dss_test() ++ See "Using methylSig" vignette for full example. ++ See "Updating methylSig Code" vignette for how to retrofit pre-0.99.0 code. diff --git a/NEWS.md b/NEWS.md deleted file mode 100644 index ab83d38..0000000 --- a/NEWS.md +++ /dev/null @@ -1,37 +0,0 @@ -# CHANGES IN VERSION 0.5.2 - - - Reintroduce Seqinfo via the `assembly` parameter of `methylSigReadData` - - If `NA`, allow user to continue, but warn that tiling and annotation cannot work unless the user manually assigns `Seqinfo` to the `BSseq` object. - - If `assembly` is unsupported in `GenomeInfoDb::fetchExtendedChromInfoFromUCSC`, act as if it was set to `NA`. - - If `assembly` is supported, use it. - - `seqinfo` persists through tiling, tests for different methylation, and annotation. - - Update built-in data based on `Seqinfo` fixes. - -# CHANGES IN VERSION 0.5.1 - - - Minor bugfix in methylSigTile() function. Cannot reproduce everywhere, but sometimes when a chromosome has no entries in the meth object, the tiling function failed. Now check for entries on the chromosome. - -# CHANGES IN VERSION 0.5.0 - - NOTE: This version of methylSig only works for R <= 3.4.4 and Bioc <= 3.6. A feature change in the bsseq Bioconductor package in Bioc >= 3.7 does not allow BSseq-class objects whose GRanges are not points, and this breaks the tiling functionality of methylSig. - - ## NEW FEATURES - - - Annotations are now done with the annotatr Bioconductor package. - - ## USER-LEVEL CHANGES - - - methylSig v0.5.0 reuses Bioconductor classes rather than the home-spun classes of earlier versions. This will improve maintainability greatly. - - The methylSigReadData() function now is a wrapper for the bsseq::read.bismark() function, obviating the need to transform the input data in anyway. The output is a BSseq-class object. - - As before, filtering for common SNPs (hg19 only), minCount, and maxCount are available. Destranding also remains. - - The result of any of the tests for differential methylation are now GenomicRanges-class objects. - - Built-in example data is now known as sample_data. - - ## BUG FIXES - - - Fixed a mistake in methylSig.tfbsEnrichTest() that mistakenly referred referred to tfbsInfo parameter as tfbs. - - ## REMOVED FEATURES - - - Removed plotting functions for retooling. - - Do not use seqinfo for any objects. Instead, when tiling a genome, find the maximum length of each existing chromosome, add 1000, and use that as the input to GenomicRanges::tileGenome(). diff --git a/R/annotations.R b/R/annotations.R deleted file mode 100644 index 9d9207c..0000000 --- a/R/annotations.R +++ /dev/null @@ -1,52 +0,0 @@ -#' Wrapper function for annotatr annotations -#' -#' Annotate the result of \code{methylSigCalc} to \code{annotatr} annotations, while also categorizing regions tested as hyper-methylated in either group or "No DM". -#' -#' @section Annotation structure: -#' Annotations from \code{annotatr} are embedded in the \code{myDiff} \code{GRanges} object as a column of \code{GRanges} (named \code{annot}). Of note is that a region annotated to multiple annotations will appear in as many rows, one for each annotation. A convenience column, \code{locus_id}, clearly indicates when a locus is multiply annotated. The easiest way to see all tested loci (CpGs or regions) and their annotations as a flat table is to coerce the result with \code{as.data.frame}. The \code{mcols} of \code{annot} are: -#' \describe{ -#' \item{id}{ A unique ID for the annotation. } -#' \item{tx_id}{ Either a UCSC knownGene transcript ID (genic annotations) or a Ensembl transcript ID (lncRNA annotations), } -#' \item{gene_id}{ Entrez ID. } -#' \item{symbol}{ The gene symbol from the \code{org.*.eg.db} mapping from the Entrez ID. } -#' \item{type}{ A code of the form \code{[genome]_[type]_[name]} indicating the annotation type. } -#' } -#' -#' @param myDiff A \code{GRanges} object resulting from \code{methylSigCalc} that contains all CpG sites that are tested for differential methylation. -#' @param dmcList A \code{logical} of the same length as \code{myDiff} defining the DMCs or DMRs. -#' @param annotations A \code{GRanges} object resulting from \code{annotatr::build_annotations()} to be used for annotating \code{myDiff} in conjunction with \code{dmcList}. See the documentation for \code{annotatr::build_annotations()} for guidance on how to chose different annotations, or use custom annotations. -#' -#' @return A \code{GRanges} object whose \code{mcols} include all those in \code{myDiff} in addition to: -#' \describe{ -#' \item{locus_ids}{ A unique identifier for the tested locus from \code{methylSigCalc}. } -#' \item{dm_status}{ A \code{character} indicating which group the CpG / region is hyper-methylated in (based on the \code{levels} of the \code{comparison} column in the \code{pData} used in \code{methylSigCalc}), or "No DM" if it is \code{FALSE} in \code{dmcList}. } -#' \item{annot}{ A \code{GRanges} containing annotation information for the CpG or region. See the "Annotation structure" section. } -#' } -#' -#' @examples -#' # Annotate the msig_cpgs results -#' utils::data(sample_data, package = 'methylSig') -#' # This includes the cpg_annots object to save time -#' -#' # Use the genome of msig_cpgs and build annotations for CpG features built with -#' # cpg_annots = annotatr::build_annotations(genome = 'hg19', annotations = 'hg19_cpg_islands') -#' -#' # Decide what counts as differentially methylated -#' dmcList = msig_cpgs$fdr < 0.05 & abs(msig_cpgs$meth.diff) > 25 -#' -#' myDiff_annotated = methylSigAnnotation(myDiff = msig_cpgs, dmcList = dmcList, annotations = cpg_annots) -#' -#' @export -methylSigAnnotation <- function(myDiff, dmcList, annotations) { - # Create a dm_status column based on dmcList and myDiff$hyper.direction - myDiff$dm_status = myDiff$hyper.direction - myDiff$dm_status[!dmcList] = 'No DM' - - # Create a locus_id to enable users to quickly see when a tested locus is annotated to multiple things - myDiff$locus_id = seq_along(myDiff) - - # Do the annotations - annotated_dmcs = annotatr::annotate_regions(regions = myDiff, annotations = annotations, ignore.strand = TRUE) - - return(annotated_dmcs) -} diff --git a/R/diff_binomial.R b/R/diff_binomial.R new file mode 100644 index 0000000..0287ebe --- /dev/null +++ b/R/diff_binomial.R @@ -0,0 +1,156 @@ +#' Differential methylation analysis using binomial model +#' +#' This function calculates differential methylation statistics using a binomial-based approach. See `Warning' message below. +#' +#' This function uses a binomial-based model to calculate differential methylation statistics. It is nearly identical to the \code{methylKit::calculateDiffMeth} function in the \code{methylKit} R package except that only the likelihood ratio test and \code{p.adjust(..., method='BH')} are used to calculate significance levels. It is significantly faster than \code{methylKit::calculateDiffMeth} function. +#' +#' @param bs A \code{BSseq-class} object to calculate differential methylation statistics. See \code{methylSigReadData} for how to read in methylation data. +#' @param group_column a \code{character} string indicating the column of \code{pData(bs)} to use for determining group membership. +#' @param comparison_groups a named \code{character} vector indicating the \code{case} and \code{control} factors of \code{group_column} for the comparison. +#' +#' @return A \code{GRanges} object containing the following \code{mcols}: +#' \describe{ +#' \item{meth_case:}{ Methylation estimate for case. } +#' \item{meth_control:}{ Methylation estimate for control. } +#' \item{meth_diff:}{ The difference \code{meth_case - meth_control}. } +#' \item{direction:}{ The group for which the lcous is hyper-methylated. Note, this is not subject to significance thresholds. } +#' \item{pvalue:}{ The p-value from the t-test (\code{t_approx = TRUE}) or the Chi-Square test (\code{t_approx = FALSE}). } +#' \item{fdr:}{ The Benjamini-Hochberg adjusted p-values using \code{p.adjust(method = 'BH')}. } +#' \item{log_lik_ratio:}{ The log likelihood ratio. } +#' } +#' +#' @section Warning: This function does not take into account the variability among samples in each group being compared. +#' +#' @examples +#' data(BS.cancer.ex, package = 'bsseqData') +#' +#' bs = filter_loci_by_group_coverage( +#' bs = BS.cancer.ex, +#' group_column = 'Type', +#' c('cancer' = 2, 'normal' = 2)) +#' +#' small_test = bs[1:50] +#' +#' diff_gr = diff_binomial( +#' bs = small_test, +#' group_column = 'Type', +#' comparison_groups = c('case' = 'cancer', 'control' = 'normal')) +#' +#' @export +diff_binomial = function( + bs, + group_column, + comparison_groups) { + + ##################################### + + # Check missing + if (missing(bs)) { + stop('Must pass bs as a BSseq object.') + } + if (missing(group_column)) { + stop('Must pass group_column as a character string.') + } + if (missing(comparison_groups)) { + stop('Must pass comparison_groups as a named character vector with names "case" and "control".') + } + + ##################################### + + # Check types + if (!is(bs, 'BSseq')) { + stop('bs must be class BSseq.') + } + if (!(is(group_column, 'character') && length(group_column) == 1)) { + stop('group_column must be a character string.') + } + if (!is(comparison_groups, 'character')) { + stop('comparison_groups must be a named character vector.') + } + + ##################################### + + # Check valid group_column name + if (!(group_column %in% colnames(pData(bs)))) { + stop(sprintf('group_column: %s not in column names of pData(bs): %s', + group_column, paste(colnames(pData(bs)), collapse = ', '))) + } + + # Check valid comparison_groups values in group_column of pData(bs) + if (!all(comparison_groups %in% pData(bs)[, group_column])) { + stop(sprintf('Not all comparison_groups are in group_column: %s', + paste(setdiff(comparison_groups, pData(bs)[, group_column]), collapse = ', ') )) + } + + # Check valid comparison_groups names + if (!all(c('case','control') %in% names(comparison_groups))) { + stop('comparison_groups vector must be a named vector with names "case" and "control".') + } + + ##################################### + + case = comparison_groups['case'] + control = comparison_groups['control'] + + # Rows of pdata and columns of bs + pdata = bsseq::pData(bs) + case_idx = which(pdata[, group_column] == case) + control_idx = which(pdata[, group_column] == control) + + ##################################### + + gr = granges(bs) + + cov_mat = as.matrix(bsseq::getCoverage(bs, type = 'Cov')) + meth_mat = as.matrix(bsseq::getCoverage(bs, type = 'M')) + + # Determine which sites are valid to test according to min.per.group + cov_mat = as.matrix(bsseq::getCoverage(bs, type = 'Cov')) + meth_mat = as.matrix(bsseq::getCoverage(bs, type = 'M')) + + # Setup required quantities for the log_lik_ratio calculation + unmeth_reads = rowSums(cov_mat - meth_mat, na.rm = TRUE) + unmeth_reads_control = rowSums(cov_mat[,control_idx] - meth_mat[,control_idx], na.rm = TRUE) + unmeth_reads_case = rowSums(cov_mat[,case_idx] - meth_mat[,case_idx], na.rm = TRUE) + + meth_reads = rowSums(meth_mat, na.rm = TRUE) + meth_reads_control = rowSums(meth_mat[,control_idx], na.rm = TRUE) + meth_reads_case = rowSums(meth_mat[,case_idx], na.rm = TRUE) + + cov = rowSums(cov_mat, na.rm = TRUE) + cov_control = rowSums(cov_mat[,control_idx], na.rm=TRUE) + cov_case = rowSums(cov_mat[,case_idx], na.rm=TRUE) + + log_lik_ratio = 2 * (meth_reads_control * log(meth_reads_control / cov_control + 1e-100) + + unmeth_reads_control * log(unmeth_reads_control / cov_control + 1e-100) + + meth_reads_case * log(meth_reads_case / cov_case + 1e-100) + + unmeth_reads_case * log(unmeth_reads_case / cov_case + 1e-100) + - meth_reads * log(meth_reads / cov + 1e-100) + - unmeth_reads * log(unmeth_reads / cov + 1e-100) + ) + + meth_control = round((meth_reads_control / cov_control) * 100, 2) + meth_case = round((meth_reads_case / cov_case) * 100, 2) + meth_diff = round(meth_case - meth_control, 2) + + direction = ifelse(meth_diff >= 0, case, control) + + pvalue = stats::pchisq(log_lik_ratio, 1, lower.tail=FALSE) + fdr = stats::p.adjust(pvalue, method = 'BH') + + results = data.frame( + 'meth_case' = meth_case, + 'meth_control' = meth_control, + 'meth_diff' = meth_diff, + 'direction' = direction, + 'pvalue' = pvalue, + 'fdr' = fdr, + 'log_lik_ratio' = log_lik_ratio, + stringsAsFactors = FALSE + ) + + result_gr = granges(bs) + mcols(result_gr) = results + + return(result_gr) +} diff --git a/R/diff_dss_fit.R b/R/diff_dss_fit.R new file mode 100644 index 0000000..b893750 --- /dev/null +++ b/R/diff_dss_fit.R @@ -0,0 +1,87 @@ +#' Performs model fit for general experimental design +#' +#' This function is a wrapper for \code{DSS::DMLfit.multiFactor}. +#' +#' @param bs a \code{BSseq} object to calculate differential methylation statistics. +#' @param design a \code{data.frame} or \code{DataFrame} for experimental design. Should contain as many rows as there are columns (samples) in \code{bs}, and the order of the rows should match the columns of \code{bs}. If omitted, will default to \code{pData(bs)}. +#' @param formula a formula for the linear model. It should refer to column names from \code{design}. NOTE: The intercept is included by default if omitted. One can omit the intercept with a formula such as \code{'~ 0 + group'}. For clarity, it helps to include the intercept explicitly as in \code{'~ 1 + group'}. +#' +#' @return A \code{list} object with: +#' \describe{ +#' \item{gr:}{ a \code{GRanges} object with loci fit. } +#' \item{design:}{ the \code{data.frame} input as the experimental design. } +#' \item{formula:}{ the \code{formula} representing the model. Can be \code{character} or \code{formula}. } +#' \item{X:}{ the design \code{matrix} used in regression based on the \code{design} and \code{formula}. This should be consulted to determine the appropriate contrast to use in \code{dss_fit_test()}. } +#' \item{fit:}{ a \code{list} with model fitting results. It has components \code{beta}, the estimated coefficients, and \code{var.beta} the estimated variance/covariance matrix for \code{beta}. } +#' } +#' +#' @examples +#' data(BS.cancer.ex, package = 'bsseqData') +#' +#' bs = filter_loci_by_group_coverage( +#' bs = BS.cancer.ex, +#' group_column = 'Type', +#' c('cancer' = 2, 'normal' = 2)) +#' +#' small_test = bs[1:50] +#' +#' diff_fit = diff_dss_fit( +#' bs = small_test, +#' design = bsseq::pData(bs), +#' formula = '~ Type') +#' +#' @export +diff_dss_fit = function( + bs, + design, + formula) { + + ##################################### + + # Check missing + if (missing(bs)) { + stop('Must pass bs as a BSseq object.') + } + if (missing(formula)) { + stop('Must pass formula as a character string or formula.') + } + + ##################################### + + # Check types + if (!is(bs, 'BSseq')) { + stop('bs must be class BSseq.') + } + if (!missing(design)) { + if (!(any( + is(design, 'data.frame'), + is(design, 'DataFrame') + ))) { + stop('design must be a data.frame or DataFrame') + } + } + if (!(is(formula, 'character') || is(formula, 'formula'))) { + stop('formula must be a character string or a formula.') + } + + ##################################### + + if(missing(design)) { + message('Missing design, defaulting to pData(bs)...') + design = pData(bs) + } + + if(is(formula, 'character')) { + formula = stats::as.formula(formula) + } + + ##################################### + + dss_fit = DSS::DMLfit.multiFactor( + BSobj = bs, + design = design, + formula = formula, + smoothing = FALSE) + + return(dss_fit) +} diff --git a/R/diff_dss_test.R b/R/diff_dss_test.R new file mode 100644 index 0000000..4eef854 --- /dev/null +++ b/R/diff_dss_test.R @@ -0,0 +1,207 @@ +#' Calculates differential methylation statistics under general experimental design +#' +#' This function is a wrapper for \code{DSS::DMLtest.multiFactor} with the added feature of reporting methylation rates alongside the test results via the \code{methylation_group_column} and \code{methylation_groups} parameters. See documentation below. +#' +#' @param bs a \code{BSseq}, the same used used to create \code{diff_fit}. +#' @param diff_fit a \code{list} object output by \code{diff_dss_fit()}. +#' @param contrast a contrast matrix for hypothesis testing. The number of rows should match the number of columns \code{design}. Consult \code{diff_fit$X} to ensure the contrast correponds to the intended test. +#' @param methylation_group_column Optionally, a column from \code{diff_fit$design} by which to group samples and capture methylation rates. This column can be a \code{character}, \code{factor}, or \code{numeric}. In the case of \code{numeric} the samples are grouped according to the top and bottom 25 percentiles of the covariate, and the mean methlyation for each group is calculated. If not a \code{numeric}, use the \code{methylation_groups} parameter to specify case and control. +#' @param methylation_groups Optionally, a named \code{character} vector indicating the \code{case} and \code{control} factors of \code{methylation_group_column} by which to group samples and capture methylation rates. If specified, must also specify \code{methylation_group_column}. +#' +#' @return A \code{GRanges} object containing the following \code{mcols}: +#' \describe{ +#' \item{stat:}{ The test statistic. } +#' \item{pvalue:}{ The p-value. } +#' \item{fdr:}{ The Benjamini-Hochberg adjusted p-values using \code{p.adjust(method = 'BH')}. } +#' } +#' +#' @examples +#' data(BS.cancer.ex, package = 'bsseqData') +#' +#' bs = filter_loci_by_group_coverage( +#' bs = BS.cancer.ex, +#' group_column = 'Type', +#' c('cancer' = 2, 'normal' = 2)) +#' +#' small_test = bs[1:50] +#' +#' diff_fit = diff_dss_fit( +#' bs = small_test, +#' design = bsseq::pData(bs), +#' formula = '~ Type') +#' +#' result = diff_dss_test( +#' bs = small_test, +#' diff_fit = diff_fit, +#' contrast = matrix(c(0,1), ncol = 1) +#' ) +#' +#' result_with_meth = diff_dss_test( +#' bs = small_test, +#' diff_fit = diff_fit, +#' contrast = matrix(c(0,1), ncol = 1), +#' methylation_group_column = 'Type', +#' methylation_groups = c('case' = 'cancer', 'control' = 'normal') +#' ) +#' +#' @export +diff_dss_test = function( + bs, + diff_fit, + contrast, + methylation_group_column = NA, + methylation_groups = NA) { + + ##################################### + + # Check missing + if (missing(bs)) { + stop('Must pass bs as a BSseq object.') + } + if (missing(diff_fit)) { + stop('Must pass diff_fit, the result of diff_dss_fit().') + } + if (missing(contrast)) { + stop('Must pass contrast as a matrix.') + } + + ##################################### + + # Check validity of bs + if (!is(bs, 'BSseq')) { + stop('bs must be class BSseq.') + } + + # Check validity of diff_fit + if (!is(diff_fit, 'list')) { + stop('diff_fit must be a list.') + } else { + if (!all(c('gr', 'design', 'formula', 'X', 'fit') %in% names(diff_fit))) { + stop('diff_fit must be a list returned from diff_dss_fit() with elements gr, design, formula, X, fit.') + } + } + + # Check validity of methylation_group_column + if (!is.na(methylation_group_column)) { + if (!(is(methylation_group_column, 'character') && length(methylation_group_column) == 1)) { + stop('methylation_group_column must be a character string.') + } + + if (!(methylation_group_column %in% colnames(diff_fit$design))) { + stop(sprintf('methylation_group_column: %s not in column names of diff_fit$design: %s', + methylation_group_column, paste(colnames(diff_fit$design), collapse = ', '))) + } + } + + # Check validity of methylation_groups + if (!all(is.na(methylation_groups))) { + + if (is.na(methylation_group_column)) { + stop('If methylation_groups is specified, so must methylation_group_column.') + } + + if (!is(methylation_groups, 'character')) { + stop('methylation_groups must be a named character vector.') + } + + if (!all(c('case','control') %in% names(methylation_groups))) { + stop('methylation_groups must be a named vector with names "case" and "control".') + } + + if (!all(methylation_groups %in% diff_fit$design[, methylation_group_column])) { + stop(sprintf('Not all methylation_groups are in methylation_group_column: %s', + paste(setdiff(methylation_groups, diff_fit$design[, methylation_group_column]), collapse = ', ') )) + } + } + + ##################################### + + result = DSS::DMLtest.multiFactor( + DMLfit = diff_fit, + Contrast = contrast) + + result_gr = diff_fit$gr + mcols(result_gr) = result[ ,c('stat','pvals','fdrs')] + colnames(mcols(result_gr)) = c('stat','pvalue','fdr') + + ##################################### + + # If a methylation_group_column is given, retrieve methylation rates + if (!is.na(methylation_group_column)) { + + # Assign correct case_idx and control_idx based on whether the + # methylation_group_column is character/factor or numeric + pdata = diff_fit$design + + if (is(pdata[, methylation_group_column], 'character') || is(pdata[, methylation_group_column], 'factor')) { + + case = methylation_groups['case'] + control = methylation_groups['control'] + + case_idx = which(pdata[, methylation_group_column] == case) + control_idx = which(pdata[, methylation_group_column] == control) + + } else if (is(pdata[, methylation_group_column], 'numeric')) { + + # Order of return is 25%, 75% + # So we want <= quantiles[1] and >= quantiles[2] + quantiles = quantile( + x = pdata[, methylation_group_column], + probs = c(0.25, 0.75), + na.rm = TRUE + ) + + case_idx = which(pdata[, methylation_group_column] <= quantiles[1]) + control_idx = which(pdata[, methylation_group_column] >= quantiles[2]) + + } + + # Subset bs by what was fit + result_bs = subsetByOverlaps(bs, diff_fit$gr) + + cov_reads_mat = bsseq::getCoverage(bs, type = 'Cov') + meth_reads_mat = bsseq::getCoverage(bs, type = 'M') + + # Compute case, control, and methylation difference + case_meth = (DelayedMatrixStats::rowSums2( + x = meth_reads_mat, + cols = case_idx, + value = TRUE, na.rm = TRUE) / DelayedMatrixStats::rowSums2( + x = cov_reads_mat, + cols = case_idx, + value = TRUE, na.rm = TRUE)) + + control_meth = (DelayedMatrixStats::rowSums2( + x = meth_reads_mat, + cols = control_idx, + value = TRUE, na.rm = TRUE) / DelayedMatrixStats::rowSums2( + x = cov_reads_mat, + cols = control_idx, + value = TRUE, na.rm = TRUE)) + meth_diff = case_meth - control_meth + + result_gr$case_meth = round(case_meth * 100, 2) + result_gr$control_meth = round(control_meth * 100, 2) + result_gr$meth_diff = round(meth_diff * 100, 2) + + col_order = c( + 'case_meth', + 'control_meth', + 'meth_diff', + 'stat', + 'pvalue', + 'fdr' + ) + mcols(result_gr) = mcols(result_gr)[, col_order] + + } + + # Remove NA tests and indicate how many failed as in diff_methylsig() + na_idx = is.na(result_gr$stat) + if (any(na_idx)) { + result_gr = result_gr[!na_idx] + message(sprintf('%s loci were dropped due to insufficient degrees of freedom.', sum(na_idx))) + } + + return(result_gr) +} diff --git a/R/diff_methylsig.R b/R/diff_methylsig.R new file mode 100644 index 0000000..f6b1feb --- /dev/null +++ b/R/diff_methylsig.R @@ -0,0 +1,476 @@ +.weight_function <- function(u) (1-u^2)^3 + +.derivative_phi <- function(phi, local_c, local_t, mu, weight) { + derivative = 0 + indicator_c = local_c > 0 + indicator_t = local_t > 0 + indicator_ct = local_c + local_t > 0 + + if(nrow(local_c) == 1) { + derivative = + sum( indicator_c * ( mu * (digamma((mu * phi) + local_c + 1e-100) - digamma(mu * phi + 1e-100)) ) ) + + sum( indicator_t * ((1 - mu) * (digamma( ((1 - mu) * phi) + local_t + 1e-100) - digamma( ((1-mu) * phi) + 1e-100))) ) - + sum( indicator_ct * (digamma(phi + local_c + local_t + 1e-100) - digamma(phi)) ) + } else { + for(g in 1:ncol(local_c)) { + derivative = derivative + + sum( indicator_c[,g] * (weight * mu[,g] * (digamma(mu[,g] * phi + local_c[,g] + 1e-100) - digamma(mu[,g] * phi + 1e-100))) ) + + sum( indicator_t[,g] * (weight * (1 - mu[,g]) * (digamma((1 - mu[,g]) * phi + local_t[,g] + 1e-100) - digamma((1 - mu[,g]) * phi + 1e-100))) ) - + sum( indicator_ct[,g] * (weight * (digamma(phi + local_c[,g] + local_t[,g] + 1e-100) - digamma(phi))) ) + } + } + + derivative +} + +.derivative_mu <- function(mu, local_c, local_t, phi, weight) { + derivative = 0 + indicator_c = local_c > 0 + indicator_t = local_t > 0 + + if(nrow(local_c) == 1) { + derivative = + sum( indicator_c * (digamma(mu * phi + local_c + 1e-100) - digamma(mu * phi + 1e-100)) ) - + sum( indicator_t * (digamma((1 - mu) * phi + local_t + 1e-100) - digamma((1 - mu) * phi + 1e-100)) ) + } else { + for(g in 1:ncol(local_c)) { + derivative = derivative + + sum( indicator_c[,g] * (weight * (digamma(mu * phi + local_c[,g]+ 1e-100) - digamma(mu * phi + 1e-100))) ) - + sum( indicator_t[,g] * (weight * (digamma((1 - mu) * phi + local_t[,g] + 1e-100) - digamma((1 - mu) * phi + 1e-100))) ) + } + } + + derivative +} + +.log_likelihood <- function(mu, phi, local_c, local_t, weight) { + llik = 0 + indicator_c = local_c > 0 + indicator_t = local_t > 0 + + if(nrow(local_c) == 1) { + llik = llik + + sum( indicator_c * (lgamma(mu * phi + local_c + 1e-100) - lgamma(mu * phi + 1e-100)) ) + + sum( indicator_t * (lgamma((1 - mu) * phi + local_t + 1e-100) - lgamma((1 - mu) * phi + 1e-100)) ) + } else { + for(g in 1:ncol(local_c)) { + llik = llik + + sum( indicator_c[,g] * (weight * (lgamma(mu * phi + local_c[,g] + 1e-100) - lgamma(mu * phi + 1e-100))) ) + + sum( indicator_t[,g] * (weight * (lgamma((1 - mu) * phi + local_t[,g] + 1e-100) - lgamma((1 - mu) + 1e-100))) ) + } + } + + 2*llik +} + +#' Calculates differential methylation statistics using a Beta-binomial approach. +#' +#' The function calculates differential methylation statistics between two groups of samples using a beta-binomial approach to calculate differential methylation statistics, accounting for variation among samples within each group. The function can be applied to a \code{BSseq} object subjected to \code{filter_loci_by_coverage()}, \code{filter_loci_by_snps()}, \code{filter_loci_by_group_coverage()} or any combination thereof. Moreover, the function can be applied to a \code{BSseq} object which has been tiled with \code{tile_by_regions()} or \code{tile_by_windows()}. +#' +#' @param bs a \code{BSseq} object. +#' @param group_column a \code{character} string indicating the column of \code{pData(bs)} to use for determining group membership. +#' @param comparison_groups a named \code{character} vector indicating the \code{case} and \code{control} factors of \code{group_column} for the comparison. +#' @param disp_groups a named \code{logical} vector indicating the whether to use \code{case}, \code{control}, or both to estimate the dispersion. +#' @param local_window_size an \code{integer} indicating the size of the window for use in determining local information to improve mean and dispersion parameter estimations. In addition to a the distance constraint, a maximum of 5 loci upstream and downstream of the locus are used. The default is \code{0}, indicating no local information is used. +#' @param local_weight_function a weight kernel function. The default is the tri-weight kernel function defined as \code{function(u) = (1-u^2)^3}. The domain of any given weight function should be [-1,1], and the range should be [0,1]. +#' @param t_approx a \code{logical} value indicating whether to use squared t approximation for the likelihood ratio statistics. Chi-square approximation (\code{t_approx = FALSE}) is recommended when the sample size is large. Default is \code{TRUE}. +#' @param n_cores an \code{integer} denoting how many cores should be used for differential methylation calculations. +#' +#' @return A \code{GRanges} object containing the following \code{mcols}: +#' \describe{ +#' \item{meth_case:}{ Methylation estimate for case. } +#' \item{meth_control:}{ Methylation estimate for control. } +#' \item{meth_diff:}{ The difference \code{meth_case - meth_control}. } +#' \item{direction:}{ The group for which the lcous is hyper-methylated. Note, this is not subject to significance thresholds. } +#' \item{pvalue:}{ The p-value from the t-test (\code{t_approx = TRUE}) or the Chi-Square test (\code{t_approx = FALSE}). } +#' \item{fdr:}{ The Benjamini-Hochberg adjusted p-values using \code{p.adjust(method = 'BH')}. } +#' \item{disp_est:}{ The dispersion estimate. } +#' \item{log_lik_ratio:}{ The log likelihood ratio. } +#' \item{df:}{ Degrees of freedom used when \code{t_approx = TRUE}. } +#' } +#' +#' @examples +#' data(BS.cancer.ex, package = 'bsseqData') +#' +#' bs = filter_loci_by_group_coverage( +#' bs = BS.cancer.ex, +#' group_column = 'Type', +#' c('cancer' = 2, 'normal' = 2)) +#' +#' small_test = bs[1:50] +#' +#' diff_gr = diff_methylsig( +#' bs = small_test, +#' group_column = 'Type', +#' comparison_groups = c('case' = 'cancer', 'control' = 'normal'), +#' disp_groups = c('case' = TRUE, 'control' = TRUE), +#' local_window_size = 0, +#' t_approx = TRUE, +#' n_cores = 1) +#' +#' @export +diff_methylsig = function( + bs, + group_column, + comparison_groups, + disp_groups, + local_window_size = 0, + local_weight_function, + t_approx = TRUE, + n_cores = 1) { + + # Constants + min_disp = 1e-6 + min_inverse_disp = 0.001 + max_inverse_disp = max(1/max(min_disp, 1e-6), min_inverse_disp) + min_meth = 0 + max_meth = 1 + + ##################################### + + # Check missing + if (missing(bs)) { + stop('Must pass bs as a BSseq object.') + } + if (missing(group_column)) { + stop('Must pass group_column as a character string.') + } + if (missing(comparison_groups)) { + stop('Must pass comparison_groups as a named character vector with names "case" and "control".') + } + if (missing(disp_groups)) { + stop('Must pass disp_groups as a logical vector.') + } + + # Use .weight_function by default, but not in the function definition because + # this introduces some strange exporting issues. The user really shouldn't + # have to think about this at all. + if (missing(local_weight_function)) { + local_weight_function = .weight_function + } + + ##################################### + + # Check types + if (!is(bs, 'BSseq')) { + stop('bs must be class BSseq.') + } + if (!(is(group_column, 'character') && length(group_column) == 1)) { + stop('group_column must be a character string.') + } + if (!is(comparison_groups, 'character')) { + stop('comparison_groups must be a named character vector.') + } + if (!is(disp_groups, 'logical')) { + stop('disp_groups must be a named logical vector.') + } + if (!(is(local_window_size, 'numeric') && length(local_window_size) == 1)) { + stop('local_window_size must be an integer.') + } + if (!is(local_weight_function, 'function')) { + stop('local_weight_function must be a function.') + } + if (!is(t_approx, 'logical')) { + stop('t_approx must be TRUE/FALSE.') + } + if (!(is(n_cores, 'numeric') && length(n_cores) == 1)) { + stop('n_cores must be an integer.') + } + + ##################################### + + # Check valid group_column name + if (!(group_column %in% colnames(pData(bs)))) { + stop(sprintf('group_column: %s not in column names of pData(bs): %s', + group_column, paste(colnames(pData(bs)), collapse = ', '))) + } + + # Check valid comparison_groups values in group_column of pData(bs) + if (!all(comparison_groups %in% pData(bs)[, group_column])) { + stop(sprintf('Not all comparison_groups are in group_column: %s', + paste(setdiff(comparison_groups, pData(bs)[, group_column]), collapse = ', ') )) + } + + # Check valid comparison_groups names + if (!all(c('case','control') %in% names(comparison_groups))) { + stop('comparison_groups vector must be a named vector with names "case" and "control".') + } + + # Check valid disp_groups names + if (!all(c('case','control') %in% names(disp_groups))) { + stop('disp_groups vector must be a named vector with names "case" and "control".') + } + + # Check valid disp_groups values + if (!any(disp_groups)) { + stop('disp_groups must be a named logical vector with at least one TRUE value corresponding to group name for case or control.') + } + + # Check for invalid local_window_size == 0 && regions state + # Cannot use local information on region-resolution data, that's the point of tiling + if (local_window_size > 0 && median(width(bs)) > 2) { + stop('Cannot use local information on region-resolution data. Detected local_window_size > 0 and median width of loci > 2') + } + + ##################################### + + case = comparison_groups['case'] + control = comparison_groups['control'] + + # Rows of pdata and columns of bs + pdata = bsseq::pData(bs) + case_idx = which(pdata[, group_column] == case) + control_idx = which(pdata[, group_column] == control) + + if(all(disp_groups)) { + disp_groups_idx = c(case_idx, control_idx) + } else if (disp_groups['case'] & !disp_groups['control']) { + disp_groups_idx = case_idx + } else if (!disp_groups['case'] & disp_groups['control']) { + disp_groups_idx = control_idx + } + + ##################################### + + num_loci = length(bs) + gr = granges(bs) + + cov_mat = as.matrix(bsseq::getCoverage(bs, type = 'Cov')) + meth_mat = as.matrix(bsseq::getCoverage(bs, type = 'M')) + + # Estimate meth per locus within each group. The same value is used for all samples within the same group. + # Note, the approach is to sum reads over all samples per group per locus + meth_est = matrix(0, ncol = ncol(bs), nrow = nrow(bs)) + meth_est[, case_idx] = base::rowSums(meth_mat[, case_idx]) / (base::rowSums(cov_mat[, case_idx]) + 1e-100) + meth_est[, control_idx] = base::rowSums(meth_mat[, control_idx]) / (base::rowSums(cov_mat[, control_idx]) + 1e-100) + + ##################################### + + result = do.call(rbind, parallel::mclapply(seq_along(gr), function(locus_idx){ + + ### Deal with local information (or not) + if(local_window_size != 0) { + # NOTE: It is much faster to work with subsets of the result of start() + # than it is to work with subsets of GRanges. + + # Get the indices which are within the local_window_size, but also limit to 5 CpGs on either side + # NOTE, local information is only used with cytosine/CpG resolution data so start() is valid. + # If regions were allowed, we would have to pay attention to which side we're on and use start()/end() + local_loci_idx = intersect( + which(abs(start(gr)[locus_idx] - start(gr)) < local_window_size), + max(1, locus_idx - 5):min(num_loci, locus_idx + 5)) + + if(length(local_loci_idx) == 1) { + # Do not use local information when there is only one local locus + local_loci_idx = locus_idx + local_weights = 1 + + # Collect Cov and M matrices for all the loci in the window + # Rows are loci and columns are samples + local_cov = matrix(cov_mat[local_loci_idx, ], nrow = 1) + local_meth = matrix(meth_mat[local_loci_idx, ], nrow = 1) + local_unmeth = local_cov - local_meth + + # Collect the correct rows of meth_est + local_meth_est = matrix(meth_est[local_loci_idx, ], nrow = 1) + } else { + # We need to scale the loci in the window onto the interval [-1, 1] because + # that is the domain of the local_weight_function. + # This is a vector of the distances of the local loci to the loci of interest (domain) + local_loci_norm = (start(gr)[local_loci_idx] - start(gr)[locus_idx]) / (local_window_size + 1) + + # Calculate the weights + # Each is a vector of values of the weight function (range) + local_weights = local_weight_function(local_loci_norm) + + # Collect Cov and M matrices for all the loci in the window + # Rows are loci and columns are samples + local_cov = cov_mat[local_loci_idx, ] + local_meth = meth_mat[local_loci_idx, ] + local_unmeth = local_cov - local_meth + + # Collect the correct rows of meth_est + local_meth_est = meth_est[local_loci_idx, ] + } + } else { + # Do not use local information when the local_window_size is 0 + local_loci_idx = locus_idx + local_weights = 1 + + # Collect Cov and M matrices for all the loci in the window + # Rows are loci and columns are samples + local_cov = matrix(cov_mat[local_loci_idx, ], nrow = 1) + local_meth = matrix(meth_mat[local_loci_idx, ], nrow = 1) + local_unmeth = local_cov - local_meth + + # Collect the correct rows of meth_est + local_meth_est = matrix(meth_est[local_loci_idx, ], nrow = 1) + } + + ##################################### + + ### Compute the degrees of freedom for the locus + if(all(disp_groups)) { + df_subtract = 2 + } else { + df_subtract = 1 + } + df = pmax(rowSums(local_cov[, disp_groups_idx, drop = FALSE] > 0) - df_subtract, 0) + # Compute the degrees of freedom to be used in the test for differential methylation + df = sum(df * local_weights) + + ##################################### + + if(df > 1) { + ### Common disp_groups calculation + # This returns a singleton numeric + if(.derivative_phi( + phi = max_inverse_disp, + local_c = local_meth[, disp_groups_idx, drop = FALSE], + local_t = local_unmeth[, disp_groups_idx, drop = FALSE], + mu = local_meth_est[, disp_groups_idx, drop = FALSE], + weight = local_weights) >= 0) { + + disp_est = max_inverse_disp + } else if(.derivative_phi( + phi = min_inverse_disp, + local_c = local_meth[, disp_groups_idx, drop = FALSE], + local_t = local_unmeth[, disp_groups_idx, drop = FALSE], + mu = local_meth_est[, disp_groups_idx, drop = FALSE], + weight = local_weights) <= 0){ + + disp_est = min_inverse_disp + } else { + disp_est = stats::uniroot( + f = .derivative_phi, + interval = c(min_inverse_disp, max_inverse_disp), + local_meth[, disp_groups_idx, drop = FALSE], + local_unmeth[, disp_groups_idx, drop = FALSE], + local_meth_est[, disp_groups_idx, drop = FALSE], + local_weights)$root + } + + ##################################### + + ### Common group means calculation + # This returns a numeric vector (control, case, control + case) with the mu est + group_meth_est_list = list(control_idx, case_idx, c(control_idx, case_idx)) + group_meth_est = rep(0, length(group_meth_est_list)) + for(group_idx in seq_along(group_meth_est_list)) { + if(sum(local_meth[, group_meth_est_list[[group_idx]], drop = FALSE]) == 0) { + # If there are no local C reads, methylation is 0 + group_meth_est[group_idx] = 0 + } else if (sum(local_unmeth[, group_meth_est_list[[group_idx]], drop = FALSE]) == 0) { + # If there are no local T reads, methylation is 1 + group_meth_est[group_idx] = 1 + } else { + # Otherwise, do something fancier + group_meth_est[group_idx] = stats::uniroot( + f = .derivative_mu, + interval = c(min_meth, max_meth), + local_meth[, group_meth_est_list[[group_idx]], drop = FALSE], + local_unmeth[, group_meth_est_list[[group_idx]], drop = FALSE], + disp_est, + local_weights)$root + } + } + + ##################################### + + ### log Likelihood ratio calculation + log_lik_ratio = + .log_likelihood( + mu = group_meth_est[1], + phi = disp_est, + local_c = local_meth[, control_idx, drop = FALSE], + local_t = local_unmeth[, control_idx, drop = FALSE], + weight = local_weights) + + .log_likelihood( + mu = group_meth_est[2], + phi = disp_est, + local_c = local_meth[, case_idx, drop = FALSE], + local_t = local_unmeth[, case_idx, drop = FALSE], + weight = local_weights) - + .log_likelihood( + mu = group_meth_est[3], + phi = disp_est, + local_c = local_meth[, c(control_idx, case_idx), drop = FALSE], + local_t = local_unmeth[, c(control_idx, case_idx), drop = FALSE], + weight = local_weights) + + ##################################### + + locus_data = c( + disp_est = disp_est, + log_lik_ratio = log_lik_ratio, + meth_control = group_meth_est[1]*100, + meth_case = group_meth_est[2]*100, + meth_all = group_meth_est[3]*100, + df = df + 2) + } else { + # Not enough degrees of freedom, return NAs, these will be removed + # with a message to the user with how many + locus_data = c( + disp_est = NA, + log_lik_ratio = NA, + meth_control = NA, + meth_case = NA, + meth_all = NA, + df = df) + } + + return(locus_data) + }, mc.cores = n_cores)) + + ##################################### + + # Build GRanges version of result + result_gr = gr + mcols(result_gr) = result + + # Calculate pvalue + if(t_approx) { + result_gr$pvalue = stats::pt(-sqrt(pmax(result_gr$log_lik_ratio, 0)), result_gr$df) * 2 + } else { + result_gr$pvalue = stats::pchisq(pmax(result_gr$log_lik_ratio, 0), 1, lower.tail = F) + } + + # Calculate meth_diff and set very small differences to 0 + result_gr$meth_diff = (result_gr$meth_case - result_gr$meth_control) + result_gr$meth_diff[abs(result_gr$meth_diff) < 0.01] = 0 + + # Correct for multiple testing + result_gr$fdr = stats::p.adjust(result_gr$pvalue, method = 'BH') + + # Assign direction of hypermethylation (NOTE, this is not "significance") + result_gr$direction = ifelse(result_gr$meth_diff >= 0, case, control) + + ##################################### + + # Order output columns and attach to GRanges + col_order = c( + 'meth_case', + 'meth_control', + 'meth_diff', + 'direction', + 'pvalue', + 'fdr', + 'disp_est', + 'log_lik_ratio', + 'df' + ) + mcols(result_gr) = mcols(result_gr)[, col_order] + + ##################################### + + # Check for NA results and indicate how many loci were dropped because of + # a lack of available degrees of freedom + insufficient_df = result_gr$df == 1 + + if(sum(insufficient_df) > 0) { + result_gr = result_gr[!insufficient_df] + message(sprintf('%s loci were dropped due to insufficient degrees of freedom (df = 1).', sum(insufficient_df))) + } + + return(result_gr) +} diff --git a/R/filter_loci_by_coverage.R b/R/filter_loci_by_coverage.R new file mode 100644 index 0000000..db26837 --- /dev/null +++ b/R/filter_loci_by_coverage.R @@ -0,0 +1,59 @@ +#' Filter BSseq object by coverage +#' +#' Used after \code{bsseq::read.bismark} to mark loci in samples below \code{min_count} or above \code{max_count} to 0. These loci will then be removed prior to differential analysis by \code{filter_loci_by_group_coverage()} if there are not a sufficient number of samples with appropriate coverage. +#' +#' @param bs a \code{BSseq} object resulting from \code{bsseq::read.bismark} or constructed manually by the user. +#' @param min_count an \code{integer} giving the minimum coverage required at a locus. +#' @param max_count an \code{integer} giving the maximum coverage allowed at a locus. +#' +#' @return A \code{BSseq} object with samples/loci in the coverage and methylation matrix set to 0 where the coverage was less than \code{min_count} or greater than \code{max_count}. The number of samples and loci are conserved. +#' +#' @examples +#' bis_cov_file1 = system.file('extdata', 'bis_cov1.cov', package = 'methylSig') +#' bis_cov_file2 = system.file('extdata', 'bis_cov2.cov', package = 'methylSig') +#' test = bsseq::read.bismark( +#' files = c(bis_cov_file1, bis_cov_file2), +#' colData = data.frame(row.names = c('test1','test2')), +#' rmZeroCov = FALSE, +#' strandCollapse = FALSE +#' ) +#' test = filter_loci_by_coverage(bs = test, min_count = 10, max_count = 500) +#' @export +filter_loci_by_coverage = function(bs, min_count = 5, max_count = 500) { + + if (!is(bs, 'BSseq')) { + stop('bs must be class BSseq. See bsseq::read.bismark() or bsseq::BSseq().') + } + if (!is(min_count, 'numeric')) { + stop('min_count must be an integer.') + } + if (!is(max_count, 'numeric')) { + stop('max_count must be an integer.') + } + if (!(min_count < max_count)) { + stop('min_count not less than max_count') + } + + ##################################### + + cov = bsseq::getCoverage(bs, type = 'Cov') + meth = bsseq::getCoverage(bs, type = 'M') + + idx = cov < min_count + cov[idx] = 0 + meth[idx] = 0 + + idx = cov > max_count + cov[idx] = 0 + meth[idx] = 0 + + bs = bsseq::BSseq( + Cov = cov, + M = meth, + gr = GenomicRanges::granges(bs), + pData = pData(bs), + sampleNames = sampleNames(bs) + ) + + return(bs) +} diff --git a/R/filter_loci_by_group_coverage.R b/R/filter_loci_by_group_coverage.R new file mode 100644 index 0000000..110c5bc --- /dev/null +++ b/R/filter_loci_by_group_coverage.R @@ -0,0 +1,101 @@ +#' Group cytosine / CpG level data into regions based on genomic regions +#' +#' An optional function to aggregate cytosine / CpG level data into regions based on a \code{GRanges} set of genomic regions. +#' +#' @param bs a \code{BSseq} object. +#' @param group_column a \code{character} string indicating the column of \code{pData(bs)} to use for determining group membership. +#' @param min_samples_per_group a named \code{integer} vector indicating the minimum number of samples with non-zero coverage required for maintaining a locus. +#' +#' @return A \code{BSseq} object with only those loci having \code{min_samples_per_group}. +#' +#' @examples +#' data(BS.cancer.ex, package = 'bsseqData') +#' +#' filter_loci_by_group_coverage( +#' bs = BS.cancer.ex, +#' group_column = 'Type', +#' min_samples_per_group = c('cancer' = 3, 'normal' = 3) +#' ) +#' +#' @export +filter_loci_by_group_coverage = function(bs, group_column, min_samples_per_group) { + + # Check missing + if (missing(bs)) { + stop('Must pass bs as a BSseq object.') + } + if (missing(group_column)) { + stop('Must pass group_column as a character string.') + } + if (missing(min_samples_per_group)) { + stop('Must pass min_samples_per_group as a named integer vector.') + } + + ##################################### + + # Check types + if (!is(bs, 'BSseq')) { + stop('bs must be class BSseq.') + } + if (!(is(group_column, 'character') && length(group_column) == 1)) { + stop('group_column must be a character string.') + } + if (!is(min_samples_per_group, 'numeric')) { + stop('min_samples_per_group must be a named integer vector.') + } + + ##################################### + + # Check valid group_column name + if (!(group_column %in% colnames(pData(bs)))) { + stop(sprintf('group_column: %s not in column names of pData(bs): %s', + group_column, paste(colnames(pData(bs)), collapse = ', '))) + } + + # Check valid factor names in group_column of pData(bs) + if (!(all(names(min_samples_per_group) %in% pData(bs)[, group_column]))) { + stop(sprintf('Not all names of min_samples_per_group are in group_column: %s', + paste(setdiff(names(min_samples_per_group), pData(bs)[, group_column]), collapse = ', ') )) + } + + ##################################### + + # Extract sample names belonging to each group given. NOTE, min_sample_per_group + # allows users to give more than two groups, and will require all group + # minimums are satisfied. Likely, it will most often be the case that + # there will only be two groups + group_samples = lapply(names(min_samples_per_group), function(f){ + rownames(pData(bs)[pData(bs)[, group_column] == f, ]) + }) + names(group_samples) = names(min_samples_per_group) + + # Previous filter_ functions have set sample/loci coverages not meeting + # the filtering requirements to 0, use this fact. + # NOTE, coercion to DelayedArray::DelayedArray because it seems that for + # tiled data (or smaller data), a DelayedArray isn't returned by getCoverage() + logical_cov_mat = DelayedArray::DelayedArray(bsseq::getCoverage(bs, type = 'Cov') > 0) + + # rowSums of the logical matrix subsetted on the group columns should equal + # or exceed the corresponding group's min_samples_per_group + keep_group_loci = lapply(names(group_samples), function(group) { + DelayedMatrixStats::rowSums2( + x = logical_cov_mat, + cols = group_samples[[group]], + value = TRUE, na.rm = TRUE) >= min_samples_per_group[group] + }) + names(keep_group_loci) = names(min_samples_per_group) + + # Entry-wise and() and will give loci matching all group thresholds + keep_loci = Reduce(`&`, keep_group_loci) + + # Check that there are some loci to keep. Say which groups were too strict. + if(!any(keep_loci)) { + zero_groups = sapply(keep_group_loci, sum, USE.NAMES = TRUE) == 0 + + stop(sprintf('Thresholds for the following groups were too strict: %s. + Relax thresholds for these groups and try filtering again.', + paste(names(zero_groups), collapse = ', '))) + } + + return(bs[keep_loci]) +} diff --git a/R/filter_loci_by_location.R b/R/filter_loci_by_location.R new file mode 100644 index 0000000..bbd57be --- /dev/null +++ b/R/filter_loci_by_location.R @@ -0,0 +1,53 @@ +#' Remove loci by overlap with a \code{GRanges} object +#' +#' A function to remove loci from a \code{BSseq} object based on intersection with loci in a \code{GRanges} object. +#' +#' @param bs a \code{BSseq} object. +#' @param gr a \code{GRanges} object. +#' +#' @return A \code{BSseq} object with loci intersecting \code{gr} removed. +#' +#' @examples +#' data(bsseq_stranded, package = 'methylSig') +#' regions = GenomicRanges::GRanges( +#' seqnames = c('chr1','chr1','chr1','chr1'), +#' ranges = IRanges::IRanges( +#' start = c(5,25,45,70), +#' end = c(15,40,55,80) +#' ) +#' ) +#' filtered = filter_loci_by_location(bs = bsseq_stranded, gr = regions) +#' +#' @export +filter_loci_by_location = function(bs, gr) { + + # Missing checks + if (missing(bs)) { + stop('Must pass bs as a BSseq object.') + } + if (missing(gr)) { + stop('Must pass gr as a GRanges object.') + } + + # Type checks + if (!is(bs, 'BSseq')) { + stop('bs must be class BSseq.') + } + if (!is(gr, 'GRanges')) { + stop('gr must be class GRanges.') + } + + ##################################### + + overlaps = GenomicRanges::findOverlaps(bs, gr) + + keep_idx = setdiff(seq_along(bs), unique(S4Vectors::queryHits(overlaps))) + + if(length(keep_idx) == 0) { + stop('All loci in bs were removed by gr, leaving no loci for downstream analysis.') + } + + bs = bs[keep_idx] + + return(bs) +} diff --git a/R/methylSig-data.R b/R/methylSig-data.R new file mode 100644 index 0000000..5df46a8 --- /dev/null +++ b/R/methylSig-data.R @@ -0,0 +1,39 @@ +#' BSseq object read from stranded coverage files +#' +#' Data contains 11 methylation loci and 2 samples +#' +#' @format A BSseq object +#' @source data-raw/02-create_bsseq_rda.R +#' @examples +#' data(bsseq_stranded, package = 'methylSig') +'bsseq_stranded' + +#' BSseq object read from destranded coverage files +#' +#' Data contains 6 methylation loci and 2 samples +#' +#' @format A BSseq object +#' @source data-raw/02-create_bsseq_rda.R +#' @examples +#' data(bsseq_destranded, package = 'methylSig') +'bsseq_destranded' + +#' BSseq object with loci on multiple chromosomes +#' +#' Data contains 4 methylation loci for 2 samples on 2 chromosomes +#' +#' @format A BSseq object +#' @source data-raw/02-create_bsseq_rda.R +#' @examples +#' data(bsseq_multichrom, package = 'methylSig') +'bsseq_multichrom' + +#' GRanges object with collapsed promoters on chr21 and chr22 +#' +#' Data contains 1466 promoters for use in the vignette +#' +#' @format A GRanges object +#' @source data-raw/02-create_bsseq_rda.R +#' @examples +#' data(promoters_gr, package = 'methylSig') +'promoters_gr' diff --git a/R/methylSig-package.R b/R/methylSig-package.R new file mode 100644 index 0000000..c428996 --- /dev/null +++ b/R/methylSig-package.R @@ -0,0 +1,49 @@ +#' MethylSig: Differential Methylation Testing for WGBS and RRBS Data +#' +#' MethylSig is a package for testing for +#' differentially methylated cytosines (DMCs) or regions (DMRs) in +#' whole-genome bisulfite sequencing (WGBS) or reduced representation +#' bisulfite sequencing (RRBS) experiments. MethylSig uses a beta +#' binomial model to test for significant differences between groups of +#' samples. Several options exist for either site-specific or sliding +#' window tests, and variance estimation. +#' +#' @section methylSig functions: +#' filter_loci_by_coverage() +#' filter_loci_by_snps() +#' tile_by_regions() +#' tile_by_windows() +#' filter_loci_by_group_coverage() +#' diff_binomial() +#' diff_methylsig() +#' diff_methylsig_dss() +#' annotate_diff() +#' visualize_diff() +#' region_enrichment_diff() +#' +#' @import bsseq +#' @import DelayedArray +#' @import DelayedMatrixStats +#' @import DSS +#' @import GenomeInfoDb +#' @import GenomicRanges +#' @import IRanges +#' @import parallel +#' @import S4Vectors +#' @importFrom methods is +#' @importFrom stats as.formula +#' @importFrom stats formula +#' @importFrom stats pchisq +#' @importFrom stats pt +#' @importFrom stats p.adjust +#' +#' @docType package +#' @name methylSig +#' @keywords internal +"_PACKAGE" + +# The following block is used by usethis to automatically manage +# roxygen namespace tags. Modify with care! +## usethis namespace: start +## usethis namespace: end +NULL diff --git a/R/methylSig_data_doc.R b/R/methylSig_data_doc.R deleted file mode 100644 index 6e12f84..0000000 --- a/R/methylSig_data_doc.R +++ /dev/null @@ -1,32 +0,0 @@ -#' CpG Index for hg19 -#' -#' A GenomicsRanges object giving the coordinates (in hg19) of all C > T SNPs. -#' Start coordinates are 0-based and end coordinates are 1-based. Starting from -#' 1000 Genomes Data we used \code{bcftools filter} with \code{-i 'AF[0]>0.05'} -#' to pull all sites with alternate frequency greater than 0.05. We then used -#' \code{grep -P '(C\tT)'} and \code{grep -P '(VT=SNP)'} to collect all C > T -#' SNPs. -#' -#' @docType data -#' @keywords datasets -#' @name CT_SNPs_hg19 -#' @format A GenomicRanges object of length 1,321,463 -#' @source \url{ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.vcf.gz} -NULL - -#' Sample data -#' -#' The \code{sample_data} object contains the following items: -#' \describe{ -#' \item{meth}{ A \code{BSseq-class} object containing 6 samples total, with three in each group. Genome is hg19. } -#' \item{tiled_meth}{ A tiled version of the \code{BSseq-class} object called \code{meth}. Tiles are 1000bp. Genome is hg19. } -#' \item{msig_cpgs}{ A \code{GRanges-class} object containing the results of \code{methylSigCalc} on \code{data}. } -#' \item{msig_tiles}{ A \code{GRanges-class} object containing the results of \code{methylSigCalc} on \code{tiled_meth}. } -#' \item{tfbs}{ A \code{GRanges-class} object representing transcription factor binding sites. For use in \code{methylSig.tfbsEnrichTest}. Genome is hg19. } -#' } -#' -#' @docType data -#' @keywords datasets -#' @name sample_data -#' @format A mixture of BSseq and GRanges class objects. See documentation for test_data. -NULL diff --git a/R/methylSig_pkg_doc.R b/R/methylSig_pkg_doc.R deleted file mode 100644 index d45ed11..0000000 --- a/R/methylSig_pkg_doc.R +++ /dev/null @@ -1,23 +0,0 @@ -#' methylSig: a whole genome DNA methylation analysis pipeline -#' -#' MethylSig is a method for testing differential methylated cytosines (DMCs) or regions (DMRs) in whole-genome bisulfite sequencing (bis-seq) or reduced representation bisulfite sequencing (RRBS) experiments. MethylSig uses a beta binomial model to test for significant differences between groups of samples. Several options exist for either site-specific or sliding window tests, combining strands, and for variance estimation. It allows annotating the resulting regions to multiple genome features, and visualizing the results for chosen genomic regions. -#' -#' @author Yongseok Park \email{yongpark@@pitt.edu}, Raymond Cavalcante \email{rcavalca@@umich.edu}, and Maureen A. Sartor -#' @references https://www.github.com/sartorlab/methylSig -#' -#' @import annotatr -#' @import BiocGenerics -#' @importFrom boot corr -#' @import bsseq -#' @import DSS -#' @import DelayedArray -#' @import GenomeInfoDb -#' @import GenomicRanges -#' @import methods -#' @importFrom parallel mclapply -#' @import S4Vectors -#' -#' @docType package -#' @name methylSig-package -#' @aliases methylSig -NULL diff --git a/R/plots.R b/R/plots.R deleted file mode 100644 index 1e09c08..0000000 --- a/R/plots.R +++ /dev/null @@ -1,118 +0,0 @@ -# #' Plot DM status distribution in annotations -# #' -# #' Generates a bar plot showing the distribution of differential methylation status of loci in selected annotations. -# #' -# #' @param myAnnots A \code{GRanges} object resulting from running \code{methylSigAnnotation} on the result of \code{methylSigCalc}. -# #' @param annotation_order A character vector which orders and subsets the annotations for the plot. -# #' @param status_order A character vector which orders and subsets the DM status. -# #' @param position One of \code{fill}, \code{stack}, \code{dodge}. See \code{ggplot2} documentation for details. -# #' @param plot_title The plot label. -# #' @param legend_title The legend label. -# #' @param x_label The x-axis label. -# #' @param y_label The y-axis label. -# #' -# #' @return A \code{ggplot2} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}. -# #' -# #' @examples -# #' # Annotate the msig_cpgs results -# #' utils::data(sample_data, package = 'methylSig') -# #' -# #' # Use the genome of msig_cpgs and build annotations for CpG features -# #' genome = GenomeInfoDb::genome(msig_cpgs) -# #' annots = annotatr::build_annotations(genome = genome, annotations = paste(genome, c('cpgs'), sep='_')) -# #' -# #' # Decide what counts as differentially methylated -# #' dmcList = msig_cpgs$fdr < 0.05 & abs(msig_cpgs$meth.diff) > 25 -# #' -# #' # Annotate -# #' myDiff_annotated = methylSigAnnotation(myDiff = msig_cpgs, dmcList = dmcList, annotations = annots) -# #' -# #' # Set the order vectors -# #' cpg_order = c('hg19_cpg_islands','hg19_cpg_shores','hg19_cpg_shelves','hg19_cpg_inter') -# #' dm_order = c('DR','DS','No DM') -# #' -# #' methylSigPlotStatus(myAnnots = myDiff_annotated, annotation_order = cpg_order, status_order = dm_order, -# #' position = 'fill', plot_title = 'DM Status in CpG Annots.', legend_title = 'Annotations', -# #' x_label = 'DM Status', y_label = 'Proportion') -# #' -# #' @export -# methylSigPlotStatus = function(myAnnots, annotation_order = NULL, status_order = NULL, position = 'fill', -# plot_title = 'DM Status', legend_title = 'Annotations', x_label = 'DM Status', y_label = 'Proportion') { -# -# plot = annotatr::plot_categorical( -# annotated_regions = myAnnots, -# x = 'dm_status', -# fill = 'annot.type', -# x_order = status_order, -# fill_order = annotation_order, -# position = position, -# plot_title = plot_title, -# legend_title = legend_title, -# x_label = x_label, -# y_label = y_label) -# -# return(plot) -# } -# -# #' Data visualization function -# #' -# #' Generates data visualization plot of methylation data for a specified genomic interval. -# #' -# #' This function offers a unique two-tiered visualization of the methylation data depending on the zoom level. For narrow regions (<1mbp) where at most 500 CpG sites have data reads, users can visualize sample-specific coverage levels and percent methylation at each site, together with group averages, significance levels and a number of genomic annotations. -# #' -# #' @param myAnnots A \code{GRanges} object resulting from running \code{methylSigAnnotation} on the result of \code{methylSigCalc}. -# #' @param annotation_order A character vector which orders and subsets the annotations for the plot. -# #' @param status_order A character vector which orders and subsets the DM status. -# #' @param bin_width A vector of two numbers (from, to) to specify the region to visualize on chromosome `chr'. -# #' @param plot_title The plot label. -# #' @param legend_title The legend label. -# #' @param x_label The x-axis label. -# #' @param y_label The y-axis label. -# #' @param legend_facet_label Label explaining the meaning of the gray bars in the resulting histogram. -# #' @param legend_cum_label Label explaining the meaning of the red outlined bars in the resulting histogram. -# #' -# #' @return A \code{ggplot2} object which can be viewed by calling it, or saved with \code{ggplot2::ggsave}. -# #' -# #' @examples -# #' # Annotate the msig_cpgs results -# #' utils::data(data, package = 'methylSig') -# #' -# #' # Use the genome of msig_cpgs and build annotations for CpG features -# #' genome = GenomeInfoDb::genome(msig_cpgs) -# #' annots = annotatr::build_annotations(genome = genome, annotations = paste(genome, c('cpgs'), sep='_')) -# #' -# #' # Decide what counts as differentially methylated -# #' dmcList = msig_cpgs$fdr < 0.05 & abs(msig_cpgs$meth.diff) > 25 -# #' -# #' # Annotate -# #' myDiff_annotated = methylSigAnnotation(myDiff = msig_cpgs, dmcList = dmcList, annotations = annots) -# #' -# #' # Set the order vectors -# #' cpg_order = c('hg19_cpg_islands','hg19_cpg_shores','hg19_cpg_shelves','hg19_cpg_inter') -# #' status_order = c('DR','DS','No DM') -# #' -# #' methylSigPlotDiff(myAnnots = myDiff_annotated, annotation_order = cpg_order, status_order = status_order, -# #' bin_width = 10, plot_title = 'Meth. Diff. in CpG Annots.', x_label = 'DM Status', y_label = 'Proportion', -# #' legend_facet_label = 'Meth. Diff. in Annotation', legend_cum_label = 'Meth. Diff. Overall') -# #' -# #' @export -# methylSigPlotDiff = function(myAnnots, annotation_order = NULL, status_order = NULL, bin_width = 10, -# plot_title = 'Methylation Differences', x_label = 'Methylation Difference', y_label = 'Density', -# legend_facet_label = 'Meth. Diff. in Annotation', legend_cum_label = 'Meth. Diff. Overall') { -# -# facet_order = list(annotation_order, status_order) -# -# plot = annotatr::plot_numerical( -# annotated_regions = myAnnots, -# x = 'meth.diff', -# facet = c('annot.type', 'dm_status'), -# facet_order = facet_order, -# bin_width = bin_width, -# plot_title = plot_title, -# x_label = x_label, -# y_label = y_label, -# legend_facet_label = legend_facet_label, -# legend_cum_label = legend_cum_label) -# -# return(plot) -# } diff --git a/R/read.R b/R/read.R deleted file mode 100644 index bf4188e..0000000 --- a/R/read.R +++ /dev/null @@ -1,155 +0,0 @@ -#' Read methylation score files to make a 'BSseq' object. -#' -#' This function reads files created by the Bismark Methylation Extractor, and outputs a \code{BSseq} object. -#' -#' @param fileList Files to be read. These can be \code{cov} or \code{cytosine_reports} from the Bismark Methylation Extractor. See \code{fileType} for details. -#' @param pData A \code{data.frame} containing phenotype information for the samples in \code{fileList}. The \code{row.names} attribute of the \code{data.frame} should match the \code{Sample_Names}. See example below. -#' @param assembly The genome assembly used for alignment. e.g. \code{hg19}, \code{mm10}, etc. -#' @param destranded A logical value indicating whether to destrand the reverse to forward strand. If TRUE, the reads from both will be combined. Default is TRUE. -#' @param maxCount A number indicating the maximum coverage count to be included. -#' @param minCount A number indicating the minimum coverage count to be included. -#' @param filterSNPs A logical value indicating whether or not to filter out C > T SNPs based on the 1000 Genomes Project. NOTE: Only supported when \code{assembly = 'hg19'}. -#' @param num.cores Number of cores to be used in reading files. Default is 1. -#' @param fileType The format of the input file. Either \code{cov} or \code{cytosineReport}. One of the outputs of the Bismark Methylation Extractor. -#' @param verbose A logical value indicating whether \code{bsseq::read.bismark} shoud print progress. Default TRUE. -#' -#' @return A \code{BSseq-class} object. -#' -#' @seealso \code{\link{methylSigCalc}} -#' -#' @examples -#' files = c( -#' system.file('extdata', 'MDAMB_231_1DR.txt.gz', package='methylSig'), -#' system.file('extdata', 'MDAMB_231_1DS.txt.gz', package='methylSig'), -#' system.file('extdata', 'MDAMB_231_2DR.txt.gz', package='methylSig'), -#' system.file('extdata', 'MDAMB_231_2DS.txt.gz', package='methylSig'), -#' system.file('extdata', 'MDAMB_231_3DR.txt.gz', package='methylSig'), -#' system.file('extdata', 'MDAMB_231_3DS.txt.gz', package='methylSig')) -#' -#' sample.ids = basename(files) -#' sample.ids = gsub('.txt.gz', '', sample.ids) -#' -#' pData = data.frame( -#' Sample_Names = sample.ids, -#' DR_vs_DS = relevel(factor(c('DR','DS','DR','DS','DR','DS')), ref = 'DS'), -#' row.names = sample.ids, -#' stringsAsFactors = FALSE) -#' -#' meth = methylSigReadData( -#' fileList = files, -#' pData = pData, -#' assembly = 'hg19', -#' destranded = TRUE, -#' maxCount = 500, -#' minCount = 10, -#' filterSNPs = TRUE, -#' num.cores = 1, -#' fileType = 'cytosineReport') -#' -#' @export -methylSigReadData = function( - fileList, - pData, - assembly = NA, - destranded = TRUE, - maxCount = 500, minCount = 10, - filterSNPs = FALSE, - num.cores = 1, - fileType = c("cov", "cytosineReport"), - verbose = TRUE) { - - # NOTE: The cytosine report is 1-based, and GRanges is also 1-based. The result, - # of bsseq::read.bismark() is 1-based. We are 1-based y'all! - - fileType = match.arg(fileType) - - # Read - bs = bsseq::read.bismark( - files = fileList, - sampleNames = rownames(pData), - rmZeroCov = TRUE, - strandCollapse = destranded, - fileType = fileType, - mc.cores = num.cores, - verbose = verbose) - - # Assign Seqinfo to bs - # If the assembly is supported, then downstream functions such as tiling and - # annotation should work without issue, assuming that the annotations for - # the same genome don't somehow have different seqlengths. This shouldn't - # happen on-the-fly because GenomeInfoDb::Seqinfo is used. BUT, if saved - # annotations are used that are old, GenomeInfoDb::Seqinfo could result in - # mismatched lengths between Bioc versions - - # If the assembly is not supported, we have to be careful about how the - # Seqinfo is defined. If we try to define it as the end of each present - # chromosome + some width to allow for tiling, we run the risk of not being - # able to perform custom annotations if the annotation Seqinfo doesn't match. - - # If the Seqinfo is left blank, we cannot tile downstream, warn the user. - if(!is.na(assembly)) { - genome_seqinfo = tryCatch({ - GenomeInfoDb::Seqinfo(genome = assembly) - }, error = function(e) { - warning(sprintf('The assembly %s is not supported by GenomeInfoDb::fetchExtendedChromInfoFromUCSC, and the resulting BSseq object from this function will have the trivial seqinfo attributes. In order to use the methylSigTile function, you must create a custom GenomeInfoDb::Seqinfo and assign it to the result of this function. If you would like to use annotation functions downstream, the seqinfo for custom annotations MUST BE THE SAME as what is assigned to the result this function.', assembly)) - seqinfo(bs) - }) - } else { - warning('Leaving assembly as NA will give the resulting BSseq object from this function the trivial seqinfo attributes. In order to use the methylSigTile function downstream, you MUST create a custom GenomeInfoDb::Seqinfo and assign it to the result of this function. If you would like to use annotation functions downstream, the seqinfo for custom annotations MUST BE THE SAME as what is assigned to the result of this function.') - - genome_seqinfo = seqinfo(bs) - } - seqinfo(bs) = merge(seqinfo(bs), genome_seqinfo) - - # Filter C > T (+) or G > A (-) SNPs - # SNPs are 1-based - # In order to avoid seqinfo() incompatibility issues between the BSseq - # object and the CT_SNPs_hg19 object - if(filterSNPs) { - if(!is.na(assembly) && assembly == 'hg19') { - message('Filtering SNPs') - utils::data('CT_SNPs_hg19', envir=environment()) - CT_SNPs_hg19 = get('CT_SNPs_hg19') - - overlaps = GenomicRanges::findOverlaps(GenomicRanges::granges(bs), CT_SNPs_hg19, ignore.strand = T) - snp_invalid_list = S4Vectors::queryHits(overlaps) - - bs = bs[-snp_invalid_list] - } else { - message(sprintf('Skipping SNP filtering, genome %s is not supported.', assembly)) - } - } - - # Filter for maxCount and minCount - # Must change both Cov and M because BSseq has some sanity checks - # where 0 <= M <= Cov - cov = as.matrix(bsseq::getCoverage(BSseq = bs, type = 'Cov')) - m = as.matrix(bsseq::getCoverage(BSseq = bs, type = 'M')) - for(j in 1:ncol(cov)) { - count_idx = which(cov[,j] < minCount | cov[,j] > maxCount) - cov[count_idx, j] = 0 - m[count_idx, j] = 0 - } - - # Rebuild the BSseq object after altering the Cov and M counts - bs = bsseq::BSseq(gr = GenomicRanges::granges(bs), M = m, Cov = cov, pData = pData, rmZeroCov = TRUE) - - bs = sort(bs, ignore.strand = TRUE) - - bs_metadata = list( - files = fileList, - assembly = assembly, - destranded = destranded, - maxCount = maxCount, - minCount = minCount, - filterSNPs = filterSNPs, - fileType = fileType, - tile = FALSE, - tiles = NULL, - win.size = NULL - ) - - S4Vectors::metadata(bs) = bs_metadata - - return(bs) -} diff --git a/R/sysdata.rda b/R/sysdata.rda new file mode 100644 index 0000000..20ce765 Binary files /dev/null and b/R/sysdata.rda differ diff --git a/R/test_binomial.R b/R/test_binomial.R deleted file mode 100644 index c575610..0000000 --- a/R/test_binomial.R +++ /dev/null @@ -1,129 +0,0 @@ -#' Differential methylation analysis using binomial model -#' -#' This function calculates differential methylation statistics using a binomial-based approach. See `Warning' message below. -#' -#' This function uses a binomial-based model to calculate differential methylation statistics. It is nearly identical to the \code{methylKit::calculateDiffMeth} function in the \code{methylKit} R package except that only the likelihood ratio test and \code{p.adjust()} with \code{method='BH'} are used to calculate significance levels. It is significantly faster than \code{methylKit::calculateDiffMeth} function. -#' -#' @param meth A \code{BSseq-class} object to calculate differential methylation statistics. See \code{methylSigReadData} for how to read in methylation data. -#' @param comparison The name of the column in \code{pData(meth)} to use for the comparisons, with the correct reference level set. -#' @param min.per.group A vector with two numbers specifying the minimum number of samples required to perform the test for differential methylation. If it is a single number, both groups will use it as the minimum requried number of samples. Default is \code{c(3,3)}. -#' -#' @return \code{GRanges} object containing the differential methylation statistics and locations. \code{p.adjust} with \code{method='BH'} option is used for p-value correction. -#' -#' @section Warning: This function does not take into account the variability among samples in each group being compared. -#' -#' @seealso \code{\link{methylSigCalc}} -#' -#' @examples -#' utils::data(sample_data, package = 'methylSig') -#' -#' myDiff = binomialDiffCalc(meth = meth, comparison = 'DR_vs_DS') -#' -#' @keywords differentialMethylation -#' -#' @export -binomialDiffCalc <- function( - meth, - comparison, - min.per.group=c(3,3)) { - - if(length(min.per.group) == 1) { - min.per.group = c(min.per.group,min.per.group) - } - - ##################################### - # Get the group labels, NOTE: THIS ASSUMES CORRECT REFERENCE LEVEL SET - pdata = bsseq::pData(meth) - group2 = levels(pdata[, comparison])[2] - group1 = levels(pdata[, comparison])[1] - - # Determine which rows of pData belong to which group - # / which columns of Cov and M matrices belong to which group - group2_idx = which(pdata[,comparison] == group2) - group1_idx = which(pdata[,comparison] == group1) - - ##################################### - # Determine which sites are valid to test according to min.per.group - all_cov = as.matrix(bsseq::getCoverage(meth, type = 'Cov')) - all_meth = as.matrix(bsseq::getCoverage(meth, type = 'M')) - - # Determine which loci satisfy min.per.group - valid_idx = which( - base::rowSums(all_cov[, group2_idx] > 0) >= min.per.group[2] & base::rowSums(all_cov[, group1_idx] > 0) >= min.per.group[1] - ) - - ##################################### - # Resize all_cov and all_meth to valid_idx - all_cov = all_cov[valid_idx,] - all_meth = all_meth[valid_idx,] - - # Setup required quantities for the logLikRatio calculation - treads = rowSums(all_cov - all_meth, na.rm = TRUE) - treads1 = rowSums(all_cov[,group1_idx] - all_meth[,group1_idx], na.rm = TRUE) - treads2 = rowSums(all_cov[,group2_idx] - all_meth[,group2_idx], na.rm = TRUE) - creads = rowSums(all_meth, na.rm = TRUE) - creads1 = rowSums(all_meth[,group1_idx], na.rm = TRUE) - creads2 = rowSums(all_meth[,group2_idx], na.rm = TRUE) - cov = rowSums(all_cov, na.rm = TRUE) - cov1 = rowSums(all_cov[,group1_idx], na.rm=TRUE) - cov2 = rowSums(all_cov[,group2_idx], na.rm=TRUE) - - logLikRatio = 2 * (creads1 * log(creads1 / cov1 + 1e-100) - + treads1 * log(treads1 / cov1 + 1e-100) - + creads2 * log(creads2 / cov2 + 1e-100) - + treads2 * log(treads2 / cov2 + 1e-100) - - creads * log(creads / cov + 1e-100) - - treads * log(treads / cov + 1e-100) - ) - - mu1 = (creads1 / cov1)*100 - mu2 = (creads2 / cov2)*100 - meth.diff = mu2 - mu1 - - hyper.direction = ifelse(meth.diff >= 0, group2, group1) - - pvalue = stats::pchisq(logLikRatio, 1, lower.tail=FALSE) - fdr = stats::p.adjust(pvalue, method = 'BH') - - results = data.frame( - 'logLikRatio' = logLikRatio, - 'mu2' = mu2, - 'mu1' = mu1, - 'meth.diff' = meth.diff, - 'hyper.direction' = hyper.direction, - 'pvalue' = pvalue, - 'fdr' = fdr, - stringsAsFactors = FALSE - ) - - - - # Extract the granges of the meth BSseq object and attach the data.frame - results_gr = GenomicRanges::granges(meth)[valid_idx, ] - GenomicRanges::mcols(results_gr) = results - - colnames(GenomicRanges::mcols(results_gr)) = c( - 'logLikRatio', - paste('meth', group2, sep='.'), - paste('meth', group1, sep='.'), - 'meth.diff', - 'hyper.direction', - 'pvalue', - 'fdr' - ) - - # Add metadata - results_metadata = list( - method = 'binomialDiffCalc', - comparison = comparison, - min.per.group = min.per.group - ) - - S4Vectors::metadata(results_gr) = results_metadata - - ############################################################################ - - seqinfo(results_gr) = merge(seqinfo(results_gr), seqinfo(meth)) - - return(results_gr) -} diff --git a/R/test_methylSig.R b/R/test_methylSig.R deleted file mode 100755 index fab6d8f..0000000 --- a/R/test_methylSig.R +++ /dev/null @@ -1,433 +0,0 @@ -#' Default methylSig Weight Function -#' -#' The default weight function used by methylSigCalc -#' -#' @param u A numeric between 0 and 1 -#' -#' @return A \code{GRanges} object containing the following \code{mcols}: -#' -#' @examples -#' methylSig_weightFunc(0.5) -#' -#' @export -methylSig_weightFunc <- function(u) (1-u^2)^3 - -# Called by methylSigCalc -methylSig_derivativePhi <- function(phi, lCreads, lTreads, mu, weight) { - derivative = 0 - indicator_c = lCreads > 0 - indicator_t = lTreads > 0 - indicator_ct = lCreads + lTreads > 0 - - if(nrow(lCreads) == 1) { - derivative = - sum( indicator_c * ( mu * (digamma((mu * phi) + lCreads + 1e-100) - digamma(mu * phi + 1e-100)) ) ) + - sum( indicator_t * ((1 - mu) * (digamma( ((1 - mu) * phi) + lTreads + 1e-100) - digamma( ((1-mu) * phi) + 1e-100))) ) - - sum( indicator_ct * (digamma(phi + lCreads + lTreads + 1e-100) - digamma(phi)) ) - } else { - for(g in 1:ncol(lCreads)) { - derivative = derivative + - sum( indicator_c[,g] * (weight * mu[,g] * (digamma(mu[,g] * phi + lCreads[,g] + 1e-100) - digamma(mu[,g] * phi + 1e-100))) ) + - sum( indicator_t[,g] * (weight * (1 - mu[,g]) * (digamma((1 - mu[,g]) * phi + lTreads[,g] + 1e-100) - digamma((1 - mu[,g]) * phi + 1e-100))) ) - - sum( indicator_ct[,g] * (weight * (digamma(phi + lCreads[,g] + lTreads[,g] + 1e-100) - digamma(phi))) ) - } - } - - derivative -} - -# Called by methylSigCalc -methylSig_derivativeMu <- function(mu, lCreads, lTreads, phi, weight) { - derivative = 0 - indicator_c = lCreads > 0 - indicator_t = lTreads > 0 - - if(nrow(lCreads) == 1) { - derivative = - sum( indicator_c * (digamma(mu * phi + lCreads + 1e-100) - digamma(mu * phi + 1e-100)) ) - - sum( indicator_t * (digamma((1 - mu) * phi + lTreads + 1e-100) - digamma((1 - mu) * phi + 1e-100)) ) - } else { - for(g in 1:ncol(lCreads)) { - derivative = derivative + - sum( indicator_c[,g] * (weight * (digamma(mu * phi + lCreads[,g]+ 1e-100) - digamma(mu * phi + 1e-100))) ) - - sum( indicator_t[,g] * (weight * (digamma((1 - mu) * phi + lTreads[,g] + 1e-100) - digamma((1 - mu) * phi + 1e-100))) ) - } - } - - derivative -} - -# Called by methylSig_dataProcess -methylSig_logLik <- function(mu, phi, lCreads, lTreads, weight) { - llik = 0 - indicator_c = lCreads > 0 - indicator_t = lTreads > 0 - - if(nrow(lCreads) == 1) { - llik = llik + - sum( indicator_c * (lgamma(mu * phi + lCreads + 1e-100) - lgamma(mu * phi + 1e-100)) ) + - sum( indicator_t * (lgamma((1 - mu) * phi + lTreads + 1e-100) - lgamma((1 - mu) * phi + 1e-100)) ) - } else { - for(g in 1:ncol(lCreads)) { - llik = llik + - sum( indicator_c[,g] * (weight * (lgamma(mu * phi + lCreads[,g] + 1e-100) - lgamma(mu * phi + 1e-100))) ) + - sum( indicator_t[,g] * (weight * (lgamma((1 - mu) * phi + lTreads[,g] + 1e-100) - lgamma((1 - mu) + 1e-100))) ) - } - } - - 2*llik -} - -#' Calculates differential methylation statistics using a Beta-binomial approach. -#' -#' The function calculates differential methylation statistics between two groups of samples. This is the main function of the methylSig package, and the method most users should use to test for DMCs or DMRs. The function uses a Beta-binomial approach to calculate differential methylation statistics, accounting for variation among samples within each group. -#' -#' The function calculates differential methylation statistics between two groups of samples. The function uses Beta-binomial approach to calculate differential methylation statistics, accounting for variation among samples within each group. Users who wish to tile their data and test for differentially methylated regions (DMRs) instead DMCs should first use the \code{\link{methylSigTile}} function before using this function. -#' -#' @param meth A \code{BSseq-class} object to calculate differential methylation statistics. See \code{methylSigReadData} for how to read in methylation data. -#' @param comparison The name of the column in \code{pData(meth)} to use for the comparisons, with the correct reference level set. -#' @param dispersion One of \code{both}, or either group name. Indicates which set of samples to use to estimate the dispersion parameter. Default is \code{both}. -#' @param local.info A \code{logical} value indicating whether to use local information to improve mean and dispersion parameter estimations. Default is \code{FALSE}. -#' @param local.winsize An \code{integer} to specify the distance upstream and downstream of a location to include local information for the mean and dispersion parameter estimations. NOTE: An additional constraint is placed whereby a maximum of 5 loci upstream and downstream of the locus of interest are used. Default is \code{200}. -#' @param min.per.group A vector with two numbers specifying the minimum number of samples required to perform the test for differential methylation. If it is a single number, both groups will use it as the minimum requried number of samples. Default is \code{c(3,3)}. NOTE: The ordering of this parameter with respect to the groups should be \code{c(reference, other)}, where \code{reference} refers to the reference level in the \code{pData(meth)[, comparison]} factor. -#' @param weightFunc A weight kernel function. The input of this function is from -1 to 1. The default is the tri-weight kernel function defined as \code{function(u) = (1-u^2)^3}. Function value and range of parameter for weight function should be from 0 to 1. -#' @param T.approx A \code{logical} value indicating whether to use squared t approximation for the likelihood ratio statistics. Chi-square approximation (\code{T.approx = FALSE}) is recommended when the sample size is large. Default is \code{TRUE}. -#' @param num.cores An integer denoting how many cores should be used for differential methylation calculations. -#' -#' @return A \code{GRanges} object containing the following \code{mcols}: -#' \describe{ -#' \item{phiCommonEst}{ The dispersion estimate. } -#' \item{logLikRatio}{ The log likelihood ratio. } -#' \item{df}{ Degrees of freedom used when \code{T.approx = TRUE}. } -#' \item{muEstC_group1}{ Methylation estimate for group1. Groups correspond to the levels in the column used for the comparison in \code{pdata(meth)}. } -#' \item{muEstC_group2}{ Methylation estimate for group2. } -#' \item{meth.diff}{ The difference \code{muEstC_group2 - muEstC_group1}. } -#' \item{hyper.direction}{ The group for which the CpG/region is hyper-methylated. Groups correspond to the levels in the column used for the comparison in \code{pdata(meth)}. } -#' \item{pvalue}{ The p-value from the t-test (\code{T.approx = TRUE}) or the Chi-Square test (\code{T.approx = FALSE}). } -#' \item{fdr}{ The Benjamini-Hochberg adjusted p-values using \code{p.adjust(method = 'BH')}. } -#' } -#' -#' @seealso \code{\link{methylSigReadData}} -#' -#' @examples -#' utils::data(sample_data, package = 'methylSig') -#' -#' result = methylSigCalc( -#' meth = meth, -#' comparison = 'DR_vs_DS', -#' dispersion = 'both', -#' local.info = FALSE, -#' local.winsize = 200, -#' min.per.group = c(3,3), -#' weightFunc = methylSig_weightFunc, -#' T.approx = TRUE, -#' num.cores = 1) -#' -#' @keywords differentialMethylation -#' -#' @export -methylSigCalc = function( - meth, - comparison = NA, - dispersion="both", - local.info=FALSE, local.winsize=200, - min.per.group=c(3,3), - weightFunc=methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) { - - ##################################### - # Constants - if(!local.info) { - local.winsize = 0 - } - - if(length(min.per.group) == 1) { - min.per.group = c(min.per.group,min.per.group) - } - - min.disp = 1e-6 - min.InvDisp = 0.001 - max.InvDisp = max(1/max(min.disp, 1e-6), min.InvDisp) - - minMu = 0 - maxMu = 1 - - ##################################### - # Get the group labels, NOTE: THIS ASSUMES CORRECT REFERENCE LEVEL SET - pdata = bsseq::pData(meth) - group2 = levels(pdata[, comparison])[2] - group1 = levels(pdata[, comparison])[1] - - # Determine which rows of pData belong to which group - # / which columns of Cov and M matrices belong to which group - group2_idx = which(pdata[,comparison] == group2) - group1_idx = which(pdata[,comparison] == group1) - - # Determine which sample column indexes to use for dispersion calculation - if(dispersion == 'both') { - disp_groups_idx = c(group2_idx, group1_idx) - } else if (dispersion == group2) { - disp_groups_idx = group2_idx - } else if (dispersion == group1) { - disp_groups_idx = group1_idx - } else { - stop('"dispersion" should be one of "both", the name of group2, or the name of group1') - } - - ##################################### - # Determine which sites are valid to test according to min.per.group - all_cov = as.matrix(bsseq::getCoverage(meth, type = 'Cov')) - all_meth = as.matrix(bsseq::getCoverage(meth, type = 'M')) - - # Estimate mu per locus within each group. The same value is used for all samples within the same group. - muEst = matrix(0, ncol = ncol(meth), nrow = nrow(meth)) - muEst[, group2_idx] = base::rowSums(all_meth[, group2_idx]) / (base::rowSums(all_cov[, group2_idx]) + 1e-100) - muEst[, group1_idx] = base::rowSums(all_meth[, group1_idx]) / (base::rowSums(all_cov[, group1_idx]) + 1e-100) - - # Determine which loci satisfy min.per.group - valid_idx = which( - base::rowSums(all_cov[, group2_idx] > 0) >= min.per.group[2] & base::rowSums(all_cov[, group1_idx] > 0) >= min.per.group[1] - ) - - ##################################### - # Resize all_cov, all_meth, and muEst to valid_idx - # Extract the granges of the meth BSseq object - # These are all used within the mclapply below - all_cov = all_cov[valid_idx,] - all_meth = all_meth[valid_idx,] - muEst = muEst[valid_idx,] - meth_gr = GenomicRanges::granges(meth)[valid_idx,] - - num_loci = length(valid_idx) - - ##################################### - # Go through each valid locus - results = do.call(rbind, mclapply(seq_along(valid_idx), function(locus_idx){ - - if(local.winsize != 0) { - # NOTE: It is much faster to work with subsets of the result of start() - # than it is to work with subsets of GRanges. - - # Get the indices which are within the local.winsize, but also limit to 5 CpGs on either side - local_loci_idx = intersect( - which(abs(BiocGenerics::start(meth_gr)[locus_idx] - BiocGenerics::start(meth_gr)) < local.winsize), - max(1, locus_idx - 5):min(num_loci, locus_idx + 5)) - - if(length(local_loci_idx) == 1) { - # Do not use local information - local_loci_idx = locus_idx - local_weights = 1 - - # Collect Cov and M matrices for all the loci in the window - # Rows are loci and columns are samples - local_cov = matrix(all_cov[local_loci_idx, ], nrow = 1) - local_meth = matrix(all_meth[local_loci_idx, ], nrow = 1) - - # Collect the correct rows of muEst - local_muEst = matrix(muEst[local_loci_idx, ], nrow = 1) - } else { - # We need to scale the loci in the window onto the interval [-1, 1] because - # that is the domain of the weightFunc. - # This is a vector of the distances of the local loci to the loci of interest (domain) - local_loci_norm = (BiocGenerics::start(meth_gr)[local_loci_idx] - BiocGenerics::start(meth_gr)[locus_idx]) / (local.winsize + 1) - - # Calculate the weights - # Each is a vector of values of the weight function (range) - local_weights = weightFunc(local_loci_norm) - - # Collect Cov and M matrices for all the loci in the window - # Rows are loci and columns are samples - local_cov = all_cov[local_loci_idx, ] - local_meth = all_meth[local_loci_idx, ] - - # Collect the correct rows of muEst - local_muEst = muEst[local_loci_idx, ] - } - } else { - # Do not use local information - local_loci_idx = locus_idx - local_weights = 1 - - # Collect Cov and M matrices for all the loci in the window - # Rows are loci and columns are samples - local_cov = matrix(all_cov[local_loci_idx, ], nrow = 1) - local_meth = matrix(all_meth[local_loci_idx, ], nrow = 1) - - # Collect the correct rows of muEst - local_muEst = matrix(muEst[local_loci_idx, ], nrow = 1) - } - - # Convert to old methylSig notion of C reads (methylated) and T reads (unmethylated) - # Then we can reuse the derivative and log likelihood functions Yongseok implemented. - # These are matrices. Rows are loci and columns are samples - local_creads = local_meth - local_treads = local_cov - local_meth - - # Compute the degrees of freedom for the locus - if(dispersion == 'both') { - df_subtract = 2 - } else { - df_subtract = 1 - } - df = pmax(rowSums(local_cov[, disp_groups_idx, drop = FALSE] > 0) - df_subtract, 0) - # Compute the degrees of freedom to be used in the test for differential methylation - df = sum(df * local_weights) - - if(df > 1) { - ### Common dispersion calculation - # This returns a singleton numeric - if(methylSig_derivativePhi( - phi = max.InvDisp, - lCreads = local_creads[, disp_groups_idx, drop = FALSE], - lTreads = local_treads[, disp_groups_idx, drop = FALSE], - mu = local_muEst[, disp_groups_idx, drop = FALSE], - weight = local_weights) >= 0) { - - phiCommonEst = max.InvDisp - } else if(methylSig_derivativePhi( - phi = min.InvDisp, - lCreads = local_creads[, disp_groups_idx, drop = FALSE], - lTreads = local_treads[, disp_groups_idx, drop = FALSE], - mu = local_muEst[, disp_groups_idx, drop = FALSE], - weight = local_weights) <= 0){ - - phiCommonEst = min.InvDisp - } else { - phiCommonEst = stats::uniroot( - f = methylSig_derivativePhi, - interval = c(min.InvDisp, max.InvDisp), - local_creads[, disp_groups_idx, drop = FALSE], - local_treads[, disp_groups_idx, drop = FALSE], - local_muEst[, disp_groups_idx, drop = FALSE], - local_weights)$root - } - - ### Common group means calculation - # This returns a numeric vector (group1, group2, group1 + group2) with the mu estimate - muEstC_groups = list(group1_idx, group2_idx, c(group1_idx, group2_idx)) - muEstC = rep(0, length(muEstC_groups)) - for(group_idx in seq_along(muEstC_groups)) { - if(sum(local_creads[, muEstC_groups[[group_idx]], drop = FALSE]) == 0) { - # If there are no local C reads, methylation is 0 - muEstC[group_idx] = 0 - } else if (sum(local_treads[, muEstC_groups[[group_idx]], drop = FALSE]) == 0) { - # If there are no local T reads, methylation is 1 - muEstC[group_idx] = 1 - } else { - # Otherwise, do something fancier - muEstC[group_idx] = stats::uniroot( - f = methylSig_derivativeMu, - interval = c(minMu, maxMu), - local_creads[, muEstC_groups[[group_idx]], drop = FALSE], - local_treads[, muEstC_groups[[group_idx]], drop = FALSE], - phiCommonEst, - local_weights)$root - } - } - - ### log Likelihood ratio calculation - logLikRatio = - methylSig_logLik( - mu = muEstC[1], - phi = phiCommonEst, - lCreads = local_creads[, group1_idx, drop = FALSE], - lTreads = local_treads[, group1_idx, drop = FALSE], - weight = local_weights) + - methylSig_logLik( - mu = muEstC[2], - phi = phiCommonEst, - lCreads = local_creads[, group2_idx, drop = FALSE], - lTreads = local_treads[, group2_idx, drop = FALSE], - weight = local_weights) - - methylSig_logLik( - mu = muEstC[3], - phi = phiCommonEst, - lCreads = local_creads[, c(group1_idx, group2_idx), drop = FALSE], - lTreads = local_treads[, c(group1_idx, group2_idx), drop = FALSE], - weight = local_weights) - - locus_data = c( - phiCommonEst = phiCommonEst, - logLikRatio = logLikRatio, - muEstC_group1 = muEstC[1]*100, - muEstC_group2 = muEstC[2]*100, - muEstC_group12 = muEstC[3]*100, - df = df + 2) - } else { - # Not enough degrees of freedom, return NAs - locus_data = c( - phiCommonEst = NA, - logLikRatio = NA, - muEstC_group1 = NA, - muEstC_group2 = NA, - muEstC_group12 = NA, - df = NA) - } - - return(locus_data) - }, mc.cores = num.cores)) - - results_gr = meth_gr - S4Vectors::mcols(results_gr) = results - - if(T.approx) { - results_gr$pvalue = stats::pt(-sqrt(pmax(results_gr$logLikRatio, 0)), results_gr$df) * 2 - } else { - results_gr$pvalue = stats::pchisq(pmax(results_gr$logLikRatio, 0), 1, lower.tail = F) - } - - # Set any methylation difference less than 0.01 to 0 - results_gr$meth.diff = (results_gr$muEstC_group2 - results_gr$muEstC_group1) - results_gr$meth.diff[abs(results_gr$meth.diff) < 0.01] = 0 - results_gr$meth.diff = as.numeric(results_gr$meth.diff) - - results_gr$fdr = stats::p.adjust(results_gr$pvalue, method = 'BH') - - results_gr$hyper.direction = ifelse(results_gr$meth.diff >= 0, group2, group1) - - col_order = c( - 'phiCommonEst', - 'logLikRatio', - 'df', - 'muEstC_group2', - 'muEstC_group1', - 'meth.diff', - 'hyper.direction', - 'pvalue', - 'fdr' - ) - - S4Vectors::mcols(results_gr) = S4Vectors::mcols(results_gr)[, col_order] - - colnames(GenomicRanges::mcols(results_gr)) = c( - 'variance.est', - 'logLikRatio', - 'df', - paste('meth', group2, sep='.'), - paste('meth', group1, sep='.'), - 'meth.diff', - 'hyper.direction', - 'pvalue', - 'fdr' - ) - - results_metadata = list( - method = 'methylSigCalc', - comparison = comparison, - dispersion = dispersion, - local.info = local.info, - local.winsize = local.winsize, - min.per.group = min.per.group, - weightFunc = weightFunc, - T.approx = T.approx - ) - - S4Vectors::metadata(results_gr) = results_metadata - - ############################################################################ - - seqinfo(results_gr) = merge(seqinfo(results_gr), seqinfo(meth)) - - return(results_gr) -} diff --git a/R/test_multifactor.R b/R/test_multifactor.R deleted file mode 100644 index 9b48a1a..0000000 --- a/R/test_multifactor.R +++ /dev/null @@ -1,208 +0,0 @@ -#' Calculates differential methylation statistics under general experimental design -#' -#' @param meth A \code{BSseq-class} object to calculate differential methylation statistics. See \code{methylSigReadData} for how to read in methylation data. -#' @param design A \code{data.frame} for experimental design. Should contain as many rows as there are columns (samples) in \code{meth}. -#' @param formula A formula for the linear model. It should refer to column names from \code{design}. NOTE: The intercept is included by default if omitted. One can omit the intercept with a formula such as \code{'~ 0 + group'}. For clarity, it helps to include the intercept explicitly as in \code{'~ 1 + group'}. -#' @param contrast A contrast matrix for hypothesis testing. The number of rows should match the number of columns \code{design}. -#' @param group.term A string indicating which term in \code{formula} contains group information on which to apply the \code{min.per.group} parameter. Currently assumes that this factor contains ONLY TWO LEVELS. -#' @param min.per.group A vector with two numbers specifying the minimum number of samples required to perform the test for differential methylation. If it is a single number, both groups will use it as the minimum requried number of samples. Default is \code{c(3,3)}. NOTE: The ordering of this parameter with respect to the groups should be \code{c(reference, other)}, where \code{reference} refers to the reference level in the \code{design[, group.term]} factor. -#' -#' @return A \code{GRanges} object containing the following \code{mcols}: -#' \describe{ -#' \item{stat}{ The dispersion estimate. } -#' \item{pvalue}{ The log likelihood ratio. } -#' \item{fdr}{ Degrees of freedom used when \code{T.approx = TRUE}. } -#' \item{mean methylation columns}{ Mean methylation for each factor in each column of design, when there are fewer than 5 factors in the factor. } -#' } -#' -#' @seealso \code{\link{methylSigReadData}} -#' -#' @examples -#' utils::data(sample_data, package = 'methylSig') -#' -#' # Example with implicit intercept -#' design1 = data.frame(group = bsseq::pData(meth)$DR_vs_DS) -#' contrast1 = matrix(c(0,1), ncol = 1) -#' result1 = methylSigDSS( -#' meth = meth, -#' design = design1, -#' formula = '~ group', -#' contrast = contrast1, -#' group.term = 'group', -#' min.per.group=c(3,3)) -#' -#' # Example with subject pairing -#' design2 = data.frame( -#' group = bsseq::pData(meth)$DR_vs_DS, -#' subject = factor(c(1,1,2,2,3,3))) -#' contrast2 = matrix(c(0,1,0,0), ncol = 1) -#' result2 = methylSigDSS( -#' meth = meth, -#' design = design2, -#' formula = '~ group + subject', -#' contrast = contrast2, -#' group.term = 'group', -#' min.per.group=c(3,3)) -#' -#' @keywords differentialMethylation -#' -#' @export -methylSigDSS = function( - meth, - design, - formula, - contrast, - group.term, - min.per.group=c(3,3)) { - - ##################################### - - # Determine the formula components so we only return mean methylation information - # for items in the formula intersected with criteria below - # TO DO: Add support for interaction terms - formula_components = unlist(strsplit(formula, '[+]')) - formula_components = gsub(' ', '', formula_components) - formula_components = gsub('~1', '', formula_components) - formula_components = gsub('~', '', formula_components) - formula_components = formula_components[formula_components != '0'] - formula_components = formula_components[formula_components != ''] - - ############################################################################ - # Filter according to min.per.group parameter - if(length(min.per.group) == 1) { - min.per.group = c(min.per.group,min.per.group) - } - - ##################################### - # Get the group labels, NOTE: THIS ASSUMES CORRECT REFERENCE LEVEL SET - group2 = levels(design[, group.term])[2] - group1 = levels(design[, group.term])[1] - - # Determine which rows of pData belong to which group - # / which columns of Cov and M matrices belong to which group - group2_idx = which(design[, group.term] == group2) - group1_idx = which(design[, group.term] == group1) - - ##################################### - # Determine which sites are valid to test according to min.per.group - all_cov = as.matrix(bsseq::getCoverage(meth, type = 'Cov')) - - # Determine which loci satisfy min.per.group - valid_idx = which( - base::rowSums(all_cov[, group2_idx] > 0) >= min.per.group[2] & base::rowSums(all_cov[, group1_idx] > 0) >= min.per.group[1] - ) - - meth = meth[valid_idx] - - ############################################################################ - # Test for differential methylation - - dss_fit = DSS::DMLfit.multiFactor(BSobj = meth, design = design, formula = stats::as.formula(formula), smoothing=FALSE) - - # Need to remove any rows with NAs because the contrast will fail - # design; fit: beta, var.beta; formula; gr; X - na_idx = which(is.na(dss_fit$fit$beta[,1])) - if(length(na_idx) > 0) { - dss_fit$fit$beta = dss_fit$fit$beta[-na_idx, ] - dss_fit$fit$var.beta = dss_fit$fit$var.beta[-na_idx, ] - dss_fit$gr = dss_fit$gr[-na_idx] - } - - test_result = DSS::DMLtest.multiFactor(DMLfit = dss_fit, Contrast = contrast) - - # Create the GRanges return object and harmonize column names with methylSigCalc() - results_gr = dss_fit$gr - GenomicRanges::mcols(results_gr) = test_result[,c('stat','pvals','fdrs')] - colnames(GenomicRanges::mcols(results_gr)) = c('stat','pvalue','fdr') - - # Retain metadata from dss_fit, design, formula, and contrast - results_gr_metadata = list( - method = 'methylSigDSS', - design = design, - formula = formula, - contrast = contrast, - beta_fit = dss_fit$fit$beta, - var_beta_fit = dss_fit$fit$var.beta, - X = dss_fit$X - ) - S4Vectors::metadata(results_gr) = results_gr_metadata - - ############################################################################ - # Recover group mean methylation - - # Construct methylation means matrix to add as mcols to results_gr - all_meth = as.matrix(bsseq::getCoverage(meth, type = 'M')) - all_cov = as.matrix(bsseq::getCoverage(meth, type = 'Cov')) - perc_meth = (all_meth / all_cov)*100 - if(length(na_idx) > 0) { - perc_meth = perc_meth[-na_idx, ] - } - - ##################################### - # Collect the correct columns of dss_fit$design to use for column groups - - # Want the column to be a factor - factor_cols = sapply(dss_fit$design, class) == 'factor' - # Want the column to have 5 or fewer levels (if not factor returns FALSE) - level_cols = sapply(lapply(dss_fit$design, levels), length) <= 5 - # Which column names are valid (above two criteria plus in formula)? - char_idx = intersect(names(which(factor_cols & level_cols)), formula_components) - - ##################################### - - # Determine the column indices for each factor level in char_idx - col_idxs = lapply(char_idx, function(idx){ - col = dss_fit$design[, idx] - groups = unique(as.character(col)) - tmp = lapply(groups, function(group){ - which(col == group) - }) - names(tmp) = groups - - return(tmp) - }) - names(col_idxs) = char_idx - - ##################################### - - # Take the row means over the selected columns - row_means = lapply(col_idxs, function(group){ - sapply(group, function(idxs){ - base::rowMeans(perc_meth[, idxs], na.rm = TRUE) - }) - }) - - means_df = Reduce(cbind, row_means) - colnames(means_df) = paste('meth', colnames(means_df), sep= '.') - - ##################################### - - # Add meth difference based on group.term - - means_df = cbind( - meth.diff = means_df[, paste('meth', group2, sep='.')] - means_df[, paste('meth', group1, sep='.')], - means_df) - - ##################################### - - # Add the mean methylations to the results_gr - GenomicRanges::mcols(results_gr) = cbind(means_df, GenomicRanges::mcols(results_gr)) - results_gr$hyper.direction = ifelse(results_gr$meth.diff >= 0, group2, group1) - - col_order = c( - 'stat', - setdiff(colnames(means_df), 'meth.diff'), - 'meth.diff', - 'hyper.direction', - 'pvalue', - 'fdr' - ) - - GenomicRanges::mcols(results_gr) = GenomicRanges::mcols(results_gr)[, col_order] - - ############################################################################ - - seqinfo(results_gr) = merge(seqinfo(results_gr), seqinfo(meth)) - - return(results_gr) -} diff --git a/R/tfbs_enrichment.R b/R/tfbs_enrichment.R deleted file mode 100644 index acdd693..0000000 --- a/R/tfbs_enrichment.R +++ /dev/null @@ -1,90 +0,0 @@ -#' Perform transcription factor enrichment test among differentially methylated cytosines or regions -#' -#' This function tests for enriched transcription binding sites among differentially methylated sites or regions using a binomial test. -#' -#' Likelihood ratio test is used based on the binomial distribution. -#' -#' @param myDiff \code{GRanges} object resulting from \code{methylSigCalc} that contains all CpG sites that are tested for differential methylation. -#' @param dmcList A \code{logical} of the same length as \code{myDiff} defining the DMCs or DMRs. -#' @param tfbsInfo A \code{GRanges} object of the genomic regions representing peaks. The \code{name} column should indicate which TF the peak is for. -#' -#' @return A \code{data.frame} whose \code{rownames} are inherited from the \code{name} column of the input BED file, and whose columns are: -#' \describe{ -#' \item{n_total_by_tf}{ The number of tested CpGs in a TFBS for a TF. } -#' \item{n_dmc_by_tf}{ The number of DM CpGs in a TFBS for a TF. } -#' \item{N_total}{ The total number of tested CpGs in a TFBS across all the TFs. } -#' \item{N_dmc}{ The total number of DM CpGs in a TFBS across all the TFs. } -#' \item{p_total}{ \code{n_total_by_tf} / \code{N_total}, used in the likelihood calculation. } -#' \item{p_dmc}{ \code{n_dmc_by_tf} / \code{N_dmc}, used in the likelihood calculation. } -#' \item{logLik}{ The log-likelihood based on the binomial distribution. } -#' \item{pvalue}{ The p-value from the likelihood ratio test. } -#' } -#' -#' @examples -#' utils::data(sample_data, package = 'methylSig') -#' -#' dmcList = msig_cpgs$fdr < 0.05 & abs(msig_cpgs$meth.diff) > 25 -#' -#' methylSig.tfbsEnrichTest(myDiff = msig_cpgs, dmcList = dmcList, tfbsInfo = tfbs) -#' -#' @export -methylSig.tfbsEnrichTest <- function(myDiff, dmcList, tfbsInfo) { - # NOTE: All notation is relative to the paper - - # Create GRangesList of tfbsInfo based on 'name' - tfbs_by_tf = split(tfbsInfo, f = tfbsInfo$name) - - # Subset myDiff by the dmcList - dmcs = myDiff[dmcList] - - # Overlap tested CpGs and DMCs with TFBSs, respectively - tested_overlaps = GenomicRanges::findOverlaps(tfbsInfo, myDiff) - dmc_overlaps = GenomicRanges::findOverlaps(tfbsInfo, dmcs) - - # Determine the total number of CpGs and DMCs in TFBSs, respectively - N_total = length(unique(S4Vectors::subjectHits(tested_overlaps))) - N_dmc = length(unique(S4Vectors::subjectHits(dmc_overlaps))) - - # Per TF overlaps - by_tf = do.call(rbind, lapply(tfbs_by_tf, function(tf) { - tested_overlaps_tf = findOverlaps(tf, myDiff) - dmc_overlaps_tf = findOverlaps(tf, dmcs) - - if(length(tested_overlaps_tf) == 0) { - return(c('n_total_by_tf' = 0, 'n_dmc_by_tf' = 0)) - } else { - return(c( - 'n_total_by_tf' = length(unique(S4Vectors::subjectHits(tested_overlaps_tf))), - 'n_dmc_by_tf' = length(unique(S4Vectors::subjectHits(dmc_overlaps_tf))) - )) - } - })) - by_tf = data.frame(by_tf) - - n_total_by_tf = by_tf$n_total_by_tf - n_dmc_by_tf = by_tf$n_dmc_by_tf - - p_total = n_total_by_tf / N_total - p_dmc = n_dmc_by_tf / N_dmc - - # Spacing is to help figure out what's grouped together - logLik = 2 * ( n_dmc_by_tf * log( pmax(p_dmc, 1e-100) / p_total ) + (N_dmc - n_dmc_by_tf) * log( pmax(1 - p_dmc, 1e-100) / (1 - p_total) ) ) - - pvalue = stats::pchisq(logLik, 1, lower.tail=FALSE) - - by_tf$N_total = N_total - by_tf$N_dmc = N_dmc - by_tf$p_total = p_total - by_tf$p_dmc = p_dmc - by_tf$logLik = logLik - by_tf$pvalue = pvalue - - # Why this step? - by_tf[p_dmc < p_total, 'pvalue'] = 1 - - by_tf = subset(by_tf, !is.na(by_tf$pvalue)) - - by_tf = by_tf[order(by_tf$pvalue), ] - - return(by_tf) -} diff --git a/R/tile.R b/R/tile.R deleted file mode 100644 index 36934e2..0000000 --- a/R/tile.R +++ /dev/null @@ -1,62 +0,0 @@ -#' Obtain tiled methylation data in non-overlapping continuous windows. -#' -#' This function summarizes methylation data within tiles or user-specified regions. For all CpGs within an intersecting genomic region, the coverage and methylation reads are summed. This is used prior to the \code{\link{methylSigCalc}} function when the user prefers to conduct a tiled analysis instead of a base specific analysis for differential methylation. Tiling may provide higher power to detect significant differences, especially for experiments with low coverage. -#' -#' @param meth A \code{BSseq-class} object, as from \code{methylSigReadData}. -#' @param tiles One of \code{NULL}, a \code{data.frame}, or a \code{GRanges} object. If not \code{NULL}, the regions should be non-overlapping. Those CpG sites not belonging to any tile will be removed from tiled data. -#' @param win.size An \code{integer} indicating the desired window size in bps. Default is 200. Used only when \code{tiles = NULL}. -#' -#' @return A \code{BSseq-class} object. -#' -#' @examples -#' utils::data(sample_data, package = 'methylSig') -#' methTile = methylSigTile(meth, tiles = NULL, win.size = 200) -#' -#' @export -methylSigTile <- function(meth, tiles = NULL, win.size = 200) { - if(!is(meth, 'BSseq')) { - stop("'meth' must be a BSseq object.") - } - - if(mean(BiocGenerics::width(meth)) != 1) { - stop("It appears that 'meth' is not CpG resolution. Tiling can only be done on CpG resolution data.") - } - - # Check for tiles possibilities - if(is.null(tiles)) { - # If the seqlengths aren't defined, remind the user to create a custom GenomeInfoDb::Seqinfo and assign it to meth - if(any(is.na(GenomeInfoDb::seqlengths(meth)))) { - stop("The seqinfo for 'meth' is ill-defined, with seqlengths being NA. In order to use the methylSigTile function, you should create a custom GenomeInfoDb::Seqinfo and assign it to 'meth'.") - } - - seqlevels_in_use = seqlengths(meth)[seqlevelsInUse(meth)] - tiles = GenomicRanges::tileGenome(seqlevels_in_use, tilewidth = win.size, cut.last.tile.in.chrom = TRUE) - seqinfo(tiles) = merge(seqinfo(tiles), seqinfo(meth)) - } else if (is(tiles, 'data.frame')) { - tiles = GenomicRanges::makeGRangesFromDataFrame(tiles, keep.extra.columns = FALSE) - seqinfo(tiles) = merge(seqinfo(tiles), seqinfo(meth)) - } else if (is(tiles, 'GRanges')) { - tiles = GenomicRanges::granges(tiles) - seqinfo(tiles) = merge(seqinfo(tiles), seqinfo(meth)) - } - - # Subset tiles based on findOverlaps to save some work downstream - overlaps = GenomicRanges::findOverlaps(query = tiles, subject = meth) - tile_idx = S4Vectors::queryHits(overlaps) - tiles = tiles[unique(tile_idx)] - - tiled_M = as.matrix(bsseq::getCoverage(BSseq = meth, regions = tiles, what = "perRegionTotal", type = 'M')) - tiled_M[is.na(tiled_M)] = 0 - tiled_Cov = as.matrix(bsseq::getCoverage(BSseq = meth, regions = tiles, what = "perRegionTotal", type = 'Cov')) - tiled_Cov[is.na(tiled_Cov)] = 0 - - tiled_bsseq = bsseq::BSseq(gr = tiles, M = tiled_M, Cov = tiled_Cov, pData = bsseq::pData(meth), rmZeroCov = TRUE) - - S4Vectors::metadata(tiled_bsseq) = S4Vectors::metadata(meth) - S4Vectors::metadata(tiled_bsseq)$tile = TRUE - S4Vectors::metadata(tiled_bsseq)$tiles = ifelse(is.null(tiles), 'windows', 'custom') - S4Vectors::metadata(tiled_bsseq)$win.size = win.size - S4Vectors::metadata(tiled_bsseq)$cpgs.per.tile = as.numeric(table(tile_idx)) - - return(tiled_bsseq) -} diff --git a/R/tile_by_regions.R b/R/tile_by_regions.R new file mode 100644 index 0000000..0aa00cc --- /dev/null +++ b/R/tile_by_regions.R @@ -0,0 +1,61 @@ +#' Group cytosine / CpG level data into regions based on genomic regions +#' +#' An optional function to aggregate cytosine / CpG level data into regions based on a \code{GRanges} set of genomic regions. +#' +#' @param bs a \code{BSseq} object. +#' @param gr a \code{GRanges} object. +#' +#' @return A \code{BSseq} object with loci of regions matching \code{gr}. Coverage and methylation read count matrices are aggregated by the sums of the cytosines / CpGs in the regions per sample. +#' +#' @examples +#' data(bsseq_stranded, package = 'methylSig') +#' regions = GenomicRanges::GRanges( +#' seqnames = c('chr1','chr1','chr1'), +#' ranges = IRanges::IRanges( +#' start = c(5,35,75), +#' end = c(30,70,80) +#' ) +#' ) +#' tiled = tile_by_regions(bs = bsseq_stranded, gr = regions) +#' +#' @export +tile_by_regions = function(bs, gr) { + + if (missing(bs)) { + stop('Must pass bs as a BSseq object.') + } + if (missing(gr)) { + stop('Must pass gr as a GRanges object.') + } + if (!is(bs, 'BSseq')) { + stop('bs must be class BSseq.') + } + if (!is(gr, 'GRanges')) { + stop('gr must be class GRanges.') + } + + ##################################### + + cov = DelayedArray::DelayedArray(bsseq::getCoverage(bs, regions = gr, type = 'Cov', what = 'perRegionTotal')) + + if (all(is.na(cov))) { + stop('No regions overlap between bs and gr') + } + + meth = DelayedArray::DelayedArray(bsseq::getCoverage(bs, regions = gr, type = 'M', what = 'perRegionTotal')) + + # Set all NA entries to 0 so bsseq::BSseq doesn't throw an error + # These will likely end up removed in filter_loci_by_group_coverage() + cov[is.na(cov)] = 0 + meth[is.na(meth)] = 0 + + bs = bsseq::BSseq( + Cov = cov, + M = meth, + gr = gr, + pData = pData(bs), + sampleNames = sampleNames(bs) + ) + + return(bs) +} diff --git a/R/tile_by_windows.R b/R/tile_by_windows.R new file mode 100644 index 0000000..7857d33 --- /dev/null +++ b/R/tile_by_windows.R @@ -0,0 +1,49 @@ +#' Group cytosine / CpG level data into regions based on genomic windows +#' +#' An optional function to aggregate cytosine / CpG level data into regions based on a tiling of the genome by \code{win_size}. +#' +#' @param bs a \code{BSseq} object. +#' @param win_size an \code{integer} indicating the size of the tiles. Default is 200bp. +#' +#' @return A \code{BSseq} object with loci consisting of a tiling of the genome by \code{win_size} bp tiles. Coverage and methylation read count matrices are aggregated by the sums of the cytosines / CpGs in the regions per sample. +#' +#' @examples +#' data(bsseq_stranded, package = 'methylSig') +#' +#' tiled = tile_by_windows(bs = bsseq_stranded, win_size = 50) +#' +#' @export +tile_by_windows = function(bs, win_size = 200) { + + if (missing(bs)) { + stop('Must pass bs as a BSseq object.') + } + if (!is(bs, 'BSseq')) { + stop('bs must be class BSseq.') + } + if (!is(win_size, 'numeric')) { + stop('win_size must be an integer') + } + + ##################################### + + # Determine maximum position per chromosome in use, and add win_size + seqlevels_in_use = GenomeInfoDb::seqlevelsInUse(bs) + seqlengths = vapply(seqlevels_in_use, function(chr) { + gr_tmp = granges(bs) + chr_length = max(end(gr_tmp[seqnames(gr_tmp) == chr])) + win_size + return(chr_length) + }, 1) + + gr = GenomicRanges::tileGenome( + seqlengths = seqlengths, + tilewidth = win_size, + cut.last.tile.in.chrom = TRUE) + + bs = tile_by_regions(bs = bs, gr = gr) + + # To avoid downstream issues with seqinfo mismatches reset lengths to NA + seqlengths(bs) = NA + + return(bs) +} diff --git a/README.md b/README.md index 620044b..d643792 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,317 @@ -# methylSig +# methylSig -[![Build Status](https://travis-ci.org/sartorlab/methylSig.svg?branch=master)](https://travis-ci.org/sartorlab/methylSig) [![Coverage Status](https://coveralls.io/repos/github/sartorlab/methylSig/badge.svg?branch=master)](https://coveralls.io/github/sartorlab/methylSig?branch=master) + +[![Travis build status](https://travis-ci.org/sartorlab/methylSig.svg?branch=master)](https://travis-ci.org/sartorlab/methylSig) +[![Coveralls test coverage](https://coveralls.io/repos/github/sartorlab/methylSig/badge.svg)](https://coveralls.io/r/sartorlab/methylSig?branch=master) + -A whole genome DNA methylation analysis pipeline. +# Introduction -## Description +DNA methylation plays critical roles in gene regulation and cellular specification without altering DNA sequences. It is one of the best understood and most intensively studied epigenetic marks in mammalian cells. Treatment of DNA with sodium bisulfite deaminates unmethylated cytosines to uracil while methylated cytosines are resistant to this conversion thus allowing for the discrimination between methylated and unmethylated CpG sites. Sodium bisulfite pre-treatment of DNA coupled with next-generation sequencing has allowed DNA methylation to be studied quantitatively and genome-wide at single cytosine site resolution. -MethylSig is a method for testing for differentially methylated cytosines (DMCs) or regions (DMRs) in whole-genome bisulfite sequencing (bis-seq) or reduced representation bisulfite sequencing (RRBS) experiments. MethylSig uses a beta binomial model to test for significant differences between groups of samples. Several options exist for either site-specific or sliding window tests, combining strands, and for variance estimation. It allows annotating the resulting regions to multiple genome features, and visualizing the results for chosen genomic regions along with supporting genomic information. +`methylSig` is a method for testing for differential methylated cytosines (DMCs) or regions (DMRs) in whole-genome bisulfite sequencing (WGBS) or reduced representation bisulfite sequencing (RRBS) experiments. `methylSig` uses a beta-binomial model to test for significant differences between groups of samples. Several options exist for either site-specific or sliding window tests, combining strands, and for variance estimation. -## Installation +# Installation -MethylSig is not currently on CRAN or Bioconductor. Installation is easiest with [devtools](http://cran.r-project.org/web/packages/devtools/index.html). +`methylSig` is available on GitHub at , and the easiest way to install it is as follows: -```R -library(devtools) -install_github('sartorlab/methylSig') +```{r install, eval=FALSE} +devtools::install_github('sartorlab/methylSig') ``` -## Citation +# Usage -Yongseok Park, Maria E. Figueroa, Laura S. Rozek, and Maureen A. Sartor, MethylSig: a whole genome DNA methylation analysis pipeline, *Bioinformatics* (2014) 30 (17): 2414-2422, doi:[10.1093/bioinformatics/btu339](http://bioinformatics.oxfordjournals.org/content/30/17/2414) +The basic flow of analysis with `methylSig` is to: + +* Read data +* Optionally filter data by coverage and/or location +* Optionally aggregate data into regions +* Optionally filter data by coverage in a minimum number of samples per group +* Test for differential methylation + +The sections below walk through each step with small test data. + +## Reading Data + +Methylation calls output by either [MethylDackel](https://github.com/dpryan79/MethylDackel#single-cytosine-methylation-metrics-extraction) or [Bismark](https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-coverage-output-looks-like-this-tab-delimited-1-based-genomic-coords) can be read by the `bsseq::read.bismark()` function from the [`bsseq`](https://www.bioconductor.org/packages/release/bioc/html/bsseq.html) R/Bioconductor package. + +This function accepts `bedGraph`s from [MethylDackel](https://github.com/dpryan79/MethylDackel#single-cytosine-methylation-metrics-extraction) and either the coverage or genome-wide cytosine reports from [Bismark](https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-coverage-output-looks-like-this-tab-delimited-1-based-genomic-coords). Options to consider when reading data are: + +* `colData`, a `data.frame` or `DataFrame` whose rows are samples and columns are phenotype data. The row ordering should match the ordering of files in `files`. This matrix will be needed for downstream differential methylation testing. +* `strandCollapse`, a `logical` (`TRUE`/`FALSE`) indicating whether or not to collapse +/- CpG data onto the + strand. Note, this can only be `TRUE` when the input type is the genome-wide cytosine report from Bismark. MethylDackel has an option to destrand data when methylation calls are made so that the output is already destranded. In this case, `strandCollapse` should be `FALSE`. + +For all options, see the `bsseq` [reference manual](https://www.bioconductor.org/packages/release/bioc/manuals/bsseq/man/bsseq.pdf), and the [section on reading data](https://www.bioconductor.org/packages/release/bioc/vignettes/bsseq/inst/doc/bsseq.html#4_reading_data) in the package vignette. + +```{r read} +files = c( + system.file('extdata', 'bis_cov1.cov', package='methylSig'), + system.file('extdata', 'bis_cov2.cov', package='methylSig') +) + +bsseq_stranded = bsseq::read.bismark( + files = files, + colData = data.frame(row.names = c('test1','test2')), + rmZeroCov = FALSE, + strandCollapse = FALSE +) +``` + +The result is a `BSseq` object. Aspects of the object can be accessed via: + +```{r bsseq_access} +# pData +bsseq::pData(bsseq_stranded) + +# GRanges +GenomicRanges::granges(bsseq_stranded) + +# Coverage matrix +bsseq::getCoverage(bsseq_stranded, type = 'Cov') + +# Methylation matrix +bsseq::getCoverage(bsseq_stranded, type = 'M') +``` + +## Filtering Data + +After data is loaded, it is good practice to filter loci that have too few or too many reads, and C-to-T and G-to-A SNPs which confound bisulfite conversion. + +### By Coverage + +Low coverage loci (typically those with fewer than 5 reads) should be marked because they adversely affect the variance calculation in downstream differential methylation tests. Very high coverage loci (typically those with more than 500 reads) are likely the result of PCR duplication, and should also be marked. + +`MethylSig` marks such sites by setting their coverage and methylation matrix entries to 0 for each sample in which this happens. Prior to testing, these sites can be removed, see below. + +```{r filter_by_coverage} +# Load data for use in the rest of the vignette +data(BS.cancer.ex, package = 'bsseqData') +bs = BS.cancer.ex[1:10000] + +bs = filter_loci_by_coverage(bs, min_count = 5, max_count = 500) +``` + +### By Location + +As noted above, locations with C-to-T and G-to-A SNPs confound bisulfite conversion in WGBS and ERRBS. Filtering them out can be accomplished by constructing a `GRanges` object with their location. For now, we leave locating such SNPs to the user. + +```{r filter_by_location} +# Show locations of bs +GenomicRanges::granges(bs) + +# Construct GRanges object +remove_gr = GenomicRanges::GRanges( + seqnames = c('chr21', 'chr21', 'chr21'), + ranges = IRanges::IRanges( + start = c(9411552, 9411784, 9412099), + end = c(9411552, 9411784, 9412099) + ) +) + +bs = filter_loci_by_location(bs = bs, gr = remove_gr) + +# Show removal +GenomicRanges::granges(bs) +``` + +## Aggregating Data + +One way to increase the power of differential methylation testing is to aggregate the CpG-level data into regions. Regions can take two forms: tiling the entire genome by windows of a certain width or defining a set of regions such as CpG islands or gene promoters. + +### By Tiling the Genome + +Given that CpG methylation is strongly correlated over short genomic distances, a reasonable upper threshold might be 500bp. For the example below, in the interest of speed, we tile by larger windows. + +```{r tile_by_windows} +windowed_bs = tile_by_windows(bs = bs, win_size = 10000) + +# Show tiling +GenomicRanges::granges(windowed_bs) +``` + +### By Pre-defined Regions + +It may be the case that differential methylation is only relevant at promoter regions of genes for a particular project. In this case, aggregation of methylation calls over these regions may increase power, and decrease computation time. + +```{r tile_by_regions} +# Collapsed promoters on chr21 and chr22 +data(promoters_gr, package = 'methylSig') + +promoters_bs = tile_by_regions(bs = bs, gr = promoters_gr) +``` + +## Testing for Differential Methylation + +`MethylSig` offers three tests for differential methylation: + +1. `diff_binomial()` +2. `diff_methylsig()` +3. `diff_dss_fit()` and `diff_dss_test()` + +Each returns a `GRanges` object with tested loci and the corresponding statistics and methylation levels (if applicable). See the documentation for each function for more information (`?diff_binomial`, `?diff_methylsig`, `?diff_dss_fit`, and `?diff_dss_test`). + +### Filtering by Coverage in a Minimum Number of Samples + +Prior to applying any test function, loci without a minimum number of samples having appropriate coverage should be removed to avoid testing loci where one sample dominates the test. + +```{r filter_by_group_coverage} +# Look a the phenotype data for bs +bsseq::pData(bs) + +# Require at least two samples from cancer and two samples from normal +bs = filter_loci_by_group_coverage( + bs = bs, + group_column = 'Type', + c('cancer' = 2, 'normal' = 2)) +``` + +### Binomial Test + +`diff_binomial()` is a binomial test based on that in the [`methylKit`](https://bioconductor.org/packages/release/bioc/html/methylKit.html) R/Bioconductor package. This was included for benchmarking purposes in the publication. It does not take into account the variability among samples being compared. + +```{r diff_binomial} +# Test cancer versus normal +diff_gr = diff_binomial( + bs = bs, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal')) + +diff_gr +``` + +### MethylSig Test + +The `diff_methylsig()` is a beta-binomial test which takes into account the variability among samples being compared. It can perform group versus group comparisons with no covariates. + +```{r diff_methylsig} +# Test cancer versus normal with dispersion from both groups +diff_gr = diff_methylsig( + bs = bs, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1) + +diff_gr +``` + +### General Models with DSS + +`diff_dss_fit()` and `diff_dss_test()` are tests supporting general models, and are wrappers for functions in the [`DSS`](https://bioconductor.org/packages/release/bioc/html/DSS.html) R/Bioconductor package. We have added the ability to recover group methylation for group comparisons, or top/bottom 25 percentile methylation rates based on a continuous covariate. + +The `DSS` style test is in two stages similar to tests in the `edgeR` or `limma` R/Bioconductor packages. The first stage is a fit, and the second stage is a test on a contrast. + +First we add a numerical covariate to the `pData(bs)` so that we can give an example of such a test. + +```{r add_numerical_covariate} +bsseq::pData(bs)$num_covariate = c(84, 96, 93, 10, 18, 9) +``` + +#### Model Fitting + +Fit the simplest group versus group model on just the type. + +```{r diff_dss_fit_simple} +diff_fit_simple = diff_dss_fit( + bs = bs, + design = bsseq::pData(bs), + formula = as.formula('~ Type')) +``` + +Fit a paired model where cancer and normal samples are paired by patient. + +```{r diff_dss_fit_paired} +# Paired-test +diff_fit_paired = diff_dss_fit( + bs = bs, + design = bsseq::pData(bs), + formula = '~ Type + Pair') +``` + +Fit a model on the numerical covariate. + +```{r diff_dss_fit_num} +# Numerical covariate test +diff_fit_num = diff_dss_fit( + bs = bs, + design = bsseq::pData(bs), + formula = '~ num_covariate') +``` + +The result of `diff_dss_fit()` is a `list` with the following structure with elements: + +* `gr`, the `GRanges` of the fit loci. +* `design`, the phenotype matrix passed via the `design` parameter. +* `formula`, the formula used in conjunction with `design` to create the model matrix. +* `X`, the result of `model.matrix` with `design` and `formula`. +* `fit`, the `beta` and `var.beta` matrices. + +#### Building Contrasts + +Prior to calling `diff_fit_test()`, it may help to look at the model matrix used for fitting in order to build the contrast. + +```{r diff_dss_fit_model} +diff_fit_simple$X + +diff_fit_paired$X + +diff_fit_num$X +``` + +The contrast passed to `diff_fit_test()` should be a column vector or a matrix whose rows correspond to the columns of the model matrix above. See the [DSS user guide](http://bioconductor.org/packages/release/bioc/vignettes/DSS/inst/doc/DSS.html#34_dmldmr_detection_from_general_experimental_design) for more information. + +```{r contrast} +# Test the simplest model for cancer vs normal +# Note, 2 rows corresponds to 2 columns in diff_fit_simple$X +simple_contrast = matrix(c(0,1), ncol = 1) + +# Test the paired model for cancer vs normal +# Note, 4 rows corresponds to 4 columns in diff_fit_paired$X +paired_contrast = matrix(c(0,1,0,0), ncol = 1) + +# Test the numerical covariate +num_contrast = matrix(c(0,1), ncol = 1) +``` + +#### Testing + +The `diff_fit_test()` function enables the recovery of group methylation rates via the optional `methylation_group_column` and `methylation_groups` parameters. + +The simple, group versus group, test. + +```{r diff_dss_test_simple} +diff_simple_gr = diff_dss_test( + bs = bs, + diff_fit = diff_fit_simple, + contrast = simple_contrast, + methylation_group_column = 'Type', + methylation_groups = c('case' = 'cancer', 'control' = 'normal')) + +diff_simple_gr +``` + +The paired test. + +```{r diff_dss_test_paired} +diff_paired_gr = diff_dss_test( + bs = bs, + diff_fit = diff_fit_paired, + contrast = paired_contrast, + methylation_group_column = 'Type', + methylation_groups = c('case' = 'cancer', 'control' = 'normal')) + +diff_paired_gr +``` + +The numerical covariate test. Note, here the `methylation_groups` parameter is omitted because there are no groups. By giving the numerical covariate column, we will group samples by the top/bottom 25 percentile over the covariate, and compute mean methylation within those groups of samples. + +```{r diff_dss_test_num} +diff_num_gr = diff_dss_test( + bs = bs, + diff_fit = diff_fit_num, + contrast = num_contrast, + methylation_group_column = 'num_covariate') + +diff_num_gr +``` diff --git a/data-raw/01-create_cov_files.R b/data-raw/01-create_cov_files.R new file mode 100644 index 0000000..cbd8349 --- /dev/null +++ b/data-raw/01-create_cov_files.R @@ -0,0 +1,144 @@ +library(bsseq) +library(GenomicRanges) + +dir.create('inst/extdata', recursive = TRUE) + +################################################################################ + +#---------CG-------------CG-------------CG------------------C--------------CG +#---------GC-------------GC-------------GC------------------G--------------GC + +gr1 = GRanges( + seqnames = rep.int('chr1', 9), + ranges = IRanges( + start = c(10,11,25,26,40,41,60,75,76), + end = c(11,12,26,27,41,42,61,76,77)), + strand = c('+','-','+','-','+','-','+','+','-') +) + +cov1 = matrix( + data = c(5,5,30,70,10,20,40,1000,1500), + ncol = 1 +) + +meth1 = matrix( + data = c(4,4,0,5,9,19,35,900,1400), + ncol = 1 +) + +################################################################################ + +#---------CG-------------CG-------------CG--------CG--------C--------------CG +#---------GC-------------GC-------------GC--------GC--------G--------------GC + +gr2 = GRanges( + seqnames = rep.int('chr1', 11), + ranges = IRanges( + start = c(10,11,25,26,40,41,50,51,60,75,76), + end = c(11,12,26,27,41,42,51,52,61,76,77)), + strand = c('+','-','+','-','+','-','+','-','+','+','-') +) + +cov2 = matrix( + data = c(10,10,50,50,15,35,5,5,20,100,200), + ncol = 1 +) + +meth2 = matrix( + data = c(9,9,1,5,14,34,5,5,15,99,199), + ncol = 1 +) + +################################################################################ + +#---------C--------------C--------------C-------------------C--------------C- + +gr3 = GRanges( + seqnames = rep.int('chr1', 5), + ranges = IRanges( + start = c(10,25,40,60,75), + end = c(12,27,42,62,77)) +) + +cov3 = matrix( + data = c(10,100,30,40,2500), + ncol = 1 +) + +meth3 = matrix( + data = c(8,5,28,35,2300), + ncol = 1 +) + +################################################################################ + +#---------C--------------C--------------C---------C---------C--------------C- + +gr4 = GRanges( + seqnames = rep.int('chr1', 6), + ranges = IRanges( + start = c(10,25,40,50,60,75), + end = c(12,27,42,52,62,77)) +) + +cov4 = matrix( + data = c(20,100,50,10,20,300), + ncol = 1 +) + +meth4 = matrix( + data = c(18,6,48,10,15,298), + ncol = 1 +) + +################################################################################ + +df1 = data.frame(gr1) +df1$meth = as.numeric(meth1) +df1$unmeth = as.numeric(cov1 - meth1) +df1$perc = as.numeric(meth1 / cov1) * 100 + +df2 = data.frame(gr2) +df2$meth = as.numeric(meth2) +df2$unmeth = as.numeric(cov2 - meth2) +df2$perc = as.numeric(meth2 / cov2) * 100 + +df3 = data.frame(gr3) +df3$meth = as.numeric(meth3) +df3$unmeth = as.numeric(cov3 - meth3) +df3$perc = as.numeric(meth3 / cov3) * 100 + +df4 = data.frame(gr4) +df4$meth = as.numeric(meth4) +df4$unmeth = as.numeric(cov4 - meth4) +df4$perc = as.numeric(meth4 / cov4) * 100 + +################################################################################ + +cov_cols = c('seqnames','start','end','perc','meth','unmeth') + +bis_cov_file1 = './inst/extdata/bis_cov1.cov' +bis_cov_file2 = './inst/extdata/bis_cov2.cov' +bis_cov_file3 = './inst/extdata/bis_cov3.cov' +bis_cov_file4 = './inst/extdata/bis_cov4.cov' + +bis_cov1 = df1[, cov_cols] +bis_cov2 = df2[, cov_cols] +bis_cov3 = df3[, cov_cols] +bis_cov4 = df4[, cov_cols] + +write.table( + x = bis_cov1, file = bis_cov_file1, + quote = FALSE, sep = '\t', row.names = FALSE, col.names = FALSE) + +write.table( + x = bis_cov2, file = bis_cov_file2, + quote = FALSE, sep = '\t', row.names = FALSE, col.names = FALSE) + +write.table( + x = bis_cov3, file = bis_cov_file3, + quote = FALSE, sep = '\t', row.names = FALSE, col.names = FALSE) + +write.table( + x = bis_cov4, file = bis_cov_file4, + quote = FALSE, sep = '\t', row.names = FALSE, col.names = FALSE) diff --git a/data-raw/02-create_bsseq_rda.R b/data-raw/02-create_bsseq_rda.R new file mode 100644 index 0000000..a6085fe --- /dev/null +++ b/data-raw/02-create_bsseq_rda.R @@ -0,0 +1,106 @@ +library(bsseq) +library(GenomicRanges) + +######################################## + +bis_cov_file1 = './inst/extdata/bis_cov1.cov' +bis_cov_file2 = './inst/extdata/bis_cov2.cov' +bis_cov_file3 = './inst/extdata/bis_cov3.cov' +bis_cov_file4 = './inst/extdata/bis_cov4.cov' + +######################################## + +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 coverage +# 5 30 10 0 40 1000 +# 5 70 20 0 1500 +# test2 coverage +# 10 50 15 5 20 100 +# 10 50 35 5 200 +# test1 methylation +# 4 0 9 0 35 900 +# 4 5 19 0 1400 +# test2 methylation +# 9 1 14 5 15 99 +# 9 5 34 5 199 +bsseq_stranded = read.bismark( + files = c(bis_cov_file1, bis_cov_file2), + colData = data.frame(row.names = c('test1','test2')), + rmZeroCov = FALSE, + strandCollapse = FALSE +) + +######################################## + +#---------C--------------C--------------C---------C---------C--------------C- +# test1 coverage +# 10 100 30 0 40 2500 +# test2 coverage +# 20 100 50 10 20 300 +# test1 methylation +# 8 5 28 0 35 2300 +# test2 methylation +# 18 6 48 10 15 298 +bsseq_destranded = read.bismark( + files = c(bis_cov_file3, bis_cov_file4), + colData = data.frame(row.names = c('test3','test4')), + rmZeroCov = FALSE, + strandCollapse = FALSE +) + +######################################## + +cov = matrix(c( + 10,20,30,90, + 40,50,60,100 +), ncol = 2) + +meth = matrix(c( + 10,20,30,90, + 40,50,60,100 +), ncol = 2) + +gr = GRanges( + seqnames = c('chr1','chr1','chr1','chr2'), + ranges = IRanges( + start = c(10, 20, 30, 10), + end = c(10, 20, 30, 10) + ) +) + +#---------C---------C---------C +# test1 coverage / methylation +# 10 20 30 +# test2 coverage / methylation +# 40 50 60 +#---------C +# test1 coverage / methylation +# 90 +# test2 coverage / methylation +# 100 + +bsseq_multichrom = BSseq( + Cov = cov, + M = meth, + gr = gr, + pData = data.frame(row.names = c('test1','test2')), + sampleNames = c('test1','test2') +) + +######################################## + +# Extract chr21 and chr22 hg19 promoters +library(TxDb.Hsapiens.UCSC.hg19.knownGene) +txdb = TxDb.Hsapiens.UCSC.hg19.knownGene +seqlevels(txdb) = c('chr21', 'chr22') +promoters_gr = promoters(txdb, upstream=1000, downstream=400) +promoters_gr = reduce(promoters_gr) + +######################################## + +usethis::use_data( + bsseq_stranded, + bsseq_destranded, + bsseq_multichrom, + promoters_gr, + overwrite = TRUE) diff --git a/data-raw/03-create_internal_rda.R b/data-raw/03-create_internal_rda.R new file mode 100644 index 0000000..82ff159 --- /dev/null +++ b/data-raw/03-create_internal_rda.R @@ -0,0 +1,775 @@ +library(bsseq) +library(DelayedArray) +library(GenomicRanges) + +################################################################################ + +# Use for tile_by_regions() tests +# Use for filter_loci_by_location() tests +#----[---------]---------[--------------]----[---------]--------------[---------] +gr_tiles1 = GRanges( + seqnames = c('chr1','chr1','chr1','chr1'), + ranges = IRanges( + start = c(5,25,45,70), + end = c(15,40,55,80) + ) +) + +# Use for tiling tests +#----[------------------------]----[----------------------------------]----[----] +gr_tiles2 = GRanges( + seqnames = c('chr1','chr1','chr1'), + ranges = IRanges( + start = c(5,35,75), + end = c(30,70,80) + ) +) + +# Use for tiling tests +# Use for filter_loci_by_location() tests (expect an error from removing all) +#----[--------------------------------------------------------------------------] +gr_tiles3 = GRanges( + seqnames = c('chr1'), + ranges = IRanges( + start = c(5), + end = c(80) + ) +) + +# Use for tiling tests +# Use for filter_loci_by_location() tests +#----[--------------]------------------------------------------------------------ +gr_tiles4 = GRanges( + seqnames = c('chr1'), + ranges = IRanges( + start = c(5), + end = c(20) + ) +) + +# Use for tiling tests +# Use for filter_loci_by_location() tests (expect nothing filtered) +#----[---]----------------------------------------------------------------------- +gr_tiles5 = GRanges( + seqnames = c('chr1'), + ranges = IRanges( + start = c(5), + end = c(9) + ) +) + +#[-----------------------][-----------------------][-----------------------][-----------------------][] +seqlengths = c('chr1' = 101) +win25_stranded_gr = GenomicRanges::tileGenome( + seqlengths = seqlengths, + tilewidth = 25, + cut.last.tile.in.chrom = TRUE) + +#[-----------------------][-----------------------][-----------------------][-----------------------] +seqlengths = c('chr1' = 100) +win25_destranded_gr = GenomicRanges::tileGenome( + seqlengths = seqlengths, + tilewidth = 25, + cut.last.tile.in.chrom = TRUE) + +#[-----------------------][-----------------------][---] +#[-----------------------][---] +seqlengths = c('chr1' = 55, 'chr2' = 35) +win25_multichrom_gr = GenomicRanges::tileGenome( + seqlengths = seqlengths, + tilewidth = 25, + cut.last.tile.in.chrom = TRUE) + +################################################################################ +# gr_tiles1 +################################################################################ + +#----[---------]---------[--------------]----[---------]--------------[---------] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 coverage +# 5 30 10 0 40 1000 +# 5 70 20 0 1500 +# test2 coverage +# 10 50 15 5 20 100 +# 10 50 35 5 200 +#----[---------]---------[--------------]----[---------]--------------[---------] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# tile_by_regions(bs = bsseq_stranded, gr = gr_tiles1) coverage +# 10 110 0 2500 +# 20 115 10 300 + +stranded_cov1 = DelayedArray::DelayedArray(matrix(c( + 10,110,0,2500, + 20,115,10,300 +), ncol = 2)) + +#----[---------]---------[--------------]----[---------]--------------[---------] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 coverage +# 5 30 10 0 40 1000 +# 5 70 20 0 1500 +# test2 coverage +# 10 50 15 5 20 100 +# 10 50 35 5 200 +#----[---------]---------[--------------]----[---------]--------------[---------] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# filter_loci_by_location(bs = bsseq_stranded, gr = gr_tiles1) coverage +# 20 40 +# 35 20 + +filter_cov1 = DelayedArray::DelayedArray(matrix(c( + 20,40, + 35,20 +), ncol = 2)) + +#----[---------]---------[--------------]----[---------]--------------[---------] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 methylation +# 4 0 9 0 35 900 +# 4 5 19 0 1400 +# test2 methylation +# 9 1 14 5 15 99 +# 9 5 34 5 199 +#----[---------]---------[--------------]----[---------]--------------[---------] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# tile_by_regions(bs = bsseq_stranded, gr = gr_tiles1) methylation +# 8 14 0 2300 +# 18 20 10 298 + +stranded_meth1 = DelayedArray::DelayedArray(matrix(c( + 8,14,0,2300, + 18,20,10,298 +), ncol = 2)) + +#----[---------]---------[--------------]----[---------]--------------[---------] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 methylation +# 4 0 9 0 35 900 +# 4 5 19 0 1400 +# test2 methylation +# 9 1 14 5 15 99 +# 9 5 34 5 199 +#----[---------]---------[--------------]----[---------]--------------[---------] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# filter_loci_by_location(bs = bsseq_stranded, gr = gr_tiles1) methylation +# 19 35 +# 34 15 + +filter_meth1 = DelayedArray::DelayedArray(matrix(c( + 19,35, + 34,15 +), ncol = 2)) + +bsseq_stranded_tiled1 = BSseq( + gr = gr_tiles1, + Cov = stranded_cov1, + M = stranded_meth1, + pData = data.frame(row.names = c('test1','test2')), + sampleNames = c('test1','test2') +) + +filter_loc_tiles1 = BSseq( + gr = granges(bsseq_stranded[c(6,9)]), + Cov = filter_cov1, + M = filter_meth1, + pData = data.frame(row.names = c('test1','test2')), + sampleNames = c('test1','test2') +) + +######################################## + +#----[---------]---------[--------------]----[---------]--------------[---------] +#---------C--------------C--------------C---------C---------C--------------C- +# test1 coverage +# 10 100 30 0 40 2500 +# test2 coverage +# 20 100 50 10 20 300 +#----[---------]---------[--------------]----[---------]--------------[---------] +#---------C--------------C--------------C---------C---------C--------------C- +# tile_by_regions(bs = bsseq_destranded, gr = gr_tiles1) coverage +# 10 130 0 2500 +# 20 150 10 300 + +destranded_cov1 = DelayedArray::DelayedArray(matrix(c( + 10,130,0,2500, + 20,150,10,300 +), ncol = 2)) + +#----[---------]---------[--------------]----[---------]--------------[---------] +#---------C--------------C--------------C---------C---------C--------------C- +# test1 methylation +# 8 5 28 0 35 2300 +# test2 methylation +# 18 6 48 10 15 298 +#----[---------]---------[--------------]----[---------]--------------[---------] +#---------C--------------C--------------C---------C---------C--------------C- +# tile_by_regions(bs = bsseq_destranded, gr = gr_tiles1) methylation +# 8 14 0 2300 +# 18 54 10 298 + +destranded_meth1 = DelayedArray::DelayedArray(matrix(c( + 8,33,0,2300, + 18,54,10,298 +), ncol = 2)) + +bsseq_destranded_tiled1 = BSseq( + gr = gr_tiles1, + Cov = destranded_cov1, + M = destranded_meth1, + pData = data.frame(row.names = c('test3','test4')), + sampleNames = c('test3','test4') +) + +################################################################################ +# gr_tiles2 +################################################################################ + +#----[------------------------]----[----------------------------------]----[----] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 coverage +# 5 30 10 0 40 1000 +# 5 70 20 0 1500 +# test2 coverage +# 10 50 15 5 20 100 +# 10 50 35 5 200 +#----[------------------------]----[----------------------------------]----[----] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# tile_by_regions(bs = bsseq_stranded, gr = gr_tiles2) coverage +# 110 70 2500 +# 120 80 300 + +stranded_cov2 = DelayedArray::DelayedArray(matrix(c( + 110,70,2500, + 120,80,300 +), ncol = 2)) + +#----[------------------------]----[----------------------------------]----[----] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 methylation +# 4 0 9 0 35 900 +# 4 5 19 0 1400 +# test2 methylation +# 9 1 14 5 15 99 +# 9 5 34 5 199 +#----[------------------------]----[----------------------------------]----[----] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# tile_by_regions(bs = bsseq_stranded, gr = gr_tiles2) methylation +# 13 63 2300 +# 24 73 298 + +stranded_meth2 = DelayedArray::DelayedArray(matrix(c( + 13,63,2300, + 24,73,298 +), ncol = 2)) + +bsseq_stranded_tiled2 = BSseq( + gr = gr_tiles2, + Cov = stranded_cov2, + M = stranded_meth2, + pData = data.frame(row.names = c('test1','test2')), + sampleNames = c('test1','test2') +) + +######################################## + +#----[------------------------]----[----------------------------------]----[----] +#---------C--------------C--------------C---------C---------C--------------C- +# test1 coverage +# 10 100 30 0 40 2500 +# test2 coverage +# 20 100 50 10 20 300 +#----[------------------------]----[----------------------------------]----[----] +#---------C--------------C--------------C---------C---------C--------------C- +# tile_by_regions(bs = bsseq_destranded, gr = gr_tiles2) coverage +# 110 70 2500 +# 120 80 300 + +destranded_cov2 = DelayedArray::DelayedArray(matrix(c( + 110,70,2500, + 120,80,300 +), ncol = 2)) + +#----[------------------------]----[----------------------------------]----[----] +#---------C--------------C--------------C---------C---------C--------------C- +# test1 methylation +# 8 5 28 0 35 2300 +# test2 methylation +# 18 6 48 10 15 298 +#----[------------------------]----[----------------------------------]----[----] +#---------C--------------C--------------C---------C---------C--------------C- +# tile_by_regions(bs = bsseq_destranded, gr = gr_tiles2) methylation +# 13 63 2300 +# 24 73 298 + +destranded_meth2 = DelayedArray::DelayedArray(matrix(c( + 13,63,2300, + 24,73,298 +), ncol = 2)) + +bsseq_destranded_tiled2 = BSseq( + gr = gr_tiles2, + Cov = destranded_cov2, + M = destranded_meth2, + pData = data.frame(row.names = c('test3','test4')), + sampleNames = c('test3','test4') +) + +################################################################################ +# gr_tiles3 +################################################################################ + +#----[--------------------------------------------------------------------------] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 coverage +# 5 30 10 0 40 1000 +# 5 70 20 0 1500 +# test2 coverage +# 10 50 15 5 20 100 +# 10 50 35 5 200 +#----[--------------------------------------------------------------------------] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# tile_by_regions(bs = bsseq_stranded, gr = gr_tiles3) coverage +# 2680 +# 500 +stranded_cov3 = DelayedArray::DelayedArray(matrix(c( + 2680, + 500 +), ncol = 2)) + +#----[--------------------------------------------------------------------------] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 methylation +# 4 0 9 0 35 900 +# 4 5 19 0 1400 +# test2 methylation +# 9 1 14 5 15 99 +# 9 5 34 5 199 +#----[--------------------------------------------------------------------------] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# tile_by_regions(bs = bsseq_stranded, gr = gr_tiles3) methylation +# 2376 +# 395 + +stranded_meth3 = DelayedArray::DelayedArray(matrix(c( + 2376, + 395 +), ncol = 2)) + +bsseq_stranded_tiled3 = BSseq( + gr = gr_tiles3, + Cov = stranded_cov3, + M = stranded_meth3, + pData = data.frame(row.names = c('test1','test2')), + sampleNames = c('test1','test2') +) + +######################################## + +#----[------------------------]----[----------------------------------]----[----] +#---------C--------------C--------------C---------C---------C--------------C- +# test1 coverage +# 10 100 30 0 40 2500 +# test2 coverage +# 20 100 50 10 20 300 +#----[------------------------]----[----------------------------------]----[----] +#---------C--------------C--------------C---------C---------C--------------C- +# tile_by_regions(bs = bsseq_destranded, gr = gr_tiles3) coverage +# 2680 +# 500 + +destranded_cov3 = DelayedArray::DelayedArray(matrix(c( + 2680, + 500 +), ncol = 2)) + +#----[------------------------]----[----------------------------------]----[----] +#---------C--------------C--------------C---------C---------C--------------C- +# test1 methylation +# 8 5 28 0 35 2300 +# test2 methylation +# 18 6 48 10 15 298 +#----[------------------------]----[----------------------------------]----[----] +#---------C--------------C--------------C---------C---------C--------------C- +# tile_by_regions(bs = bsseq_destranded, gr = gr_tiles3) methylation +# 2376 +# 395 + +destranded_meth3 = DelayedArray::DelayedArray(matrix(c( + 2376, + 395 +), ncol = 2)) + +bsseq_destranded_tiled3 = BSseq( + gr = gr_tiles3, + Cov = destranded_cov3, + M = destranded_meth3, + pData = data.frame(row.names = c('test3','test4')), + sampleNames = c('test3','test4') +) + +################################################################################ +# gr_tiles4 +################################################################################ + +#----[--------------]------------------------------------------------------------ +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 coverage +# 5 30 10 0 40 1000 +# 5 70 20 0 1500 +# test2 coverage +# 10 50 15 5 20 100 +# 10 50 35 5 200 +#----[--------------]------------------------------------------------------------ +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# tile_by_regions(bs = bsseq_stranded, gr = gr_tiles4) coverage +# 10 +# 20 +stranded_cov4 = DelayedArray::DelayedArray(matrix(c( + 10, + 20 +), ncol = 2)) + +filter_cov4 = DelayedArray::DelayedArray(matrix(c( + 30,70,10,20,0,0,40,1000,1500, + 50,50,15,35,5,5,20,100,200 +), ncol = 2)) + +#----[--------------]------------------------------------------------------------ +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 methylation +# 4 0 9 0 35 900 +# 4 5 19 0 1400 +# test2 methylation +# 9 1 14 5 15 99 +# 9 5 34 5 199 +#----[--------------]------------------------------------------------------------ +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# tile_by_regions(bs = bsseq_stranded, gr = gr_tiles4) methylation +# 8 +# 18 + +stranded_meth4 = DelayedArray::DelayedArray(matrix(c( + 8, + 18 +), ncol = 2)) + +filter_meth4 = DelayedArray::DelayedArray(matrix(c( + 0,5,9,19,0,0,35,900,1400, + 1,5,14,34,5,5,15,99,199 +), ncol = 2)) + +bsseq_stranded_tiled4 = BSseq( + gr = gr_tiles4, + Cov = stranded_cov4, + M = stranded_meth4, + pData = data.frame(row.names = c('test1','test2')), + sampleNames = c('test1','test2') +) + +filter_loc_tiles4 = BSseq( + gr = granges(bsseq_stranded[-c(1,2)]), + Cov = filter_cov4, + M = filter_meth4, + pData = data.frame(row.names = c('test1','test2')), + sampleNames = c('test1','test2') +) + +######################################## + +#----[--------------]------------------------------------------------------------ +#---------C--------------C--------------C---------C---------C--------------C- +# test1 coverage +# 10 100 30 0 40 2500 +# test2 coverage +# 20 100 50 10 20 300 +#----[--------------]------------------------------------------------------------ +#---------C--------------C--------------C---------C---------C--------------C- +# tile_by_regions(bs = bsseq_destranded, gr = gr_tiles4) coverage +# 10 +# 20 + +destranded_cov4 = DelayedArray::DelayedArray(matrix(c( + 10, + 20 +), ncol = 2)) + +#----[--------------]------------------------------------------------------------ +#---------C--------------C--------------C---------C---------C--------------C- +# test1 methylation +# 8 5 28 0 35 2300 +# test2 methylation +# 18 6 48 10 15 298 +#----[--------------]------------------------------------------------------------ +#---------C--------------C--------------C---------C---------C--------------C- +# tile_by_regions(bs = bsseq_destranded, gr = gr_tiles4) methylation +# 8 +# 18 + +destranded_meth4 = DelayedArray::DelayedArray(matrix(c( + 8, + 18 +), ncol = 2)) + +bsseq_destranded_tiled4 = BSseq( + gr = gr_tiles4, + Cov = destranded_cov4, + M = destranded_meth4, + pData = data.frame(row.names = c('test3','test4')), + sampleNames = c('test3','test4') +) + +################################################################################ +# gr_tiles5 +################################################################################ + +#----[---]----------------------------------------------------------------------- +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 coverage +# 5 30 10 0 40 1000 +# 5 70 20 0 1500 +# test2 coverage +# 10 50 15 5 20 100 +# 10 50 35 5 200 +#----[---]----------------------------------------------------------------------- +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# tile_by_regions(bs = bsseq_stranded, gr = gr_tiles5) coverage +# 0 +# 0 +stranded_cov5 = DelayedArray::DelayedArray(matrix(c( + 0, + 0 +), ncol = 2)) + +#----[---]----------------------------------------------------------------------- +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 methylation +# 4 0 9 0 35 900 +# 4 5 19 0 1400 +# test2 methylation +# 9 1 14 5 15 99 +# 9 5 34 5 199 +#----[---]----------------------------------------------------------------------- +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# tile_by_regions(bs = bsseq_stranded, gr = gr_tiles5) methylation +# 0 +# 0 + +stranded_meth5 = DelayedArray::DelayedArray(matrix(c( + 0, + 0 +), ncol = 2)) + +bsseq_stranded_tiled5 = BSseq( + gr = gr_tiles5, + Cov = stranded_cov5, + M = stranded_meth5, + pData = data.frame(row.names = c('test1','test2')), + sampleNames = c('test1','test2') +) + +######################################## + +#----[---]----------------------------------------------------------------------- +#---------C--------------C--------------C---------C---------C--------------C- +# test1 coverage +# 10 100 30 0 40 2500 +# test2 coverage +# 20 100 50 10 20 300 +#----[---]----------------------------------------------------------------------- +#---------C--------------C--------------C---------C---------C--------------C- +# tile_by_regions(bs = bsseq_destranded, gr = gr_tiles5) coverage +# 0 +# 0 + +destranded_cov5 = DelayedArray::DelayedArray(matrix(c( + 0, + 0 +), ncol = 2)) + +#----[---]----------------------------------------------------------------------- +#---------C--------------C--------------C---------C---------C--------------C- +# test1 methylation +# 8 5 28 0 35 2300 +# test2 methylation +# 18 6 48 10 15 298 +#----[---]----------------------------------------------------------------------- +#---------C--------------C--------------C---------C---------C--------------C- +# tile_by_regions(bs = bsseq_destranded, gr = gr_tiles5) methylation +# 0 +# 0 + +destranded_meth5 = DelayedArray::DelayedArray(matrix(c( + 0, + 0 +), ncol = 2)) + +bsseq_destranded_tiled5 = BSseq( + gr = gr_tiles5, + Cov = destranded_cov5, + M = destranded_meth5, + pData = data.frame(row.names = c('test3','test4')), + sampleNames = c('test3','test4') +) + +filter_loc_tiles5 = bsseq_stranded + +################################################################################ +# win25_gr +################################################################################ + +#[-----------------------][-----------------------][-----------------------][-----------------------][] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 coverage +# 5 30 10 0 40 1000 +# 5 70 20 0 1500 +# test2 coverage +# 10 50 15 5 20 100 +# 10 50 35 5 200 +#[-----------------------][-----------------------][-----------------------][-----------------------][] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# tile_by_windows(bs = bsseq_stranded, win_size = 25) coverage +# 40 100 1040 1500 0 +# 70 105 125 200 0 + +stranded_cov25 = DelayedArray::DelayedArray(matrix(c( + 40,100,1040,1500,0, + 70,105,125,200,0 +), ncol = 2)) + +#[-----------------------][-----------------------][-----------------------][-----------------------][] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# test1 methylation +# 4 0 9 0 35 900 +# 4 5 19 0 1400 +# test2 methylation +# 9 1 14 5 15 99 +# 9 5 34 5 199 +#[-----------------------][-----------------------][-----------------------][-----------------------][] +#---------CG-------------CG-------------CG--------CG--------C--------------CG +# tile_by_windows(bs = bsseq_stranded, win_size = 25) methylation +# 8 33 935 1400 0 +# 19 58 119 199 0 + +stranded_meth25 = DelayedArray::DelayedArray(matrix(c( + 8,33,935,1400,0, + 19,58,119,199,0 +), ncol = 2)) + +bsseq_stranded_win25 = BSseq( + gr = win25_stranded_gr, + Cov = stranded_cov25, + M = stranded_meth25, + pData = data.frame(row.names = c('test1','test2')), + sampleNames = c('test1','test2') +) + +######################################## + +#[-----------------------][-----------------------][-----------------------][-----------------------] +#---------C--------------C--------------C---------C---------C--------------C- +# test1 coverage +# 10 100 30 0 40 2500 +# test2 coverage +# 20 100 50 10 20 300 +#[-----------------------][-----------------------][-----------------------][-----------------------] +#---------C--------------C--------------C---------C---------C--------------C- +# tile_by_windows(bs = bsseq_destranded, win_size = 25) coverage +# 110 30 2540 0 +# 120 60 320 0 + +destranded_cov25 = DelayedArray::DelayedArray(matrix(c( + 110,30,2540,0, + 120,60,320,0 +), ncol = 2)) + +#[-----------------------][-----------------------][-----------------------][-----------------------] +#---------C--------------C--------------C---------C---------C--------------C- +# test1 methylation +# 8 5 28 0 35 2300 +# test2 methylation +# 18 6 48 10 15 298 +#[-----------------------][-----------------------][-----------------------][-----------------------] +#---------C--------------C--------------C---------C---------C--------------C- +# tile_by_windows(bs = bsseq_destranded, win_size = 25) methylation +# 13 28 2335 0 +# 24 58 313 0 + +destranded_meth25 = DelayedArray::DelayedArray(matrix(c( + 13,28,2335,0, + 24,58,313,0 +), ncol = 2)) + +bsseq_destranded_win25 = BSseq( + gr = win25_destranded_gr, + Cov = destranded_cov25, + M = destranded_meth25, + pData = data.frame(row.names = c('test3','test4')), + sampleNames = c('test3','test4') +) + +################################################################################ +# win25_multichrom_gr +################################################################################ + +#[-----------------------][-----------------------][---] +#---------C---------C---------C +# test1 coverage / methylation +# 10 20 30 +# test2 coverage / methylation +# 40 50 60 +#[-----------------------][-----------------------][---] +#---------C---------C---------C + +#[-----------------------][---] +#---------C +# test1 coverage / methylation +# 90 +# test2 coverage / methylation +# 100 +#[-----------------------][---] +#---------C + +multichrom_cov25 = DelayedArray::DelayedArray(matrix(c( + 30,30,0,90,0, + 90,60,0,100,0 +), ncol = 2)) + +multichrom_meth25 = multichrom_cov25 + +bsseq_multichrom_win25 = BSseq( + gr = win25_multichrom_gr, + Cov = multichrom_cov25, + M = multichrom_meth25, + pData = data.frame(row.names = c('test1','test2')), + sampleNames = c('test1','test2') +) + +################################################################################ + +usethis::use_data( + gr_tiles1, + gr_tiles2, + gr_tiles3, + gr_tiles4, + gr_tiles5, + win25_stranded_gr, + win25_destranded_gr, + win25_multichrom_gr, + bsseq_stranded_tiled1, + bsseq_destranded_tiled1, + filter_loc_tiles1, + bsseq_stranded_tiled2, + bsseq_destranded_tiled2, + bsseq_stranded_tiled3, + bsseq_destranded_tiled3, + bsseq_stranded_tiled4, + bsseq_destranded_tiled4, + filter_loc_tiles4, + bsseq_stranded_tiled5, + bsseq_destranded_tiled5, + filter_loc_tiles5, + bsseq_stranded_win25, + bsseq_destranded_win25, + bsseq_multichrom_win25, + internal = TRUE, + overwrite = TRUE) diff --git a/data-raw/build_ct_snp_index.R b/data-raw/build_ct_snp_index.R deleted file mode 100644 index 7ea9f19..0000000 --- a/data-raw/build_ct_snp_index.R +++ /dev/null @@ -1,18 +0,0 @@ -# The index of C > T SNPs will be a GenomicRanges object for efficient overlap finding -library(GenomicRanges) - -# Annotate with hg19 chromosome lengths -chr_lengths = read.table('~/latte/Methylation/Data/chromInfo_hg19.txt', header=F, sep='\t', stringsAsFactors=F) - -# Grab the vcf, but only consider the first two columns -setwd('~/latte/vcf') -vcf = read.table('filtered_AF_0.05_CT_SNPs.vcf',header=F,sep='\t',quote='',comment.char='', colClasses=c('character','numeric','NULL','NULL','NULL','NULL','NULL','NULL'),stringsAsFactors=F) -colnames(vcf) = c('chrom','start') -vcf$chrom = paste('chr',vcf$chrom,sep='') - -CT_SNPs_hg19 = GRanges(seqnames=vcf$chrom, ranges=IRanges(start=vcf$start, end=vcf$start)) -seqlengths(CT_SNPs_hg19) = chr_lengths[match(names(seqlengths(CT_SNPs_hg19)), chr_lengths[,1]), 2] -genome(CT_SNPs_hg19) = 'hg19' - -save(CT_SNPs_hg19, file='CT_SNPs_hg19.RData') -write.table(vcf, file='CT_SNPs_hg19.txt', sep='\t', row.names=F, quote=F) diff --git a/data-raw/build_ct_snp_index.sh b/data-raw/build_ct_snp_index.sh deleted file mode 100644 index 94326b7..0000000 --- a/data-raw/build_ct_snp_index.sh +++ /dev/null @@ -1,21 +0,0 @@ -# All paths are relative to working environment on our lab's compute cluster -cd ~/latte/vcf/ - -# Shell script to pull out C > T SNPs from ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.vcf -wget ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.vcf.gz - -# Filter the sites.vcf so that AF[0]>0.05 -# NOTE: If AF > 0.5, then the 'alternative' allele is actually the main one -~/latte/apps/bcftools-1.2/bin/bcftools filter -i 'AF[0]>0.05' --output-type=v --output=filtered_AF_0.05.vcf ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.vcf - -# Grab only C > T (forward strand) and G > A (reverse strand) -grep -P '(C\tT)' filtered_AF_0.05.vcf > filtered_AF_0.05_CT_forward.vcf -grep -P '(G\tA)' filtered_AF_0.05.vcf > filtered_AF_0.05_CT_reverse.vcf - -# Concatenate the two files -cat filtered_AF_0.05_CT_forward.vcf filtered_AF_0.05_CT_reverse.vcf > filtered_AF_0.05_CT.vcf - -# Grab only the SNP variants -grep -P '(VT=SNP)' filtered_AF_0.05_CT.vcf > filtered_AF_0.05_CT_SNPs.vcf - -# NOTE VCFs are 1-based diff --git a/data-raw/build_test_data.R b/data-raw/build_test_data.R deleted file mode 100644 index 351fc30..0000000 --- a/data-raw/build_test_data.R +++ /dev/null @@ -1,74 +0,0 @@ -# In /bfx/home/rcavalca/epicore/pipeline_testing/test_errbs_1.2.3/00-methCall -# do the following in bash - -# for file in *cytosine_report.txt -# do -# file_base=`basename ${file} '_cytosine_report.txt'` -# echo ${file_base} -# grep chr21 ${file} | head -200000 | gzip > ${file_base}_chr21_cytosine_report.txt.gz -# done -# -# for file in `ls *cytosine_report*` -# do -# file_base=`basename ${file} '_chr21_cytosine_report.txt.gz'` -# echo ${file_base} -# gunzip -c ${file} | awk -v OFS='\t' '$4 + $5 > 5 {print $0}' | gzip > ${file_base}.txt.gz -# done - -devtools::load_all() - -# Get data -files = list.files( - path = 'inst/extdata', - pattern = 'MDAMB', full.names = TRUE) - -sample.ids = basename(files) -sample.ids = gsub('.txt.gz', '', sample.ids) - -pData = data.frame( - Sample_Names = sample.ids, - DR_vs_DS = relevel(factor(c('DR','DS','DR','DS','DR','DS')), ref = 'DS'), - row.names = sample.ids, - stringsAsFactors = FALSE) - -meth = methylSigReadData( - fileList = files, - pData = pData, - assembly = 'hg19', - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 4, - fileType = 'cytosineReport') - -tiled_meth = methylSigTile(meth = meth, tiles = NULL, win.size = 1000) - -msig_cpgs = methylSigCalc( - meth = meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - -msig_tiles = methylSigCalc( - meth = tiled_meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - -tfbs_file = system.file('extdata','tfbs.bed.gz', package = 'methylSig') -tfbs = rtracklayer::import(tfbs_file, genome = 'hg19') - -cpg_annots = annotatr::build_annotations(genome = 'hg19', annotations = 'hg19_cpg_islands') - -save(meth, tiled_meth, msig_cpgs, msig_tiles, tfbs, cpg_annots, file = 'data/sample_data.RData', compress = 'xz') diff --git a/data-raw/create_package.R b/data-raw/create_package.R new file mode 100644 index 0000000..6ea41bd --- /dev/null +++ b/data-raw/create_package.R @@ -0,0 +1,87 @@ +# docker run --interactive --tty --rm --volume /Users/rcavalca/Projects:/Projects rcavalcante/bioconductor_docker:RELEASE_3_10 + +library(devtools) + +# Description fields +description = list( + Title = 'MethylSig: Differential Methylation Testing for WGBS and RRBS Data', + Version = '0.99.0', + Date = '2020-02-28', + `Authors@R` = 'c( + person(given = "Yongseok", + family = "Park", + role = c("aut"), + email = "yongpark@pitt.edu"), + person(given = "Raymond G.", + family = "Cavalcante", + role = c("aut", "cre"), + email = "rcavalca@umich.edu"))', + Description = 'MethylSig is a package for testing for differentially methylated + cytosines (DMCs) or regions (DMRs) in whole-genome bisulfite sequencing + (WGBS) or reduced representation bisulfite sequencing (RRBS) experiments. + MethylSig uses a beta binomial model to test for significant differences + between groups of samples. Several options exist for either site-specific + or sliding window tests, and variance estimation.', + BugReports = 'https://github.com/sartorlab/methylSig/issues', + biocViews = 'DNAMethylation, DifferentialMethylation, Epigenetics, Regression, MethylSeq', + License = 'GPL-3', + Depends = 'R (>= 3.6)' +) + +# Create package +path = '/Projects/methylSig' +create_package(path, fields = description) +activate_project(path) +# use_description(fields = description) # For updating + +# Build ignore +build_ignore_files = c('README.md', '.travis.yml', '.git', '.gitignore') +use_build_ignore(files = build_ignore_files) + +# Data +use_data_raw(name = '01-create_cov_files') +use_data_raw(name = '02-create_bsseq_rda') +use_data_raw(name = '03-create_internal_rda') + +# Documentation +use_readme_md() +use_news_md() +use_package_doc() +use_vignette(name = 'using-methylSig', title = 'Using methylSig') +use_vignette(name = 'updating-methylSig-code', title = 'Updating methylSig code') + +# Travis +use_travis() +use_travis_badge(ext = 'org') + +# Coverage +use_coverage(type = 'coveralls') + +# Testing +use_testthat() + +# Package dependencies +use_package('bsseq', type = 'Imports') + +# R files and test files +use_r('filter_loci_by_coverage') +use_r('filter_loci_by_location') +use_r('filter_loci_by_snps') +use_r('tile_by_windows') +use_r('tile_by_regions') +use_r('filter_loci_by_group_coverage') +use_r('diff_binomial') +use_r('diff_methylsig') +use_r('diff_dss_fit') +use_r('diff_dss_test') + +use_test('filter_loci_by_coverage') +use_test('filter_loci_by_location') +use_test('filter_loci_by_snps') +use_test('tile_by_windows') +use_test('tile_by_regions') +use_test('filter_loci_by_group_coverage') +use_test('diff_binomial') +use_test('diff_methylsig') +use_test('diff_dss_fit') +use_test('diff_dss_test') diff --git a/data/CT_SNPs_hg19.RData b/data/CT_SNPs_hg19.RData deleted file mode 100644 index ee4dcdd..0000000 Binary files a/data/CT_SNPs_hg19.RData and /dev/null differ diff --git a/data/bsseq_destranded.rda b/data/bsseq_destranded.rda new file mode 100644 index 0000000..f3d1188 Binary files /dev/null and b/data/bsseq_destranded.rda differ diff --git a/data/bsseq_multichrom.rda b/data/bsseq_multichrom.rda new file mode 100644 index 0000000..7a426b0 Binary files /dev/null and b/data/bsseq_multichrom.rda differ diff --git a/data/bsseq_stranded.rda b/data/bsseq_stranded.rda new file mode 100644 index 0000000..3253d64 Binary files /dev/null and b/data/bsseq_stranded.rda differ diff --git a/data/datalist b/data/datalist deleted file mode 100644 index 23be2a8..0000000 --- a/data/datalist +++ /dev/null @@ -1,2 +0,0 @@ -CT_SNPs_hg19 -sample_data diff --git a/data/promoters_gr.rda b/data/promoters_gr.rda new file mode 100644 index 0000000..c768b5f Binary files /dev/null and b/data/promoters_gr.rda differ diff --git a/data/sample_data.RData b/data/sample_data.RData deleted file mode 100644 index a701c81..0000000 Binary files a/data/sample_data.RData and /dev/null differ diff --git a/doc/methylSig.R b/doc/methylSig.R deleted file mode 100644 index 91032ed..0000000 --- a/doc/methylSig.R +++ /dev/null @@ -1,166 +0,0 @@ -## ---- echo=FALSE----------------------------------------------------------- -library(methylSig) -library(rtracklayer) - -## ---- eval=FALSE----------------------------------------------------------- -# devtools::install_github('sartorlab/methylSig') - -## -------------------------------------------------------------------------- -# The following bismark cytosine reports are included in inst/extdata -files = c( - system.file('extdata', 'MDAMB_231_1DR.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_1DS.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_2DR.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_2DS.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_3DR.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_3DS.txt.gz', package='methylSig')) - -sample.ids = basename(files) -sample.ids = gsub('.txt.gz', '', sample.ids) - -# Build a pData matrix with columns for the samples, group memberships, and phenotype data -pData = data.frame( - Sample_Names = sample.ids, - DR_vs_DS = relevel(factor(c('DR','DS','DR','DS','DR','DS')), ref = 'DS'), - row.names = sample.ids, - stringsAsFactors = FALSE) - -meth = methylSigReadData( - fileList = files, - pData = pData, - assembly = 'hg19', - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 1, - fileType = 'cytosineReport') - -print(meth) - -## -------------------------------------------------------------------------- -### Test on CpGs -result = methylSigCalc( - meth = meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - -print(result) - -## -------------------------------------------------------------------------- -# Must create a design matrix -design1 = data.frame(group = bsseq::pData(meth)$DR_vs_DS) - -print(design1) - -# NOTE this model has an intercept -contrast_intercept = matrix(c(0,1), ncol = 1) -result_dss_intercept = methylSigDSS( - meth = meth, - design = design1, - formula = '~ group', - contrast = contrast_intercept, - group.term = 'group', - min.per.group=c(3,3)) - -print(result_dss_intercept) - -## -------------------------------------------------------------------------- -# Add a covariate column, note specification as a factor, but can -# also use a numeric covariate -design2 = data.frame( - group = bsseq::pData(meth)$DR_vs_DS, - subject = factor(c(1,1,2,2,3,3))) - -print(design2) - -# NOTE the contrast vector has as many entries as the sum of the -# levels in group and subject, in the formula. -contrast_covariates = matrix(c(0,1,0,0), ncol = 1) -result_dss_covariates = methylSigDSS( - meth = meth, - design = design2, - formula = '~ group + subject', - contrast = contrast_covariates, - group.term = 'group', - min.per.group=c(3,3)) - -print(result_dss_covariates) - -## -------------------------------------------------------------------------- -### Test on 10000bp windows -windowed_meth = methylSigTile(meth, tiles = NULL, win.size = 10000) - -tiled_result = methylSigCalc( - meth = windowed_meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - -print(tiled_result) - -## -------------------------------------------------------------------------- -### Test on CpG islands -library(annotatr) - -cpg_islands = annotatr::build_annotations(genome = 'hg19', annotations = 'hg19_cpg_islands') - -cpg_island_meth = methylSigTile(meth, tiles = cpg_islands) - -cpg_island_result = methylSigCalc( - meth = cpg_island_meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - -print(cpg_island_result) - -## -------------------------------------------------------------------------- -# Get CpG island annotations from built-in data they could be built with the following: -# cpg_annots = annotatr::build_annotations(genome = 'hg19', annotations = c('hg19_cpg_islands')) -utils::data(sample_data, package = 'methylSig') - -# Determine what CpGs should be considered significant -dmcList = result$fdr < 0.05 & abs(result$meth.diff) > 25 - -annotated_result = methylSigAnnotation(myDiff = result, dmcList = dmcList, annotations = cpg_annots) - -## -------------------------------------------------------------------------- -print(annotated_result) - -## -------------------------------------------------------------------------- -print(head(as.data.frame(annotated_result))) - -## -------------------------------------------------------------------------- -# Use preloaded tfbs from package sample_data. Could be manually loaded as with: -# tfbs_file = system.file('extdata','tfbs.bed.gz', package = 'methylSig') -# tfbs = rtracklayer::import(tfbs_file, genome = 'hg19') - -print(tfbs) - -## -------------------------------------------------------------------------- -# Significance threshold -dmcList = result$fdr < 0.05 & abs(result$meth.diff) > 25 - -# Perform the test -tfbs_enrichment = methylSig.tfbsEnrichTest(myDiff = result, dmcList = dmcList, tfbsInfo = tfbs) - -# Take a look at the first few rows -print(head(tfbs_enrichment)) - diff --git a/doc/methylSig.Rmd b/doc/methylSig.Rmd deleted file mode 100644 index 6db1de5..0000000 --- a/doc/methylSig.Rmd +++ /dev/null @@ -1,269 +0,0 @@ ---- -title: "methylSig: A package for whole genome DNA methylation analysis" -author: "Yongseok Park, Raymond G. Cavalcante, Maria E. Figueroa, Laura S. Rozek, and Maureen A. Sartor" -date: "`r Sys.Date()`" -output: - BiocStyle::html_document -vignette: > - %\VignetteIndexEntry{Introduction to methylSig} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, echo=FALSE} -library(methylSig) -library(rtracklayer) -``` - -# Introduction - -DNA methylation plays critical roles in gene regulation and cellular specification without altering DNA sequences. It is one of the best understood and most intensively studied epigenetic marks in mammalian cells. Treatment of DNA with sodium bisulfite deaminates unmethylated cytosines to uracil while methylated cytosines are resistant to this conversion thus allowing for the discrimination between methylated and unmethylated CpG sites. Sodium bisulfite pre-treatment of DNA coupled with next-generation sequencing has allowed DNA methylation to be studied quantitatively and genome-wide at single cytosine site resolution. - -`methylSig` is a method for testing for differential methylated cytosines (DMCs) or regions (DMRs) in whole-genome bisulfite sequencing (bis-seq) or reduced representation bisulfite sequencing (RRBS) experiments. `methylSig` uses a beta binomial model to test for significant differences between groups of samples. Several options exist for either site-specific or sliding window tests, combining strands, and for variance estimation. It allows annotating the resulting regions to multiple genome features, and visualizing the results for chosen genomic regions. - -# Installation - -`methylSig` is available on GitHub at , and the easiest way to install it is as follows: - -```{r, eval=FALSE} -devtools::install_github('sartorlab/methylSig') -``` - -# Basic usage - -## Reading data - -As of version 0.5.0, `methylSig` is able to read `bismark_methylation_extractor` outputs directly using the `bsseq` Bioconductor package. The `methylSigReadData()` function is a wrapper for `bsseq::read_bismark()` that adds some userful features: - -1. Users can set a `minCount` and `maxCount` for the coverage of sites. -2. Users analyzing data aligned to `hg19` can filter out C > T or G > A SNPs. - -The following code uses data contained in the package to demonstrate how to read methylation data: - -```{r} -# The following bismark cytosine reports are included in inst/extdata -files = c( - system.file('extdata', 'MDAMB_231_1DR.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_1DS.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_2DR.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_2DS.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_3DR.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_3DS.txt.gz', package='methylSig')) - -sample.ids = basename(files) -sample.ids = gsub('.txt.gz', '', sample.ids) - -# Build a pData matrix with columns for the samples, group memberships, and phenotype data -pData = data.frame( - Sample_Names = sample.ids, - DR_vs_DS = relevel(factor(c('DR','DS','DR','DS','DR','DS')), ref = 'DS'), - row.names = sample.ids, - stringsAsFactors = FALSE) - -meth = methylSigReadData( - fileList = files, - pData = pData, - assembly = 'hg19', - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 1, - fileType = 'cytosineReport') - -print(meth) -``` - -## Differential methylation analysis - -The package consists of two methods to test for differential methlyation: `methylSigCalc()` and `methylSigDSS()`. - -The first, `methylSigCalc()`, calculates differential methylation statistics between two groups of samples. It uses a beta-binomial approach to calculate differential methylation statistics, accounting for coverage and variation among samples within each group. - -The second, `methylSigDSS()`, is a wrapper for the `DSS::DMLfit.multiFactor()` and `DSS::DMLtest.multiFactor()` functions in the [DSS Bioconductor package](https://bioconductor.org/packages/release/bioc/html/DSS.html). Essentially the test in DSS uses a linear model over an arbitrary design matrix, thus allowing for correction by covariates. The wrapper function provided here enables enforcement of the a minimum number of data points per group to test a site/region, as well as reporting of averaged methlyation levels over the groups. - -### Site specific analysis with `methylSigCalc()` - -The default is to do site specific analysis and to use both groups to estimate variances. - -```{r} -### Test on CpGs -result = methylSigCalc( - meth = meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - -print(result) -``` - -The output includes the estimated dispersion (`phiCommonEst`), the log-likelihood ratio (`logLikRatio`), the degrees of freedom (`df`), the group methylation estimates (`muEstC_group1` and `muEstC_group2` where group 1 is the reference factor in the comparison column of the `pData` matrix), the methylation difference (`meth.diff = muEstC_group2 - muEstC_group1`), the group for which the site is hyper-methylated (`hyper.direction`, note, this is regardless of significance), the `pvalue`, and `fdr`. - -#### Variance from one group - -Using the `dispersion` argument, it is possible to estimate variances from one group rather than from both groups. This can be accomplished by changing the `dispersion` parameter in the previous example from `'both'` to `'DS'` or `'DR'` - -#### Using local information - -It is also possible to use information from nearby CpG sites to improve the variance and methylation level estimates. The default `local.winsize` is 200 bps. The `local.winsize` is only used when `local.info = TRUE`. - -### Site specific analysis with `methylSigDSS()` - -The following example illustrates a case with no covariates and where the model has an intercept. - -```{r} -# Must create a design matrix -design1 = data.frame(group = bsseq::pData(meth)$DR_vs_DS) - -print(design1) - -# NOTE this model has an intercept -contrast_intercept = matrix(c(0,1), ncol = 1) -result_dss_intercept = methylSigDSS( - meth = meth, - design = design1, - formula = '~ group', - contrast = contrast_intercept, - group.term = 'group', - min.per.group=c(3,3)) - -print(result_dss_intercept) -``` - -The following illustrates an paired-type test. - -```{r} -# Add a covariate column, note specification as a factor, but can -# also use a numeric covariate -design2 = data.frame( - group = bsseq::pData(meth)$DR_vs_DS, - subject = factor(c(1,1,2,2,3,3))) - -print(design2) - -# NOTE the contrast vector has as many entries as the sum of the -# levels in group and subject, in the formula. -contrast_covariates = matrix(c(0,1,0,0), ncol = 1) -result_dss_covariates = methylSigDSS( - meth = meth, - design = design2, - formula = '~ group + subject', - contrast = contrast_covariates, - group.term = 'group', - min.per.group=c(3,3)) - -print(result_dss_covariates) -``` - -### Tiled analysis - -`methylSig` also provides `methylSigTile()` to tile data within continuous non-overlapping windows. Users can tile the genome according to a window size, give a `data.frame` with genomic regions, or give a `GRanges` object. Examples are below. Note that tiling analysis is also possible with `methylSigDSS()`. - -#### Windowed analysis - -```{r} -### Test on 10000bp windows -windowed_meth = methylSigTile(meth, tiles = NULL, win.size = 10000) - -tiled_result = methylSigCalc( - meth = windowed_meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - -print(tiled_result) -``` - -#### Region analysis - -As mentioned, users can provide a `data.frame` in the `tiles` parameter, so long as it has column names acceptable to `makeGRangesFromDataFrame`, i.e. `chr`, `start`, and `end`. - -Finally, users can provide tiling regions as a `GRanges` object. If we wanted to test for differential methylation in CpG islands, we could use the `annotatr` package to create the CpG island regions. - -```{r} -### Test on CpG islands -library(annotatr) - -cpg_islands = annotatr::build_annotations(genome = 'hg19', annotations = 'hg19_cpg_islands') - -cpg_island_meth = methylSigTile(meth, tiles = cpg_islands) - -cpg_island_result = methylSigCalc( - meth = cpg_island_meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - -print(cpg_island_result) -``` - -# Annotation - -## Annotating differentially methylated CpGs - -Once differential methylation has been determined with `methylSigCalc()`, it may be of interest understand where differential methylation occurs in terms of genes and CpG features (islands, shores, shelves). `methylSig` uses the `annotatr` package to accomplish this. - -```{r} -# Get CpG island annotations from built-in data they could be built with the following: -# cpg_annots = annotatr::build_annotations(genome = 'hg19', annotations = c('hg19_cpg_islands')) -utils::data(sample_data, package = 'methylSig') - -# Determine what CpGs should be considered significant -dmcList = result$fdr < 0.05 & abs(result$meth.diff) > 25 - -annotated_result = methylSigAnnotation(myDiff = result, dmcList = dmcList, annotations = cpg_annots) -``` - -The result is a `GRanges` object with the same columns as the `result` with the addition of columns giving the differential methylation status (`dm_status`), a unique locus id (`locus_id`), and information about the annotation which is itself a `GRanges` object (`annot`). It is important to note that regions tested for differential methylation may occur on multiple rows, depending on the number of features it is annotated to. The `locus_id` column helps to quickly see when this is the case. - -```{r} -print(annotated_result) -``` - -It is more illuminating to view this object as a coerced `data.frame`, wherein the information about the annotations are displayed. - -```{r} -print(head(as.data.frame(annotated_result))) -``` - -## Transcription factor (TF) enrichment test - -Changes in DNA methylation have been shown to alter transcription factor binding. The `methylSig` package has implemented `methylSig.tfbsEnrichTest()` to test a set of transcription factor binding sites (TFBSs) are enriched for differentially methylated CpGs. - -We demonstrate this funciton on a set of TFBSs from ENCODE. The `rtracklayer::import()` function makes reading in `BED` files simple. - -```{r} -# Use preloaded tfbs from package sample_data. Could be manually loaded as with: -# tfbs_file = system.file('extdata','tfbs.bed.gz', package = 'methylSig') -# tfbs = rtracklayer::import(tfbs_file, genome = 'hg19') - -print(tfbs) -``` - -This file mixes TFBSs from a number of TFs and keeps track of them in the name column (4th) of the `BED`. Next, we indicate what is considered a differentially methylated CpG and perform the test. - -```{r} -# Significance threshold -dmcList = result$fdr < 0.05 & abs(result$meth.diff) > 25 - -# Perform the test -tfbs_enrichment = methylSig.tfbsEnrichTest(myDiff = result, dmcList = dmcList, tfbsInfo = tfbs) - -# Take a look at the first few rows -print(head(tfbs_enrichment)) -``` diff --git a/doc/methylSig.html b/doc/methylSig.html deleted file mode 100644 index 840667c..0000000 --- a/doc/methylSig.html +++ /dev/null @@ -1,760 +0,0 @@ - - - - - - - - - - - - - - - -methylSig: A package for whole genome DNA methylation analysis - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - -

Contents

- - -
## 
-
## 
-
## 
-
## 
-
## Loading required package: GenomicRanges
-
## Loading required package: stats4
-
## Loading required package: BiocGenerics
-
## Loading required package: parallel
-
## 
-## Attaching package: 'BiocGenerics'
-
## The following objects are masked from 'package:parallel':
-## 
-##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
-##     clusterExport, clusterMap, parApply, parCapply, parLapply,
-##     parLapplyLB, parRapply, parSapply, parSapplyLB
-
## The following objects are masked from 'package:stats':
-## 
-##     IQR, mad, sd, var, xtabs
-
## The following objects are masked from 'package:base':
-## 
-##     anyDuplicated, append, as.data.frame, cbind, colMeans, colnames,
-##     colSums, do.call, duplicated, eval, evalq, Filter, Find, get,
-##     grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
-##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
-##     pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
-##     rowSums, sapply, setdiff, sort, table, tapply, union, unique,
-##     unsplit, which, which.max, which.min
-
## Loading required package: S4Vectors
-
## 
-## Attaching package: 'S4Vectors'
-
## The following object is masked from 'package:base':
-## 
-##     expand.grid
-
## Loading required package: IRanges
-
## Loading required package: GenomeInfoDb
-
-

1 Introduction

-

DNA methylation plays critical roles in gene regulation and cellular specification without altering DNA sequences. It is one of the best understood and most intensively studied epigenetic marks in mammalian cells. Treatment of DNA with sodium bisulfite deaminates unmethylated cytosines to uracil while methylated cytosines are resistant to this conversion thus allowing for the discrimination between methylated and unmethylated CpG sites. Sodium bisulfite pre-treatment of DNA coupled with next-generation sequencing has allowed DNA methylation to be studied quantitatively and genome-wide at single cytosine site resolution.

-

methylSig is a method for testing for differential methylated cytosines (DMCs) or regions (DMRs) in whole-genome bisulfite sequencing (bis-seq) or reduced representation bisulfite sequencing (RRBS) experiments. methylSig uses a beta binomial model to test for significant differences between groups of samples. Several options exist for either site-specific or sliding window tests, combining strands, and for variance estimation. It allows annotating the resulting regions to multiple genome features, and visualizing the results for chosen genomic regions.

-
-
-

2 Installation

-

methylSig is available on GitHub at http://www.github.com/sartorlab/methylSig, and the easiest way to install it is as follows:

-
devtools::install_github('sartorlab/methylSig')
-
-
-

3 Basic usage

-
-

3.1 Reading data

-

As of version 0.5.0, methylSig is able to read bismark_methylation_extractor outputs directly using the bsseq Bioconductor package. The methylSigReadData() function is a wrapper for bsseq::read_bismark() that adds some userful features:

-
    -
  1. Users can set a minCount and maxCount for the coverage of sites.
  2. -
  3. Users analyzing data aligned to hg19 can filter out C > T or G > A SNPs.
  4. -
-

The following code uses data contained in the package to demonstrate how to read methylation data:

-
# The following bismark cytosine reports are included in inst/extdata
-files = c(
-    system.file('extdata', 'MDAMB_231_1DR.txt.gz', package='methylSig'),
-    system.file('extdata', 'MDAMB_231_1DS.txt.gz', package='methylSig'),
-    system.file('extdata', 'MDAMB_231_2DR.txt.gz', package='methylSig'),
-    system.file('extdata', 'MDAMB_231_2DS.txt.gz', package='methylSig'),
-    system.file('extdata', 'MDAMB_231_3DR.txt.gz', package='methylSig'),
-    system.file('extdata', 'MDAMB_231_3DS.txt.gz', package='methylSig'))
-
-sample.ids = basename(files)
-sample.ids = gsub('.txt.gz', '', sample.ids)
-
-# Build a pData matrix with columns for the samples, group memberships, and phenotype data
-pData = data.frame(
-    Sample_Names = sample.ids,
-    DR_vs_DS = relevel(factor(c('DR','DS','DR','DS','DR','DS')), ref = 'DS'),
-    row.names = sample.ids,
-    stringsAsFactors = FALSE)
-
-meth = methylSigReadData(
-    fileList = files,
-    pData = pData,
-    assembly = 'hg19',
-    destranded = TRUE,
-    maxCount = 500,
-    minCount = 10,
-    filterSNPs = TRUE,
-    num.cores = 1,
-    fileType = 'cytosineReport')
-
## Assuming file type is cytosineReport
-
## [read.bismark] Reading file '/tmp/Rtmp6sEBze/temp_libpath2056f65bde/methylSig/extdata/MDAMB_231_1DR.txt.gz' ... done in 1.8 secs
-## [read.bismark] Reading file '/tmp/Rtmp6sEBze/temp_libpath2056f65bde/methylSig/extdata/MDAMB_231_1DS.txt.gz' ... done in 1.1 secs
-## [read.bismark] Reading file '/tmp/Rtmp6sEBze/temp_libpath2056f65bde/methylSig/extdata/MDAMB_231_2DR.txt.gz' ... done in 0.3 secs
-## [read.bismark] Reading file '/tmp/Rtmp6sEBze/temp_libpath2056f65bde/methylSig/extdata/MDAMB_231_2DS.txt.gz' ... done in 0.3 secs
-## [read.bismark] Reading file '/tmp/Rtmp6sEBze/temp_libpath2056f65bde/methylSig/extdata/MDAMB_231_3DR.txt.gz' ... done in 0.3 secs
-## [read.bismark] Reading file '/tmp/Rtmp6sEBze/temp_libpath2056f65bde/methylSig/extdata/MDAMB_231_3DS.txt.gz' ... done in 0.3 secs
-## [read.bismark] Joining samples ... done in 0.2 secs
-
## Filtering SNPs
-
print(meth)
-
## An object of type 'BSseq' with
-##   4577 methylation loci
-##   6 samples
-## has not been smoothed
-## All assays are in-memory
-
-
-

3.2 Differential methylation analysis

-

The package consists of two methods to test for differential methlyation: methylSigCalc() and methylSigDSS().

-

The first, methylSigCalc(), calculates differential methylation statistics between two groups of samples. It uses a beta-binomial approach to calculate differential methylation statistics, accounting for coverage and variation among samples within each group.

-

The second, methylSigDSS(), is a wrapper for the DSS::DMLfit.multiFactor() and DSS::DMLtest.multiFactor() functions in the DSS Bioconductor package. Essentially the test in DSS uses a linear model over an arbitrary design matrix, thus allowing for correction by covariates. The wrapper function provided here enables enforcement of the a minimum number of data points per group to test a site/region, as well as reporting of averaged methlyation levels over the groups.

-
-

3.2.1 Site specific analysis with methylSigCalc()

-

The default is to do site specific analysis and to use both groups to estimate variances.

-
### Test on CpGs
-result = methylSigCalc(
-    meth = meth,
-    comparison = 'DR_vs_DS',
-    dispersion = 'both',
-    local.info = FALSE,
-    local.winsize = 200,
-    min.per.group = c(3,3),
-    weightFunc = methylSig_weightFunc,
-    T.approx = TRUE,
-    num.cores = 1)
-
-print(result)
-
## GRanges object with 2962 ranges and 9 metadata columns:
-##          seqnames               ranges strand | variance.est logLikRatio
-##             <Rle>            <IRanges>  <Rle> |    <numeric>   <numeric>
-##      [1]    chr21   [9437432, 9437432]      * | 2.783517e+01   1.2816869
-##      [2]    chr21   [9437444, 9437444]      * | 1.000000e+06   0.2131537
-##      [3]    chr21   [9437458, 9437458]      * | 5.163216e+01   0.6232777
-##      [4]    chr21   [9437461, 9437461]      * | 2.508937e+01   1.6203272
-##      [5]    chr21   [9437464, 9437464]      * | 5.434622e+01   9.3657230
-##      ...      ...                  ...    ... .          ...         ...
-##   [2958]    chr21 [26980251, 26980251]      * |  1.00000e+06   0.0000000
-##   [2959]    chr21 [26980274, 26980274]      * |  4.14772e+00   1.4538396
-##   [2960]    chr21 [26980313, 26980313]      * |  1.00000e+06   0.0000000
-##   [2961]    chr21 [26980346, 26980346]      * |  1.00000e+06   0.8909992
-##   [2962]    chr21 [26980353, 26980353]      * |  1.00000e+06   0.0000000
-##                 df   meth.DR    meth.DS  meth.diff hyper.direction
-##          <numeric> <numeric>  <numeric>  <numeric>     <character>
-##      [1]         6  98.72839   95.58233  3.1460617              DR
-##      [2]         6  96.99333   97.87526 -0.8819329              DS
-##      [3]         6  95.23565   92.12952  3.1061310              DR
-##      [4]         6  94.22184   87.23211  6.9897317              DR
-##      [5]         6  97.29838   83.97197 13.3264146              DR
-##      ...       ...       ...        ...        ...             ...
-##   [2958]         6  0.000000  0.0000000  0.0000000              DR
-##   [2959]         6  3.576359 12.4291240 -8.8527652              DS
-##   [2960]         6  0.000000  0.0000000  0.0000000              DR
-##   [2961]         6  0.000000  0.2568636 -0.2568636              DS
-##   [2962]         6  0.000000  0.0000000  0.0000000              DR
-##              pvalue        fdr
-##           <numeric>  <numeric>
-##      [1] 0.30079244 0.46548966
-##      [2] 0.66057168 0.84303415
-##      [3] 0.45988357 0.63034480
-##      [4] 0.25014206 0.41362110
-##      [5] 0.02221512 0.06351466
-##      ...        ...        ...
-##   [2958]  1.0000000  1.0000000
-##   [2959]  0.2732998  0.4356911
-##   [2960]  1.0000000  1.0000000
-##   [2961]  0.3816470  0.5506923
-##   [2962]  1.0000000  1.0000000
-##   -------
-##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
-

The output includes the estimated dispersion (phiCommonEst), the log-likelihood ratio (logLikRatio), the degrees of freedom (df), the group methylation estimates (muEstC_group1 and muEstC_group2 where group 1 is the reference factor in the comparison column of the pData matrix), the methylation difference (meth.diff = muEstC_group2 - muEstC_group1), the group for which the site is hyper-methylated (hyper.direction, note, this is regardless of significance), the pvalue, and fdr.

-
-

3.2.1.1 Variance from one group

-

Using the dispersion argument, it is possible to estimate variances from one group rather than from both groups. This can be accomplished by changing the dispersion parameter in the previous example from 'both' to 'DS' or 'DR'

-
-
-

3.2.1.2 Using local information

-

It is also possible to use information from nearby CpG sites to improve the variance and methylation level estimates. The default local.winsize is 200 bps. The local.winsize is only used when local.info = TRUE.

-
-
-
-

3.2.2 Site specific analysis with methylSigDSS()

-

The following example illustrates a case with no covariates and where the model has an intercept.

-
# Must create a design matrix
-design1 = data.frame(group = bsseq::pData(meth)$DR_vs_DS)
-
-print(design1)
-
##   group
-## 1    DR
-## 2    DS
-## 3    DR
-## 4    DS
-## 5    DR
-## 6    DS
-
# NOTE this model has an intercept
-contrast_intercept = matrix(c(0,1), ncol = 1)
-result_dss_intercept = methylSigDSS(
-    meth = meth,
-    design = design1,
-    formula = '~ group',
-    contrast = contrast_intercept,
-    group.term = 'group',
-    min.per.group=c(3,3))
-
## Fitting DML model for CpG site:
-
print(result_dss_intercept)
-
## GRanges object with 2962 ranges and 7 metadata columns:
-##          seqnames               ranges strand |      stat   meth.DR
-##             <Rle>            <IRanges>  <Rle> | <numeric> <numeric>
-##      [1]    chr21   [9437432, 9437432]      * | 1.0009952  99.35897
-##      [2]    chr21   [9437444, 9437444]      * | 0.5713017  97.56852
-##      [3]    chr21   [9437458, 9437458]      * | 0.7730615  96.35279
-##      [4]    chr21   [9437461, 9437461]      * | 1.0119228  93.08807
-##      [5]    chr21   [9437464, 9437464]      * | 2.5388560  97.40130
-##      ...      ...                  ...    ... .       ...       ...
-##   [2958]    chr21 [26980251, 26980251]      * | 0.2692223 0.0000000
-##   [2959]    chr21 [26980274, 26980274]      * | 0.9297976 0.8130081
-##   [2960]    chr21 [26980313, 26980313]      * | 0.2404340 0.0000000
-##   [2961]    chr21 [26980346, 26980346]      * | 0.2829581 0.0000000
-##   [2962]    chr21 [26980353, 26980353]      * | 0.2333046 0.0000000
-##             meth.DS   meth.diff hyper.direction     pvalue        fdr
-##           <numeric>   <numeric>     <character>  <numeric>  <numeric>
-##      [1]   94.83092   4.5280565              DR 0.31682912 0.53230168
-##      [2]   97.88406  -0.3155345              DS 0.56779516 0.79244608
-##      [3]   91.80998   4.5428012              DR 0.43948598 0.67343894
-##      [4]   87.56200   5.5260746              DR 0.31157495 0.52646035
-##      [5]   83.69082  13.7104781              DR 0.01112156 0.03098971
-##      ...        ...         ...             ...        ...        ...
-##   [2958]  0.0000000   0.0000000              DR  0.7877586  0.9520460
-##   [2959] 15.7205924 -14.9075843              DS  0.3524759  0.5779137
-##   [2960]  0.0000000   0.0000000              DR  0.8099938  0.9520460
-##   [2961]  0.2364066  -0.2364066              DS  0.7772090  0.9520460
-##   [2962]  0.0000000   0.0000000              DR  0.8155249  0.9520460
-##   -------
-##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
-

The following illustrates an paired-type test.

-
# Add a covariate column, note specification as a factor, but can
-# also use a numeric covariate
-design2 = data.frame(
-    group = bsseq::pData(meth)$DR_vs_DS,
-    subject = factor(c(1,1,2,2,3,3)))
-
-print(design2)
-
##   group subject
-## 1    DR       1
-## 2    DS       1
-## 3    DR       2
-## 4    DS       2
-## 5    DR       3
-## 6    DS       3
-
# NOTE the contrast vector has as many entries as the sum of the
-# levels in group and subject, in the formula.
-contrast_covariates = matrix(c(0,1,0,0), ncol = 1)
-result_dss_covariates = methylSigDSS(
-    meth = meth,
-    design = design2,
-    formula = '~ group + subject',
-    contrast = contrast_covariates,
-    group.term = 'group',
-    min.per.group=c(3,3))
-
## Fitting DML model for CpG site:
-
print(result_dss_covariates)
-
## GRanges object with 2962 ranges and 10 metadata columns:
-##          seqnames               ranges strand |      stat   meth.DR
-##             <Rle>            <IRanges>  <Rle> | <numeric> <numeric>
-##      [1]    chr21   [9437432, 9437432]      * | 0.9903242  99.35897
-##      [2]    chr21   [9437444, 9437444]      * | 0.4428470  97.56852
-##      [3]    chr21   [9437458, 9437458]      * | 0.6141460  96.35279
-##      [4]    chr21   [9437461, 9437461]      * | 1.8382529  93.08807
-##      [5]    chr21   [9437464, 9437464]      * | 1.8214267  97.40130
-##      ...      ...                  ...    ... .       ...       ...
-##   [2958]    chr21 [26980251, 26980251]      * | 0.2850336 0.0000000
-##   [2959]    chr21 [26980274, 26980274]      * | 1.4152062 0.8130081
-##   [2960]    chr21 [26980313, 26980313]      * | 0.1633590 0.0000000
-##   [2961]    chr21 [26980346, 26980346]      * | 0.2843347 0.0000000
-##   [2962]    chr21 [26980353, 26980353]      * | 0.1590852 0.0000000
-##             meth.DS    meth.1    meth.2    meth.3   meth.diff
-##           <numeric> <numeric> <numeric> <numeric>   <numeric>
-##      [1]   94.83092  93.33333 100.00000  97.95151   4.5280565
-##      [2]   97.88406  98.27586  99.00000  95.90301  -0.3155345
-##      [3]   91.80998  96.30268  91.00000  94.94147   4.5428012
-##      [4]   87.56200  97.77778  82.47826  90.71906   5.5260746
-##      [5]   83.69082  94.94253  90.82609  85.86957  13.7104781
-##      ...        ...       ...       ...       ...         ...
-##   [2958]  0.0000000   0.00000 0.0000000 0.0000000   0.0000000
-##   [2959] 15.7205924  23.60757 0.6493506 0.5434783 -14.9075843
-##   [2960]  0.0000000   0.00000 0.0000000 0.0000000   0.0000000
-##   [2961]  0.2364066   0.00000 0.3546099 0.0000000  -0.2364066
-##   [2962]  0.0000000   0.00000 0.0000000 0.0000000   0.0000000
-##          hyper.direction     pvalue       fdr
-##              <character>  <numeric> <numeric>
-##      [1]              DR 0.32201570 0.5367415
-##      [2]              DS 0.65787644 0.8726188
-##      [3]              DR 0.53911882 0.7677259
-##      [4]              DR 0.06602515 0.1432426
-##      [5]              DR 0.06854202 0.1481563
-##      ...             ...        ...       ...
-##   [2958]              DR  0.7756184 0.9547039
-##   [2959]              DS  0.1570081 0.3039594
-##   [2960]              DR  0.8702357 0.9547039
-##   [2961]              DS  0.7761539 0.9547039
-##   [2962]              DR  0.8736017 0.9547039
-##   -------
-##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
-
-
-

3.2.3 Tiled analysis

-

methylSig also provides methylSigTile() to tile data within continuous non-overlapping windows. Users can tile the genome according to a window size, give a data.frame with genomic regions, or give a GRanges object. Examples are below. Note that tiling analysis is also possible with methylSigDSS().

-
-

3.2.3.1 Windowed analysis

-
### Test on 10000bp windows
-windowed_meth = methylSigTile(meth, tiles = NULL, win.size = 10000)
-
-tiled_result = methylSigCalc(
-    meth = windowed_meth,
-    comparison = 'DR_vs_DS',
-    dispersion = 'both',
-    local.info = FALSE,
-    local.winsize = 200,
-    min.per.group = c(3,3),
-    weightFunc = methylSig_weightFunc,
-    T.approx = TRUE,
-    num.cores = 1)
-
-print(tiled_result)
-
## GRanges object with 403 ranges and 9 metadata columns:
-##         seqnames               ranges strand | variance.est logLikRatio
-##            <Rle>            <IRanges>  <Rle> |    <numeric>   <numeric>
-##     [1]    chr21   [9430001, 9440000]      * |    3217.7251    31.87458
-##     [2]    chr21   [9470001, 9480000]      * | 1000000.0000   222.95755
-##     [3]    chr21   [9480001, 9490000]      * |     367.0496    11.88060
-##     [4]    chr21   [9540001, 9550000]      * |     398.7476   119.61740
-##     [5]    chr21   [9570001, 9580000]      * | 1000000.0000    15.98406
-##     ...      ...                  ...    ... .          ...         ...
-##   [399]    chr21 [26770001, 26780000]      * | 9.045037e+01  47.1302186
-##   [400]    chr21 [26920001, 26930000]      * | 1.466130e+01  14.9949174
-##   [401]    chr21 [26930001, 26940000]      * | 2.062805e+02   0.3858422
-##   [402]    chr21 [26970001, 26980000]      * | 1.000000e+06   2.7957758
-##   [403]    chr21 [26980001, 26990000]      * | 6.313615e+04  20.4396536
-##                df    meth.DR    meth.DS    meth.diff hyper.direction
-##         <numeric>  <numeric>  <numeric>    <numeric>     <character>
-##     [1]         6  58.512432  63.592261    -5.079830              DS
-##     [2]         6   9.006623  35.966085   -26.959463              DS
-##     [3]         6  86.915490  92.014703    -5.099213              DS
-##     [4]         6   5.572836  22.749576   -17.176740              DS
-##     [5]         6   0.000000   4.195631    -4.195631              DS
-##     ...       ...        ...        ...          ...             ...
-##   [399]         6  0.9656335 26.9541773 -25.98854385              DS
-##   [400]         6 61.1776884 18.8547058  42.32298260              DR
-##   [401]         6 96.1423558 95.4135724   0.72878334              DR
-##   [402]         6  0.0244081  0.1039525  -0.07954435              DS
-##   [403]         6  0.4466644  1.2952075  -0.84854310              DS
-##               pvalue          fdr
-##            <numeric>    <numeric>
-##     [1] 1.324096e-03 3.707253e-03
-##     [2] 5.680069e-06 9.461202e-05
-##     [3] 1.368606e-02 2.522605e-02
-##     [4] 3.468094e-05 2.911754e-04
-##     [5] 7.135159e-03 1.509949e-02
-##     ...          ...          ...
-##   [399] 0.0004705309  0.001679357
-##   [400] 0.0082436191  0.016910968
-##   [401] 0.5573388553  0.629152825
-##   [402] 0.1455439010  0.199504055
-##   [403] 0.0040132048  0.009403032
-##   -------
-##   seqinfo: 1 sequence from an unspecified genome
-
-
-

3.2.3.2 Region analysis

-

As mentioned, users can provide a data.frame in the tiles parameter, so long as it has column names acceptable to makeGRangesFromDataFrame, i.e. chr, start, and end.

-

Finally, users can provide tiling regions as a GRanges object. If we wanted to test for differential methylation in CpG islands, we could use the annotatr package to create the CpG island regions.

-
### Test on CpG islands
-library(annotatr)
-
-cpg_islands = annotatr::build_annotations(genome = 'hg19', annotations = 'hg19_cpg_islands')
-
## snapshotDate(): 2017-10-27
-
## Building CpG islands...
-
## loading from cache '/root//.AnnotationHub/5086'
-
cpg_island_meth = methylSigTile(meth, tiles = cpg_islands)
-
-cpg_island_result = methylSigCalc(
-    meth = cpg_island_meth,
-    comparison = 'DR_vs_DS',
-    dispersion = 'both',
-    local.info = FALSE,
-    local.winsize = 200,
-    min.per.group = c(3,3),
-    weightFunc = methylSig_weightFunc,
-    T.approx = TRUE,
-    num.cores = 1)
-
-print(cpg_island_result)
-
## GRanges object with 35 ranges and 9 metadata columns:
-##        seqnames               ranges strand | variance.est logLikRatio
-##           <Rle>            <IRanges>  <Rle> |    <numeric>   <numeric>
-##    [1]    chr21   [9437273, 9439473]      * |  10108.31324 49.47148386
-##    [2]    chr21   [9483486, 9484663]      * |    677.34297 14.91396011
-##    [3]    chr21   [9647867, 9648116]      * |    338.35837 85.02550366
-##    [4]    chr21   [9708936, 9709231]      * |    227.92331  0.86175261
-##    [5]    chr21   [9825443, 9826296]      * |      4.90187  0.06391833
-##    ...      ...                  ...    ... .          ...         ...
-##   [31]    chr21 [19617099, 19617874]      * |    2150.3170 106.0834863
-##   [32]    chr21 [22369424, 22370582]      * |     223.8728  45.0812745
-##   [33]    chr21 [26734109, 26734485]      * |     148.5777 333.3295868
-##   [34]    chr21 [26934424, 26934805]      * |     197.2834   0.4653393
-##   [35]    chr21 [26979578, 26980252]      * |    3956.9207   6.7192656
-##               df    meth.DR   meth.DS   meth.diff hyper.direction
-##        <numeric>  <numeric> <numeric>   <numeric>     <character>
-##    [1]         6   72.17284  77.00457   -4.831728              DS
-##    [2]         6   88.84971  93.18360   -4.333888              DS
-##    [3]         6   28.61071  49.46677  -20.856061              DS
-##    [4]         6   39.76191  42.56069   -2.798781              DS
-##    [5]         6   49.97605  54.17314   -4.197089              DS
-##    ...       ...        ...       ...         ...             ...
-##   [31]         6 55.8142135 69.071588 -13.2573749              DS
-##   [32]         6 12.1540430  2.840372   9.3136710              DR
-##   [33]         6  2.0245908 52.419599 -50.3950084              DS
-##   [34]         6 96.5735977 97.269836  -0.6962384              DS
-##   [35]         6  0.2125352  0.515617  -0.3030818              DS
-##              pvalue          fdr
-##           <numeric>    <numeric>
-##    [1] 0.0004126188 0.0014441658
-##    [2] 0.0083442742 0.0183734669
-##    [3] 0.0000918022 0.0004016346
-##    [4] 0.3890673165 0.4863341457
-##    [5] 0.8088455587 0.8846748298
-##    ...          ...          ...
-##   [31] 4.893231e-05 0.0002550898
-##   [32] 5.304487e-04 0.0016877914
-##   [33] 1.739160e-06 0.0000304353
-##   [34] 5.205895e-01 0.6073543910
-##   [35] 4.109283e-02 0.0625325619
-##   -------
-##   seqinfo: 93 sequences (1 circular) from hg19 genome
-
-
-
-
-
-

4 Annotation

-
-

4.1 Annotating differentially methylated CpGs

-

Once differential methylation has been determined with methylSigCalc(), it may be of interest understand where differential methylation occurs in terms of genes and CpG features (islands, shores, shelves). methylSig uses the annotatr package to accomplish this.

-
# Get CpG island annotations from built-in data they could be built with the following:
-# cpg_annots = annotatr::build_annotations(genome = 'hg19', annotations = c('hg19_cpg_islands'))
-utils::data(sample_data, package = 'methylSig')
-
-# Determine what CpGs should be considered significant
-dmcList = result$fdr < 0.05 & abs(result$meth.diff) > 25
-
-annotated_result = methylSigAnnotation(myDiff = result, dmcList = dmcList, annotations = cpg_annots)
-
## Annotating...
-

The result is a GRanges object with the same columns as the result with the addition of columns giving the differential methylation status (dm_status), a unique locus id (locus_id), and information about the annotation which is itself a GRanges object (annot). It is important to note that regions tested for differential methylation may occur on multiple rows, depending on the number of features it is annotated to. The locus_id column helps to quickly see when this is the case.

-
print(annotated_result)
-
## GRanges object with 985 ranges and 12 metadata columns:
-##         seqnames               ranges strand | variance.est logLikRatio
-##            <Rle>            <IRanges>  <Rle> |    <numeric>   <numeric>
-##     [1]    chr21   [9437432, 9437432]      * | 2.783517e+01   1.2816869
-##     [2]    chr21   [9437444, 9437444]      * | 1.000000e+06   0.2131537
-##     [3]    chr21   [9437458, 9437458]      * | 5.163216e+01   0.6232777
-##     [4]    chr21   [9437461, 9437461]      * | 2.508937e+01   1.6203272
-##     [5]    chr21   [9437464, 9437464]      * | 5.434622e+01   9.3657230
-##     ...      ...                  ...    ... .          ...         ...
-##   [981]    chr21 [26980131, 26980131]      * |     107.3813    7.680714
-##   [982]    chr21 [26980148, 26980148]      * | 1000000.0000   12.786767
-##   [983]    chr21 [26980233, 26980233]      * | 1000000.0000    0.000000
-##   [984]    chr21 [26980249, 26980249]      * | 1000000.0000    0.000000
-##   [985]    chr21 [26980251, 26980251]      * | 1000000.0000    0.000000
-##                df   meth.DR   meth.DS  meth.diff hyper.direction     pvalue
-##         <numeric> <numeric> <numeric>  <numeric>     <character>  <numeric>
-##     [1]         6  98.72839  95.58233  3.1460617              DR 0.30079244
-##     [2]         6  96.99333  97.87526 -0.8819329              DS 0.66057168
-##     [3]         6  95.23565  92.12952  3.1061310              DR 0.45988357
-##     [4]         6  94.22184  87.23211  6.9897317              DR 0.25014206
-##     [5]         6  97.29838  83.97197 13.3264146              DR 0.02221512
-##     ...       ...       ...       ...        ...             ...        ...
-##   [981]         6  0.000000  2.273272  -2.273272              DS 0.03236211
-##   [982]         6  1.851407 19.999972 -18.148565              DS 0.01170130
-##   [983]         6  0.000000  0.000000   0.000000              DR 1.00000000
-##   [984]         6  0.000000  0.000000   0.000000              DR 1.00000000
-##   [985]         6  0.000000  0.000000   0.000000              DR 1.00000000
-##                fdr   dm_status  locus_id                   annot
-##          <numeric> <character> <integer>               <GRanges>
-##     [1] 0.46548966       No DM         1   chr21:9437273-9439473
-##     [2] 0.84303415       No DM         2   chr21:9437273-9439473
-##     [3] 0.63034480       No DM         3   chr21:9437273-9439473
-##     [4] 0.41362110       No DM         4   chr21:9437273-9439473
-##     [5] 0.06351466       No DM         5   chr21:9437273-9439473
-##     ...        ...         ...       ...                     ...
-##   [981] 0.08467895       No DM      2954 chr21:26979578-26980252
-##   [982] 0.03868219       No DM      2955 chr21:26979578-26980252
-##   [983] 1.00000000       No DM      2956 chr21:26979578-26980252
-##   [984] 1.00000000       No DM      2957 chr21:26979578-26980252
-##   [985] 1.00000000       No DM      2958 chr21:26979578-26980252
-##   -------
-##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
-

It is more illuminating to view this object as a coerced data.frame, wherein the information about the annotations are displayed.

-
print(head(as.data.frame(annotated_result)))
-
##   seqnames   start     end width strand variance.est logLikRatio df  meth.DR
-## 1    chr21 9437432 9437432     1      * 2.783517e+01   1.2816869  6 98.72839
-## 2    chr21 9437444 9437444     1      * 1.000000e+06   0.2131537  6 96.99333
-## 3    chr21 9437458 9437458     1      * 5.163216e+01   0.6232777  6 95.23565
-## 4    chr21 9437461 9437461     1      * 2.508937e+01   1.6203272  6 94.22184
-## 5    chr21 9437464 9437464     1      * 5.434622e+01   9.3657230  6 97.29838
-## 6    chr21 9437470 9437470     1      * 2.166604e+02  11.0351514  6 95.29008
-##    meth.DS  meth.diff hyper.direction     pvalue        fdr dm_status
-## 1 95.58233  3.1460617              DR 0.30079244 0.46548966     No DM
-## 2 97.87526 -0.8819329              DS 0.66057168 0.84303415     No DM
-## 3 92.12952  3.1061310              DR 0.45988357 0.63034480     No DM
-## 4 87.23211  6.9897317              DR 0.25014206 0.41362110     No DM
-## 5 83.97197 13.3264146              DR 0.02221512 0.06351466     No DM
-## 6 77.80258 17.4874962              DR 0.01596620 0.04962424     No DM
-##   locus_id annot.seqnames annot.start annot.end annot.width annot.strand
-## 1        1          chr21     9437273   9439473        2201            *
-## 2        2          chr21     9437273   9439473        2201            *
-## 3        3          chr21     9437273   9439473        2201            *
-## 4        4          chr21     9437273   9439473        2201            *
-## 5        5          chr21     9437273   9439473        2201            *
-## 6        6          chr21     9437273   9439473        2201            *
-##       annot.id annot.tx_id annot.gene_id annot.symbol       annot.type
-## 1 island:25558          NA            NA           NA hg19_cpg_islands
-## 2 island:25558          NA            NA           NA hg19_cpg_islands
-## 3 island:25558          NA            NA           NA hg19_cpg_islands
-## 4 island:25558          NA            NA           NA hg19_cpg_islands
-## 5 island:25558          NA            NA           NA hg19_cpg_islands
-## 6 island:25558          NA            NA           NA hg19_cpg_islands
-
-
-

4.2 Transcription factor (TF) enrichment test

-

Changes in DNA methylation have been shown to alter transcription factor binding. The methylSig package has implemented methylSig.tfbsEnrichTest() to test a set of transcription factor binding sites (TFBSs) are enriched for differentially methylated CpGs.

-

We demonstrate this funciton on a set of TFBSs from ENCODE. The rtracklayer::import() function makes reading in BED files simple.

-
# Use preloaded tfbs from package sample_data. Could be manually loaded as with:
-# tfbs_file = system.file('extdata','tfbs.bed.gz', package = 'methylSig')
-# tfbs = rtracklayer::import(tfbs_file, genome = 'hg19')
-
-print(tfbs)
-
## GRanges object with 47964 ranges and 2 metadata columns:
-##           seqnames               ranges strand |              name     score
-##              <Rle>            <IRanges>  <Rle> |       <character> <numeric>
-##       [1]    chr21   [9720034, 9720145]      * |       Gm12878Zzz3       564
-##       [2]    chr21   [9720060, 9720250]      * |       Gm12878Cjun       666
-##       [3]    chr21   [9720200, 9720201]      * |        Helas3Brf1       450
-##       [4]    chr21   [9720665, 9720748]      * |      Helas3Rpc155       445
-##       [5]    chr21   [9720836, 9721493]      * |        Helas3Bdp1       870
-##       ...      ...                  ...    ... .               ...       ...
-##   [47960]    chr21 [46943784, 46944183]      * |  K562bSetdb1Mnase       437
-##   [47961]    chr21 [46943788, 46943923]      * |        Hct116Tcf4         0
-##   [47962]    chr21 [46943821, 46944068]      * | Gm15510NfkbIggrab       414
-##   [47963]    chr21 [46943829, 46944301]      * |        Gm12878Tr4       438
-##   [47964]    chr21 [46943981, 46944050]      * |          K562Cjun         0
-##   -------
-##   seqinfo: 1 sequence from hg19 genome; no seqlengths
-

This file mixes TFBSs from a number of TFs and keeps track of them in the name column (4th) of the BED. Next, we indicate what is considered a differentially methylated CpG and perform the test.

-
# Significance threshold
-dmcList = result$fdr < 0.05 & abs(result$meth.diff) > 25
-
-# Perform the test
-tfbs_enrichment = methylSig.tfbsEnrichTest(myDiff = result, dmcList = dmcList, tfbsInfo = tfbs)
-
-# Take a look at the first few rows
-print(head(tfbs_enrichment))
-
##                   n_total_by_tf n_dmc_by_tf N_total N_dmc    p_total
-## Gm10847Pol2Musigg             5           2     144    31 0.03472222
-## Gm12878Cfos                  14           0     144    31 0.09722222
-## Gm12878CfosV2                14           0     144    31 0.09722222
-## Gm12878CfosV3                17           0     144    31 0.11805556
-## Gm12878Cjun                  24           2     144    31 0.16666667
-## Gm12878Cmyc                  26           5     144    31 0.18055556
-##                        p_dmc     logLik    pvalue
-## Gm10847Pol2Musigg 0.06451613 0.65972499 0.4166571
-## Gm12878Cfos       0.00000000 6.34128865 1.0000000
-## Gm12878CfosV2     0.00000000 6.34128865 1.0000000
-## Gm12878CfosV3     0.00000000 7.78882521 1.0000000
-## Gm12878Cjun       0.06451613 2.91022835 1.0000000
-## Gm12878Cmyc       0.16129032 0.08005114 1.0000000
-
-
- - - - -
- - - - - - - - - diff --git a/inst/CITATION b/inst/CITATION old mode 100755 new mode 100644 diff --git a/inst/extdata/MDAMB_231_1DR.txt.gz b/inst/extdata/MDAMB_231_1DR.txt.gz deleted file mode 100644 index bbd3bb4..0000000 Binary files a/inst/extdata/MDAMB_231_1DR.txt.gz and /dev/null differ diff --git a/inst/extdata/MDAMB_231_1DS.txt.gz b/inst/extdata/MDAMB_231_1DS.txt.gz deleted file mode 100644 index f51fbb6..0000000 Binary files a/inst/extdata/MDAMB_231_1DS.txt.gz and /dev/null differ diff --git a/inst/extdata/MDAMB_231_2DR.txt.gz b/inst/extdata/MDAMB_231_2DR.txt.gz deleted file mode 100644 index fabbef1..0000000 Binary files a/inst/extdata/MDAMB_231_2DR.txt.gz and /dev/null differ diff --git a/inst/extdata/MDAMB_231_2DS.txt.gz b/inst/extdata/MDAMB_231_2DS.txt.gz deleted file mode 100644 index c0aee60..0000000 Binary files a/inst/extdata/MDAMB_231_2DS.txt.gz and /dev/null differ diff --git a/inst/extdata/MDAMB_231_3DR.txt.gz b/inst/extdata/MDAMB_231_3DR.txt.gz deleted file mode 100644 index 6ad52c6..0000000 Binary files a/inst/extdata/MDAMB_231_3DR.txt.gz and /dev/null differ diff --git a/inst/extdata/MDAMB_231_3DS.txt.gz b/inst/extdata/MDAMB_231_3DS.txt.gz deleted file mode 100644 index 27341a5..0000000 Binary files a/inst/extdata/MDAMB_231_3DS.txt.gz and /dev/null differ diff --git a/inst/extdata/bis_cov1.cov b/inst/extdata/bis_cov1.cov new file mode 100644 index 0000000..493de5d --- /dev/null +++ b/inst/extdata/bis_cov1.cov @@ -0,0 +1,9 @@ +chr1 10 11 80 4 1 +chr1 11 12 80 4 1 +chr1 25 26 0 0 30 +chr1 26 27 7.14285714285714 5 65 +chr1 40 41 90 9 1 +chr1 41 42 95 19 1 +chr1 60 61 87.5 35 5 +chr1 75 76 90 900 100 +chr1 76 77 93.3333333333333 1400 100 diff --git a/inst/extdata/bis_cov2.cov b/inst/extdata/bis_cov2.cov new file mode 100644 index 0000000..0149c2e --- /dev/null +++ b/inst/extdata/bis_cov2.cov @@ -0,0 +1,11 @@ +chr1 10 11 90 9 1 +chr1 11 12 90 9 1 +chr1 25 26 2 1 49 +chr1 26 27 10 5 45 +chr1 40 41 93.3333333333333 14 1 +chr1 41 42 97.1428571428571 34 1 +chr1 50 51 100 5 0 +chr1 51 52 100 5 0 +chr1 60 61 75 15 5 +chr1 75 76 99 99 1 +chr1 76 77 99.5 199 1 diff --git a/inst/extdata/bis_cov3.cov b/inst/extdata/bis_cov3.cov new file mode 100644 index 0000000..c1e0a5b --- /dev/null +++ b/inst/extdata/bis_cov3.cov @@ -0,0 +1,5 @@ +chr1 10 12 80 8 2 +chr1 25 27 5 5 95 +chr1 40 42 93.3333333333333 28 2 +chr1 60 62 87.5 35 5 +chr1 75 77 92 2300 200 diff --git a/inst/extdata/bis_cov4.cov b/inst/extdata/bis_cov4.cov new file mode 100644 index 0000000..b24f148 --- /dev/null +++ b/inst/extdata/bis_cov4.cov @@ -0,0 +1,6 @@ +chr1 10 12 90 18 2 +chr1 25 27 6 6 94 +chr1 40 42 96 48 2 +chr1 50 52 100 10 0 +chr1 60 62 75 15 5 +chr1 75 77 99.3333333333333 298 2 diff --git a/inst/extdata/test_1.txt b/inst/extdata/test_1.txt deleted file mode 100644 index cc6040c..0000000 --- a/inst/extdata/test_1.txt +++ /dev/null @@ -1,8 +0,0 @@ -chr21 9413839 + 20 0 CG CGG -chr21 9413840 - 20 5 CG CGG -chr21 9419908 + 14 0 CG CGG -chr21 9419909 - 20 0 CG CGG -chr21 43052356 - 550 0 CG CGG -chr21 43053297 + 23 1 CG CGG -chr21 43053298 - 64 8 CG CGG -chr21 43053323 + 4 0 CG CGG diff --git a/inst/extdata/test_2.txt b/inst/extdata/test_2.txt deleted file mode 100644 index cf61500..0000000 --- a/inst/extdata/test_2.txt +++ /dev/null @@ -1,6 +0,0 @@ -chr21 9419908 + 5 0 CG CGG -chr21 9419909 - 5 0 CG CGG -chr21 43052356 - 300 0 CG CGG -chr21 43053297 + 300 18 CG CGG -chr21 43053298 - 134 6 CG CGG -chr21 43053323 + 4 0 CG CGG diff --git a/inst/extdata/test_tiles.txt b/inst/extdata/test_tiles.txt deleted file mode 100644 index 50c28d1..0000000 --- a/inst/extdata/test_tiles.txt +++ /dev/null @@ -1,3 +0,0 @@ -chr start end -chr21 9400000 9420000 -chr21 43052000 43054000 diff --git a/inst/extdata/test_tiles2.txt b/inst/extdata/test_tiles2.txt deleted file mode 100644 index 8b7059d..0000000 --- a/inst/extdata/test_tiles2.txt +++ /dev/null @@ -1,4 +0,0 @@ -chr start end -chr21 9400000 9420000 -chr21 43052000 43054000 -chr10 368938 370000 diff --git a/inst/extdata/tfbs.bed.gz b/inst/extdata/tfbs.bed.gz deleted file mode 100644 index 0b2946d..0000000 Binary files a/inst/extdata/tfbs.bed.gz and /dev/null differ diff --git a/man/CT_SNPs_hg19.Rd b/man/CT_SNPs_hg19.Rd deleted file mode 100644 index 0bcb8e6..0000000 --- a/man/CT_SNPs_hg19.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methylSig_data_doc.R -\docType{data} -\name{CT_SNPs_hg19} -\alias{CT_SNPs_hg19} -\title{CpG Index for hg19} -\format{A GenomicRanges object of length 1,321,463} -\source{ -\url{ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.vcf.gz} -} -\description{ -A GenomicsRanges object giving the coordinates (in hg19) of all C > T SNPs. -Start coordinates are 0-based and end coordinates are 1-based. Starting from -1000 Genomes Data we used \code{bcftools filter} with \code{-i 'AF[0]>0.05'} -to pull all sites with alternate frequency greater than 0.05. We then used -\code{grep -P '(C\tT)'} and \code{grep -P '(VT=SNP)'} to collect all C > T -SNPs. -} -\keyword{datasets} diff --git a/man/binomialDiffCalc.Rd b/man/binomialDiffCalc.Rd deleted file mode 100644 index bfa6605..0000000 --- a/man/binomialDiffCalc.Rd +++ /dev/null @@ -1,38 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/test_binomial.R -\name{binomialDiffCalc} -\alias{binomialDiffCalc} -\title{Differential methylation analysis using binomial model} -\usage{ -binomialDiffCalc(meth, comparison, min.per.group = c(3, 3)) -} -\arguments{ -\item{meth}{A \code{BSseq-class} object to calculate differential methylation statistics. See \code{methylSigReadData} for how to read in methylation data.} - -\item{comparison}{The name of the column in \code{pData(meth)} to use for the comparisons, with the correct reference level set.} - -\item{min.per.group}{A vector with two numbers specifying the minimum number of samples required to perform the test for differential methylation. If it is a single number, both groups will use it as the minimum requried number of samples. Default is \code{c(3,3)}.} -} -\value{ -\code{GRanges} object containing the differential methylation statistics and locations. \code{p.adjust} with \code{method='BH'} option is used for p-value correction. -} -\description{ -This function calculates differential methylation statistics using a binomial-based approach. See `Warning' message below. -} -\details{ -This function uses a binomial-based model to calculate differential methylation statistics. It is nearly identical to the \code{methylKit::calculateDiffMeth} function in the \code{methylKit} R package except that only the likelihood ratio test and \code{p.adjust()} with \code{method='BH'} are used to calculate significance levels. It is significantly faster than \code{methylKit::calculateDiffMeth} function. -} -\section{Warning}{ - This function does not take into account the variability among samples in each group being compared. -} - -\examples{ -utils::data(sample_data, package = 'methylSig') - -myDiff = binomialDiffCalc(meth = meth, comparison = 'DR_vs_DS') - -} -\seealso{ -\code{\link{methylSigCalc}} -} -\keyword{differentialMethylation} diff --git a/man/bsseq_destranded.Rd b/man/bsseq_destranded.Rd new file mode 100644 index 0000000..72c519c --- /dev/null +++ b/man/bsseq_destranded.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methylSig-data.R +\docType{data} +\name{bsseq_destranded} +\alias{bsseq_destranded} +\title{BSseq object read from destranded coverage files} +\format{A BSseq object} +\source{ +data-raw/02-create_bsseq_rda.R +} +\usage{ +bsseq_destranded +} +\description{ +Data contains 6 methylation loci and 2 samples +} +\examples{ +data(bsseq_destranded, package = 'methylSig') +} +\keyword{datasets} diff --git a/man/bsseq_multichrom.Rd b/man/bsseq_multichrom.Rd new file mode 100644 index 0000000..50c4811 --- /dev/null +++ b/man/bsseq_multichrom.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methylSig-data.R +\docType{data} +\name{bsseq_multichrom} +\alias{bsseq_multichrom} +\title{BSseq object with loci on multiple chromosomes} +\format{A BSseq object} +\source{ +data-raw/02-create_bsseq_rda.R +} +\usage{ +bsseq_multichrom +} +\description{ +Data contains 4 methylation loci for 2 samples on 2 chromosomes +} +\examples{ +data(bsseq_multichrom, package = 'methylSig') +} +\keyword{datasets} diff --git a/man/bsseq_stranded.Rd b/man/bsseq_stranded.Rd new file mode 100644 index 0000000..88ad31b --- /dev/null +++ b/man/bsseq_stranded.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methylSig-data.R +\docType{data} +\name{bsseq_stranded} +\alias{bsseq_stranded} +\title{BSseq object read from stranded coverage files} +\format{A BSseq object} +\source{ +data-raw/02-create_bsseq_rda.R +} +\usage{ +bsseq_stranded +} +\description{ +Data contains 11 methylation loci and 2 samples +} +\examples{ +data(bsseq_stranded, package = 'methylSig') +} +\keyword{datasets} diff --git a/man/diff_binomial.Rd b/man/diff_binomial.Rd new file mode 100644 index 0000000..72ae866 --- /dev/null +++ b/man/diff_binomial.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/diff_binomial.R +\name{diff_binomial} +\alias{diff_binomial} +\title{Differential methylation analysis using binomial model} +\usage{ +diff_binomial(bs, group_column, comparison_groups) +} +\arguments{ +\item{bs}{A \code{BSseq-class} object to calculate differential methylation statistics. See \code{methylSigReadData} for how to read in methylation data.} + +\item{group_column}{a \code{character} string indicating the column of \code{pData(bs)} to use for determining group membership.} + +\item{comparison_groups}{a named \code{character} vector indicating the \code{case} and \code{control} factors of \code{group_column} for the comparison.} +} +\value{ +A \code{GRanges} object containing the following \code{mcols}: +\describe{ + \item{meth_case:}{ Methylation estimate for case. } + \item{meth_control:}{ Methylation estimate for control. } + \item{meth_diff:}{ The difference \code{meth_case - meth_control}. } + \item{direction:}{ The group for which the lcous is hyper-methylated. Note, this is not subject to significance thresholds. } + \item{pvalue:}{ The p-value from the t-test (\code{t_approx = TRUE}) or the Chi-Square test (\code{t_approx = FALSE}). } + \item{fdr:}{ The Benjamini-Hochberg adjusted p-values using \code{p.adjust(method = 'BH')}. } + \item{log_lik_ratio:}{ The log likelihood ratio. } +} +} +\description{ +This function calculates differential methylation statistics using a binomial-based approach. See `Warning' message below. +} +\details{ +This function uses a binomial-based model to calculate differential methylation statistics. It is nearly identical to the \code{methylKit::calculateDiffMeth} function in the \code{methylKit} R package except that only the likelihood ratio test and \code{p.adjust(..., method='BH')} are used to calculate significance levels. It is significantly faster than \code{methylKit::calculateDiffMeth} function. +} +\section{Warning}{ + This function does not take into account the variability among samples in each group being compared. +} + +\examples{ +data(BS.cancer.ex, package = 'bsseqData') + +bs = filter_loci_by_group_coverage( + bs = BS.cancer.ex, + group_column = 'Type', + c('cancer' = 2, 'normal' = 2)) + +small_test = bs[1:50] + +diff_gr = diff_binomial( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal')) + +} diff --git a/man/diff_dss_fit.Rd b/man/diff_dss_fit.Rd new file mode 100644 index 0000000..e4f4b8b --- /dev/null +++ b/man/diff_dss_fit.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/diff_dss_fit.R +\name{diff_dss_fit} +\alias{diff_dss_fit} +\title{Performs model fit for general experimental design} +\usage{ +diff_dss_fit(bs, design, formula) +} +\arguments{ +\item{bs}{a \code{BSseq} object to calculate differential methylation statistics.} + +\item{design}{a \code{data.frame} or \code{DataFrame} for experimental design. Should contain as many rows as there are columns (samples) in \code{bs}, and the order of the rows should match the columns of \code{bs}. If omitted, will default to \code{pData(bs)}.} + +\item{formula}{a formula for the linear model. It should refer to column names from \code{design}. NOTE: The intercept is included by default if omitted. One can omit the intercept with a formula such as \code{'~ 0 + group'}. For clarity, it helps to include the intercept explicitly as in \code{'~ 1 + group'}.} +} +\value{ +A \code{list} object with: +\describe{ + \item{gr:}{ a \code{GRanges} object with loci fit. } + \item{design:}{ the \code{data.frame} input as the experimental design. } + \item{formula:}{ the \code{formula} representing the model. Can be \code{character} or \code{formula}. } + \item{X:}{ the design \code{matrix} used in regression based on the \code{design} and \code{formula}. This should be consulted to determine the appropriate contrast to use in \code{dss_fit_test()}. } + \item{fit:}{ a \code{list} with model fitting results. It has components \code{beta}, the estimated coefficients, and \code{var.beta} the estimated variance/covariance matrix for \code{beta}. } +} +} +\description{ +This function is a wrapper for \code{DSS::DMLfit.multiFactor}. +} +\examples{ +data(BS.cancer.ex, package = 'bsseqData') + +bs = filter_loci_by_group_coverage( + bs = BS.cancer.ex, + group_column = 'Type', + c('cancer' = 2, 'normal' = 2)) + +small_test = bs[1:50] + +diff_fit = diff_dss_fit( + bs = small_test, + design = bsseq::pData(bs), + formula = '~ Type') + +} diff --git a/man/diff_dss_test.Rd b/man/diff_dss_test.Rd new file mode 100644 index 0000000..618ef5f --- /dev/null +++ b/man/diff_dss_test.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/diff_dss_test.R +\name{diff_dss_test} +\alias{diff_dss_test} +\title{Calculates differential methylation statistics under general experimental design} +\usage{ +diff_dss_test( + bs, + diff_fit, + contrast, + methylation_group_column = NA, + methylation_groups = NA +) +} +\arguments{ +\item{bs}{a \code{BSseq}, the same used used to create \code{diff_fit}.} + +\item{diff_fit}{a \code{list} object output by \code{diff_dss_fit()}.} + +\item{contrast}{a contrast matrix for hypothesis testing. The number of rows should match the number of columns \code{design}. Consult \code{diff_fit$X} to ensure the contrast correponds to the intended test.} + +\item{methylation_group_column}{Optionally, a column from \code{diff_fit$design} by which to group samples and capture methylation rates. This column can be a \code{character}, \code{factor}, or \code{numeric}. In the case of \code{numeric} the samples are grouped according to the top and bottom 25 percentiles of the covariate, and the mean methlyation for each group is calculated. If not a \code{numeric}, use the \code{methylation_groups} parameter to specify case and control.} + +\item{methylation_groups}{Optionally, a named \code{character} vector indicating the \code{case} and \code{control} factors of \code{methylation_group_column} by which to group samples and capture methylation rates. If specified, must also specify \code{methylation_group_column}.} +} +\value{ +A \code{GRanges} object containing the following \code{mcols}: +\describe{ + \item{stat:}{ The test statistic. } + \item{pvalue:}{ The p-value. } + \item{fdr:}{ The Benjamini-Hochberg adjusted p-values using \code{p.adjust(method = 'BH')}. } +} +} +\description{ +This function is a wrapper for \code{DSS::DMLtest.multiFactor} with the added feature of reporting methylation rates alongside the test results via the \code{methylation_group_column} and \code{methylation_groups} parameters. See documentation below. +} +\examples{ +data(BS.cancer.ex, package = 'bsseqData') + +bs = filter_loci_by_group_coverage( + bs = BS.cancer.ex, + group_column = 'Type', + c('cancer' = 2, 'normal' = 2)) + +small_test = bs[1:50] + +diff_fit = diff_dss_fit( + bs = small_test, + design = bsseq::pData(bs), + formula = '~ Type') + +result = diff_dss_test( + bs = small_test, + diff_fit = diff_fit, + contrast = matrix(c(0,1), ncol = 1) +) + +result_with_meth = diff_dss_test( + bs = small_test, + diff_fit = diff_fit, + contrast = matrix(c(0,1), ncol = 1), + methylation_group_column = 'Type', + methylation_groups = c('case' = 'cancer', 'control' = 'normal') +) + +} diff --git a/man/diff_methylsig.Rd b/man/diff_methylsig.Rd new file mode 100644 index 0000000..7a55438 --- /dev/null +++ b/man/diff_methylsig.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/diff_methylsig.R +\name{diff_methylsig} +\alias{diff_methylsig} +\title{Calculates differential methylation statistics using a Beta-binomial approach.} +\usage{ +diff_methylsig( + bs, + group_column, + comparison_groups, + disp_groups, + local_window_size = 0, + local_weight_function, + t_approx = TRUE, + n_cores = 1 +) +} +\arguments{ +\item{bs}{a \code{BSseq} object.} + +\item{group_column}{a \code{character} string indicating the column of \code{pData(bs)} to use for determining group membership.} + +\item{comparison_groups}{a named \code{character} vector indicating the \code{case} and \code{control} factors of \code{group_column} for the comparison.} + +\item{disp_groups}{a named \code{logical} vector indicating the whether to use \code{case}, \code{control}, or both to estimate the dispersion.} + +\item{local_window_size}{an \code{integer} indicating the size of the window for use in determining local information to improve mean and dispersion parameter estimations. In addition to a the distance constraint, a maximum of 5 loci upstream and downstream of the locus are used. The default is \code{0}, indicating no local information is used.} + +\item{local_weight_function}{a weight kernel function. The default is the tri-weight kernel function defined as \code{function(u) = (1-u^2)^3}. The domain of any given weight function should be [-1,1], and the range should be [0,1].} + +\item{t_approx}{a \code{logical} value indicating whether to use squared t approximation for the likelihood ratio statistics. Chi-square approximation (\code{t_approx = FALSE}) is recommended when the sample size is large. Default is \code{TRUE}.} + +\item{n_cores}{an \code{integer} denoting how many cores should be used for differential methylation calculations.} +} +\value{ +A \code{GRanges} object containing the following \code{mcols}: +\describe{ + \item{meth_case:}{ Methylation estimate for case. } + \item{meth_control:}{ Methylation estimate for control. } + \item{meth_diff:}{ The difference \code{meth_case - meth_control}. } + \item{direction:}{ The group for which the lcous is hyper-methylated. Note, this is not subject to significance thresholds. } + \item{pvalue:}{ The p-value from the t-test (\code{t_approx = TRUE}) or the Chi-Square test (\code{t_approx = FALSE}). } + \item{fdr:}{ The Benjamini-Hochberg adjusted p-values using \code{p.adjust(method = 'BH')}. } + \item{disp_est:}{ The dispersion estimate. } + \item{log_lik_ratio:}{ The log likelihood ratio. } + \item{df:}{ Degrees of freedom used when \code{t_approx = TRUE}. } +} +} +\description{ +The function calculates differential methylation statistics between two groups of samples using a beta-binomial approach to calculate differential methylation statistics, accounting for variation among samples within each group. The function can be applied to a \code{BSseq} object subjected to \code{filter_loci_by_coverage()}, \code{filter_loci_by_snps()}, \code{filter_loci_by_group_coverage()} or any combination thereof. Moreover, the function can be applied to a \code{BSseq} object which has been tiled with \code{tile_by_regions()} or \code{tile_by_windows()}. +} +\examples{ +data(BS.cancer.ex, package = 'bsseqData') + +bs = filter_loci_by_group_coverage( + bs = BS.cancer.ex, + group_column = 'Type', + c('cancer' = 2, 'normal' = 2)) + +small_test = bs[1:50] + +diff_gr = diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1) + +} diff --git a/man/filter_loci_by_coverage.Rd b/man/filter_loci_by_coverage.Rd new file mode 100644 index 0000000..cdc0d12 --- /dev/null +++ b/man/filter_loci_by_coverage.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter_loci_by_coverage.R +\name{filter_loci_by_coverage} +\alias{filter_loci_by_coverage} +\title{Filter BSseq object by coverage} +\usage{ +filter_loci_by_coverage(bs, min_count = 5, max_count = 500) +} +\arguments{ +\item{bs}{a \code{BSseq} object resulting from \code{bsseq::read.bismark} or constructed manually by the user.} + +\item{min_count}{an \code{integer} giving the minimum coverage required at a locus.} + +\item{max_count}{an \code{integer} giving the maximum coverage allowed at a locus.} +} +\value{ +A \code{BSseq} object with samples/loci in the coverage and methylation matrix set to 0 where the coverage was less than \code{min_count} or greater than \code{max_count}. The number of samples and loci are conserved. +} +\description{ +Used after \code{bsseq::read.bismark} to mark loci in samples below \code{min_count} or above \code{max_count} to 0. These loci will then be removed prior to differential analysis by \code{filter_loci_by_group_coverage()} if there are not a sufficient number of samples with appropriate coverage. +} +\examples{ +bis_cov_file1 = system.file('extdata', 'bis_cov1.cov', package = 'methylSig') +bis_cov_file2 = system.file('extdata', 'bis_cov2.cov', package = 'methylSig') +test = bsseq::read.bismark( + files = c(bis_cov_file1, bis_cov_file2), + colData = data.frame(row.names = c('test1','test2')), + rmZeroCov = FALSE, + strandCollapse = FALSE +) +test = filter_loci_by_coverage(bs = test, min_count = 10, max_count = 500) +} diff --git a/man/filter_loci_by_group_coverage.Rd b/man/filter_loci_by_group_coverage.Rd new file mode 100644 index 0000000..ba576f1 --- /dev/null +++ b/man/filter_loci_by_group_coverage.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter_loci_by_group_coverage.R +\name{filter_loci_by_group_coverage} +\alias{filter_loci_by_group_coverage} +\title{Group cytosine / CpG level data into regions based on genomic regions} +\usage{ +filter_loci_by_group_coverage(bs, group_column, min_samples_per_group) +} +\arguments{ +\item{bs}{a \code{BSseq} object.} + +\item{group_column}{a \code{character} string indicating the column of \code{pData(bs)} to use for determining group membership.} + +\item{min_samples_per_group}{a named \code{integer} vector indicating the minimum number of samples with non-zero coverage required for maintaining a locus.} +} +\value{ +A \code{BSseq} object with only those loci having \code{min_samples_per_group}. +} +\description{ +An optional function to aggregate cytosine / CpG level data into regions based on a \code{GRanges} set of genomic regions. +} +\examples{ +data(BS.cancer.ex, package = 'bsseqData') + +filter_loci_by_group_coverage( + bs = BS.cancer.ex, + group_column = 'Type', + min_samples_per_group = c('cancer' = 3, 'normal' = 3) +) + +} diff --git a/man/filter_loci_by_location.Rd b/man/filter_loci_by_location.Rd new file mode 100644 index 0000000..457c53a --- /dev/null +++ b/man/filter_loci_by_location.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter_loci_by_location.R +\name{filter_loci_by_location} +\alias{filter_loci_by_location} +\title{Remove loci by overlap with a \code{GRanges} object} +\usage{ +filter_loci_by_location(bs, gr) +} +\arguments{ +\item{bs}{a \code{BSseq} object.} + +\item{gr}{a \code{GRanges} object.} +} +\value{ +A \code{BSseq} object with loci intersecting \code{gr} removed. +} +\description{ +A function to remove loci from a \code{BSseq} object based on intersection with loci in a \code{GRanges} object. +} +\examples{ +data(bsseq_stranded, package = 'methylSig') +regions = GenomicRanges::GRanges( + seqnames = c('chr1','chr1','chr1','chr1'), + ranges = IRanges::IRanges( + start = c(5,25,45,70), + end = c(15,40,55,80) + ) +) +filtered = filter_loci_by_location(bs = bsseq_stranded, gr = regions) + +} diff --git a/man/methylSig-package.Rd b/man/methylSig-package.Rd deleted file mode 100644 index 78ef603..0000000 --- a/man/methylSig-package.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methylSig_pkg_doc.R -\docType{package} -\name{methylSig-package} -\alias{methylSig-package} -\alias{methylSig} -\title{methylSig: a whole genome DNA methylation analysis pipeline} -\description{ -MethylSig is a method for testing differential methylated cytosines (DMCs) or regions (DMRs) in whole-genome bisulfite sequencing (bis-seq) or reduced representation bisulfite sequencing (RRBS) experiments. MethylSig uses a beta binomial model to test for significant differences between groups of samples. Several options exist for either site-specific or sliding window tests, combining strands, and for variance estimation. It allows annotating the resulting regions to multiple genome features, and visualizing the results for chosen genomic regions. -} -\references{ -https://www.github.com/sartorlab/methylSig -} -\author{ -Yongseok Park \email{yongpark@pitt.edu}, Raymond Cavalcante \email{rcavalca@umich.edu}, and Maureen A. Sartor -} diff --git a/man/methylSig.Rd b/man/methylSig.Rd new file mode 100644 index 0000000..db9eb7a --- /dev/null +++ b/man/methylSig.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methylSig-package.R +\docType{package} +\name{methylSig} +\alias{methylSig} +\alias{methylSig-package} +\title{MethylSig: Differential Methylation Testing for WGBS and RRBS Data} +\description{ +MethylSig is a package for testing for + differentially methylated cytosines (DMCs) or regions (DMRs) in + whole-genome bisulfite sequencing (WGBS) or reduced representation + bisulfite sequencing (RRBS) experiments. MethylSig uses a beta + binomial model to test for significant differences between groups of + samples. Several options exist for either site-specific or sliding + window tests, and variance estimation. +} +\section{methylSig functions}{ + +filter_loci_by_coverage() +filter_loci_by_snps() +tile_by_regions() +tile_by_windows() +filter_loci_by_group_coverage() +diff_binomial() +diff_methylsig() +diff_methylsig_dss() +annotate_diff() +visualize_diff() +region_enrichment_diff() +} + +\seealso{ +Useful links: +\itemize{ + \item Report bugs at \url{https://github.com/sartorlab/methylSig/issues} +} + +} +\author{ +\strong{Maintainer}: Raymond G. Cavalcante \email{rcavalca@umich.edu} + +Authors: +\itemize{ + \item Yongseok Park \email{yongpark@pitt.edu} +} + +} +\keyword{internal} diff --git a/man/methylSig.tfbsEnrichTest.Rd b/man/methylSig.tfbsEnrichTest.Rd deleted file mode 100644 index f0f511f..0000000 --- a/man/methylSig.tfbsEnrichTest.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tfbs_enrichment.R -\name{methylSig.tfbsEnrichTest} -\alias{methylSig.tfbsEnrichTest} -\title{Perform transcription factor enrichment test among differentially methylated cytosines or regions} -\usage{ -methylSig.tfbsEnrichTest(myDiff, dmcList, tfbsInfo) -} -\arguments{ -\item{myDiff}{\code{GRanges} object resulting from \code{methylSigCalc} that contains all CpG sites that are tested for differential methylation.} - -\item{dmcList}{A \code{logical} of the same length as \code{myDiff} defining the DMCs or DMRs.} - -\item{tfbsInfo}{A \code{GRanges} object of the genomic regions representing peaks. The \code{name} column should indicate which TF the peak is for.} -} -\value{ -A \code{data.frame} whose \code{rownames} are inherited from the \code{name} column of the input BED file, and whose columns are: -\describe{ - \item{n_total_by_tf}{ The number of tested CpGs in a TFBS for a TF. } - \item{n_dmc_by_tf}{ The number of DM CpGs in a TFBS for a TF. } - \item{N_total}{ The total number of tested CpGs in a TFBS across all the TFs. } - \item{N_dmc}{ The total number of DM CpGs in a TFBS across all the TFs. } - \item{p_total}{ \code{n_total_by_tf} / \code{N_total}, used in the likelihood calculation. } - \item{p_dmc}{ \code{n_dmc_by_tf} / \code{N_dmc}, used in the likelihood calculation. } - \item{logLik}{ The log-likelihood based on the binomial distribution. } - \item{pvalue}{ The p-value from the likelihood ratio test. } -} -} -\description{ -This function tests for enriched transcription binding sites among differentially methylated sites or regions using a binomial test. -} -\details{ -Likelihood ratio test is used based on the binomial distribution. -} -\examples{ -utils::data(sample_data, package = 'methylSig') - -dmcList = msig_cpgs$fdr < 0.05 & abs(msig_cpgs$meth.diff) > 25 - -methylSig.tfbsEnrichTest(myDiff = msig_cpgs, dmcList = dmcList, tfbsInfo = tfbs) - -} diff --git a/man/methylSigAnnotation.Rd b/man/methylSigAnnotation.Rd deleted file mode 100644 index acb6536..0000000 --- a/man/methylSigAnnotation.Rd +++ /dev/null @@ -1,52 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/annotations.R -\name{methylSigAnnotation} -\alias{methylSigAnnotation} -\title{Wrapper function for annotatr annotations} -\usage{ -methylSigAnnotation(myDiff, dmcList, annotations) -} -\arguments{ -\item{myDiff}{A \code{GRanges} object resulting from \code{methylSigCalc} that contains all CpG sites that are tested for differential methylation.} - -\item{dmcList}{A \code{logical} of the same length as \code{myDiff} defining the DMCs or DMRs.} - -\item{annotations}{A \code{GRanges} object resulting from \code{annotatr::build_annotations()} to be used for annotating \code{myDiff} in conjunction with \code{dmcList}. See the documentation for \code{annotatr::build_annotations()} for guidance on how to chose different annotations, or use custom annotations.} -} -\value{ -A \code{GRanges} object whose \code{mcols} include all those in \code{myDiff} in addition to: -\describe{ - \item{locus_ids}{ A unique identifier for the tested locus from \code{methylSigCalc}. } - \item{dm_status}{ A \code{character} indicating which group the CpG / region is hyper-methylated in (based on the \code{levels} of the \code{comparison} column in the \code{pData} used in \code{methylSigCalc}), or "No DM" if it is \code{FALSE} in \code{dmcList}. } - \item{annot}{ A \code{GRanges} containing annotation information for the CpG or region. See the "Annotation structure" section. } -} -} -\description{ -Annotate the result of \code{methylSigCalc} to \code{annotatr} annotations, while also categorizing regions tested as hyper-methylated in either group or "No DM". -} -\section{Annotation structure}{ - -Annotations from \code{annotatr} are embedded in the \code{myDiff} \code{GRanges} object as a column of \code{GRanges} (named \code{annot}). Of note is that a region annotated to multiple annotations will appear in as many rows, one for each annotation. A convenience column, \code{locus_id}, clearly indicates when a locus is multiply annotated. The easiest way to see all tested loci (CpGs or regions) and their annotations as a flat table is to coerce the result with \code{as.data.frame}. The \code{mcols} of \code{annot} are: -\describe{ - \item{id}{ A unique ID for the annotation. } - \item{tx_id}{ Either a UCSC knownGene transcript ID (genic annotations) or a Ensembl transcript ID (lncRNA annotations), } - \item{gene_id}{ Entrez ID. } - \item{symbol}{ The gene symbol from the \code{org.*.eg.db} mapping from the Entrez ID. } - \item{type}{ A code of the form \code{[genome]_[type]_[name]} indicating the annotation type. } -} -} - -\examples{ -# Annotate the msig_cpgs results -utils::data(sample_data, package = 'methylSig') -# This includes the cpg_annots object to save time - -# Use the genome of msig_cpgs and build annotations for CpG features built with -# cpg_annots = annotatr::build_annotations(genome = 'hg19', annotations = 'hg19_cpg_islands') - -# Decide what counts as differentially methylated -dmcList = msig_cpgs$fdr < 0.05 & abs(msig_cpgs$meth.diff) > 25 - -myDiff_annotated = methylSigAnnotation(myDiff = msig_cpgs, dmcList = dmcList, annotations = cpg_annots) - -} diff --git a/man/methylSigCalc.Rd b/man/methylSigCalc.Rd deleted file mode 100644 index 045bf21..0000000 --- a/man/methylSigCalc.Rd +++ /dev/null @@ -1,68 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/test_methylSig.R -\name{methylSigCalc} -\alias{methylSigCalc} -\title{Calculates differential methylation statistics using a Beta-binomial approach.} -\usage{ -methylSigCalc(meth, comparison = NA, dispersion = "both", - local.info = FALSE, local.winsize = 200, min.per.group = c(3, 3), - weightFunc = methylSig_weightFunc, T.approx = TRUE, num.cores = 1) -} -\arguments{ -\item{meth}{A \code{BSseq-class} object to calculate differential methylation statistics. See \code{methylSigReadData} for how to read in methylation data.} - -\item{comparison}{The name of the column in \code{pData(meth)} to use for the comparisons, with the correct reference level set.} - -\item{dispersion}{One of \code{both}, or either group name. Indicates which set of samples to use to estimate the dispersion parameter. Default is \code{both}.} - -\item{local.info}{A \code{logical} value indicating whether to use local information to improve mean and dispersion parameter estimations. Default is \code{FALSE}.} - -\item{local.winsize}{An \code{integer} to specify the distance upstream and downstream of a location to include local information for the mean and dispersion parameter estimations. NOTE: An additional constraint is placed whereby a maximum of 5 loci upstream and downstream of the locus of interest are used. Default is \code{200}.} - -\item{min.per.group}{A vector with two numbers specifying the minimum number of samples required to perform the test for differential methylation. If it is a single number, both groups will use it as the minimum requried number of samples. Default is \code{c(3,3)}. NOTE: The ordering of this parameter with respect to the groups should be \code{c(reference, other)}, where \code{reference} refers to the reference level in the \code{pData(meth)[, comparison]} factor.} - -\item{weightFunc}{A weight kernel function. The input of this function is from -1 to 1. The default is the tri-weight kernel function defined as \code{function(u) = (1-u^2)^3}. Function value and range of parameter for weight function should be from 0 to 1.} - -\item{T.approx}{A \code{logical} value indicating whether to use squared t approximation for the likelihood ratio statistics. Chi-square approximation (\code{T.approx = FALSE}) is recommended when the sample size is large. Default is \code{TRUE}.} - -\item{num.cores}{An integer denoting how many cores should be used for differential methylation calculations.} -} -\value{ -A \code{GRanges} object containing the following \code{mcols}: -\describe{ - \item{phiCommonEst}{ The dispersion estimate. } - \item{logLikRatio}{ The log likelihood ratio. } - \item{df}{ Degrees of freedom used when \code{T.approx = TRUE}. } - \item{muEstC_group1}{ Methylation estimate for group1. Groups correspond to the levels in the column used for the comparison in \code{pdata(meth)}. } - \item{muEstC_group2}{ Methylation estimate for group2. } - \item{meth.diff}{ The difference \code{muEstC_group2 - muEstC_group1}. } - \item{hyper.direction}{ The group for which the CpG/region is hyper-methylated. Groups correspond to the levels in the column used for the comparison in \code{pdata(meth)}. } - \item{pvalue}{ The p-value from the t-test (\code{T.approx = TRUE}) or the Chi-Square test (\code{T.approx = FALSE}). } - \item{fdr}{ The Benjamini-Hochberg adjusted p-values using \code{p.adjust(method = 'BH')}. } -} -} -\description{ -The function calculates differential methylation statistics between two groups of samples. This is the main function of the methylSig package, and the method most users should use to test for DMCs or DMRs. The function uses a Beta-binomial approach to calculate differential methylation statistics, accounting for variation among samples within each group. -} -\details{ -The function calculates differential methylation statistics between two groups of samples. The function uses Beta-binomial approach to calculate differential methylation statistics, accounting for variation among samples within each group. Users who wish to tile their data and test for differentially methylated regions (DMRs) instead DMCs should first use the \code{\link{methylSigTile}} function before using this function. -} -\examples{ -utils::data(sample_data, package = 'methylSig') - -result = methylSigCalc( - meth = meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - -} -\seealso{ -\code{\link{methylSigReadData}} -} -\keyword{differentialMethylation} diff --git a/man/methylSigDSS.Rd b/man/methylSigDSS.Rd deleted file mode 100644 index 77d5c43..0000000 --- a/man/methylSigDSS.Rd +++ /dev/null @@ -1,66 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/test_multifactor.R -\name{methylSigDSS} -\alias{methylSigDSS} -\title{Calculates differential methylation statistics under general experimental design} -\usage{ -methylSigDSS(meth, design, formula, contrast, group.term, - min.per.group = c(3, 3)) -} -\arguments{ -\item{meth}{A \code{BSseq-class} object to calculate differential methylation statistics. See \code{methylSigReadData} for how to read in methylation data.} - -\item{design}{A \code{data.frame} for experimental design. Should contain as many rows as there are columns (samples) in \code{meth}.} - -\item{formula}{A formula for the linear model. It should refer to column names from \code{design}. NOTE: The intercept is included by default if omitted. One can omit the intercept with a formula such as \code{'~ 0 + group'}. For clarity, it helps to include the intercept explicitly as in \code{'~ 1 + group'}.} - -\item{contrast}{A contrast matrix for hypothesis testing. The number of rows should match the number of columns \code{design}.} - -\item{group.term}{A string indicating which term in \code{formula} contains group information on which to apply the \code{min.per.group} parameter. Currently assumes that this factor contains ONLY TWO LEVELS.} - -\item{min.per.group}{A vector with two numbers specifying the minimum number of samples required to perform the test for differential methylation. If it is a single number, both groups will use it as the minimum requried number of samples. Default is \code{c(3,3)}. NOTE: The ordering of this parameter with respect to the groups should be \code{c(reference, other)}, where \code{reference} refers to the reference level in the \code{design[, group.term]} factor.} -} -\value{ -A \code{GRanges} object containing the following \code{mcols}: -\describe{ - \item{stat}{ The dispersion estimate. } - \item{pvalue}{ The log likelihood ratio. } - \item{fdr}{ Degrees of freedom used when \code{T.approx = TRUE}. } - \item{mean methylation columns}{ Mean methylation for each factor in each column of design, when there are fewer than 5 factors in the factor. } -} -} -\description{ -Calculates differential methylation statistics under general experimental design -} -\examples{ -utils::data(sample_data, package = 'methylSig') - -# Example with implicit intercept -design1 = data.frame(group = bsseq::pData(meth)$DR_vs_DS) -contrast1 = matrix(c(0,1), ncol = 1) -result1 = methylSigDSS( - meth = meth, - design = design1, - formula = '~ group', - contrast = contrast1, - group.term = 'group', - min.per.group=c(3,3)) - -# Example with subject pairing -design2 = data.frame( - group = bsseq::pData(meth)$DR_vs_DS, - subject = factor(c(1,1,2,2,3,3))) -contrast2 = matrix(c(0,1,0,0), ncol = 1) -result2 = methylSigDSS( - meth = meth, - design = design2, - formula = '~ group + subject', - contrast = contrast2, - group.term = 'group', - min.per.group=c(3,3)) - -} -\seealso{ -\code{\link{methylSigReadData}} -} -\keyword{differentialMethylation} diff --git a/man/methylSigReadData.Rd b/man/methylSigReadData.Rd deleted file mode 100644 index 1e7490c..0000000 --- a/man/methylSigReadData.Rd +++ /dev/null @@ -1,70 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read.R -\name{methylSigReadData} -\alias{methylSigReadData} -\title{Read methylation score files to make a 'BSseq' object.} -\usage{ -methylSigReadData(fileList, pData, assembly = NA, destranded = TRUE, - maxCount = 500, minCount = 10, filterSNPs = FALSE, num.cores = 1, - fileType = c("cov", "cytosineReport"), verbose = TRUE) -} -\arguments{ -\item{fileList}{Files to be read. These can be \code{cov} or \code{cytosine_reports} from the Bismark Methylation Extractor. See \code{fileType} for details.} - -\item{pData}{A \code{data.frame} containing phenotype information for the samples in \code{fileList}. The \code{row.names} attribute of the \code{data.frame} should match the \code{Sample_Names}. See example below.} - -\item{assembly}{The genome assembly used for alignment. e.g. \code{hg19}, \code{mm10}, etc.} - -\item{destranded}{A logical value indicating whether to destrand the reverse to forward strand. If TRUE, the reads from both will be combined. Default is TRUE.} - -\item{maxCount}{A number indicating the maximum coverage count to be included.} - -\item{minCount}{A number indicating the minimum coverage count to be included.} - -\item{filterSNPs}{A logical value indicating whether or not to filter out C > T SNPs based on the 1000 Genomes Project. NOTE: Only supported when \code{assembly = 'hg19'}.} - -\item{num.cores}{Number of cores to be used in reading files. Default is 1.} - -\item{fileType}{The format of the input file. Either \code{cov} or \code{cytosineReport}. One of the outputs of the Bismark Methylation Extractor.} - -\item{verbose}{A logical value indicating whether \code{bsseq::read.bismark} shoud print progress. Default TRUE.} -} -\value{ -A \code{BSseq-class} object. -} -\description{ -This function reads files created by the Bismark Methylation Extractor, and outputs a \code{BSseq} object. -} -\examples{ -files = c( - system.file('extdata', 'MDAMB_231_1DR.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_1DS.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_2DR.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_2DS.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_3DR.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_3DS.txt.gz', package='methylSig')) - -sample.ids = basename(files) -sample.ids = gsub('.txt.gz', '', sample.ids) - -pData = data.frame( - Sample_Names = sample.ids, - DR_vs_DS = relevel(factor(c('DR','DS','DR','DS','DR','DS')), ref = 'DS'), - row.names = sample.ids, - stringsAsFactors = FALSE) - -meth = methylSigReadData( - fileList = files, - pData = pData, - assembly = 'hg19', - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 1, - fileType = 'cytosineReport') - -} -\seealso{ -\code{\link{methylSigCalc}} -} diff --git a/man/methylSigTile.Rd b/man/methylSigTile.Rd deleted file mode 100644 index 8cdca29..0000000 --- a/man/methylSigTile.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tile.R -\name{methylSigTile} -\alias{methylSigTile} -\title{Obtain tiled methylation data in non-overlapping continuous windows.} -\usage{ -methylSigTile(meth, tiles = NULL, win.size = 200) -} -\arguments{ -\item{meth}{A \code{BSseq-class} object, as from \code{methylSigReadData}.} - -\item{tiles}{One of \code{NULL}, a \code{data.frame}, or a \code{GRanges} object. If not \code{NULL}, the regions should be non-overlapping. Those CpG sites not belonging to any tile will be removed from tiled data.} - -\item{win.size}{An \code{integer} indicating the desired window size in bps. Default is 200. Used only when \code{tiles = NULL}.} -} -\value{ -A \code{BSseq-class} object. -} -\description{ -This function summarizes methylation data within tiles or user-specified regions. For all CpGs within an intersecting genomic region, the coverage and methylation reads are summed. This is used prior to the \code{\link{methylSigCalc}} function when the user prefers to conduct a tiled analysis instead of a base specific analysis for differential methylation. Tiling may provide higher power to detect significant differences, especially for experiments with low coverage. -} -\examples{ -utils::data(sample_data, package = 'methylSig') -methTile = methylSigTile(meth, tiles = NULL, win.size = 200) - -} diff --git a/man/methylSig_weightFunc.Rd b/man/methylSig_weightFunc.Rd deleted file mode 100644 index bf07bf6..0000000 --- a/man/methylSig_weightFunc.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/test_methylSig.R -\name{methylSig_weightFunc} -\alias{methylSig_weightFunc} -\title{Default methylSig Weight Function} -\usage{ -methylSig_weightFunc(u) -} -\arguments{ -\item{u}{A numeric between 0 and 1} -} -\value{ -A \code{GRanges} object containing the following \code{mcols}: -} -\description{ -The default weight function used by methylSigCalc -} -\examples{ -methylSig_weightFunc(0.5) - -} diff --git a/man/promoters_gr.Rd b/man/promoters_gr.Rd new file mode 100644 index 0000000..97a0725 --- /dev/null +++ b/man/promoters_gr.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methylSig-data.R +\docType{data} +\name{promoters_gr} +\alias{promoters_gr} +\title{GRanges object with collapsed promoters on chr21 and chr22} +\format{A GRanges object} +\source{ +data-raw/02-create_bsseq_rda.R +} +\usage{ +promoters_gr +} +\description{ +Data contains 1466 promoters for use in the vignette +} +\examples{ +data(promoters_gr, package = 'methylSig') +} +\keyword{datasets} diff --git a/man/sample_data.Rd b/man/sample_data.Rd deleted file mode 100644 index f8819ac..0000000 --- a/man/sample_data.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methylSig_data_doc.R -\docType{data} -\name{sample_data} -\alias{sample_data} -\title{Sample data} -\format{A mixture of BSseq and GRanges class objects. See documentation for test_data.} -\description{ -The \code{sample_data} object contains the following items: -\describe{ - \item{meth}{ A \code{BSseq-class} object containing 6 samples total, with three in each group. Genome is hg19. } - \item{tiled_meth}{ A tiled version of the \code{BSseq-class} object called \code{meth}. Tiles are 1000bp. Genome is hg19. } - \item{msig_cpgs}{ A \code{GRanges-class} object containing the results of \code{methylSigCalc} on \code{data}. } - \item{msig_tiles}{ A \code{GRanges-class} object containing the results of \code{methylSigCalc} on \code{tiled_meth}. } - \item{tfbs}{ A \code{GRanges-class} object representing transcription factor binding sites. For use in \code{methylSig.tfbsEnrichTest}. Genome is hg19. } -} -} -\keyword{datasets} diff --git a/man/tile_by_regions.Rd b/man/tile_by_regions.Rd new file mode 100644 index 0000000..9311596 --- /dev/null +++ b/man/tile_by_regions.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tile_by_regions.R +\name{tile_by_regions} +\alias{tile_by_regions} +\title{Group cytosine / CpG level data into regions based on genomic regions} +\usage{ +tile_by_regions(bs, gr) +} +\arguments{ +\item{bs}{a \code{BSseq} object.} + +\item{gr}{a \code{GRanges} object.} +} +\value{ +A \code{BSseq} object with loci of regions matching \code{gr}. Coverage and methylation read count matrices are aggregated by the sums of the cytosines / CpGs in the regions per sample. +} +\description{ +An optional function to aggregate cytosine / CpG level data into regions based on a \code{GRanges} set of genomic regions. +} +\examples{ +data(bsseq_stranded, package = 'methylSig') +regions = GenomicRanges::GRanges( + seqnames = c('chr1','chr1','chr1'), + ranges = IRanges::IRanges( + start = c(5,35,75), + end = c(30,70,80) + ) +) +tiled = tile_by_regions(bs = bsseq_stranded, gr = regions) + +} diff --git a/man/tile_by_windows.Rd b/man/tile_by_windows.Rd new file mode 100644 index 0000000..e8cf1af --- /dev/null +++ b/man/tile_by_windows.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tile_by_windows.R +\name{tile_by_windows} +\alias{tile_by_windows} +\title{Group cytosine / CpG level data into regions based on genomic windows} +\usage{ +tile_by_windows(bs, win_size = 200) +} +\arguments{ +\item{bs}{a \code{BSseq} object.} + +\item{win_size}{an \code{integer} indicating the size of the tiles. Default is 200bp.} +} +\value{ +A \code{BSseq} object with loci consisting of a tiling of the genome by \code{win_size} bp tiles. Coverage and methylation read count matrices are aggregated by the sums of the cytosines / CpGs in the regions per sample. +} +\description{ +An optional function to aggregate cytosine / CpG level data into regions based on a tiling of the genome by \code{win_size}. +} +\examples{ +data(bsseq_stranded, package = 'methylSig') + +tiled = tile_by_windows(bs = bsseq_stranded, win_size = 50) + +} diff --git a/tests/testthat/test-diff_binomial.R b/tests/testthat/test-diff_binomial.R new file mode 100644 index 0000000..0488880 --- /dev/null +++ b/tests/testthat/test-diff_binomial.R @@ -0,0 +1,128 @@ +data(BS.cancer.ex, package = 'bsseqData') + +bs = filter_loci_by_group_coverage( + bs = BS.cancer.ex, + group_column = 'Type', + c('cancer' = 2, 'normal' = 2)) + +small_test = bs[1:50] + +small_test_tile = tile_by_windows(bs = small_test, win_size = 5000) + +##################################### + +test_that('bs missing check', { + expect_error( + diff_binomial(), + 'Must pass bs as a BSseq object', + fixed = TRUE + ) +}) + +test_that('group_column missing check', { + expect_error( + diff_binomial(bs = small_test), + 'Must pass group_column', + fixed = TRUE + ) +}) + +test_that('comparison_groups missing check', { + expect_error( + diff_binomial( + bs = small_test, + group_column = 'Type'), + 'Must pass comparison_groups', + fixed = TRUE + ) +}) + +##################################### + +test_that('bs type check', { + expect_error( + diff_binomial( + bs = 'blue', + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal')), + 'bs must be', + fixed = TRUE + ) +}) + +test_that('group_column type check', { + expect_error( + diff_binomial( + bs = small_test, + group_column = c(1, 3), + comparison_groups = c('case' = 'cancer', 'control' = 'normal')), + 'group_column must be', + fixed = TRUE + ) +}) + +test_that('comparison_groups type check', { + expect_error( + diff_binomial( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 1, 'control' = 2)), + 'comparison_groups must be', + fixed = TRUE + ) +}) + +##################################### + +test_that('Valid group_column name check', { + expect_error( + diff_binomial( + bs = small_test, + group_column = 'blue', + comparison_groups = c('case' = 'cancer', 'control' = 'normal')), + 'not in column names of pData(bs):', + fixed = TRUE + ) +}) + +test_that('Valid comparison_groups values check', { + expect_error( + diff_binomial( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'blue', 'control' = 'normal')), + 'Not all comparison_groups are in group_column', + fixed = TRUE + ) +}) + +test_that('Valid comparison_groups name check', { + expect_error( + diff_binomial( + bs = small_test, + group_column = 'Type', + comparison_groups = c('blue' = 'cancer', 'control' = 'normal')), + 'comparison_groups vector must be a named vector with', + fixed = TRUE + ) +}) + +##################################### + +test_that('Test 1', { + diff_gr = diff_binomial( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal')) + + expect_true(is(diff_gr, 'GRanges')) +}) + +test_that('Test 2', { + diff_gr = diff_binomial( + bs = small_test_tile, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal')) + + expect_true(is(diff_gr, 'GRanges')) +}) diff --git a/tests/testthat/test-diff_dss_fit.R b/tests/testthat/test-diff_dss_fit.R new file mode 100644 index 0000000..d66d20a --- /dev/null +++ b/tests/testthat/test-diff_dss_fit.R @@ -0,0 +1,105 @@ +data(BS.cancer.ex, package = 'bsseqData') + +bs = filter_loci_by_group_coverage( + bs = BS.cancer.ex, + group_column = 'Type', + c('cancer' = 2, 'normal' = 2)) + +small_test = bs[1:50] + +##################################### + +test_that('bs missing check', { + expect_error( + diff_dss_fit(), + 'Must pass bs as a BSseq object', + fixed = TRUE + ) +}) + +test_that('formula missing check', { + expect_error( + diff_dss_fit(bs = small_test), + 'Must pass formula', + fixed = TRUE + ) +}) + +##################################### + +test_that('bs type check', { + expect_error( + diff_dss_fit( + bs = 'blue', + design = pData(small_test), + formula = '~ Type'), + 'bs must be', + fixed = TRUE + ) +}) + +test_that('design type check', { + expect_error( + diff_dss_fit( + bs = small_test, + design = 'hello', + formula = '~ Type'), + 'design must be', + fixed = TRUE + ) +}) + +test_that('formula type check', { + expect_error( + diff_dss_fit( + bs = small_test, + design = pData(small_test), + formula = 100), + 'formula must be', + fixed = TRUE + ) +}) + +##################################### + +test_that('design message check', { + expect_message( + diff_dss_fit( + bs = small_test, + formula = '~ Type'), + 'Missing design', + fixed = TRUE + ) +}) + +##################################### + +test_that('Valid return character formula check', { + diff_fit = diff_dss_fit( + bs = small_test, + design = pData(bs), + formula = '~ Type') + + expect_true(is(diff_fit, 'list')) + expect_true(all(c('gr', 'design', 'formula', 'X', 'fit') %in% names(diff_fit))) +}) + +test_that('Valid return formula formula check', { + diff_fit = diff_dss_fit( + bs = small_test, + design = pData(bs), + formula = as.formula('~ Type')) + + expect_true(is(diff_fit, 'list')) + expect_true(all(c('gr', 'design', 'formula', 'X', 'fit') %in% names(diff_fit))) +}) + +test_that('Valid return more complex model check', { + diff_fit = diff_dss_fit( + bs = small_test, + design = pData(bs), + formula = as.formula('~ Type + Pair')) + + expect_true(is(diff_fit, 'list')) + expect_true(all(c('gr', 'design', 'formula', 'X', 'fit') %in% names(diff_fit))) +}) diff --git a/tests/testthat/test-diff_dss_test.R b/tests/testthat/test-diff_dss_test.R new file mode 100644 index 0000000..6665160 --- /dev/null +++ b/tests/testthat/test-diff_dss_test.R @@ -0,0 +1,239 @@ +data(BS.cancer.ex, package = 'bsseqData') + +bs = filter_loci_by_group_coverage( + bs = BS.cancer.ex, + group_column = 'Type', + min_samples_per_group = c('cancer' = 2, 'normal' = 2)) + +pData(bs)$num_cov = c(9, 8, 10, 1, 3, 2) + +small_test = bs[1:50] + +bs_tile = tile_by_windows(bs, win_size = 5000) + +bs_tile = filter_loci_by_group_coverage( + bs = bs_tile, + group_column = 'Type', + min_samples_per_group = c('cancer' = 2, 'normal' = 2)) + +small_test_tile = bs_tile[1:50] + +diff_fit = diff_dss_fit( + bs = small_test, + design = pData(small_test), + formula = '~ Type') + +##################################### + +test_that('bs missing check', { + expect_error( + diff_dss_test(), + 'Must pass bs as a BSseq object', + fixed = TRUE + ) +}) + +test_that('diff_fit missing check', { + expect_error( + diff_dss_test(bs = small_test), + 'Must pass diff_fit', + fixed = TRUE + ) +}) + +test_that('contrast missing check', { + expect_error( + diff_dss_test(bs = small_test, diff_fit = diff_fit), + 'Must pass contrast', + fixed = TRUE + ) +}) + +##################################### + +test_that('bs type check', { + expect_error( + diff_dss_test( + bs = 'blue', + diff_fit = diff_fit, + contrast = matrix(c(0,1), ncol = 1)), + 'bs must be', + fixed = TRUE + ) +}) + +test_that('diff_fit type check', { + expect_error( + diff_dss_test( + bs = small_test, + diff_fit = 'blue', + contrast = matrix(c(0,1), ncol = 1)), + 'diff_fit must be a list.', + fixed = TRUE + ) +}) + +test_that('diff_fit a list with correct names check', { + expect_error( + diff_dss_test( + bs = small_test, + diff_fit = list('a' = 'hello', 'b' = 'goodbye'), + contrast = matrix(c(0,1), ncol = 1)), + 'diff_fit must be a list returned from diff_dss_fit', + fixed = TRUE + ) +}) + +##################################### + +test_that('Valid methylation_group_column name check', { + expect_error( + diff_dss_test( + bs = small_test, + diff_fit = diff_fit, + contrast = matrix(c(0,1), ncol = 1), + methylation_group_column = 'blue'), + 'not in column names of diff_fit$design', + fixed = TRUE + ) +}) + +test_that('methylation_groups and methylation_group_column check', { + expect_error( + diff_dss_test( + bs = small_test, + diff_fit = diff_fit, + contrast = matrix(c(0,1), ncol = 1), + methylation_groups = c('case' = 'blue', 'control' = 'read')), + 'If methylation_groups is specified', + fixed = TRUE + ) +}) + +test_that('methylation_groups type check', { + expect_error( + diff_dss_test( + bs = small_test, + diff_fit = diff_fit, + contrast = matrix(c(0,1), ncol = 1), + methylation_group_column = 'Type', + methylation_groups = 2), + 'methylation_groups must be a named character vector', + fixed = TRUE + ) +}) + +test_that('methylation_groups type check', { + expect_error( + diff_dss_test( + bs = small_test, + diff_fit = diff_fit, + contrast = matrix(c(0,1), ncol = 1), + methylation_group_column = 'Type', + methylation_groups = c('blue' = 'blue', 'red' = 'red')), + 'methylation_groups must be a named vector with names', + fixed = TRUE + ) +}) + +test_that('methylation_groups and methylation_group_column check', { + expect_error( + diff_dss_test( + bs = small_test, + diff_fit = diff_fit, + contrast = matrix(c(0,1), ncol = 1), + methylation_group_column = 'Type', + methylation_groups = c('case' = 'blue', 'control' = 'red')), + 'Not all methylation_groups are in methylation_group_column', + fixed = TRUE + ) +}) + +##################################### + +test_that('Valid return, simple model, group methylation check', { + diff_fit = diff_dss_fit( + bs = small_test, + design = pData(small_test), + formula = '~ Type') + + diff_gr = diff_dss_test( + bs = small_test, + diff_fit = diff_fit, + contrast = matrix(c(0,1), ncol = 1), + methylation_group_column = 'Type', + methylation_groups = c('case' = 'cancer', 'control' = 'normal') + ) + + expect_true(is(diff_gr, 'GRanges')) + +}) + +test_that('Valid return, more complex model, no methylation check', { + diff_fit = diff_dss_fit( + bs = small_test, + design = pData(small_test), + formula = '~ Type + Pair') + + diff_gr = diff_dss_test( + bs = small_test, + diff_fit = diff_fit, + contrast = matrix(c(0,1,0,0), ncol = 1) + ) + + expect_true(is(diff_gr, 'GRanges')) + +}) + +test_that('Valid return, more complex model, methylation check', { + diff_fit = diff_dss_fit( + bs = small_test, + design = pData(small_test), + formula = '~ Type + num_cov') + + diff_gr = diff_dss_test( + bs = small_test, + diff_fit = diff_fit, + contrast = matrix(c(0,1,0), ncol = 1), + methylation_group_column = 'Type', + methylation_groups = c('case' = 'cancer', 'control' = 'normal') + ) + + expect_true(is(diff_gr, 'GRanges')) + +}) + +test_that('Valid return, numerical covariate model, percentile methylation check', { + diff_fit4 = diff_dss_fit( + bs = small_test, + design = pData(small_test), + formula = '~ num_cov') + + diff_gr = diff_dss_test( + bs = small_test, + diff_fit = diff_fit, + contrast = matrix(c(0,1), ncol = 1), + methylation_group_column = 'num_cov' + ) + + expect_true(is(diff_gr, 'GRanges')) + +}) + +test_that('Valid return, simple model tiled, methylation check', { + diff_fit = diff_dss_fit( + bs = small_test_tile, + design = pData(small_test_tile), + formula = '~ Type') + + diff_gr = diff_dss_test( + bs = small_test_tile, + diff_fit = diff_fit, + contrast = matrix(c(0,1), ncol = 1), + methylation_group_column = 'Type', + methylation_groups = c('case' = 'cancer', 'control' = 'normal') + ) + + expect_true(is(diff_gr, 'GRanges')) + +}) diff --git a/tests/testthat/test-diff_methylsig.R b/tests/testthat/test-diff_methylsig.R new file mode 100644 index 0000000..0764e92 --- /dev/null +++ b/tests/testthat/test-diff_methylsig.R @@ -0,0 +1,359 @@ +data(BS.cancer.ex, package = 'bsseqData') + +bs = filter_loci_by_group_coverage( + bs = BS.cancer.ex, + group_column = 'Type', + c('cancer' = 2, 'normal' = 2)) + +small_test = bs[1:50] + +small_test_tile = tile_by_windows(bs = small_test, win_size = 5000) + +##################################### + +test_that('bs missing check', { + expect_error( + diff_methylsig(), + 'Must pass bs as a BSseq object', + fixed = TRUE + ) +}) + +test_that('group_column missing check', { + expect_error( + diff_methylsig(bs = small_test), + 'Must pass group_column', + fixed = TRUE + ) +}) + +test_that('comparison_groups missing check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'Type'), + 'Must pass comparison_groups', + fixed = TRUE + ) +}) + +test_that('disp_groups missing check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal')), + 'Must pass disp_groups', + fixed = TRUE + ) +}) + +##################################### + +test_that('bs type check', { + expect_error( + diff_methylsig( + bs = 'blue', + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1), + 'bs must be', + fixed = TRUE + ) +}) + +test_that('group_column type check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = c(1, 3), + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1), + 'group_column must be', + fixed = TRUE + ) +}) + +test_that('comparison_groups type check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 1, 'control' = 2), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1), + 'comparison_groups must be', + fixed = TRUE + ) +}) + +test_that('disp_groups type check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = 1, 'control' = 2), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1), + 'disp_groups must be', + fixed = TRUE + ) +}) + +test_that('local_window_size type check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 'a', + t_approx = TRUE, + n_cores = 1), + 'local_window_size must be', + fixed = TRUE + ) +}) + +test_that('local_weight_function type check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + local_weight_function = 'b', + t_approx = TRUE, + n_cores = 1), + 'local_weight_function must be', + fixed = TRUE + ) +}) + +test_that('t_approx type check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = 'c', + n_cores = 1), + 't_approx must be', + fixed = TRUE + ) +}) + +test_that('n_cores type check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 'a'), + 'n_cores must be', + fixed = TRUE + ) +}) + +##################################### + +test_that('Valid group_column name check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'blue', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1), + 'not in column names of pData(bs):', + fixed = TRUE + ) +}) + +test_that('Valid comparison_groups values check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'blue', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1), + 'Not all comparison_groups are in group_column', + fixed = TRUE + ) +}) + +test_that('Valid comparison_groups name check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('blue' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1), + 'comparison_groups vector must be a named vector with', + fixed = TRUE + ) +}) + +test_that('Valid disp_groups values check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = FALSE, 'control' = FALSE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1), + 'disp_groups must be a named logical vector with at least one TRUE value corresponding', + fixed = TRUE + ) +}) + +test_that('Valid disp_groups name check', { + expect_error( + diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('blue' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1), + 'disp_groups vector must be a named vector with names', + fixed = TRUE + ) +}) + +test_that('Check for invalid local_window_size == 0 && regions state', { + expect_error( + diff_methylsig( + bs = small_test_tile, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 50, + t_approx = TRUE, + n_cores = 1), + 'Cannot use local information on region-resolution data. Detected local_window_size', + fixed = TRUE + ) +}) + +##################################### + +test_that('Test 1', { + diff_gr = diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1) + + expect_true(is(diff_gr, 'GRanges')) +}) + +test_that('Check dropped loci message', { + expect_message( + diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = FALSE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1), + 'loci were dropped due to insufficient degrees', + fixed = TRUE + ) +}) + +test_that('Test 2', { + diff_gr = diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = FALSE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1) + + expect_true(is(diff_gr, 'GRanges')) +}) + +test_that('Test 3', { + diff_gr = diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = FALSE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1) + + expect_true(is(diff_gr, 'GRanges')) +}) + +test_that('Test 4', { + diff_gr = diff_methylsig( + bs = small_test_tile, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = FALSE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1) + + expect_true(is(diff_gr, 'GRanges')) +}) + +test_that('Test 5', { + diff_gr = diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = FALSE, 'control' = TRUE), + local_window_size = 0, + t_approx = FALSE, + n_cores = 1) + + expect_true(is(diff_gr, 'GRanges')) +}) + +test_that('Test 6', { + diff_gr = diff_methylsig( + bs = small_test, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = FALSE, 'control' = TRUE), + local_window_size = 50, + t_approx = TRUE, + n_cores = 1) + + expect_true(is(diff_gr, 'GRanges')) +}) diff --git a/tests/testthat/test-filter_loci_by_coverage.R b/tests/testthat/test-filter_loci_by_coverage.R new file mode 100644 index 0000000..1730bf3 --- /dev/null +++ b/tests/testthat/test-filter_loci_by_coverage.R @@ -0,0 +1,51 @@ +test_that('BSseq class check', { + expect_error( + filter_loci_by_coverage(5), + 'bs must be class BSseq', + fixed = TRUE + ) +}) + +test_that('min_count numeric check', { + expect_error( + filter_loci_by_coverage(bsseq_stranded, min_count = 'a'), + 'min_count must be an integer', + fixed = TRUE + ) +}) + +test_that('max_count numeric check', { + expect_error( + filter_loci_by_coverage(bsseq_stranded, max_count = 'a'), + 'max_count must be an integer', + fixed = TRUE + ) +}) + +test_that('min_count less than max_count check', { + expect_error( + filter_loci_by_coverage(bsseq_stranded, min_count = 600 ), + 'min_count not less than max_count', + fixed = TRUE + ) +}) + +test_that('correct set to 0 check', { + bs = filter_loci_by_coverage(bsseq_stranded, min_count = 10, max_count = 500) + bs_cov = bsseq::getCoverage(bs, type = 'Cov') + bs_meth = bsseq::getCoverage(bs, type = 'M') + + expect_equivalent(bs_cov[1,'test1'], 0) + expect_equivalent(bs_cov[2,'test1'], 0) + expect_equivalent(bs_cov[7,'test2'], 0) + expect_equivalent(bs_cov[8,'test2'], 0) + expect_equivalent(bs_cov[10,'test1'], 0) + expect_equivalent(bs_cov[11,'test1'], 0) + + expect_equivalent(bs_meth[1,'test1'], 0) + expect_equivalent(bs_meth[2,'test1'], 0) + expect_equivalent(bs_meth[7,'test2'], 0) + expect_equivalent(bs_meth[8,'test2'], 0) + expect_equivalent(bs_meth[10,'test1'], 0) + expect_equivalent(bs_meth[11,'test1'], 0) +}) diff --git a/tests/testthat/test-filter_loci_by_group_coverage.R b/tests/testthat/test-filter_loci_by_group_coverage.R new file mode 100644 index 0000000..ba11bf6 --- /dev/null +++ b/tests/testthat/test-filter_loci_by_group_coverage.R @@ -0,0 +1,160 @@ +data(BS.cancer.ex, package = 'bsseqData') + +small_test = BS.cancer.ex[1:10] + +expected_cov_cancer2_normal2 = bsseq::getCoverage(small_test, type = 'Cov')[c(3,4,7,8,9,10), ] +expected_meth_cancer2_normal2 = bsseq::getCoverage(small_test, type = 'M')[c(3,4,7,8,9,10), ] + +expected_cov_cancer2_normal3 = bsseq::getCoverage(small_test, type = 'Cov')[c(3,8,9), ] +expected_meth_cancer2_normal3 = bsseq::getCoverage(small_test, type = 'M')[c(3,8,9), ] + +expected_cov_cancer3_normal3 = bsseq::getCoverage(small_test, type = 'Cov')[c(3,8), ] +expected_meth_cancer3_normal3 = bsseq::getCoverage(small_test, type = 'M')[c(3,8), ] + +##################################### + +test_that('bs missing check', { + expect_error( + filter_loci_by_group_coverage(), + 'Must pass bs as a BSseq object', + fixed = TRUE + ) +}) + +test_that('group_column missing check', { + expect_error( + filter_loci_by_group_coverage(bs = small_test), + 'Must pass group_column as a character string', + fixed = TRUE + ) +}) + +test_that('min_samples_per_group missing check', { + expect_error( + filter_loci_by_group_coverage(bs = small_test, group_column = 'Type'), + 'Must pass min_samples_per_group as a named integer vector', + fixed = TRUE + ) +}) + +##################################### + +test_that('bs type check', { + expect_error( + filter_loci_by_group_coverage( + bs = 'a', + group_column = 'Type', + c('cancer' = 3, 'normal' = 3)), + 'bs must be class BSseq', + fixed = TRUE + ) +}) + +test_that('group_column type check', { + expect_error( + filter_loci_by_group_coverage( + bs = small_test, + group_column = 6, + c('cancer' = 3, 'normal' = 3)), + 'group_column must be a character string', + fixed = TRUE + ) +}) + +test_that('min_samples_per_group type check', { + expect_error( + filter_loci_by_group_coverage( + bs = small_test, + group_column = 'Type', + c('cancer' = 'a', 'normal' = 3)), + 'min_samples_per_group must be a named integer vector', + fixed = TRUE + ) +}) + +##################################### + +test_that('Valid group_column name check', { + expect_error( + filter_loci_by_group_coverage( + bs = small_test, + group_column = 'blue', + c('cancer' = 3, 'normal' = 3)), + 'group_column: blue not in column names of pData', # () seem to be a problem + fixed = TRUE + ) +}) + +test_that('Valid factor name check', { + expect_error( + filter_loci_by_group_coverage( + bs = small_test, + group_column = 'Type', + c('blue' = 3, 'normal' = 3)), + 'Not all names of min_samples_per_group are in group_column', + fixed = TRUE + ) +}) + +test_that('All loci removed check', { + expect_error( + filter_loci_by_group_coverage( + bs = small_test, + group_column = 'Type', + c('cancer' = 4, 'normal' = 4)), + 'Thresholds for the following groups were too strict' + ) +}) + +##################################### + +test_that('Test cancer 2 normal 2', { + test = filter_loci_by_group_coverage( + bs = small_test, + group_column = 'Type', + c('cancer' = 2, 'normal' = 2)) + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + expected_cov_cancer2_normal2 + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + expected_meth_cancer2_normal2 + ) +}) + +test_that('Test cancer 2 normal 3', { + test = filter_loci_by_group_coverage( + bs = small_test, + group_column = 'Type', + c('cancer' = 2, 'normal' = 3)) + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + expected_cov_cancer2_normal3 + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + expected_meth_cancer2_normal3 + ) +}) + +test_that('Test cancer 3 normal 3', { + test = filter_loci_by_group_coverage( + bs = small_test, + group_column = 'Type', + c('cancer' = 3, 'normal' = 3)) + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + expected_cov_cancer3_normal3 + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + expected_meth_cancer3_normal3 + ) +}) diff --git a/tests/testthat/test-filter_loci_by_location.R b/tests/testthat/test-filter_loci_by_location.R new file mode 100644 index 0000000..6d01573 --- /dev/null +++ b/tests/testthat/test-filter_loci_by_location.R @@ -0,0 +1,83 @@ +test_that('bs missing check', { + expect_error( + filter_loci_by_location(), + 'Must pass bs as a BSseq object', + fixed = TRUE + ) +}) + +test_that('gr missing check', { + expect_error( + filter_loci_by_location(bs = bsseq_stranded), + 'Must pass gr as a GRanges object', + fixed = TRUE + ) +}) + +test_that('bs class check', { + expect_error( + filter_loci_by_location(bs = '5', gr = gr_tiles1), + 'bs must be class BSseq', + fixed = TRUE + ) +}) + +test_that('gr class check', { + expect_error( + filter_loci_by_location(bs = bsseq_stranded, gr = '5'), + 'gr must be class GRanges', + fixed = TRUE + ) +}) + +##################################### + +test_that('correct filtering gr1', { + test = filter_loci_by_location(bs = bsseq_stranded, gr = gr_tiles1) + + expect_equivalent( + as.matrix(bsseq::getCoverage(test, type = 'Cov')), + as.matrix(bsseq::getCoverage(filter_loc_tiles1, type = 'Cov')) + ) + + expect_equivalent( + as.matrix(bsseq::getCoverage(test, type = 'M')), + as.matrix(bsseq::getCoverage(filter_loc_tiles1, type = 'M')) + ) +}) + +test_that('correct filtering gr3', { + expect_error( + filter_loci_by_location(bs = bsseq_stranded, gr = gr_tiles3), + 'All loci in bs were removed by gr', + fixed = TRUE + ) +}) + +test_that('correct filtering gr4', { + test = filter_loci_by_location(bs = bsseq_stranded, gr = gr_tiles4) + + expect_equivalent( + as.matrix(bsseq::getCoverage(test, type = 'Cov')), + as.matrix(bsseq::getCoverage(filter_loc_tiles4, type = 'Cov')) + ) + + expect_equivalent( + as.matrix(bsseq::getCoverage(test, type = 'M')), + as.matrix(bsseq::getCoverage(filter_loc_tiles4, type = 'M')) + ) +}) + +test_that('correct filtering gr5', { + test = filter_loci_by_location(bs = bsseq_stranded, gr = gr_tiles5) + + expect_equivalent( + as.matrix(bsseq::getCoverage(test, type = 'Cov')), + as.matrix(bsseq::getCoverage(filter_loc_tiles5, type = 'Cov')) + ) + + expect_equivalent( + as.matrix(bsseq::getCoverage(test, type = 'M')), + as.matrix(bsseq::getCoverage(filter_loc_tiles5, type = 'M')) + ) +}) diff --git a/tests/testthat/test-tile_by_regions.R b/tests/testthat/test-tile_by_regions.R new file mode 100644 index 0000000..9454707 --- /dev/null +++ b/tests/testthat/test-tile_by_regions.R @@ -0,0 +1,167 @@ +test_that('bs missing check', { + expect_error( + tile_by_regions(), + 'Must pass bs as a BSseq object', + fixed = TRUE + ) +}) + +test_that('gr missing check', { + expect_error( + tile_by_regions(bs = bsseq_stranded), + 'Must pass gr as a GRanges object', + fixed = TRUE + ) +}) + +test_that('bs class check', { + expect_error( + tile_by_regions(bs = '5', gr = gr_tiles1), + 'bs must be class BSseq', + fixed = TRUE + ) +}) + +test_that('gr class check', { + expect_error( + tile_by_regions(bs = bsseq_stranded, gr = '5'), + 'gr must be class GRanges', + fixed = TRUE + ) +}) + +##################################### + +test_that('correct tiling stranded gr1', { + test = tile_by_regions(bs = bsseq_stranded, gr = gr_tiles1) + + # NOTE, it is not sufficient to do + # expect_equivalent(test, bsseq_stranded_tiled1) because the test + # expect_equivalent(test, bsseq_destranded_tiled1) does not throw an + # error but testing equivalance of the Cov or M matrices of test and + # bsseq_destranded_tiled1 will. Consequently, testing at this + # level is necessary + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + bsseq::getCoverage(bsseq_stranded_tiled1, type = 'Cov') + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + bsseq::getCoverage(bsseq_stranded_tiled1, type = 'M') + ) +}) + +test_that('correct tiling stranded gr2', { + test = tile_by_regions(bs = bsseq_stranded, gr = gr_tiles2) + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + bsseq::getCoverage(bsseq_stranded_tiled2, type = 'Cov') + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + bsseq::getCoverage(bsseq_stranded_tiled2, type = 'M') + ) +}) + +test_that('correct tiling stranded gr3', { + test = tile_by_regions(bs = bsseq_stranded, gr = gr_tiles3) + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + bsseq::getCoverage(bsseq_stranded_tiled3, type = 'Cov') + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + bsseq::getCoverage(bsseq_stranded_tiled3, type = 'M') + ) +}) + +test_that('correct tiling stranded gr4', { + test = tile_by_regions(bs = bsseq_stranded, gr = gr_tiles4) + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + bsseq::getCoverage(bsseq_stranded_tiled4, type = 'Cov') + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + bsseq::getCoverage(bsseq_stranded_tiled4, type = 'M') + ) +}) + +test_that('error tiling stranded gr5', { + expect_error( + tile_by_regions(bs = bsseq_stranded, gr = gr_tiles5), + 'No regions overlap between bs and gr' + ) +}) + +test_that('correct tiling destranded gr1', { + test = tile_by_regions(bs = bsseq_destranded, gr = gr_tiles1) + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + bsseq::getCoverage(bsseq_destranded_tiled1, type = 'Cov') + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + bsseq::getCoverage(bsseq_destranded_tiled1, type = 'M') + ) +}) + +test_that('correct tiling destranded gr2', { + test = tile_by_regions(bs = bsseq_destranded, gr = gr_tiles2) + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + bsseq::getCoverage(bsseq_destranded_tiled2, type = 'Cov') + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + bsseq::getCoverage(bsseq_destranded_tiled2, type = 'M') + ) +}) + +test_that('correct tiling destranded gr3', { + test = tile_by_regions(bs = bsseq_destranded, gr = gr_tiles3) + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + bsseq::getCoverage(bsseq_destranded_tiled3, type = 'Cov') + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + bsseq::getCoverage(bsseq_destranded_tiled3, type = 'M') + ) +}) + +test_that('correct tiling destranded gr4', { + test = tile_by_regions(bs = bsseq_destranded, gr = gr_tiles4) + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + bsseq::getCoverage(bsseq_destranded_tiled4, type = 'Cov') + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + bsseq::getCoverage(bsseq_destranded_tiled4, type = 'M') + ) +}) + +test_that('error tiling destranded gr5', { + expect_error( + tile_by_regions(bs = bsseq_destranded, gr = gr_tiles5), + 'No regions overlap between bs and gr', + fixed = TRUE + ) +}) diff --git a/tests/testthat/test-tile_by_windows.R b/tests/testthat/test-tile_by_windows.R new file mode 100644 index 0000000..e239bf8 --- /dev/null +++ b/tests/testthat/test-tile_by_windows.R @@ -0,0 +1,84 @@ +test_that('bs missing check', { + expect_error( + tile_by_windows(), + 'Must pass bs as a BSseq object', + fixed = TRUE + ) +}) + +test_that('bs class check', { + expect_error( + tile_by_windows(bs = '5', win_size = 200), + 'bs must be class BSseq', + fixed = TRUE + ) +}) + +test_that('win_size class check', { + expect_error( + tile_by_windows(bs = bsseq_stranded, win_size = TRUE), + 'win_size must be an integer', + fixed = TRUE + ) +}) + +test_that('correct tiling stranded win25', { + test = tile_by_windows(bs = bsseq_stranded, win_size = 25) + + # NOTE, it is not sufficient to do + # expect_equivalent(test, bsseq_stranded_tiled1) because the test + # expect_equivalent(test, bsseq_destranded_tiled1) does not throw an + # error but testing equivalance of the Cov or M matrices of test and + # bsseq_destranded_tiled1 will. Consequently, testing at this + # level is necessary + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + bsseq::getCoverage(bsseq_stranded_win25, type = 'Cov') + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + bsseq::getCoverage(bsseq_stranded_win25, type = 'M') + ) + + expect_true( + all(is.na(seqlengths(test))) + ) +}) + +test_that('correct tiling destranded win25', { + test = tile_by_windows(bs = bsseq_destranded, win_size = 25) + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + bsseq::getCoverage(bsseq_destranded_win25, type = 'Cov') + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + bsseq::getCoverage(bsseq_destranded_win25, type = 'M') + ) + + expect_true( + all(is.na(seqlengths(test))) + ) +}) + +test_that('correct tiling multichrom multichrom25', { + test = tile_by_windows(bs = bsseq_multichrom, win_size = 25) + + expect_equivalent( + bsseq::getCoverage(test, type = 'Cov'), + bsseq::getCoverage(bsseq_multichrom_win25, type = 'Cov') + ) + + expect_equivalent( + bsseq::getCoverage(test, type = 'M'), + bsseq::getCoverage(bsseq_multichrom_win25, type = 'M') + ) + + expect_true( + all(is.na(seqlengths(test))) + ) +}) diff --git a/tests/testthat/test_1_read.R b/tests/testthat/test_1_read.R deleted file mode 100644 index 264b877..0000000 --- a/tests/testthat/test_1_read.R +++ /dev/null @@ -1,238 +0,0 @@ -context('Test methylSigReadData') - -# Lay out what you expect to happen in all cases - -######################## -### test_1.txt -# destrand = T, filterSNPs = T, min/max = 10/500: cov = -; M = - # at 9413839 -# destrand = T, filterSNPs = F, min/max = 10/500: cov = 45; M = 40 # at 9413839 -# destrand = F, filterSNPs = T, min/max = 10/500: cov = -/25; M = -/20 -# destrand = F, filterSNPs = F, min/max = 10/500: cov = 20/25; M = 0/5 -# 9413839 is a CT SNP -# chr21 9413839 + 20 0 CG CGG -# chr21 9413840 - 20 5 CG CGG - -# destrand = T, filterSNPs = T, min/max = 10/500: cov = 34; M = 34 # at 9419909 -# destrand = T, filterSNPs = F, min/max = 10/500: cov = 34; M = 34 # at 9419909 -# destrand = F, filterSNPs = T, min/max = 10/500: cov = 14/-; M = 14/- -# destrand = F, filterSNPs = F, min/max = 10/500: cov = 14/20; M = 14/20 -# 9419909 is a CT SNP -# chr21 9419908 + 14 0 CG CGG -# chr21 9419909 - 20 0 CG CGG - -# destrand = T, filterSNPs = T, min/max = 10/500: cov = 0; M = 0 (kept because test_2.txt has identical location that is kept) -# destrand = T, filterSNPs = F, min/max = 10/500: cov = 0; M = 0 (kept because test_2.txt has identical location that is kept) -# destrand = F, filterSNPs = T, min/max = 10/500: cov = 0; M = 0 (kept because test_2.txt has identical location that is kept) -# destrand = F, filterSNPs = F, min/max = 10/500: cov = 0; M = 0 (kept because test_2.txt has identical location that is kept) -# No SNP -# chr21 43052356 - 550 0 CG CGG - -# destrand = T, filterSNPs = T, min/max = 10/500: cov = 96; M = 87 # at 43053297 -# destrand = T, filterSNPs = F, min/max = 10/500: cov = 96; M = 87 # at 43053297 -# destrand = F, filterSNPs = T, min/max = 10/500: cov = 24/72; M = 23/64 -# destrand = F, filterSNPs = F, min/max = 10/500: cov = 24/72; M = 23/64 -# No SNP -# chr21 43053297 + 23 1 CG CGG -# chr21 43053298 - 64 8 CG CGG - -# destrand = T, filterSNPs = T, min/max = 10/500: cov = -; M = - (removed because test_2.txt has identical and 0 in both) -# destrand = T, filterSNPs = F, min/max = 10/500: cov = -; M = - (removed because test_2.txt has identical and 0 in both) -# destrand = F, filterSNPs = T, min/max = 10/500: cov = -; M = - (removed because test_2.txt has identical and 0 in both) -# destrand = F, filterSNPs = F, min/max = 10/500: cov = -; M = - (removed because test_2.txt has identical and 0 in both) -# No SNP -# chr21 43053323 + 4 0 CG CGG - -######################## -### test_2.txt -# destrand = T, filterSNPs = T, min/max = 10/500: cov = -; M = - # at 9413839 -# destrand = T, filterSNPs = F, min/max = 10/500: cov = 0; M = 0 # at 9413839 -# destrand = F, filterSNPs = T, min/max = 10/500: cov = -/0; M = -/0 -# destrand = F, filterSNPs = F, min/max = 10/500: cov = 0/0; M = 0/0 -# EXCLUDE THESE FROM THIS FILE -# chr21 9413839 + 20 0 CG CGG -# chr21 9413840 - 20 5 CG CGG - -# destrand = T, filterSNPs = T, min/max = 10/500: cov = 10; M = 10 # at 9419909 -# destrand = T, filterSNPs = F, min/max = 10/500: cov = 10; M = 10 # at 9419909 -# destrand = F, filterSNPs = T, min/max = 10/500: cov = 0/-; M = 0/- -# destrand = F, filterSNPs = F, min/max = 10/500: cov = 0/0; M = 0/0 -# 9419910 is a CT SNP -# chr21 9419908 + 5 0 CG CGG -# chr21 9419909 - 5 0 CG CGG - -# destrand = T, filterSNPs = T, min/max = 10/500: cov = 300; M = 300 -# destrand = T, filterSNPs = F, min/max = 10/500: cov = 300; M = 300 -# destrand = F, filterSNPs = T, min/max = 10/500: cov = 300; M = 300 -# destrand = F, filterSNPs = F, min/max = 10/500: cov = 300; M = 300 -# No SNP -# chr21 43052356 - 300 0 CG CGG - -# destrand = T, filterSNPs = T, min/max = 10/500: cov = 458; M = 434 # at 43053297 (kept because test_1.txt has identical location that is kept) -# destrand = T, filterSNPs = F, min/max = 10/500: cov = 458; M = 434 # at 43053297 (kept because test_1.txt has identical location that is kept) -# destrand = F, filterSNPs = T, min/max = 10/500: cov = 318/140; M = 300/134 -# destrand = F, filterSNPs = F, min/max = 10/500: cov = 318/140; M = 300/134 -# No SNP -# chr21 43053297 + 300 18 CG CGG -# chr21 43053298 - 134 6 CG CGG - -# destrand = T, filterSNPs = T, min/max = 10/500: cov = -; M = - (removed because test_2.txt has identical and 0 in both) -# destrand = T, filterSNPs = F, min/max = 10/500: cov = -; M = - (removed because test_2.txt has identical and 0 in both) -# destrand = F, filterSNPs = T, min/max = 10/500: cov = -; M = - (removed because test_2.txt has identical and 0 in both) -# destrand = F, filterSNPs = F, min/max = 10/500: cov = -; M = - (removed because test_2.txt has identical and 0 in both) -# No SNP -# chr21 43053323 + 4 0 CG CGG - -result1_cov = matrix(c(34,0,96,10,300,458), nrow = 3, ncol = 2, byrow = FALSE) -result1_M = matrix(c(34,0,87,10,300,434), nrow = 3, ncol = 2, byrow = FALSE) - -result2_cov = matrix(c(45,34,0,96,0,10,300,458), nrow = 4, ncol = 2, byrow = FALSE) -result2_M = matrix(c(40,34,0,87,0,10,300,434), nrow = 4, ncol = 2, byrow = FALSE) - -result3_cov = matrix(c(25,14,0,24,72,0,0,300,318,140), nrow = 5, ncol = 2, byrow = FALSE) -result3_M = matrix(c(20,14,0,23,64,0,0,300,300,134), nrow = 5, ncol = 2, byrow = FALSE) - -result4_cov = matrix(c(20,25,14,20,0,24,72,0,0,0,0,300,318,140), nrow = 7, ncol = 2, byrow = FALSE) -result4_M = matrix(c(20,20,14,20,0,23,64,0,0,0,0,300,300,134), nrow = 7, ncol = 2, byrow = FALSE) - - -################################################################################ -# Test min/max filters - -files = c(system.file('extdata', 'test_1.txt', package='methylSig'), - system.file('extdata', 'test_2.txt', package='methylSig')) - -sample_names = gsub('.txt', '', basename(files)) - -pData = data.frame( - Sample_Names = sample_names, - Group = relevel(factor(c(1,0)), ref = '0'), - Note = c("Hello", "Goodbye"), - row.names = sample_names, - stringsAsFactors = FALSE) - -test_that('Test messages/warnings and trivial seqinfo', { - expect_message( - suppressWarnings(methylSigReadData( - fileList = files, - pData = pData, - assembly = NA, - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 1, - fileType = 'cytosineReport')), - 'Skipping SNP filtering' - ) - - expect_warning( - suppressMessages(methylSigReadData( - fileList = files, - pData = pData, - assembly = NA, - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 1, - fileType = 'cytosineReport')), - 'Leaving assembly as NA will give the resulting' - ) - - expect_warning( - suppressMessages(methylSigReadData( - fileList = files, - pData = pData, - assembly = 'hg1', - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 1, - fileType = 'cytosineReport')), - 'is not supported by GenomeInfoDb::fetchExtendedChromInfoFromUCSC' - ) - - data = suppressWarnings(suppressMessages( - methylSigReadData( - fileList = files, - pData = pData, - assembly = NA, - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 1, - fileType = 'cytosineReport') - )) - expect_true(is.na(seqlengths(data))) - expect_true(is.na(genome(data))) -}) - -test_that('data coverage and M matrices are as expected', { - data = methylSigReadData( - fileList = files, - pData = pData, - assembly = 'hg19', - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 1, - fileType = 'cytosineReport') - - expect_true(all(as.matrix(bsseq::getCoverage(data, type = 'Cov')) == result1_cov)) - expect_true(all(as.matrix(bsseq::getCoverage(data, type = 'M')) == result1_M)) - expect_true(all(genome(data) == 'hg19')) -}) - -test_that('data coverage and M matrices are as expected', { - data = methylSigReadData( - fileList = files, - pData = pData, - assembly = 'hg19', - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = FALSE, - num.cores = 1, - fileType = 'cytosineReport') - - expect_true(all(as.matrix(bsseq::getCoverage(data, type = 'Cov')) == result2_cov)) - expect_true(all(as.matrix(bsseq::getCoverage(data, type = 'M')) == result2_M)) - expect_true(all(genome(data) == 'hg19')) -}) - -test_that('data coverage and M matrices are as expected', { - data = methylSigReadData( - fileList = files, - pData = pData, - assembly = 'hg19', - destranded = FALSE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 1, - fileType = 'cytosineReport') - - expect_true(all(as.matrix(bsseq::getCoverage(data, type = 'Cov')) == result3_cov)) - expect_true(all(as.matrix(bsseq::getCoverage(data, type = 'M')) == result3_M)) - expect_true(all(genome(data) == 'hg19')) -}) - -test_that('data coverage and M matrices are as expected', { - data = methylSigReadData( - fileList = files, - pData = pData, - assembly = 'hg19', - destranded = FALSE, - maxCount = 500, - minCount = 10, - filterSNPs = FALSE, - num.cores = 1, - fileType = 'cytosineReport') - - expect_true(all(as.matrix(bsseq::getCoverage(data, type = 'Cov')) == result4_cov)) - expect_true(all(as.matrix(bsseq::getCoverage(data, type = 'M')) == result4_M)) - expect_true(all(genome(data) == 'hg19')) -}) diff --git a/tests/testthat/test_2_tile.R b/tests/testthat/test_2_tile.R deleted file mode 100644 index 99fa924..0000000 --- a/tests/testthat/test_2_tile.R +++ /dev/null @@ -1,103 +0,0 @@ -context('Test methylSigTile') - -################################################################################ - -# Tiles on the same chromosomes as data -tiles_df_samechr = read.table(system.file('extdata','test_tiles.txt', package='methylSig'), header = T, sep = '\t', as.is = T) -tiles_gr_samechr_noseqinfo = GenomicRanges::makeGRangesFromDataFrame(tiles_df_samechr) - -# Tiles with some tiles on different chromosomes as data -tiles_df_extrachr = read.table(system.file('extdata','test_tiles2.txt', package='methylSig'), header = T, sep = '\t', as.is = T) -tiles_gr_extrachr_noseqinfo = GenomicRanges::makeGRangesFromDataFrame(tiles_df_extrachr) - -files = c(system.file('extdata', 'test_1.txt', package='methylSig'), - system.file('extdata', 'test_2.txt', package='methylSig')) - -sample_names = gsub('.txt', '', basename(files)) - -pData = data.frame( - Sample_Names = sample_names, - Group = relevel(factor(c(1,0)), ref = '0'), - Note = c("Hello", "Goodbye"), - row.names = sample_names, - stringsAsFactors = FALSE) - -meth = methylSigReadData( - fileList = files, - pData = pData, - assembly = 'hg19', - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 1, - fileType = 'cytosineReport') - -################################################################################ - -truth1_meth = matrix(c(34,10,0,300,87,434), nrow = 3, ncol = 2, byrow = TRUE) -truth1_cov = matrix(c(34,10,0,300,96,458), nrow = 3, ncol = 2, byrow = TRUE) - -test_that('Test tileGenome tiling', { - tiled_data = methylSigTile( - meth = meth, - tiles = NULL, - win.size = 200) - - expect_true(all(truth1_meth == as.matrix(bsseq::getCoverage(tiled_data, type='M')))) - expect_true(all(truth1_cov == as.matrix(bsseq::getCoverage(tiled_data, type='Cov')))) - expect_true(all(dim(bsseq::pData(tiled_data)) == c(2,3))) - expect_true(all(S4Vectors::metadata(tiled_data)$cpgs.per.tile == c(1,1,1))) - expect_true(all(genome(tiled_data) == 'hg19')) -}) - -truth2_meth = matrix(c(34,10,87,734), nrow = 2, ncol = 2, byrow = TRUE) -truth2_cov = matrix(c(34,10,96,758), nrow = 2, ncol = 2, byrow = TRUE) - -test_that('Test data.frame tiling with matching chromosomes', { - tiled_data = methylSigTile( - meth = meth, - tiles = tiles_df_samechr, - win.size = 200) - - expect_true(all(truth2_meth == as.matrix(bsseq::getCoverage(tiled_data, type='M')))) - expect_true(all(truth2_cov == as.matrix(bsseq::getCoverage(tiled_data, type='Cov')))) - expect_true(all(dim(bsseq::pData(tiled_data)) == c(2,3))) - expect_true(all(genome(tiled_data) == 'hg19')) -}) - -test_that('Test GRanges tiling with matching chromosomes and no seqinfo', { - tiled_data = methylSigTile( - meth = meth, - tiles = tiles_gr_samechr_noseqinfo, - win.size = 200) - - expect_true(all(truth2_meth == as.matrix(bsseq::getCoverage(tiled_data, type='M')))) - expect_true(all(truth2_cov == as.matrix(bsseq::getCoverage(tiled_data, type='Cov')))) - expect_true(all(dim(bsseq::pData(tiled_data)) == c(2,3))) - expect_true(all(genome(tiled_data) == 'hg19')) -}) - -test_that('Test data.frame tiling with extra chromosomes', { - tiled_data = methylSigTile( - meth = meth, - tiles = tiles_df_extrachr, - win.size = 200) - - expect_true(all(truth2_meth == as.matrix(bsseq::getCoverage(tiled_data, type='M')))) - expect_true(all(truth2_cov == as.matrix(bsseq::getCoverage(tiled_data, type='Cov')))) - expect_true(all(dim(bsseq::pData(tiled_data)) == c(2,3))) - expect_true(all(genome(tiled_data) == 'hg19')) -}) - -test_that('Test GRanges tiling with extra chromosomes and no seqinfo', { - tiled_data = methylSigTile( - meth = meth, - tiles = tiles_gr_extrachr_noseqinfo, - win.size = 200) - - expect_true(all(truth2_meth == as.matrix(bsseq::getCoverage(tiled_data, type='M')))) - expect_true(all(truth2_cov == as.matrix(bsseq::getCoverage(tiled_data, type='Cov')))) - expect_true(all(dim(bsseq::pData(tiled_data)) == c(2,3))) - expect_true(all(genome(tiled_data) == 'hg19')) -}) diff --git a/tests/testthat/test_3_binomialDiffCalc.R b/tests/testthat/test_3_binomialDiffCalc.R deleted file mode 100644 index 478b208..0000000 --- a/tests/testthat/test_3_binomialDiffCalc.R +++ /dev/null @@ -1,14 +0,0 @@ -context('Test binomialDiffCalc') - -utils::data(sample_data, package = 'methylSig') - -test_that('Test CpG no local both dispersion', { - result_binomial = binomialDiffCalc( - meth = meth, - comparison = 'DR_vs_DS', - min.per.group = c(3,3)) - - expect_true(is(result_binomial, 'GRanges')) - expect_match(S4Vectors::metadata(result_binomial)$method, 'binomialDiffCalc') - expect_true(all(genome(result_binomial) == 'hg19')) -}) diff --git a/tests/testthat/test_4_methylSigCalc.R b/tests/testthat/test_4_methylSigCalc.R deleted file mode 100644 index f5a7d00..0000000 --- a/tests/testthat/test_4_methylSigCalc.R +++ /dev/null @@ -1,216 +0,0 @@ -context('Test methylSigCalc') - -utils::data(sample_data, package = 'methylSig') - -test_that('Test CpG no local both dispersion', { - result_calc = methylSigCalc( - meth = meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - - expect_true(is(result_calc, 'GRanges')) - expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') - expect_true(all(genome(result_calc) == 'hg19')) -}) - -# test_that('Test CpG no local DR dispersion', { -# result_calc = methylSigCalc( -# meth = meth, -# comparison = 'DR_vs_DS', -# dispersion = 'DR', -# local.info = FALSE, -# local.winsize = 200, -# min.per.group = c(3,3), -# weightFunc = methylSig_weightFunc, -# T.approx = TRUE, -# num.cores = 1) -# -# expect_true(is(result_calc, 'GRanges')) -# expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') -# }) - -# test_that('Test CpG no local DS dispersion', { -# result_calc = methylSigCalc( -# meth = meth, -# comparison = 'DR_vs_DS', -# dispersion = 'DS', -# local.info = FALSE, -# local.winsize = 200, -# min.per.group = c(3,3), -# weightFunc = methylSig_weightFunc, -# T.approx = TRUE, -# num.cores = 1) -# -# expect_true(is(result_calc, 'GRanges')) -# expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') -# }) - -# test_that('Test CpG local both dispersion', { -# result_calc = methylSigCalc( -# meth = meth, -# comparison = 'DR_vs_DS', -# dispersion = 'both', -# local.info = TRUE, -# local.winsize = 200, -# min.per.group = c(3,3), -# weightFunc = methylSig_weightFunc, -# T.approx = TRUE, -# num.cores = 1) -# -# expect_true(is(result_calc, 'GRanges')) -# expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') -# }) - -# test_that('Test CpG local DR dispersion', { -# result_calc = methylSigCalc( -# meth = meth, -# comparison = 'DR_vs_DS', -# dispersion = 'DR', -# local.info = TRUE, -# local.winsize = 200, -# min.per.group = c(3,3), -# weightFunc = methylSig_weightFunc, -# T.approx = TRUE, -# num.cores = 1) -# -# expect_true(is(result_calc, 'GRanges')) -# expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') -# }) - -# test_that('Test CpG local DS dispersion', { -# result_calc = methylSigCalc( -# meth = meth, -# comparison = 'DR_vs_DS', -# dispersion = 'DS', -# local.info = TRUE, -# local.winsize = 200, -# min.per.group = c(3,3), -# weightFunc = methylSig_weightFunc, -# T.approx = TRUE, -# num.cores = 1) -# -# expect_true(is(result_calc, 'GRanges')) -# expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') -# }) - -test_that('Test tiled no local both dispersion', { - result_calc = methylSigCalc( - meth = tiled_meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - - expect_true(is(result_calc, 'GRanges')) - expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') -}) - -# test_that('Test tiled no local DR dispersion', { -# result_calc = methylSigCalc( -# meth = tiled_meth, -# comparison = 'DR_vs_DS', -# dispersion = 'DR', -# local.info = FALSE, -# local.winsize = 200, -# min.per.group = c(3,3), -# weightFunc = methylSig_weightFunc, -# T.approx = TRUE, -# num.cores = 1) -# -# expect_true(is(result_calc, 'GRanges')) -# expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') -# }) - -# test_that('Test tiled no local DS dispersion', { -# result_calc = methylSigCalc( -# meth = tiled_meth, -# comparison = 'DR_vs_DS', -# dispersion = 'DS', -# local.info = FALSE, -# local.winsize = 200, -# min.per.group = c(3,3), -# weightFunc = methylSig_weightFunc, -# T.approx = TRUE, -# num.cores = 1) -# -# expect_true(is(result_calc, 'GRanges')) -# expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') -# }) - -# test_that('Test tiled local both dispersion', { -# result_calc = methylSigCalc( -# meth = tiled_meth, -# comparison = 'DR_vs_DS', -# dispersion = 'both', -# local.info = TRUE, -# local.winsize = 200, -# min.per.group = c(3,3), -# weightFunc = methylSig_weightFunc, -# T.approx = TRUE, -# num.cores = 1) -# -# expect_true(is(result_calc, 'GRanges')) -# expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') -# }) - -# test_that('Test tiled local DR dispersion', { -# result_calc = methylSigCalc( -# meth = tiled_meth, -# comparison = 'DR_vs_DS', -# dispersion = 'DR', -# local.info = TRUE, -# local.winsize = 200, -# min.per.group = c(3,3), -# weightFunc = methylSig_weightFunc, -# T.approx = TRUE, -# num.cores = 1) -# -# expect_true(is(result_calc, 'GRanges')) -# expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') -# }) - -# test_that('Test tiled local DS dispersion', { -# result_calc = methylSigCalc( -# meth = tiled_meth, -# comparison = 'DR_vs_DS', -# dispersion = 'DS', -# local.info = TRUE, -# local.winsize = 200, -# min.per.group = c(3,3), -# weightFunc = methylSig_weightFunc, -# T.approx = TRUE, -# num.cores = 1) -# -# expect_true(is(result_calc, 'GRanges')) -# expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') -# }) - - -# test_that('Test specified tiles', { -# cpg_islands = annotatr::build_annotations(genome = 'hg19', annotations = 'hg19_cpg_islands') -# cpg_island_meth = methylSigTile(meth, tiles = cpg_islands) -# -# result_calc = methylSigCalc( -# meth = cpg_island_meth, -# comparison = 'DR_vs_DS', -# dispersion = 'both', -# local.info = FALSE, -# local.winsize = 200, -# min.per.group = c(3,3), -# weightFunc = methylSig_weightFunc, -# T.approx = TRUE, -# num.cores = 1) -# -# expect_true(is(result_calc, 'GRanges')) -# expect_match(S4Vectors::metadata(result_calc)$method, 'methylSigCalc') -# }) diff --git a/tests/testthat/test_5_methylSigDSS.R b/tests/testthat/test_5_methylSigDSS.R deleted file mode 100644 index cd3d2a9..0000000 --- a/tests/testthat/test_5_methylSigDSS.R +++ /dev/null @@ -1,83 +0,0 @@ -context('Test methylSigDSS') - -utils::data(sample_data, package = 'methylSig') - -design1 = data.frame( - group = bsseq::pData(meth)$DR_vs_DS) - -design2 = data.frame( - group = bsseq::pData(meth)$DR_vs_DS, - subject = factor(c(1,1,2,2,3,3))) - -test_that('Test with intercept', { - contrast = matrix(c(0,1), ncol = 1) - result_dss = methylSigDSS( - meth = meth, - design = design1, - formula = '~ group', - contrast = contrast, - group.term = 'group', - min.per.group=c(3,3)) - - expect_match(class(result_dss), 'GRanges') - expect_match(S4Vectors::metadata(result_dss)$method, 'methylSigDSS') - expect_true(all(genome(result_dss) == 'hg19')) -}) - -# test_that('Test without intercept', { -# contrast = matrix(c(-1,1), ncol = 1) -# result_dss = methylSigDSS( -# meth = meth, -# design = design1, -# formula = '~ 0 + group', -# contrast = contrast, -# group.term = 'group', -# min.per.group=c(3,3)) -# -# expect_match(class(result_dss), 'GRanges') -# expect_match(S4Vectors::metadata(result_dss)$method, 'methylSigDSS') -# }) - -# test_that('Test similar to first but with extra design columns', { -# contrast = matrix(c(0,1), ncol = 1) -# result_dss = methylSigDSS( -# meth = meth, -# design = design2, -# formula = '~ group', -# contrast = contrast, -# group.term = 'group', -# min.per.group=c(3,3)) -# -# expect_match(class(result_dss), 'GRanges') -# expect_match(S4Vectors::metadata(result_dss)$method, 'methylSigDSS') -# }) - -test_that('Test multiple formula terms', { - contrast = matrix(c(0,1,0,0), ncol = 1) - result_dss = methylSigDSS( - meth = meth, - design = design2, - formula = '~ group + subject', - contrast = contrast, - group.term = 'group', - min.per.group=c(3,3)) - - expect_match(class(result_dss), 'GRanges') - expect_true('meth.2' %in% colnames(GenomicRanges::mcols(result_dss))) - expect_match(S4Vectors::metadata(result_dss)$method, 'methylSigDSS') - expect_true(all(genome(result_dss) == 'hg19')) -}) - -# test_that('Test alternate formula', { -# contrast = matrix(c(0,0,0,1), ncol = 1) -# result_dss = methylSigDSS( -# meth = meth, -# design = design2, -# formula = '~ subject + group', -# contrast = contrast, -# group.term = 'group', -# min.per.group=c(3,3)) -# -# expect_match(class(result_dss), 'GRanges') -# expect_match(S4Vectors::metadata(result_dss)$method, 'methylSigDSS') -# }) diff --git a/tests/testthat/test_6_annotations.R b/tests/testthat/test_6_annotations.R deleted file mode 100644 index ab01d14..0000000 --- a/tests/testthat/test_6_annotations.R +++ /dev/null @@ -1,9 +0,0 @@ -utils::data(sample_data, package = 'methylSig') - -test_that('Test annotations', { - dmcList = msig_cpgs$fdr < 0.05 & abs(msig_cpgs$meth.diff) > 25 - myDiff_annotated = methylSigAnnotation(myDiff = msig_cpgs, dmcList = dmcList, annotations = cpg_annots) - - expect_match(class(myDiff_annotated), 'GRanges') - expect_true(all(genome(myDiff_annotated) == 'hg19')) -}) diff --git a/tests/testthat/test_7_tfbs.R b/tests/testthat/test_7_tfbs.R deleted file mode 100644 index 963f8a5..0000000 --- a/tests/testthat/test_7_tfbs.R +++ /dev/null @@ -1,8 +0,0 @@ -utils::data(sample_data, package = 'methylSig') - -test_that('Test annotations', { - dmcList = msig_cpgs$fdr < 0.05 & abs(msig_cpgs$meth.diff) > 25 - tfbs_results = methylSig.tfbsEnrichTest(myDiff = msig_cpgs, dmcList = dmcList, tfbsInfo = tfbs) - - expect_equal(sum(tfbs_results$pvalue < 0.05), 4) -}) diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/methylSig.Rmd b/vignettes/methylSig.Rmd deleted file mode 100644 index 6db1de5..0000000 --- a/vignettes/methylSig.Rmd +++ /dev/null @@ -1,269 +0,0 @@ ---- -title: "methylSig: A package for whole genome DNA methylation analysis" -author: "Yongseok Park, Raymond G. Cavalcante, Maria E. Figueroa, Laura S. Rozek, and Maureen A. Sartor" -date: "`r Sys.Date()`" -output: - BiocStyle::html_document -vignette: > - %\VignetteIndexEntry{Introduction to methylSig} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, echo=FALSE} -library(methylSig) -library(rtracklayer) -``` - -# Introduction - -DNA methylation plays critical roles in gene regulation and cellular specification without altering DNA sequences. It is one of the best understood and most intensively studied epigenetic marks in mammalian cells. Treatment of DNA with sodium bisulfite deaminates unmethylated cytosines to uracil while methylated cytosines are resistant to this conversion thus allowing for the discrimination between methylated and unmethylated CpG sites. Sodium bisulfite pre-treatment of DNA coupled with next-generation sequencing has allowed DNA methylation to be studied quantitatively and genome-wide at single cytosine site resolution. - -`methylSig` is a method for testing for differential methylated cytosines (DMCs) or regions (DMRs) in whole-genome bisulfite sequencing (bis-seq) or reduced representation bisulfite sequencing (RRBS) experiments. `methylSig` uses a beta binomial model to test for significant differences between groups of samples. Several options exist for either site-specific or sliding window tests, combining strands, and for variance estimation. It allows annotating the resulting regions to multiple genome features, and visualizing the results for chosen genomic regions. - -# Installation - -`methylSig` is available on GitHub at , and the easiest way to install it is as follows: - -```{r, eval=FALSE} -devtools::install_github('sartorlab/methylSig') -``` - -# Basic usage - -## Reading data - -As of version 0.5.0, `methylSig` is able to read `bismark_methylation_extractor` outputs directly using the `bsseq` Bioconductor package. The `methylSigReadData()` function is a wrapper for `bsseq::read_bismark()` that adds some userful features: - -1. Users can set a `minCount` and `maxCount` for the coverage of sites. -2. Users analyzing data aligned to `hg19` can filter out C > T or G > A SNPs. - -The following code uses data contained in the package to demonstrate how to read methylation data: - -```{r} -# The following bismark cytosine reports are included in inst/extdata -files = c( - system.file('extdata', 'MDAMB_231_1DR.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_1DS.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_2DR.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_2DS.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_3DR.txt.gz', package='methylSig'), - system.file('extdata', 'MDAMB_231_3DS.txt.gz', package='methylSig')) - -sample.ids = basename(files) -sample.ids = gsub('.txt.gz', '', sample.ids) - -# Build a pData matrix with columns for the samples, group memberships, and phenotype data -pData = data.frame( - Sample_Names = sample.ids, - DR_vs_DS = relevel(factor(c('DR','DS','DR','DS','DR','DS')), ref = 'DS'), - row.names = sample.ids, - stringsAsFactors = FALSE) - -meth = methylSigReadData( - fileList = files, - pData = pData, - assembly = 'hg19', - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 1, - fileType = 'cytosineReport') - -print(meth) -``` - -## Differential methylation analysis - -The package consists of two methods to test for differential methlyation: `methylSigCalc()` and `methylSigDSS()`. - -The first, `methylSigCalc()`, calculates differential methylation statistics between two groups of samples. It uses a beta-binomial approach to calculate differential methylation statistics, accounting for coverage and variation among samples within each group. - -The second, `methylSigDSS()`, is a wrapper for the `DSS::DMLfit.multiFactor()` and `DSS::DMLtest.multiFactor()` functions in the [DSS Bioconductor package](https://bioconductor.org/packages/release/bioc/html/DSS.html). Essentially the test in DSS uses a linear model over an arbitrary design matrix, thus allowing for correction by covariates. The wrapper function provided here enables enforcement of the a minimum number of data points per group to test a site/region, as well as reporting of averaged methlyation levels over the groups. - -### Site specific analysis with `methylSigCalc()` - -The default is to do site specific analysis and to use both groups to estimate variances. - -```{r} -### Test on CpGs -result = methylSigCalc( - meth = meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - -print(result) -``` - -The output includes the estimated dispersion (`phiCommonEst`), the log-likelihood ratio (`logLikRatio`), the degrees of freedom (`df`), the group methylation estimates (`muEstC_group1` and `muEstC_group2` where group 1 is the reference factor in the comparison column of the `pData` matrix), the methylation difference (`meth.diff = muEstC_group2 - muEstC_group1`), the group for which the site is hyper-methylated (`hyper.direction`, note, this is regardless of significance), the `pvalue`, and `fdr`. - -#### Variance from one group - -Using the `dispersion` argument, it is possible to estimate variances from one group rather than from both groups. This can be accomplished by changing the `dispersion` parameter in the previous example from `'both'` to `'DS'` or `'DR'` - -#### Using local information - -It is also possible to use information from nearby CpG sites to improve the variance and methylation level estimates. The default `local.winsize` is 200 bps. The `local.winsize` is only used when `local.info = TRUE`. - -### Site specific analysis with `methylSigDSS()` - -The following example illustrates a case with no covariates and where the model has an intercept. - -```{r} -# Must create a design matrix -design1 = data.frame(group = bsseq::pData(meth)$DR_vs_DS) - -print(design1) - -# NOTE this model has an intercept -contrast_intercept = matrix(c(0,1), ncol = 1) -result_dss_intercept = methylSigDSS( - meth = meth, - design = design1, - formula = '~ group', - contrast = contrast_intercept, - group.term = 'group', - min.per.group=c(3,3)) - -print(result_dss_intercept) -``` - -The following illustrates an paired-type test. - -```{r} -# Add a covariate column, note specification as a factor, but can -# also use a numeric covariate -design2 = data.frame( - group = bsseq::pData(meth)$DR_vs_DS, - subject = factor(c(1,1,2,2,3,3))) - -print(design2) - -# NOTE the contrast vector has as many entries as the sum of the -# levels in group and subject, in the formula. -contrast_covariates = matrix(c(0,1,0,0), ncol = 1) -result_dss_covariates = methylSigDSS( - meth = meth, - design = design2, - formula = '~ group + subject', - contrast = contrast_covariates, - group.term = 'group', - min.per.group=c(3,3)) - -print(result_dss_covariates) -``` - -### Tiled analysis - -`methylSig` also provides `methylSigTile()` to tile data within continuous non-overlapping windows. Users can tile the genome according to a window size, give a `data.frame` with genomic regions, or give a `GRanges` object. Examples are below. Note that tiling analysis is also possible with `methylSigDSS()`. - -#### Windowed analysis - -```{r} -### Test on 10000bp windows -windowed_meth = methylSigTile(meth, tiles = NULL, win.size = 10000) - -tiled_result = methylSigCalc( - meth = windowed_meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - -print(tiled_result) -``` - -#### Region analysis - -As mentioned, users can provide a `data.frame` in the `tiles` parameter, so long as it has column names acceptable to `makeGRangesFromDataFrame`, i.e. `chr`, `start`, and `end`. - -Finally, users can provide tiling regions as a `GRanges` object. If we wanted to test for differential methylation in CpG islands, we could use the `annotatr` package to create the CpG island regions. - -```{r} -### Test on CpG islands -library(annotatr) - -cpg_islands = annotatr::build_annotations(genome = 'hg19', annotations = 'hg19_cpg_islands') - -cpg_island_meth = methylSigTile(meth, tiles = cpg_islands) - -cpg_island_result = methylSigCalc( - meth = cpg_island_meth, - comparison = 'DR_vs_DS', - dispersion = 'both', - local.info = FALSE, - local.winsize = 200, - min.per.group = c(3,3), - weightFunc = methylSig_weightFunc, - T.approx = TRUE, - num.cores = 1) - -print(cpg_island_result) -``` - -# Annotation - -## Annotating differentially methylated CpGs - -Once differential methylation has been determined with `methylSigCalc()`, it may be of interest understand where differential methylation occurs in terms of genes and CpG features (islands, shores, shelves). `methylSig` uses the `annotatr` package to accomplish this. - -```{r} -# Get CpG island annotations from built-in data they could be built with the following: -# cpg_annots = annotatr::build_annotations(genome = 'hg19', annotations = c('hg19_cpg_islands')) -utils::data(sample_data, package = 'methylSig') - -# Determine what CpGs should be considered significant -dmcList = result$fdr < 0.05 & abs(result$meth.diff) > 25 - -annotated_result = methylSigAnnotation(myDiff = result, dmcList = dmcList, annotations = cpg_annots) -``` - -The result is a `GRanges` object with the same columns as the `result` with the addition of columns giving the differential methylation status (`dm_status`), a unique locus id (`locus_id`), and information about the annotation which is itself a `GRanges` object (`annot`). It is important to note that regions tested for differential methylation may occur on multiple rows, depending on the number of features it is annotated to. The `locus_id` column helps to quickly see when this is the case. - -```{r} -print(annotated_result) -``` - -It is more illuminating to view this object as a coerced `data.frame`, wherein the information about the annotations are displayed. - -```{r} -print(head(as.data.frame(annotated_result))) -``` - -## Transcription factor (TF) enrichment test - -Changes in DNA methylation have been shown to alter transcription factor binding. The `methylSig` package has implemented `methylSig.tfbsEnrichTest()` to test a set of transcription factor binding sites (TFBSs) are enriched for differentially methylated CpGs. - -We demonstrate this funciton on a set of TFBSs from ENCODE. The `rtracklayer::import()` function makes reading in `BED` files simple. - -```{r} -# Use preloaded tfbs from package sample_data. Could be manually loaded as with: -# tfbs_file = system.file('extdata','tfbs.bed.gz', package = 'methylSig') -# tfbs = rtracklayer::import(tfbs_file, genome = 'hg19') - -print(tfbs) -``` - -This file mixes TFBSs from a number of TFs and keeps track of them in the name column (4th) of the `BED`. Next, we indicate what is considered a differentially methylated CpG and perform the test. - -```{r} -# Significance threshold -dmcList = result$fdr < 0.05 & abs(result$meth.diff) > 25 - -# Perform the test -tfbs_enrichment = methylSig.tfbsEnrichTest(myDiff = result, dmcList = dmcList, tfbsInfo = tfbs) - -# Take a look at the first few rows -print(head(tfbs_enrichment)) -``` diff --git a/vignettes/updating-methylSig-code.Rmd b/vignettes/updating-methylSig-code.Rmd new file mode 100644 index 0000000..b1a2f4a --- /dev/null +++ b/vignettes/updating-methylSig-code.Rmd @@ -0,0 +1,226 @@ +--- +title: "Updating methylSig code" +author: "Raymond G. Cavalcante" +date: "`r Sys.Date()`" +output: BiocStyle::html_document +vignette: > + %\VignetteIndexEntry{Updating methylSig code} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(methylSig) +``` + +# Introduction + +The purpose of this vignette is to show users how to retrofit their `methylSig` < 0.99.0 code to work with the refactor in version 0.99.0 and later. + +# Reading Data + +## Old methylSig + +In versions < 0.99.0 of `methylSig`, the `methylSigReadData()` function read Bismark coverage files, Bismark genome-wide CpG reports, or MethylDackel bedGraphs. Additionally, users could destrand the data, filter by coverage, and filter SNPs. + +```{r eval = FALSE} +meth = methylSigReadData( + fileList = files, + pData = pData, + assembly = 'hg19', + destranded = TRUE, + maxCount = 500, + minCount = 10, + filterSNPs = TRUE, + num.cores = 1, + fileType = 'cytosineReport') +``` + +## New methylSig + +In versions >= 0.99.0 of `methylSig`, the user should read data with `bsseq::read.bismark()` and then apply functions that were once bundled within `methylSigReadData()`. + +```{r read} +files = c( + system.file('extdata', 'bis_cov1.cov', package='methylSig'), + system.file('extdata', 'bis_cov2.cov', package='methylSig') +) + +bsseq_stranded = bsseq::read.bismark( + files = files, + colData = data.frame(row.names = c('test1','test2')), + rmZeroCov = FALSE, + strandCollapse = FALSE +) +``` + +After reading data, filter by coverage. Note, we are changing our dataset to something we can use with the downstream functions. + +```{r filter_by_coverage} +# Load data for use in the rest of the vignette +data(BS.cancer.ex, package = 'bsseqData') +bs = BS.cancer.ex[1:10000] + +bs = filter_loci_by_coverage(bs, min_count = 5, max_count = 500) +``` + +If the locations of C-to-T and G-to-A SNPs are known, or some other set of location should be removed: + +```{r filter_by_location} +# Construct GRanges object +remove_gr = GenomicRanges::GRanges( + seqnames = c('chr21', 'chr21', 'chr21'), + ranges = IRanges::IRanges( + start = c(9411552, 9411784, 9412099), + end = c(9411552, 9411784, 9412099) + ) +) + +bs = filter_loci_by_location(bs = bs, gr = remove_gr) +``` + +# Tiling Data + +## Old methylSig + +In versions < 0.99.0 of `methylSig`, the `methylSigTile()` function combined aggregating CpG data over pre-defined tiles and genomic windows. + +```{r eval = FALSE} +# For genomic windows, tiles = NULL +windowed_meth = methylSigTile(meth, tiles = NULL, win.size = 10000) + +# For pre-defined tiles, tiles should be a GRanges object. +``` + +## New methylSig + +In versions >= 0.99.0 of `methylSig`, tiling is separated into two functions, `tile_by_regions()` and `tile_by_windows()`. Users should chooose one or the other. + +```{r tile_by_windows} +windowed_bs = tile_by_windows(bs = bs, win_size = 10000) +``` + +```{r tile_by_regions} +# Collapsed promoters on chr21 and chr22 +data(promoters_gr, package = 'methylSig') + +promoters_bs = tile_by_regions(bs = bs, gr = promoters_gr) +``` + +# Testing + +## MethylSig Test + +### Old methylSig + +In versions < 0.99.0 of `methylSig`, the `methylSigCalc` function had a `min.per.group` parameter to determine how many samples per group had to have coverage in order to be tested. + +```{r eval = FALSE} +result = methylSigCalc( + meth = meth, + comparison = 'DR_vs_DS', + dispersion = 'both', + local.info = FALSE, + local.winsize = 200, + min.per.group = c(3,3), + weightFunc = methylSig_weightFunc, + T.approx = TRUE, + num.cores = 1) +``` + +### New methylSig + +In versions >= 0.99.0 of `methylSig`, the `min.per.group` functionality is performed by a separate function `filter_loci_by_group_coverage()`. Also note the change in form to define dispersion calculations, and the use of local information. + +```{r filter_by_group_coverage} +# Look a the phenotype data for bs +bsseq::pData(bs) + +# Require at least two samples from cancer and two samples from normal +bs = filter_loci_by_group_coverage( + bs = bs, + group_column = 'Type', + c('cancer' = 2, 'normal' = 2)) +``` + +After removing loci with insufficient information, we can now use the `diff_methylsig()` test. + +```{r diff_methylsig} +# Test cancer versus normal with dispersion from both groups +diff_gr = diff_methylsig( + bs = bs, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1) +``` + +## DSS Test + +### Old methylSig + +In versions < 0.99.0 of `methylSig`, the `methylSigDSS` function also had a `min.per.group` parameter to determine how many samples per group had to have coverage. Users also couldn't specify which methylation groups to recover. The form of `design`, `formula`, and `contrast`, remain the same in versions >= 0.99.0. + +```{r eval = FALSE} +contrast = matrix(c(0,1), ncol = 1) +result_dss = methylSigDSS( + meth = meth, + design = design1, + formula = '~ group', + contrast = contrast, + group.term = 'group', + min.per.group=c(3,3)) +``` + +### New methylSig + +In versions >= 0.99.0 of `methylSig`, the single `methylSigDSS()` function is replaced by a fit function `diff_dss_fit()` and a test functiotn `diff_dss_test()`. As with `diff_methylsig()`, users should ensure enough samples have sufficient coverage with the `filter_loci_by_group_coverage()` function. The `design` and `formula` are unchanged in their forms. + +If a continuous covariate is to be tested, `filter_loci_by_group_coverage()` should be skipped, as there are no groups. In prior versions of `methylSigDSS()`, this was not possible, and the group constraints were incorrectly applied prior to testing on a continuous covariate. + +```{r filter_by_group_coverage2, eval = FALSE} +# IF NOT DONE PREVIOUSLY +# Require at least two samples from cancer and two samples from normal +bs = filter_loci_by_group_coverage( + bs = bs, + group_column = 'Type', + c('cancer' = 2, 'normal' = 2)) +``` + +```{r diff_dss_fit_simple} +# Test the simplest model with an intercept and Type +diff_fit_simple = diff_dss_fit( + bs = bs, + design = bsseq::pData(bs), + formula = as.formula('~ Type')) +``` + +The `contrast` parameter is also changed in its form. Note the, additional parameters to specify how to recover group methylation. `methylation_group_column` and `methylation_groups` should be specified for group versus group comparisons. For continuous covariates, `methylation_group_column` is sufficient, and the samples will be grouped into top/bottom 25 percentile based on the continuous covariate column name given in `methylation_group_column`. + +```{r diff_dss_test_simple} +# Test the simplest model for cancer vs normal +# Note, 2 rows corresponds to 2 columns in diff_fit_simple$X +simple_contrast = matrix(c(0,1), ncol = 1) + +diff_simple_gr = diff_dss_test( + bs = bs, + diff_fit = diff_fit_simple, + contrast = simple_contrast, + methylation_group_column = 'Type', + methylation_groups = c('case' = 'cancer', 'control' = 'normal')) +``` + +# Session Info + +```{r sessionInfo} +sessionInfo() +``` diff --git a/vignettes/using-methylSig.Rmd b/vignettes/using-methylSig.Rmd new file mode 100644 index 0000000..e3575ac --- /dev/null +++ b/vignettes/using-methylSig.Rmd @@ -0,0 +1,338 @@ +--- +title: "Using methylSig" +author: "Yongseok Park, Raymond G. Cavalcante, Maria E. Figueroa, Laura S. Rozek, and Maureen A. Sartor" +date: "`r Sys.Date()`" +output: BiocStyle::html_document +vignette: > + %\VignetteIndexEntry{Using methylSig} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(methylSig) +``` + +# Introduction + +DNA methylation plays critical roles in gene regulation and cellular specification without altering DNA sequences. It is one of the best understood and most intensively studied epigenetic marks in mammalian cells. Treatment of DNA with sodium bisulfite deaminates unmethylated cytosines to uracil while methylated cytosines are resistant to this conversion thus allowing for the discrimination between methylated and unmethylated CpG sites. Sodium bisulfite pre-treatment of DNA coupled with next-generation sequencing has allowed DNA methylation to be studied quantitatively and genome-wide at single cytosine site resolution. + +`methylSig` is a method for testing for differential methylated cytosines (DMCs) or regions (DMRs) in whole-genome bisulfite sequencing (WGBS) or reduced representation bisulfite sequencing (RRBS) experiments. `methylSig` uses a beta-binomial model to test for significant differences between groups of samples. Several options exist for either site-specific or sliding window tests, combining strands, and for variance estimation. + +# Installation + +`methylSig` is available on GitHub at , and the easiest way to install it is as follows: + +```{r install, eval=FALSE} +devtools::install_github('sartorlab/methylSig') +``` + +# Usage + +The basic flow of analysis with `methylSig` is to: + +* Read data +* Optionally filter data by coverage and/or location +* Optionally aggregate data into regions +* Optionally filter data by coverage in a minimum number of samples per group +* Test for differential methylation + +The sections below walk through each step with small test data. + +## Reading Data + +Methylation calls output by either [MethylDackel](https://github.com/dpryan79/MethylDackel#single-cytosine-methylation-metrics-extraction) or [Bismark](https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-coverage-output-looks-like-this-tab-delimited-1-based-genomic-coords) can be read by the `bsseq::read.bismark()` function from the [`bsseq`](https://www.bioconductor.org/packages/release/bioc/html/bsseq.html) R/Bioconductor package. + +This function accepts `bedGraph`s from [MethylDackel](https://github.com/dpryan79/MethylDackel#single-cytosine-methylation-metrics-extraction) and either the coverage or genome-wide cytosine reports from [Bismark](https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-coverage-output-looks-like-this-tab-delimited-1-based-genomic-coords). Options to consider when reading data are: + +* `colData`, a `data.frame` or `DataFrame` whose rows are samples and columns are phenotype data. The row ordering should match the ordering of files in `files`. This matrix will be needed for downstream differential methylation testing. +* `strandCollapse`, a `logical` (`TRUE`/`FALSE`) indicating whether or not to collapse +/- CpG data onto the + strand. Note, this can only be `TRUE` when the input type is the genome-wide cytosine report from Bismark. MethylDackel has an option to destrand data when methylation calls are made so that the output is already destranded. In this case, `strandCollapse` should be `FALSE`. + +For all options, see the `bsseq` [reference manual](https://www.bioconductor.org/packages/release/bioc/manuals/bsseq/man/bsseq.pdf), and the [section on reading data](https://www.bioconductor.org/packages/release/bioc/vignettes/bsseq/inst/doc/bsseq.html#4_reading_data) in the package vignette. + +```{r read} +files = c( + system.file('extdata', 'bis_cov1.cov', package='methylSig'), + system.file('extdata', 'bis_cov2.cov', package='methylSig') +) + +bsseq_stranded = bsseq::read.bismark( + files = files, + colData = data.frame(row.names = c('test1','test2')), + rmZeroCov = FALSE, + strandCollapse = FALSE +) +``` + +The result is a `BSseq` object. Aspects of the object can be accessed via: + +```{r bsseq_access} +# pData +bsseq::pData(bsseq_stranded) + +# GRanges +GenomicRanges::granges(bsseq_stranded) + +# Coverage matrix +bsseq::getCoverage(bsseq_stranded, type = 'Cov') + +# Methylation matrix +bsseq::getCoverage(bsseq_stranded, type = 'M') +``` + +## Filtering Data + +After data is loaded, it is good practice to filter loci that have too few or too many reads, and C-to-T and G-to-A SNPs which confound bisulfite conversion. + +### By Coverage + +Low coverage loci (typically those with fewer than 5 reads) should be marked because they adversely affect the variance calculation in downstream differential methylation tests. Very high coverage loci (typically those with more than 500 reads) are likely the result of PCR duplication, and should also be marked. + +`MethylSig` marks such sites by setting their coverage and methylation matrix entries to 0 for each sample in which this happens. Prior to testing, these sites can be removed, see below. + +```{r filter_by_coverage} +# Load data for use in the rest of the vignette +data(BS.cancer.ex, package = 'bsseqData') +bs = BS.cancer.ex[1:10000] + +bs = filter_loci_by_coverage(bs, min_count = 5, max_count = 500) +``` + +### By Location + +As noted above, locations with C-to-T and G-to-A SNPs confound bisulfite conversion in WGBS and ERRBS. Filtering them out can be accomplished by constructing a `GRanges` object with their location. For now, we leave locating such SNPs to the user. + +```{r filter_by_location} +# Show locations of bs +GenomicRanges::granges(bs) + +# Construct GRanges object +remove_gr = GenomicRanges::GRanges( + seqnames = c('chr21', 'chr21', 'chr21'), + ranges = IRanges::IRanges( + start = c(9411552, 9411784, 9412099), + end = c(9411552, 9411784, 9412099) + ) +) + +bs = filter_loci_by_location(bs = bs, gr = remove_gr) + +# Show removal +GenomicRanges::granges(bs) +``` + +## Aggregating Data + +One way to increase the power of differential methylation testing is to aggregate the CpG-level data into regions. Regions can take two forms: tiling the entire genome by windows of a certain width or defining a set of regions such as CpG islands or gene promoters. + +### By Tiling the Genome + +Given that CpG methylation is strongly correlated over short genomic distances, a reasonable upper threshold might be 500bp. For the example below, in the interest of speed, we tile by larger windows. + +```{r tile_by_windows} +windowed_bs = tile_by_windows(bs = bs, win_size = 10000) + +# Show tiling +GenomicRanges::granges(windowed_bs) +``` + +### By Pre-defined Regions + +It may be the case that differential methylation is only relevant at promoter regions of genes for a particular project. In this case, aggregation of methylation calls over these regions may increase power, and decrease computation time. + +```{r tile_by_regions} +# Collapsed promoters on chr21 and chr22 +data(promoters_gr, package = 'methylSig') + +promoters_bs = tile_by_regions(bs = bs, gr = promoters_gr) +``` + +## Testing for Differential Methylation + +`MethylSig` offers three tests for differential methylation: + +1. `diff_binomial()` +2. `diff_methylsig()` +3. `diff_dss_fit()` and `diff_dss_test()` + +Each returns a `GRanges` object with tested loci and the corresponding statistics and methylation levels (if applicable). See the documentation for each function for more information (`?diff_binomial`, `?diff_methylsig`, `?diff_dss_fit`, and `?diff_dss_test`). + +### Filtering by Coverage in a Minimum Number of Samples + +Prior to applying any test function, loci without a minimum number of samples having appropriate coverage should be removed to avoid testing loci where one sample dominates the test. + +```{r filter_by_group_coverage} +# Look a the phenotype data for bs +bsseq::pData(bs) + +# Require at least two samples from cancer and two samples from normal +bs = filter_loci_by_group_coverage( + bs = bs, + group_column = 'Type', + c('cancer' = 2, 'normal' = 2)) +``` + +### Binomial Test + +`diff_binomial()` is a binomial test based on that in the [`methylKit`](https://bioconductor.org/packages/release/bioc/html/methylKit.html) R/Bioconductor package. This was included for benchmarking purposes in the publication. It does not take into account the variability among samples being compared. + +```{r diff_binomial} +# Test cancer versus normal +diff_gr = diff_binomial( + bs = bs, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal')) + +diff_gr +``` + +### MethylSig Test + +The `diff_methylsig()` is a beta-binomial test which takes into account the variability among samples being compared. It can perform group versus group comparisons with no covariates. + +```{r diff_methylsig} +# Test cancer versus normal with dispersion from both groups +diff_gr = diff_methylsig( + bs = bs, + group_column = 'Type', + comparison_groups = c('case' = 'cancer', 'control' = 'normal'), + disp_groups = c('case' = TRUE, 'control' = TRUE), + local_window_size = 0, + t_approx = TRUE, + n_cores = 1) + +diff_gr +``` + +### General Models with DSS + +`diff_dss_fit()` and `diff_dss_test()` are tests supporting general models, and are wrappers for functions in the [`DSS`](https://bioconductor.org/packages/release/bioc/html/DSS.html) R/Bioconductor package. We have added the ability to recover group methylation for group comparisons, or top/bottom 25 percentile methylation rates based on a continuous covariate. + +The `DSS` style test is in two stages similar to tests in the `edgeR` or `limma` R/Bioconductor packages. The first stage is a fit, and the second stage is a test on a contrast. + +First we add a numerical covariate to the `pData(bs)` so that we can give an example of such a test. + +```{r add_numerical_covariate} +bsseq::pData(bs)$num_covariate = c(84, 96, 93, 10, 18, 9) +``` + +#### Model Fitting + +Fit the simplest group versus group model on just the type. + +```{r diff_dss_fit_simple} +diff_fit_simple = diff_dss_fit( + bs = bs, + design = bsseq::pData(bs), + formula = as.formula('~ Type')) +``` + +Fit a paired model where cancer and normal samples are paired by patient. + +```{r diff_dss_fit_paired} +# Paired-test +diff_fit_paired = diff_dss_fit( + bs = bs, + design = bsseq::pData(bs), + formula = '~ Type + Pair') +``` + +Fit a model on the numerical covariate. + +```{r diff_dss_fit_num} +# Numerical covariate test +diff_fit_num = diff_dss_fit( + bs = bs, + design = bsseq::pData(bs), + formula = '~ num_covariate') +``` + +The result of `diff_dss_fit()` is a `list` with the following structure with elements: + +* `gr`, the `GRanges` of the fit loci. +* `design`, the phenotype matrix passed via the `design` parameter. +* `formula`, the formula used in conjunction with `design` to create the model matrix. +* `X`, the result of `model.matrix` with `design` and `formula`. +* `fit`, the `beta` and `var.beta` matrices. + +#### Building Contrasts + +Prior to calling `diff_fit_test()`, it may help to look at the model matrix used for fitting in order to build the contrast. + +```{r diff_dss_fit_model} +diff_fit_simple$X + +diff_fit_paired$X + +diff_fit_num$X +``` + +The contrast passed to `diff_fit_test()` should be a column vector or a matrix whose rows correspond to the columns of the model matrix above. See the [DSS user guide](http://bioconductor.org/packages/release/bioc/vignettes/DSS/inst/doc/DSS.html#34_dmldmr_detection_from_general_experimental_design) for more information. + +```{r contrast} +# Test the simplest model for cancer vs normal +# Note, 2 rows corresponds to 2 columns in diff_fit_simple$X +simple_contrast = matrix(c(0,1), ncol = 1) + +# Test the paired model for cancer vs normal +# Note, 4 rows corresponds to 4 columns in diff_fit_paired$X +paired_contrast = matrix(c(0,1,0,0), ncol = 1) + +# Test the numerical covariate +num_contrast = matrix(c(0,1), ncol = 1) +``` + +#### Testing + +The `diff_fit_test()` function enables the recovery of group methylation rates via the optional `methylation_group_column` and `methylation_groups` parameters. + +The simple, group versus group, test. + +```{r diff_dss_test_simple} +diff_simple_gr = diff_dss_test( + bs = bs, + diff_fit = diff_fit_simple, + contrast = simple_contrast, + methylation_group_column = 'Type', + methylation_groups = c('case' = 'cancer', 'control' = 'normal')) + +diff_simple_gr +``` + +The paired test. + +```{r diff_dss_test_paired} +diff_paired_gr = diff_dss_test( + bs = bs, + diff_fit = diff_fit_paired, + contrast = paired_contrast, + methylation_group_column = 'Type', + methylation_groups = c('case' = 'cancer', 'control' = 'normal')) + +diff_paired_gr +``` + +The numerical covariate test. Note, here the `methylation_groups` parameter is omitted because there are no groups. By giving the numerical covariate column, we will group samples by the top/bottom 25 percentile over the covariate, and compute mean methylation within those groups of samples. + +```{r diff_dss_test_num} +diff_num_gr = diff_dss_test( + bs = bs, + diff_fit = diff_fit_num, + contrast = num_contrast, + methylation_group_column = 'num_covariate') + +diff_num_gr +``` + +# Session Info + +```{r sessionInfo} +sessionInfo() +```