diff --git a/DESCRIPTION b/DESCRIPTION index 6427432..901f4d9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: methylSig Type: Package Title: a whole genome DNA methylation analysis pipeline -Version: 0.4.2 -Date: 2016-04-20 +Version: 0.4.3 +Date: 2016-05-11 Author: Yongseok Park Maintainer: Who to complain to Description: MethylSig is a method for testing for differentially methylated diff --git a/R/read.R b/R/read.R index f6f11f4..cb4c95a 100644 --- a/R/read.R +++ b/R/read.R @@ -1,55 +1,158 @@ # Called by methylSigReadData methylSigReadDataSingleFile <- function(fileIndex, fileList, header, minCount, maxCount, destranded, filterSNPs, quiet=FALSE) { - if(quiet==FALSE) { - message(sprintf('Reading file (%s / %s) -- %s', fileIndex, nrow(fileList), fileList[[fileIndex]])) - } - chr = read.table(fileList[[fileIndex]], header=header, stringsAsFactors=FALSE) - #### order for base #### - ##chr = chr[order(chr$base),] - #### - names(chr) <- c("id", "chr", "start", "strand", "coverage", "numCs", "numTs") + if(!quiet) { + message(sprintf('Reading file (%s/%s) -- %s', fileIndex, length(fileList), fileList[[fileIndex]])) + } + + df = read.table(fileList[[fileIndex]], header=header, stringsAsFactors=FALSE) + names(df) <- c("id", "chr", "start", "strand", "coverage", "numCs", "numTs") + + # Convert all strand information into +/-/* from {+,-,*,F,R,.} + df$strand[which(df$strand == 'F')] = '+' + df$strand[which(df$strand == 'R')] = '-' + df$strand[which(df$strand == '.')] = '*' + df$strand = factor(df$strand, levels=c('+','-','*')) # At this point, numCs and numTs are actually freqC and freqT from methylKit. - freqInvalidList = which(chr$numCs + chr$numTs < 95) + # Filter sites with C + T < 95% + freqInvalidList = which(df$numCs + df$numTs < 95) if(length(freqInvalidList) > 0) { - message(sprintf('File (%s / %s) Sites with numCs + numTs < 95: %s / %s = %s', - fileIndex, nrow(fileList), nrow(freqInvalidList), nrow(chr), signif(nrow(freqInvalidList) / nrow(chr), 3))) - chr$coverage[freqInvalidList] = 0 - } + if(!quiet) { + message(sprintf('File (%s/%s) Sites with numCs + numTs < 95: %s/%s = %s', + fileIndex, length(fileList), length(freqInvalidList), nrow(df), signif(length(freqInvalidList) / nrow(df), 3))) + } + df$coverage[freqInvalidList] = 0 + } else { + if(!quiet) { + message(sprintf('File (%s/%s) Sites with numCs + numTs < 95: 0 / %s = 0', + fileIndex, length(fileList), nrow(df))) + } + } - # Now numCs and numTs have frequencies replaced by counts - chr$numCs<- round(chr$numCs * chr$coverage / 100) - chr$numTs<- round(chr$numTs * chr$coverage / 100) + # Now numCs and numTs have frequencies replaced by counts + df$numCs = round(df$numCs * df$coverage / 100) + df$numTs = round(df$numTs * df$coverage / 100) - # countInvalidList = which(chr$coverage > maxCount | chr$coverage < minCount) - # message(sprintf('File (%s / %s) Sites > maxCount or < minCount: %s / %s = %s', - # fileIndex, nrow(fileList), nrow(countInvalidList), nrow(chr), signif(nrow(countInvalidList) / nrow(chr), 3))) - # chr$coverage[countInvalidList] = 0 + # Destrand (or not) + if(destranded == TRUE) { + if(!quiet) { + message(sprintf('File (%s/%s) Destranding', + fileIndex, length(fileList))) + } - if(destranded == TRUE) { - destrandList = which(chr$strand == "-" | chr$strand == "R") - chr$start[destrandList] = chr$start[destrandList] - 1 - } + # Shift positions on - strand back by 1bp to combine with + strand + destrandList = which(df$strand == "-" | df$strand == "R") + df$start[destrandList] = df$start[destrandList] - 1 - if(filterSNPs) { - data('CT_SNPs_hg19', envir=environment()) - chr_gr = GRanges(seqnames=chr$chr, ranges=IRanges(start=chr$start, end=chr$start)) + # Need to re-sort the df in case destranding puts things out of order + # For example, + # chr1 6 + + # chr1 6 - + # Would be destranded to + # chr1 6 + + # chr1 5 + + df = df[order(df$chr, df$start),] - overlaps = findOverlaps(chr_gr, CT_SNPs_hg19) - snpInvalidList = overlaps@queryHits + # Some ingredients for the hash / destranding + MAXBASE = 10^{ceiling(log10(max(df$start) + 1))} + uniqueChr = sort(unique(df$chr)) - message(sprintf('File (%s / %s) Sites overlapping C > T SNP: %s / %s = %s', - fileIndex, nrow(fileList), nrow(snpInvalidList), nrow(chr), signif(nrow(snpInvalidList) / nrow(chr), 3))) + # Hash the locations + uniqueLoc = unique(as.numeric(as.factor(df$chr)) * MAXBASE + df$start) + numSites = length(uniqueLoc) - chr$coverage[snpInvalidList] = 0 - } + # Create empty matrices where file data will be columns + chrom = pos = coverage = numCs = numTs = rep.int(0, numSites) + strand = factor(rep(NA, numSites), levels=c('+','-','*')) - #chr$chr = as.factor(chr$chr) - chr$strand = as.factor(chr$strand) - ## When only F observed in strand column, as.factor convert F to FALSE - levels(chr$strand) = list("+"="F","-"="R","*"="*", "+"="FALSE") + # Determine locations of data in uniqueLoc. This should behave so that + # base shifted - strand sites have the same "location". + location = findInterval( (as.numeric(as.factor(df$chr)) * MAXBASE) + df$start, uniqueLoc) - chr[chr$coverage>0,2:7] + # Deal with locations having strand NA and set to * + # settingList = is.na(strand[location]) + # strand[location][settingList] = df$strand[settingList] + # settingList = !settingList & (strand[location] != df$strand) + # strand[location][settingList] = "*" + strand = factor(rep.int('*', numSites), levels=c('+','-','*')) + + # Deal with positive strand + forward = (df$strand == "+") | (df$strand == "F") + chrom[location[forward]] = df$chr[forward] + pos[location[forward]] = df$start[forward] + coverage[location[forward]] = df$coverage[forward] + numCs[location[forward]] = df$numCs[forward] + numTs[location[forward]] = df$numTs[forward] + + # Deal with reverse strand + reverse = (df$strand == "-") | (df$strand == "R") + chrom[location[reverse]] = df$chr[reverse] + pos[location[reverse]] = df$start[reverse] + coverage[location[reverse]] = coverage[location[reverse]] + df$coverage[reverse] + numCs[location[reverse]] = numCs[location[reverse]] + df$numCs[reverse] + numTs[location[reverse]] = numTs[location[reverse]] + df$numTs[reverse] + + # Rebuild the data.frame + df = data.frame( + chr = chrom, + start = pos, + strand = strand, + coverage = coverage, + numCs = numCs, + numTs = numTs, + stringsAsFactors=F) + } else { + if(!quiet) { + message(sprintf('File (%s/%s) Not destranding', + fileIndex, length(fileList))) + } + # Remove the first column of the data.frame + df = df[,2:7] + } + + # Filter for minCount or maxCount + countInvalidList = which(df$coverage > maxCount | df$coverage < minCount) + if(length(countInvalidList) > 0) { + if(!quiet) { + message(sprintf('File (%s/%s) Sites > maxCount or < minCount: %s/%s = %s', + fileIndex, length(fileList), length(countInvalidList), nrow(df), signif(length(countInvalidList) / nrow(df), 3))) + } + df$coverage[countInvalidList] = 0 + } else { + if(!quiet) { + message(sprintf('File (%s/%s) Sites > maxCount or < minCount: 0 / %s = 0', + fileIndex, length(fileList), nrow(df))) + } + } + + # Filter C > T SNPs + if(filterSNPs) { + data('CT_SNPs_hg19', envir=environment()) + + df_gr = GRanges(seqnames=df$chr, ranges=IRanges(start=df$start, end=df$start)) + overlaps = findOverlaps(df_gr, CT_SNPs_hg19) + snpInvalidList = queryHits(overlaps) + + if(length(snpInvalidList) > 0) { + if(!quiet) { + message(sprintf('File (%s/%s) Sites overlapping C > T SNP: %s/%s = %s', + fileIndex, length(fileList), length(snpInvalidList), nrow(df), signif(length(snpInvalidList) / nrow(df), 3))) + } + df$coverage[snpInvalidList] = 0 + } else { + if(!quiet) { + message(sprintf('File (%s/%s) Sites overlapping C > T SNP: 0 / %s = 0', + fileIndex, length(fileList), nrow(df))) + } + } + } else { + if(!quiet) { + message(sprintf('File (%s/%s) filterSNPs == FALSE', + fileIndex, length(fileList))) + } + } + + return(df[df$coverage>0,]) } #' Read methylation score files to make a 'methylSigData' object. @@ -108,6 +211,8 @@ methylSigReadData = function(fileList, } } + # Collect chromosomes present in each of the files and determine the maximum + # start location. (MAXBASE PURPOSE NOT CLEAR YET) MAXBASE = 0 uniqueChr = NULL for(fileIndex in 1:n.files) { @@ -115,90 +220,47 @@ methylSigReadData = function(fileList, MAXBASE = max(MAXBASE, max(chrList[[fileIndex]]$start)) } + # Determine the unique chromosomes and order them uniqueChr = unique(uniqueChr) uniqueChr = uniqueChr[order(uniqueChr)] + # Basically round MAXBASE to the nearest power of 10 MAXBASE = 10^{ceiling(log10(MAXBASE + 1))} - uniqueLoc = NULL + + # Create a hash for unique locations + uniqueLoc = NULL for(fileIndex in 1:n.files) { chrList[[fileIndex]]$chr = factor(chrList[[fileIndex]]$chr, levels=uniqueChr) uniqueLoc = unique(c(uniqueLoc, as.numeric(chrList[[fileIndex]]$chr) * MAXBASE+chrList[[fileIndex]]$start)) } + # Determine only the unique locations and the number to create the empty + # matrices for aggregation uniqueLoc = uniqueLoc[order(uniqueLoc)] - sizeRet = NROW(uniqueLoc) + # Create empty matrices where file data will be columns coverage = numCs = numTs = matrix(0, nrow=sizeRet, ncol=n.files) strand = factor(rep(NA, sizeRet), levels=levels(chrList[[1]]$strand)) + # Aggregate for(fileIndex in 1:n.files) { - if(quiet == FALSE) { - message(sprintf('(%s)', fileIndex)) - } + # if(quiet == FALSE) { + # message(sprintf('(%s)', fileIndex)) + # } location = findInterval( (as.numeric(chrList[[fileIndex]]$chr) * MAXBASE+chrList[[fileIndex]]$start) , uniqueLoc) - if(destranded == FALSE) { - strand[location] = chrList[[fileIndex]]$strand - coverage[location,fileIndex] = chrList[[fileIndex]]$coverage - numCs[location,fileIndex] = chrList[[fileIndex]]$numCs - numTs[location,fileIndex] = chrList[[fileIndex]]$numTs - } else { - # Deal with locations having strand NA and set to * - settingList = is.na(strand[location]) - strand[location][settingList] = chrList[[fileIndex]]$strand[settingList] - settingList = !settingList & (strand[location] != chrList[[fileIndex]]$strand) - strand[location][settingList] = "*" - - # Deal with positive strand - forward = (chrList[[fileIndex]]$strand == "+") - coverage[location[forward],fileIndex] = chrList[[fileIndex]]$coverage[forward] - numCs[location[forward],fileIndex] = chrList[[fileIndex]]$numCs[forward] - numTs[location[forward],fileIndex] = chrList[[fileIndex]]$numTs[forward] - - # Deal with reverse strand - reverse = (chrList[[fileIndex]]$strand == "-") - coverage[location[reverse],fileIndex] = coverage[location[reverse],fileIndex] + chrList[[fileIndex]]$coverage[reverse] - numCs[location[reverse],fileIndex] = numCs[location[reverse],fileIndex] + chrList[[fileIndex]]$numCs[reverse] - numTs[location[reverse],fileIndex] = numTs[location[reverse],fileIndex] + chrList[[fileIndex]]$numTs[reverse] - } - } - - # Filter by minCount and maxCount while also modifying chr, uniqueLoc, strand, numCs, numTs - # This is in the idiom of the what's been implemented - countInvalidList = NULL - for(fileIndex in 1:n.files) { - # Record the countInvalidList - countInvalidList = unique( c(countInvalidList, which( - ((coverage[, fileIndex] != 0) & (coverage[, fileIndex] < minCount)) | - ((coverage[, fileIndex] != 0) & (coverage[, fileIndex] > maxCount)) ) ) ) - } - - # For simulated data you could have nothing in the countInvalidList - if(length(countInvalidList) > 0) { - uniqueLoc = uniqueLoc[-countInvalidList] - strand = strand[-countInvalidList] - coverage = coverage[-countInvalidList, ] - numCs = numCs[-countInvalidList, ] - numTs = numTs[-countInvalidList, ] - - message(sprintf('Sites > maxCount or < minCount: %s / %s = %s', - length(countInvalidList), nrow(coverage) + length(countInvalidList), - signif( length(countInvalidList) / (nrow(coverage) + length(countInvalidList)) , 3))) - } else { - message('Sites > maxCount or < minCount = 0') - } -# strand = as.factor(strand) -# levels(strand) = list("+"="1","-"="2","*"="3") + strand[location] = chrList[[fileIndex]]$strand + coverage[location,fileIndex] = chrList[[fileIndex]]$coverage + numCs[location,fileIndex] = chrList[[fileIndex]]$numCs + numTs[location,fileIndex] = chrList[[fileIndex]]$numTs + } options = paste("maxCount=", maxCount, " & minCount=", minCount, " & filterSNPs=", filterSNPs, sep="") if(!is.na(assembly)) options = paste(options, " & assembly=", assembly, sep="") if(!is.na(context)) options = paste(options, " & context=", context, sep="") if(!is.na(pipeline)) options = paste(options, " & pipeline=", pipeline, sep="") - # numTs[coverage==0] = NA - # numCs[coverage==0] = NA - # coverage[coverage==0] = NA methylSig.newData(data.ids=uniqueLoc, data.chr=as.factor(uniqueChr[as.integer(uniqueLoc/MAXBASE)]), data.start=uniqueLoc%%MAXBASE, data.end=uniqueLoc%%MAXBASE, data.strand=strand, data.coverage = coverage, data.numTs = numTs, data.numCs = numCs, diff --git a/R/tile.R b/R/tile.R index d092302..7f4d292 100644 --- a/R/tile.R +++ b/R/tile.R @@ -13,11 +13,11 @@ #' methTile = methylSigTile(meth) #' @export methylSigTile <- function(meth, tiles = NULL, win.size=25) { - if(meth@resolution == "region") stop("Object has already been tiled") + if(meth@resolution == "region") stop("Object has already been tiled") - if(is.null(tiles)) { - message(sprintf('Tiling by %s bp windows', win.size)) - MAXBASE = max(meth@data.start) + win.size + 1 + if(is.null(tiles)) { + message(sprintf('Tiling by %s bp windows', win.size)) + MAXBASE = max(meth@data.start) + win.size + 1 # MAXBASE10 = MAXBASE = 10^{ceiling(log10(MAXBASE + 1))} startList = as.numeric(meth@data.chr)*MAXBASE + meth@data.start diff --git a/inst/extdata/test_combo_1.txt b/inst/extdata/test_combo_1.txt new file mode 100644 index 0000000..9041906 --- /dev/null +++ b/inst/extdata/test_combo_1.txt @@ -0,0 +1,14 @@ +chr21.1 chr21 1 F 14 30 40 +chr21.3 chr21 3 F 5 100 0 +chr21.4 chr21 4 R 7 100 0 +chr21.6 chr21 6 F 16 80 20 +chr21.7 chr21 7 R 20 100 0 +chr21.8 chr21 8 F 550 100 0 +chr21.10 chr21 10 F 2 50 50 +chr21.15 chr21 15 F 300 33 67 +chr21.16 chr21 16 R 300 20 80 +chr21.20 chr21 20 F 30 100 0 +chr21.25 chr21 25 R 50 25 75 +chr21.30 chr21 30 F 2 100 0 +chr21.31 chr21 31 R 3 100 0 +chr21.9411410 chr21 9411410 F 72 89 11 diff --git a/inst/extdata/test_combo_2.txt b/inst/extdata/test_combo_2.txt new file mode 100644 index 0000000..49d9281 --- /dev/null +++ b/inst/extdata/test_combo_2.txt @@ -0,0 +1,12 @@ +chr21.1 chr21 1 F 14 30 70 +chr21.6 chr21 6 F 30 75 25 +chr21.7 chr21 7 R 25 90 10 +chr21.8 chr21 8 F 400 96 4 +chr21.10 chr21 10 F 2 50 50 +chr21.15 chr21 15 F 250 33 67 +chr21.16 chr21 16 R 225 20 80 +chr21.20 chr21 20 F 80 100 0 +chr21.25 chr21 25 R 25 25 75 +chr21.30 chr21 30 F 2 100 0 +chr21.31 chr21 31 R 3 100 0 +chr21.48086570 chr21 48086570 R 34 89 11 diff --git a/inst/extdata/test_freq_1.txt b/inst/extdata/test_freq_1.txt new file mode 100644 index 0000000..ea41c37 --- /dev/null +++ b/inst/extdata/test_freq_1.txt @@ -0,0 +1,6 @@ +chr21.3 chr21 3 F 10 30 40 +chr21.4 chr21 4 R 14 100 0 +chr21.6 chr21 6 F 16 80 20 +chr21.7 chr21 7 R 20 100 0 +chr21.8 chr21 8 F 200 100 0 +chr21.9 chr21 9 R 200 98 2 diff --git a/inst/extdata/test_freq_2.txt b/inst/extdata/test_freq_2.txt new file mode 100644 index 0000000..a06387c --- /dev/null +++ b/inst/extdata/test_freq_2.txt @@ -0,0 +1,6 @@ +chr21.3 chr21 3 F 10 100 0 +chr21.4 chr21 4 R 14 100 0 +chr21.6 chr21 6 F 16 80 20 +chr21.7 chr21 7 R 20 100 0 +chr21.8 chr21 8 F 200 100 0 +chr21.9 chr21 9 R 100 90 2 diff --git a/inst/extdata/test_minmax_1.txt b/inst/extdata/test_minmax_1.txt new file mode 100644 index 0000000..51b8759 --- /dev/null +++ b/inst/extdata/test_minmax_1.txt @@ -0,0 +1,6 @@ +chr21.3 chr21 3 F 5 100 0 +chr21.4 chr21 4 R 14 100 0 +chr21.6 chr21 6 F 16 80 20 +chr21.7 chr21 7 R 20 100 0 +chr21.8 chr21 8 F 200 100 0 +chr21.9 chr21 9 R 200 98 2 diff --git a/inst/extdata/test_minmax_2.txt b/inst/extdata/test_minmax_2.txt new file mode 100644 index 0000000..b87a0c0 --- /dev/null +++ b/inst/extdata/test_minmax_2.txt @@ -0,0 +1,6 @@ +chr21.3 chr21 3 F 10 100 0 +chr21.4 chr21 4 R 14 100 0 +chr21.6 chr21 6 F 16 80 20 +chr21.7 chr21 7 R 20 100 0 +chr21.8 chr21 8 F 550 100 0 +chr21.9 chr21 9 R 400 98 2 diff --git a/tests/testthat/test_1_read.R b/tests/testthat/test_1_read.R index 4289f89..5e98ad5 100644 --- a/tests/testthat/test_1_read.R +++ b/tests/testthat/test_1_read.R @@ -1,138 +1,530 @@ context('Test methylSigReadData') ################################################################################ -# Test with default settings (except header) +# Test min/max filters -files = c( - system.file('extdata', 'test_1.txt', package='methylSig'), - system.file('extdata', 'test_2.txt', package='methylSig')) +files = system.file('extdata', 'test_minmax_1.txt', package='methylSig') -data_defaults = methylSigReadData( - fileList = files, - sample.ids = c('test_1','test_2'), - assembly = 'hg19', - pipeline = 'bismark and methylKit', - header = FALSE, - context = 'CpG', - resolution = "base", - treatment = c(1,0), - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = FALSE, - num.cores = 1, - quiet = TRUE) +data1 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = FALSE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) -test_that('Test minCount removals after destrand',{ - expect_equal( 43053323 %in% data_defaults@data.start, expected = FALSE ) +test_that('Test minCount removal (data1)', { + expect_equal( length(data1@data.start), expected = 5 ) + expect_equal( 3 %in% data1@data.start, expected = FALSE ) }) -test_that('Test maxCount removals after destrand',{ - expect_equal( 43052356 %in% data_defaults@data.start, expected = FALSE ) +data2 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = TRUE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test minCount rescue (data2)', { + expect_equal( length(data2@data.start), expected = 3 ) + expect_equal( 3 %in% data2@data.start, expected = TRUE ) + expect_equal( all( c(3,6,8) %in% data2@data.start ), expected = TRUE ) }) -test_that('Test invalid frequency removals',{ - expect_equal( 43045383 %in% data_defaults@data.start, expected = FALSE ) +############## + +files = system.file('extdata', 'test_minmax_2.txt', package='methylSig') + +data3 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = FALSE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test maxCount removal (data3)', { + expect_equal( length(data3@data.start), expected = 5 ) + expect_equal( 8 %in% data3@data.start, expected = FALSE ) }) -# NOTE: This test indicates that destranding does not behave as we expect. -# If we want a minimum count of 10 after destranding, we need to set the -# minCount to 5 because updating after destranding occurs after filtering -# for minCount. -test_that('Test destranding rescue',{ - expect_equal( 43045074 %in% data_defaults@data.start, expected = TRUE ) - expect_equal( data_defaults@data.coverage[which(data_defaults@data.start == 43045074), 1], expected = 10) +data4 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = TRUE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test maxCount removal (data4)', { + expect_equal( length(data4@data.start), expected = 2 ) + expect_equal( all( c(3,6) %in% data4@data.start ), expected = TRUE ) + expect_equal( 8 %in% data4@data.start, expected = FALSE ) + expect_equal( all( c(4,7,9) %in% data4@data.start ), expected = FALSE ) }) -test_that('Test destranding aggregation',{ - # Removal of sites moved to the other strand - expect_equal( 43053298 %in% data_defaults@data.start, expected = FALSE ) - # Correct coverage sums - expect_equal( data_defaults@data.coverage[which(data_defaults@data.start == 43053297), 1], expected = 96 ) - expect_equal( data_defaults@data.coverage[which(data_defaults@data.start == 43053297), 2], expected = 207 ) - # Correct numCs - expect_equal( data_defaults@data.numCs[which(data_defaults@data.start == 43053297), 1], expected = 87 ) +############## + +files = c( + system.file('extdata', 'test_minmax_1.txt', package='methylSig'), + system.file('extdata', 'test_minmax_2.txt', package='methylSig') + ) + +data5 = methylSigReadData( + fileList = files, + sample.ids = c('test_1','test_2'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1,0), + destranded = FALSE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test minCount/maxCount site preservation (data5)', { + expect_equal( length(data5@data.start), expected = 6 ) + expect_equal( data5@data.coverage[which(data5@data.start == 3), 1], expected = 0 ) + expect_equal( data5@data.coverage[which(data5@data.start == 3), 2], expected = 10 ) + expect_equal( data5@data.coverage[which(data5@data.start == 8), 1], expected = 200 ) + expect_equal( data5@data.coverage[which(data5@data.start == 8), 2], expected = 0 ) }) -test_that('Test for correct nrows in methylSigData object',{ - expect_equal( nrow(data_defaults@data.coverage), expected = 8 ) +data6 = methylSigReadData( + fileList = files, + sample.ids = c('test_1','test_2'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1,0), + destranded = TRUE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test minCount/maxCount site preservation (data6)', { + expect_equal( length(data6@data.start), expected = 3 ) + expect_equal( data6@data.coverage[which(data6@data.start == 3), 1], expected = 19 ) + expect_equal( data6@data.coverage[which(data6@data.start == 3), 2], expected = 24 ) + expect_equal( data6@data.coverage[which(data6@data.start == 8), 1], expected = 400 ) + expect_equal( data6@data.coverage[which(data6@data.start == 8), 2], expected = 0 ) }) ################################################################################ -# Test with SNP removal +# Test %C + %T < 95% + +files = system.file('extdata', 'test_freq_1.txt', package='methylSig') -data_snps = methylSigReadData( - fileList = files, - sample.ids = c('test_1','test_2'), - assembly = 'hg19', - pipeline = 'bismark and methylKit', - header = FALSE, - context = 'CpG', - resolution = "base", - treatment = c(1,0), - destranded = TRUE, - maxCount = 500, - minCount = 10, - filterSNPs = TRUE, - num.cores = 1, - quiet = TRUE) +data7 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = FALSE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) -test_that('Test SNP removal',{ - expect_equal( 9419909 %in% data_snps@data.start, expected = FALSE ) +test_that('Test %C + %T < 95% removal (data7)', { + expect_equal( length(data7@data.start), expected = 5 ) + expect_equal( 3 %in% data7@data.start, expected = FALSE ) }) -test_that('Test for correct nrows in methylSigData object',{ - expect_equal( length(data_snps@data.start), expected = 7 ) +data8 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = TRUE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +# Is this the behavior we want? In this case, destranding causes the data +# on the forward strand to be destroyed and shifts the data on the reverse +# strand to that position. Or do we want to do this check after destranding? +# That would cause the CpG at 3 to be lost completely. +test_that('Test %C + %T < 95% removal (data8)', { + expect_equal( length(data8@data.start), expected = 3 ) + expect_equal( 3 %in% data8@data.start, expected = TRUE ) + expect_equal( data8@data.coverage[which(data8@data.start == 3), 1], expected = 14 ) }) -################################################################################ -# Test with strandedness (destranded = FALSE) -# And reduced minCount to 5 - -data_stranded = methylSigReadData( - fileList = files, - sample.ids = c('test_1','test_2'), - assembly = 'hg19', - pipeline = 'bismark and methylKit', - header = FALSE, - context = 'CpG', - resolution = "base", - treatment = c(1,0), - destranded = FALSE, - maxCount = 500, - minCount = 5, - filterSNPs = FALSE, - num.cores = 1, - quiet = TRUE) - -test_that('Test for correct nrows in methylSigData object',{ - expect_equal( length(data_stranded@data.start), expected = 10) +############## + +files = system.file('extdata', 'test_freq_2.txt', package='methylSig') + +data9 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = FALSE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test %C + %T < 95% removal (data9)', { + expect_equal( length(data9@data.start), expected = 5 ) + expect_equal( 9 %in% data9@data.start, expected = FALSE ) +}) + +data10 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = TRUE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test %C + %T < 95% removal (data10)', { + expect_equal( length(data10@data.start), expected = 3 ) + expect_equal( 8 %in% data10@data.start, expected = TRUE ) + expect_equal( data10@data.coverage[which(data10@data.start == 8), 1], expected = 200 ) +}) + +############## + +files = c( + system.file('extdata', 'test_freq_1.txt', package='methylSig'), + system.file('extdata', 'test_freq_2.txt', package='methylSig') + ) + +data11 = methylSigReadData( + fileList = files, + sample.ids = c('test_1','test_2'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1,0), + destranded = FALSE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test %C + %T < 95% removal (data11)', { + expect_equal( length(data11@data.start), expected = 6 ) + expect_equal( data11@data.coverage[which(data11@data.start == 3), 1], expected = 0 ) + expect_equal( data11@data.coverage[which(data11@data.start == 3), 2], expected = 10 ) + expect_equal( data11@data.coverage[which(data11@data.start == 9), 1], expected = 200 ) + expect_equal( data11@data.coverage[which(data11@data.start == 9), 2], expected = 0 ) +}) + +data12 = methylSigReadData( + fileList = files, + sample.ids = c('test_1','test_2'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1,0), + destranded = TRUE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test %C + %T < 95% removal (data12)', { + expect_equal( length(data12@data.start), expected = 3 ) + expect_equal( data12@data.coverage[which(data12@data.start == 3), 1], expected = 14 ) + expect_equal( data12@data.coverage[which(data12@data.start == 3), 2], expected = 24 ) + expect_equal( data12@data.coverage[which(data12@data.start == 8), 1], expected = 400 ) + expect_equal( data12@data.coverage[which(data12@data.start == 8), 2], expected = 200 ) }) ################################################################################ -# Test with destranded -# And maxCount lowered to 200 - -data_ceiling = methylSigReadData( - fileList = files, - sample.ids = c('test_1','test_2'), - assembly = 'hg19', - pipeline = 'bismark and methylKit', - header = FALSE, - context = 'CpG', - resolution = "base", - treatment = c(1,0), - destranded = TRUE, - maxCount = 200, - minCount = 10, - filterSNPs = FALSE, - num.cores = 1, - quiet = TRUE) - -test_that('Test for removal of combined > maxCount',{ - expect_equal( 43053297 %in% data_ceiling@data.start, expected = FALSE) -}) - -test_that('Test for correct nrows in methylSigData object',{ - expect_equal( length(data_ceiling@data.start), expected = 7 ) +# Test combinations + +files = system.file('extdata', 'test_combo_1.txt', package='methylSig') + +data13 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = FALSE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test combination (data13)', { + expect_equal( length(data13@data.start), expected = 7 ) + expect_equal( all( c(1,3,4,8,10,30,31) %in% data13@data.start ), expected = FALSE ) + expect_equal( all( c(6,7,15,16,20,25,9411410) %in% data13@data.start ), expected = TRUE ) +}) + +data14 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = FALSE, + maxCount = 500, + minCount = 10, + filterSNPs = TRUE, + num.cores = 1, + quiet = TRUE) + +test_that('Test combination (data14)', { + expect_equal( length(data14@data.start), expected = 6 ) + expect_equal( all( c(1,3,4,8,10,30,31) %in% data14@data.start ), expected = FALSE ) + expect_equal( all( c(6,7,15,16,20,25) %in% data14@data.start ), expected = TRUE ) +}) + +data15 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = TRUE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test combination (data15)', { + expect_equal( length(data15@data.start), expected = 5 ) + expect_equal( all( c(1,4,8,10,15,25,30,31) %in% data15@data.start ), expected = FALSE ) + expect_equal( all( c(3,6,20,24,9411410) %in% data15@data.start ), expected = TRUE ) +}) + +data16 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = TRUE, + maxCount = 500, + minCount = 10, + filterSNPs = TRUE, + num.cores = 1, + quiet = TRUE) + +test_that('Test combination (data16)', { + expect_equal( length(data16@data.start), expected = 4 ) + expect_equal( all( c(1,4,8,10,15,25,30,31) %in% data16@data.start ), expected = FALSE ) + expect_equal( all( c(3,6,20,24) %in% data16@data.start ), expected = TRUE ) +}) + +############## + +files = system.file('extdata', 'test_combo_2.txt', package='methylSig') + +data17 = methylSigReadData( + fileList = files, + sample.ids = c('test_1'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1), + destranded = TRUE, + maxCount = 500, + minCount = 10, + filterSNPs = TRUE, + num.cores = 1, + quiet = TRUE) + +test_that('Test combination (data17)', { + expect_equal( length(data17@data.start), expected = 6 ) + expect_equal( all( c(10,16,25,30,31,48086569,48086570) %in% data17@data.start ), expected = FALSE ) + expect_equal( all( c(1,6,8,15,20,24) %in% data17@data.start ), expected = TRUE ) + expect_equal( data17@data.coverage[which(data17@data.start == 6), 1], expected = 55 ) + expect_equal( data17@data.coverage[which(data17@data.start == 15), 1], expected = 475 ) +}) + +############## + +files = c( + system.file('extdata', 'test_combo_1.txt', package='methylSig'), + system.file('extdata', 'test_combo_2.txt', package='methylSig') + ) + +data18 = methylSigReadData( + fileList = files, + sample.ids = c('test_1','test_2'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1,0), + destranded = FALSE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test combination (data18)', { + expect_equal( length(data18@data.start), expected = 10 ) + expect_equal( all( c(10,30,31) %in% data18@data.start ), expected = FALSE ) + expect_equal( all( c(1,6,7,8,15,16,20,25,9411410,48086570) %in% data18@data.start ), expected = TRUE ) + expect_equal( data18@data.coverage[which(data18@data.start == 1), 1], expected = 0 ) + expect_equal( data18@data.coverage[which(data18@data.start == 1), 2], expected = 14 ) +}) + +data19 = methylSigReadData( + fileList = files, + sample.ids = c('test_1','test_2'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1,0), + destranded = TRUE, + maxCount = 500, + minCount = 10, + filterSNPs = FALSE, + num.cores = 1, + quiet = TRUE) + +test_that('Test combination (data19)', { + expect_equal( length(data19@data.start), expected = 9 ) + expect_equal( all( c(4,7,10,16,25,30,31,48086570) %in% data19@data.start ), expected = FALSE ) + expect_equal( all( c(1,3,6,8,15,20,24,9411410,48086569) %in% data19@data.start ), expected = TRUE ) + expect_equal( data19@data.coverage[which(data19@data.start == 3), 1], expected = 12 ) + expect_equal( data19@data.coverage[which(data19@data.start == 3), 2], expected = 0 ) + expect_equal( data19@data.coverage[which(data19@data.start == 6), 1], expected = 36 ) + expect_equal( data19@data.coverage[which(data19@data.start == 6), 2], expected = 55 ) + expect_equal( data19@data.coverage[which(data19@data.start == 8), 1], expected = 0 ) + expect_equal( data19@data.coverage[which(data19@data.start == 8), 2], expected = 400 ) + expect_equal( data19@data.coverage[which(data19@data.start == 15), 1], expected = 0 ) + expect_equal( data19@data.coverage[which(data19@data.start == 15), 2], expected = 475 ) + expect_equal( data19@data.coverage[which(data19@data.start == 20), 1], expected = 30 ) + expect_equal( data19@data.coverage[which(data19@data.start == 20), 2], expected = 80 ) + expect_equal( data19@data.coverage[which(data19@data.start == 24), 1], expected = 50 ) + expect_equal( data19@data.coverage[which(data19@data.start == 24), 2], expected = 25 ) + expect_equal( data19@data.coverage[which(data19@data.start == 9411410), 1], expected = 72 ) + expect_equal( data19@data.coverage[which(data19@data.start == 9411410), 2], expected = 0 ) + expect_equal( data19@data.coverage[which(data19@data.start == 48086569), 1], expected = 0 ) + expect_equal( data19@data.coverage[which(data19@data.start == 48086569), 2], expected = 34 ) +}) + +data20 = methylSigReadData( + fileList = files, + sample.ids = c('test_1','test_2'), + assembly = 'hg19', + pipeline = 'bismark and methylKit', + header = FALSE, + context = 'CpG', + resolution = "base", + treatment = c(1,0), + destranded = TRUE, + maxCount = 500, + minCount = 10, + filterSNPs = TRUE, + num.cores = 1, + quiet = TRUE) + +test_that('Test combination (data20)', { + expect_equal( length(data20@data.start), expected = 7 ) + expect_equal( all( c(4,7,10,16,25,30,31,9411410,48086569,48086570) %in% data20@data.start ), expected = FALSE ) + expect_equal( all( c(1,3,6,8,15,20,24) %in% data20@data.start ), expected = TRUE ) + expect_equal( data20@data.coverage[which(data20@data.start == 3), 1], expected = 12 ) + expect_equal( data20@data.coverage[which(data20@data.start == 3), 2], expected = 0 ) + expect_equal( data20@data.coverage[which(data20@data.start == 6), 1], expected = 36 ) + expect_equal( data20@data.coverage[which(data20@data.start == 6), 2], expected = 55 ) + expect_equal( data20@data.coverage[which(data20@data.start == 8), 1], expected = 0 ) + expect_equal( data20@data.coverage[which(data20@data.start == 8), 2], expected = 400 ) + expect_equal( data20@data.coverage[which(data20@data.start == 15), 1], expected = 0 ) + expect_equal( data20@data.coverage[which(data20@data.start == 15), 2], expected = 475 ) + expect_equal( data20@data.coverage[which(data20@data.start == 20), 1], expected = 30 ) + expect_equal( data20@data.coverage[which(data20@data.start == 20), 2], expected = 80 ) + expect_equal( data20@data.coverage[which(data20@data.start == 24), 1], expected = 50 ) + expect_equal( data20@data.coverage[which(data20@data.start == 24), 2], expected = 25 ) }) diff --git a/tests/testthat/test_2_tile.R b/tests/testthat/test_2_tile.R index 24480ee..50072a3 100644 --- a/tests/testthat/test_2_tile.R +++ b/tests/testthat/test_2_tile.R @@ -21,7 +21,7 @@ data_error_tiled = methylSigReadData( minCount = 10, filterSNPs = FALSE, num.cores = 1, - quiet = FALSE) + quiet = TRUE) test_that('Already tiled data throws error', { expect_error(methylSigTile(data_error_tiled), 'Object has already been tiled') @@ -44,7 +44,7 @@ data_default = methylSigReadData( minCount = 10, filterSNPs = FALSE, num.cores = 1, - quiet = FALSE) + quiet = TRUE) tiled_windows = methylSigTile(meth = data_default, win.size = 25) diff --git a/vignettes/methylSig.Rmd b/vignettes/methylSig.Rmd index 6b0d1e3..d7ff2e6 100644 --- a/vignettes/methylSig.Rmd +++ b/vignettes/methylSig.Rmd @@ -144,7 +144,7 @@ myDiffSignormTile = methylSigCalc(methTile, groups=c(1,0), ## Using local information -It is also possible to use information from nearby CpG sites to improve the variance and methylation level estimates. The default \verb@winsize.disp@ and \verb@winsize.meth@ are 200 bps. The \verb@winsize.disp@ argument only takes into effect when \verb@local.disp@ is set to `TRUE'. Similarly \verb@winsize.meth@ argument only takes into effect when \verb@local.meth@ is set to `TRUE'. +It is also possible to use information from nearby CpG sites to improve the variance and methylation level estimates. The default `winsize.disp` and `winsize.meth` are 200 bps. The `winsize.disp` argument only takes into effect when `local.disp` is set to `TRUE`. Similarly `winsize.meth` argument only takes into effect when `local.meth` is set to `TRUE'. ```{r} ### Variance from both groups and using local information for variance @@ -208,7 +208,7 @@ cpgAnnotationPlot(cpgAnnDmc,main="DMCs") ## RefGene annotation -Again, there are two functions, \verb@refGeneAnnotation()@ and \verb@refGeneAnnotationPlot()@, in `methylSig` package for annotation using RefGene models. The refGene information file can be download from websites such UCSC genome browser. The appropriate genome assembly (the same genome assembly of the provided data) should be used. +Again, there are two functions, `refGeneAnnotation()` and `refGeneAnnotationPlot()`, in `methylSig` package for annotation using RefGene models. The refGene information file can be download from websites such UCSC genome browser. The appropriate genome assembly (the same genome assembly of the provided data) should be used. In a linux server, the user may use the following command to download the annotation file for hg19. Please use appropriate directories for hg18, mm9 or mm10. @@ -287,7 +287,7 @@ methylSigPlot(meth, "chr21", c(43800000, 43900000), groups=c(1,0), ### S4 data structure -The `methylSig` package uses S4 object. The contents of `methylSigData@` can be shown using the `show()` function in R or just type the object itself. +The `methylSig` package uses S4 object. The contents of `methylSigData` can be shown using the `show()` function in R or just type the object itself. ```{r} show(meth)