Merge pull request #15 from sartorlab/v0.4.3-bugfix

* Fix #14 * Use `queryHits()` accessor instead of `@` in SNP filtering.
sartorlab · May 12, 2016 · 089c561 · 089c561
2 parents e2b4e08 + 9ba232c
commit 089c561
Show file tree

Hide file tree

Showing 12 changed files with 722 additions and 218 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: methylSig
 Type: Package
 Title: a whole genome DNA methylation analysis pipeline
-Version: 0.4.2
-Date: 2016-04-20
+Version: 0.4.3
+Date: 2016-05-11
 Author: Yongseok Park
 Maintainer: Who to complain to <[email protected]>
 Description: MethylSig is a method for testing for differentially methylated

diff --git a/R/read.R b/R/read.R
@@ -1,55 +1,158 @@
 # Called by methylSigReadData
 methylSigReadDataSingleFile <- function(fileIndex, fileList, header, minCount, maxCount, destranded, filterSNPs, quiet=FALSE) {
-    if(quiet==FALSE) {
-      message(sprintf('Reading file (%s / %s) -- %s', fileIndex, nrow(fileList), fileList[[fileIndex]]))
-    }
-    chr = read.table(fileList[[fileIndex]], header=header, stringsAsFactors=FALSE)
-    #### order for base ####
-    ##chr = chr[order(chr$base),]
-    ####
-    names(chr) <- c("id",  "chr", "start", "strand", "coverage", "numCs", "numTs")
+	if(!quiet) {
+		message(sprintf('Reading file (%s/%s) -- %s', fileIndex, length(fileList), fileList[[fileIndex]]))
+	}
+
+    df = read.table(fileList[[fileIndex]], header=header, stringsAsFactors=FALSE)
+    names(df) <- c("id", "chr", "start", "strand", "coverage", "numCs", "numTs")
+
+	# Convert all strand information into +/-/* from {+,-,*,F,R,.}
+	df$strand[which(df$strand == 'F')] = '+'
+	df$strand[which(df$strand == 'R')] = '-'
+	df$strand[which(df$strand == '.')] = '*'
+	df$strand = factor(df$strand, levels=c('+','-','*'))
 
     # At this point, numCs and numTs are actually freqC and freqT from methylKit.
-    freqInvalidList = which(chr$numCs + chr$numTs < 95)
+	# Filter sites with C + T < 95%
+    freqInvalidList = which(df$numCs + df$numTs < 95)
     if(length(freqInvalidList) > 0) {
-        message(sprintf('File (%s / %s) Sites with numCs + numTs < 95: %s / %s = %s',
-          fileIndex, nrow(fileList), nrow(freqInvalidList), nrow(chr), signif(nrow(freqInvalidList) / nrow(chr), 3)))
-        chr$coverage[freqInvalidList] = 0
-    }
+		if(!quiet) {
+			message(sprintf('File (%s/%s) Sites with numCs + numTs < 95: %s/%s = %s',
+				fileIndex, length(fileList), length(freqInvalidList), nrow(df), signif(length(freqInvalidList) / nrow(df), 3)))
+		}
+		df$coverage[freqInvalidList] = 0
+    } else {
+		if(!quiet) {
+			message(sprintf('File (%s/%s) Sites with numCs + numTs < 95: 0 / %s = 0',
+				fileIndex, length(fileList), nrow(df)))
+		}
+	}
 
-    # Now numCs and numTs have frequencies replaced by counts
-    chr$numCs<- round(chr$numCs * chr$coverage / 100)
-    chr$numTs<- round(chr$numTs * chr$coverage / 100)
+	# Now numCs and numTs have frequencies replaced by counts
+	df$numCs = round(df$numCs * df$coverage / 100)
+	df$numTs = round(df$numTs * df$coverage / 100)
 
-    # countInvalidList = which(chr$coverage > maxCount | chr$coverage < minCount)
-    # message(sprintf('File (%s / %s) Sites > maxCount or < minCount: %s / %s = %s',
-    #   fileIndex, nrow(fileList), nrow(countInvalidList), nrow(chr), signif(nrow(countInvalidList) / nrow(chr), 3)))
-    # chr$coverage[countInvalidList] = 0
+	# Destrand (or not)
+	if(destranded == TRUE) {
+		if(!quiet) {
+			message(sprintf('File (%s/%s) Destranding',
+				fileIndex, length(fileList)))
+		}
 
-    if(destranded == TRUE) {
-        destrandList = which(chr$strand == "-" | chr$strand == "R")
-        chr$start[destrandList] = chr$start[destrandList] - 1
-    }
+		# Shift positions on - strand back by 1bp to combine with + strand
+		destrandList = which(df$strand == "-" | df$strand == "R")
+        df$start[destrandList] = df$start[destrandList] - 1
 
-    if(filterSNPs) {
-        data('CT_SNPs_hg19', envir=environment())
-        chr_gr = GRanges(seqnames=chr$chr, ranges=IRanges(start=chr$start, end=chr$start))
+		# Need to re-sort the df in case destranding puts things out of order
+		# For example,
+		# chr1	6	+
+		# chr1	6	-
+		# Would be destranded to
+		# chr1	6	+
+		# chr1	5	+
+		df = df[order(df$chr, df$start),]
 
-        overlaps = findOverlaps(chr_gr, CT_SNPs_hg19)
-        snpInvalidList = overlaps@queryHits
+		# Some ingredients for the hash / destranding
+		MAXBASE = 10^{ceiling(log10(max(df$start) + 1))}
+		uniqueChr = sort(unique(df$chr))
 
-        message(sprintf('File (%s / %s) Sites overlapping C > T SNP: %s / %s = %s',
-          fileIndex, nrow(fileList), nrow(snpInvalidList), nrow(chr), signif(nrow(snpInvalidList) / nrow(chr), 3)))
+		# Hash the locations
+		uniqueLoc = unique(as.numeric(as.factor(df$chr)) * MAXBASE + df$start)
+		numSites = length(uniqueLoc)
 
-        chr$coverage[snpInvalidList] = 0
-    }
+		# Create empty matrices where file data will be columns
+	    chrom = pos = coverage = numCs = numTs = rep.int(0, numSites)
+	    strand = factor(rep(NA, numSites), levels=c('+','-','*'))
 
-    #chr$chr = as.factor(chr$chr)
-    chr$strand = as.factor(chr$strand)
-    ## When only F observed in strand column, as.factor convert F to FALSE
-    levels(chr$strand) = list("+"="F","-"="R","*"="*", "+"="FALSE")
+		# Determine locations of data in uniqueLoc. This should behave so that
+		# base shifted - strand sites have the same "location".
+		location =  findInterval( (as.numeric(as.factor(df$chr)) * MAXBASE) + df$start, uniqueLoc)
 
-    chr[chr$coverage>0,2:7]
+		# Deal with locations having strand NA and set to *
+		# settingList = is.na(strand[location])
+		# strand[location][settingList] = df$strand[settingList]
+		# settingList = !settingList & (strand[location] != df$strand)
+		# strand[location][settingList] = "*"
+		strand = factor(rep.int('*', numSites), levels=c('+','-','*'))
+
+		# Deal with positive strand
+		forward = (df$strand == "+") | (df$strand == "F")
+		chrom[location[forward]] = df$chr[forward]
+		pos[location[forward]] = df$start[forward]
+		coverage[location[forward]] = df$coverage[forward]
+		numCs[location[forward]] = df$numCs[forward]
+		numTs[location[forward]] = df$numTs[forward]
+
+		# Deal with reverse strand
+		reverse = (df$strand == "-") | (df$strand == "R")
+		chrom[location[reverse]] = df$chr[reverse]
+		pos[location[reverse]] = df$start[reverse]
+		coverage[location[reverse]] = coverage[location[reverse]] + df$coverage[reverse]
+		numCs[location[reverse]] = numCs[location[reverse]] + df$numCs[reverse]
+		numTs[location[reverse]] = numTs[location[reverse]] + df$numTs[reverse]
+
+		# Rebuild the data.frame
+		df = data.frame(
+			chr = chrom,
+			start = pos,
+			strand = strand,
+			coverage = coverage,
+			numCs = numCs,
+			numTs = numTs,
+			stringsAsFactors=F)
+    } else {
+		if(!quiet) {
+			message(sprintf('File (%s/%s) Not destranding',
+				fileIndex, length(fileList)))
+		}
+		# Remove the first column of the data.frame
+		df = df[,2:7]
+	}
+
+	# Filter for minCount or maxCount
+	countInvalidList = which(df$coverage > maxCount | df$coverage < minCount)
+	if(length(countInvalidList) > 0) {
+		if(!quiet) {
+			message(sprintf('File (%s/%s) Sites > maxCount or < minCount: %s/%s = %s',
+				fileIndex, length(fileList), length(countInvalidList), nrow(df), signif(length(countInvalidList) / nrow(df), 3)))
+		}
+		df$coverage[countInvalidList] = 0
+	} else {
+		if(!quiet) {
+			message(sprintf('File (%s/%s) Sites > maxCount or < minCount: 0 / %s = 0',
+				fileIndex, length(fileList), nrow(df)))
+		}
+	}
+
+	# Filter C > T SNPs
+	if(filterSNPs) {
+		data('CT_SNPs_hg19', envir=environment())
+
+		df_gr = GRanges(seqnames=df$chr, ranges=IRanges(start=df$start, end=df$start))
+		overlaps = findOverlaps(df_gr, CT_SNPs_hg19)
+		snpInvalidList = queryHits(overlaps)
+
+		if(length(snpInvalidList) > 0) {
+			if(!quiet) {
+				message(sprintf('File (%s/%s) Sites overlapping C > T SNP: %s/%s = %s',
+					fileIndex, length(fileList), length(snpInvalidList), nrow(df), signif(length(snpInvalidList) / nrow(df), 3)))
+			}
+			df$coverage[snpInvalidList] = 0
+		} else {
+			if(!quiet) {
+				message(sprintf('File (%s/%s) Sites overlapping C > T SNP: 0 / %s = 0',
+					fileIndex, length(fileList), nrow(df)))
+			}
+		}
+    } else {
+		if(!quiet) {
+			message(sprintf('File (%s/%s) filterSNPs == FALSE',
+				fileIndex, length(fileList)))
+		}
+	}
+
+	return(df[df$coverage>0,])
 }
 
 #' Read methylation score files to make a 'methylSigData' object.
@@ -108,97 +211,56 @@ methylSigReadData = function(fileList,
         }
     }
 
+	# Collect chromosomes present in each of the files and determine the maximum
+	# start location. (MAXBASE PURPOSE NOT CLEAR YET)
     MAXBASE = 0
     uniqueChr = NULL
     for(fileIndex in 1:n.files) {
          uniqueChr = c(uniqueChr, chrList[[fileIndex]]$chr)
          MAXBASE = max(MAXBASE, max(chrList[[fileIndex]]$start))
     }
 
+	# Determine the unique chromosomes and order them
     uniqueChr = unique(uniqueChr)
     uniqueChr = uniqueChr[order(uniqueChr)]
 
+	# Basically round MAXBASE to the nearest power of 10
     MAXBASE = 10^{ceiling(log10(MAXBASE + 1))}
-    uniqueLoc = NULL
+
+	# Create a hash for unique locations
+	uniqueLoc = NULL
     for(fileIndex in 1:n.files) {
         chrList[[fileIndex]]$chr = factor(chrList[[fileIndex]]$chr, levels=uniqueChr)
         uniqueLoc = unique(c(uniqueLoc, as.numeric(chrList[[fileIndex]]$chr) * MAXBASE+chrList[[fileIndex]]$start))
     }
 
+	# Determine only the unique locations and the number to create the empty
+	# matrices for aggregation
     uniqueLoc = uniqueLoc[order(uniqueLoc)]
-
     sizeRet = NROW(uniqueLoc)
 
+	# Create empty matrices where file data will be columns
     coverage = numCs = numTs = matrix(0, nrow=sizeRet, ncol=n.files)
     strand = factor(rep(NA, sizeRet), levels=levels(chrList[[1]]$strand))
 
+	# Aggregate
     for(fileIndex in 1:n.files) {
-        if(quiet == FALSE) {
-            message(sprintf('(%s)', fileIndex))
-        }
+        # if(quiet == FALSE) {
+        #     message(sprintf('(%s)', fileIndex))
+        # }
         location =  findInterval( (as.numeric(chrList[[fileIndex]]$chr) * MAXBASE+chrList[[fileIndex]]$start) , uniqueLoc)
-        if(destranded == FALSE) {
-            strand[location] = chrList[[fileIndex]]$strand
-            coverage[location,fileIndex] = chrList[[fileIndex]]$coverage
-            numCs[location,fileIndex] = chrList[[fileIndex]]$numCs
-            numTs[location,fileIndex] = chrList[[fileIndex]]$numTs
-        } else {
-            # Deal with locations having strand NA and set to *
-            settingList = is.na(strand[location])
-            strand[location][settingList] = chrList[[fileIndex]]$strand[settingList]
-            settingList = !settingList & (strand[location] != chrList[[fileIndex]]$strand)
-            strand[location][settingList] = "*"
-
-            # Deal with positive strand
-            forward = (chrList[[fileIndex]]$strand == "+")
-            coverage[location[forward],fileIndex] = chrList[[fileIndex]]$coverage[forward]
-            numCs[location[forward],fileIndex] = chrList[[fileIndex]]$numCs[forward]
-            numTs[location[forward],fileIndex] = chrList[[fileIndex]]$numTs[forward]
-
-            # Deal with reverse strand
-            reverse = (chrList[[fileIndex]]$strand == "-")
-            coverage[location[reverse],fileIndex] = coverage[location[reverse],fileIndex] + chrList[[fileIndex]]$coverage[reverse]
-            numCs[location[reverse],fileIndex] = numCs[location[reverse],fileIndex] + chrList[[fileIndex]]$numCs[reverse]
-            numTs[location[reverse],fileIndex] = numTs[location[reverse],fileIndex] + chrList[[fileIndex]]$numTs[reverse]
-        }
-    }
-
-    # Filter by minCount and maxCount while also modifying chr, uniqueLoc, strand, numCs, numTs
-    # This is in the idiom of the what's been implemented
-      countInvalidList = NULL
-      for(fileIndex in 1:n.files) {
-        # Record the countInvalidList
-        countInvalidList = unique( c(countInvalidList, which(
-          ((coverage[, fileIndex] != 0) & (coverage[, fileIndex] < minCount)) |
-          ((coverage[, fileIndex] != 0) & (coverage[, fileIndex] > maxCount)) ) ) )
-      }
-
-	# For simulated data you could have nothing in the countInvalidList
-	if(length(countInvalidList) > 0) {
-		uniqueLoc = uniqueLoc[-countInvalidList]
-		strand = strand[-countInvalidList]
-		coverage = coverage[-countInvalidList, ]
-		numCs = numCs[-countInvalidList, ]
-		numTs = numTs[-countInvalidList, ]
-
-		message(sprintf('Sites > maxCount or < minCount: %s / %s = %s',
-			length(countInvalidList), nrow(coverage) + length(countInvalidList),
-			signif( length(countInvalidList) / (nrow(coverage) + length(countInvalidList)) , 3)))
-    } else {
-		message('Sites > maxCount or < minCount = 0')
-	}
 
-#    strand = as.factor(strand)
-#    levels(strand) = list("+"="1","-"="2","*"="3")
+        strand[location] = chrList[[fileIndex]]$strand
+        coverage[location,fileIndex] = chrList[[fileIndex]]$coverage
+        numCs[location,fileIndex] = chrList[[fileIndex]]$numCs
+        numTs[location,fileIndex] = chrList[[fileIndex]]$numTs
+    }
 
     options = paste("maxCount=", maxCount, " & minCount=", minCount, " & filterSNPs=", filterSNPs, sep="")
     if(!is.na(assembly)) options = paste(options, " & assembly=", assembly, sep="")
     if(!is.na(context))  options = paste(options, " & context=",  context,  sep="")
     if(!is.na(pipeline)) options = paste(options, " & pipeline=", pipeline, sep="")
 
-    # numTs[coverage==0] = NA
-    # numCs[coverage==0] = NA
-    # coverage[coverage==0] = NA
     methylSig.newData(data.ids=uniqueLoc, data.chr=as.factor(uniqueChr[as.integer(uniqueLoc/MAXBASE)]),
                       data.start=uniqueLoc%%MAXBASE, data.end=uniqueLoc%%MAXBASE,
                       data.strand=strand, data.coverage = coverage, data.numTs = numTs, data.numCs = numCs,

diff --git a/R/tile.R b/R/tile.R
@@ -13,11 +13,11 @@
 #' methTile = methylSigTile(meth)
 #' @export
 methylSigTile <- function(meth, tiles = NULL, win.size=25) {
-   if(meth@resolution == "region") stop("Object has already been tiled")
+	if(meth@resolution == "region") stop("Object has already been tiled")
 
-   if(is.null(tiles)) {
-     message(sprintf('Tiling by %s bp windows', win.size))
-       MAXBASE = max(meth@data.start) + win.size + 1
+	if(is.null(tiles)) {
+		message(sprintf('Tiling by %s bp windows', win.size))
+		MAXBASE = max(meth@data.start) + win.size + 1
 #        MAXBASE10 = MAXBASE = 10^{ceiling(log10(MAXBASE + 1))}
 
        startList = as.numeric(meth@data.chr)*MAXBASE + meth@data.start

diff --git a/inst/extdata/test_combo_1.txt b/inst/extdata/test_combo_1.txt
@@ -0,0 +1,14 @@
+chr21.1	chr21	1	F	14	30	40
+chr21.3	chr21	3	F	5	100	0
+chr21.4	chr21	4	R	7	100	0
+chr21.6	chr21	6	F	16	80	20
+chr21.7	chr21	7	R	20	100	0
+chr21.8	chr21	8	F	550	100	0
+chr21.10	chr21	10	F	2	50	50
+chr21.15	chr21	15	F	300	33	67
+chr21.16	chr21	16	R	300	20	80
+chr21.20	chr21	20	F	30	100 0
+chr21.25	chr21	25	R	50	25	75
+chr21.30	chr21	30	F	2	100	0
+chr21.31	chr21	31	R	3	100	0
+chr21.9411410	chr21	9411410	F	72	89	11
diff --git a/inst/extdata/test_combo_2.txt b/inst/extdata/test_combo_2.txt
@@ -0,0 +1,12 @@
+chr21.1	chr21	1	F	14	30	70
+chr21.6	chr21	6	F	30	75	25
+chr21.7	chr21	7	R	25	90	10
+chr21.8	chr21	8	F	400	96	4
+chr21.10	chr21	10	F	2	50	50
+chr21.15	chr21	15	F	250	33	67
+chr21.16	chr21	16	R	225	20	80
+chr21.20	chr21	20	F	80	100 0
+chr21.25	chr21	25	R	25	25	75
+chr21.30	chr21	30	F	2	100	0
+chr21.31	chr21	31	R	3	100	0
+chr21.48086570	chr21	48086570	R	34	89	11
diff --git a/inst/extdata/test_freq_1.txt b/inst/extdata/test_freq_1.txt
@@ -0,0 +1,6 @@
+chr21.3	chr21	3	F	10	30	40
+chr21.4	chr21	4	R	14	100	0
+chr21.6	chr21	6	F	16	80	20
+chr21.7	chr21	7	R	20	100	0
+chr21.8	chr21	8	F	200	100	0
+chr21.9	chr21	9	R	200	98	2
diff --git a/inst/extdata/test_freq_2.txt b/inst/extdata/test_freq_2.txt
@@ -0,0 +1,6 @@
+chr21.3	chr21	3	F	10	100	0
+chr21.4	chr21	4	R	14	100	0
+chr21.6	chr21	6	F	16	80	20
+chr21.7	chr21	7	R	20	100	0
+chr21.8	chr21	8	F	200	100	0
+chr21.9	chr21	9	R	100	90	2
diff --git a/inst/extdata/test_minmax_1.txt b/inst/extdata/test_minmax_1.txt
@@ -0,0 +1,6 @@
+chr21.3	chr21	3	F	5	100	0
+chr21.4	chr21	4	R	14	100	0
+chr21.6	chr21	6	F	16	80	20
+chr21.7	chr21	7	R	20	100	0
+chr21.8	chr21	8	F	200	100	0
+chr21.9	chr21	9	R	200	98	2
diff --git a/inst/extdata/test_minmax_2.txt b/inst/extdata/test_minmax_2.txt
@@ -0,0 +1,6 @@
+chr21.3	chr21	3	F	10	100	0
+chr21.4	chr21	4	R	14	100	0
+chr21.6	chr21	6	F	16	80	20
+chr21.7	chr21	7	R	20	100	0
+chr21.8	chr21	8	F	550	100	0
+chr21.9	chr21	9	R	400	98	2