Skip to content

Commit

Permalink
Merge pull request #15 from sartorlab/v0.4.3-bugfix
Browse files Browse the repository at this point in the history
* Fix #14
* Use `queryHits()` accessor instead of `@` in SNP filtering.
  • Loading branch information
Raymond Cavalcante committed May 12, 2016
2 parents e2b4e08 + 9ba232c commit 089c561
Show file tree
Hide file tree
Showing 12 changed files with 722 additions and 218 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: methylSig
Type: Package
Title: a whole genome DNA methylation analysis pipeline
Version: 0.4.2
Date: 2016-04-20
Version: 0.4.3
Date: 2016-05-11
Author: Yongseok Park
Maintainer: Who to complain to <[email protected]>
Description: MethylSig is a method for testing for differentially methylated
Expand Down
258 changes: 160 additions & 98 deletions R/read.R
Original file line number Diff line number Diff line change
@@ -1,55 +1,158 @@
# Called by methylSigReadData
methylSigReadDataSingleFile <- function(fileIndex, fileList, header, minCount, maxCount, destranded, filterSNPs, quiet=FALSE) {
if(quiet==FALSE) {
message(sprintf('Reading file (%s / %s) -- %s', fileIndex, nrow(fileList), fileList[[fileIndex]]))
}
chr = read.table(fileList[[fileIndex]], header=header, stringsAsFactors=FALSE)
#### order for base ####
##chr = chr[order(chr$base),]
####
names(chr) <- c("id", "chr", "start", "strand", "coverage", "numCs", "numTs")
if(!quiet) {
message(sprintf('Reading file (%s/%s) -- %s', fileIndex, length(fileList), fileList[[fileIndex]]))
}

df = read.table(fileList[[fileIndex]], header=header, stringsAsFactors=FALSE)
names(df) <- c("id", "chr", "start", "strand", "coverage", "numCs", "numTs")

# Convert all strand information into +/-/* from {+,-,*,F,R,.}
df$strand[which(df$strand == 'F')] = '+'
df$strand[which(df$strand == 'R')] = '-'
df$strand[which(df$strand == '.')] = '*'
df$strand = factor(df$strand, levels=c('+','-','*'))

# At this point, numCs and numTs are actually freqC and freqT from methylKit.
freqInvalidList = which(chr$numCs + chr$numTs < 95)
# Filter sites with C + T < 95%
freqInvalidList = which(df$numCs + df$numTs < 95)
if(length(freqInvalidList) > 0) {
message(sprintf('File (%s / %s) Sites with numCs + numTs < 95: %s / %s = %s',
fileIndex, nrow(fileList), nrow(freqInvalidList), nrow(chr), signif(nrow(freqInvalidList) / nrow(chr), 3)))
chr$coverage[freqInvalidList] = 0
}
if(!quiet) {
message(sprintf('File (%s/%s) Sites with numCs + numTs < 95: %s/%s = %s',
fileIndex, length(fileList), length(freqInvalidList), nrow(df), signif(length(freqInvalidList) / nrow(df), 3)))
}
df$coverage[freqInvalidList] = 0
} else {
if(!quiet) {
message(sprintf('File (%s/%s) Sites with numCs + numTs < 95: 0 / %s = 0',
fileIndex, length(fileList), nrow(df)))
}
}

# Now numCs and numTs have frequencies replaced by counts
chr$numCs<- round(chr$numCs * chr$coverage / 100)
chr$numTs<- round(chr$numTs * chr$coverage / 100)
# Now numCs and numTs have frequencies replaced by counts
df$numCs = round(df$numCs * df$coverage / 100)
df$numTs = round(df$numTs * df$coverage / 100)

# countInvalidList = which(chr$coverage > maxCount | chr$coverage < minCount)
# message(sprintf('File (%s / %s) Sites > maxCount or < minCount: %s / %s = %s',
# fileIndex, nrow(fileList), nrow(countInvalidList), nrow(chr), signif(nrow(countInvalidList) / nrow(chr), 3)))
# chr$coverage[countInvalidList] = 0
# Destrand (or not)
if(destranded == TRUE) {
if(!quiet) {
message(sprintf('File (%s/%s) Destranding',
fileIndex, length(fileList)))
}

if(destranded == TRUE) {
destrandList = which(chr$strand == "-" | chr$strand == "R")
chr$start[destrandList] = chr$start[destrandList] - 1
}
# Shift positions on - strand back by 1bp to combine with + strand
destrandList = which(df$strand == "-" | df$strand == "R")
df$start[destrandList] = df$start[destrandList] - 1

if(filterSNPs) {
data('CT_SNPs_hg19', envir=environment())
chr_gr = GRanges(seqnames=chr$chr, ranges=IRanges(start=chr$start, end=chr$start))
# Need to re-sort the df in case destranding puts things out of order
# For example,
# chr1 6 +
# chr1 6 -
# Would be destranded to
# chr1 6 +
# chr1 5 +
df = df[order(df$chr, df$start),]

overlaps = findOverlaps(chr_gr, CT_SNPs_hg19)
snpInvalidList = overlaps@queryHits
# Some ingredients for the hash / destranding
MAXBASE = 10^{ceiling(log10(max(df$start) + 1))}
uniqueChr = sort(unique(df$chr))

message(sprintf('File (%s / %s) Sites overlapping C > T SNP: %s / %s = %s',
fileIndex, nrow(fileList), nrow(snpInvalidList), nrow(chr), signif(nrow(snpInvalidList) / nrow(chr), 3)))
# Hash the locations
uniqueLoc = unique(as.numeric(as.factor(df$chr)) * MAXBASE + df$start)
numSites = length(uniqueLoc)

chr$coverage[snpInvalidList] = 0
}
# Create empty matrices where file data will be columns
chrom = pos = coverage = numCs = numTs = rep.int(0, numSites)
strand = factor(rep(NA, numSites), levels=c('+','-','*'))

#chr$chr = as.factor(chr$chr)
chr$strand = as.factor(chr$strand)
## When only F observed in strand column, as.factor convert F to FALSE
levels(chr$strand) = list("+"="F","-"="R","*"="*", "+"="FALSE")
# Determine locations of data in uniqueLoc. This should behave so that
# base shifted - strand sites have the same "location".
location = findInterval( (as.numeric(as.factor(df$chr)) * MAXBASE) + df$start, uniqueLoc)

chr[chr$coverage>0,2:7]
# Deal with locations having strand NA and set to *
# settingList = is.na(strand[location])
# strand[location][settingList] = df$strand[settingList]
# settingList = !settingList & (strand[location] != df$strand)
# strand[location][settingList] = "*"
strand = factor(rep.int('*', numSites), levels=c('+','-','*'))

# Deal with positive strand
forward = (df$strand == "+") | (df$strand == "F")
chrom[location[forward]] = df$chr[forward]
pos[location[forward]] = df$start[forward]
coverage[location[forward]] = df$coverage[forward]
numCs[location[forward]] = df$numCs[forward]
numTs[location[forward]] = df$numTs[forward]

# Deal with reverse strand
reverse = (df$strand == "-") | (df$strand == "R")
chrom[location[reverse]] = df$chr[reverse]
pos[location[reverse]] = df$start[reverse]
coverage[location[reverse]] = coverage[location[reverse]] + df$coverage[reverse]
numCs[location[reverse]] = numCs[location[reverse]] + df$numCs[reverse]
numTs[location[reverse]] = numTs[location[reverse]] + df$numTs[reverse]

# Rebuild the data.frame
df = data.frame(
chr = chrom,
start = pos,
strand = strand,
coverage = coverage,
numCs = numCs,
numTs = numTs,
stringsAsFactors=F)
} else {
if(!quiet) {
message(sprintf('File (%s/%s) Not destranding',
fileIndex, length(fileList)))
}
# Remove the first column of the data.frame
df = df[,2:7]
}

# Filter for minCount or maxCount
countInvalidList = which(df$coverage > maxCount | df$coverage < minCount)
if(length(countInvalidList) > 0) {
if(!quiet) {
message(sprintf('File (%s/%s) Sites > maxCount or < minCount: %s/%s = %s',
fileIndex, length(fileList), length(countInvalidList), nrow(df), signif(length(countInvalidList) / nrow(df), 3)))
}
df$coverage[countInvalidList] = 0
} else {
if(!quiet) {
message(sprintf('File (%s/%s) Sites > maxCount or < minCount: 0 / %s = 0',
fileIndex, length(fileList), nrow(df)))
}
}

# Filter C > T SNPs
if(filterSNPs) {
data('CT_SNPs_hg19', envir=environment())

df_gr = GRanges(seqnames=df$chr, ranges=IRanges(start=df$start, end=df$start))
overlaps = findOverlaps(df_gr, CT_SNPs_hg19)
snpInvalidList = queryHits(overlaps)

if(length(snpInvalidList) > 0) {
if(!quiet) {
message(sprintf('File (%s/%s) Sites overlapping C > T SNP: %s/%s = %s',
fileIndex, length(fileList), length(snpInvalidList), nrow(df), signif(length(snpInvalidList) / nrow(df), 3)))
}
df$coverage[snpInvalidList] = 0
} else {
if(!quiet) {
message(sprintf('File (%s/%s) Sites overlapping C > T SNP: 0 / %s = 0',
fileIndex, length(fileList), nrow(df)))
}
}
} else {
if(!quiet) {
message(sprintf('File (%s/%s) filterSNPs == FALSE',
fileIndex, length(fileList)))
}
}

return(df[df$coverage>0,])
}

#' Read methylation score files to make a 'methylSigData' object.
Expand Down Expand Up @@ -108,97 +211,56 @@ methylSigReadData = function(fileList,
}
}

# Collect chromosomes present in each of the files and determine the maximum
# start location. (MAXBASE PURPOSE NOT CLEAR YET)
MAXBASE = 0
uniqueChr = NULL
for(fileIndex in 1:n.files) {
uniqueChr = c(uniqueChr, chrList[[fileIndex]]$chr)
MAXBASE = max(MAXBASE, max(chrList[[fileIndex]]$start))
}

# Determine the unique chromosomes and order them
uniqueChr = unique(uniqueChr)
uniqueChr = uniqueChr[order(uniqueChr)]

# Basically round MAXBASE to the nearest power of 10
MAXBASE = 10^{ceiling(log10(MAXBASE + 1))}
uniqueLoc = NULL

# Create a hash for unique locations
uniqueLoc = NULL
for(fileIndex in 1:n.files) {
chrList[[fileIndex]]$chr = factor(chrList[[fileIndex]]$chr, levels=uniqueChr)
uniqueLoc = unique(c(uniqueLoc, as.numeric(chrList[[fileIndex]]$chr) * MAXBASE+chrList[[fileIndex]]$start))
}

# Determine only the unique locations and the number to create the empty
# matrices for aggregation
uniqueLoc = uniqueLoc[order(uniqueLoc)]

sizeRet = NROW(uniqueLoc)

# Create empty matrices where file data will be columns
coverage = numCs = numTs = matrix(0, nrow=sizeRet, ncol=n.files)
strand = factor(rep(NA, sizeRet), levels=levels(chrList[[1]]$strand))

# Aggregate
for(fileIndex in 1:n.files) {
if(quiet == FALSE) {
message(sprintf('(%s)', fileIndex))
}
# if(quiet == FALSE) {
# message(sprintf('(%s)', fileIndex))
# }
location = findInterval( (as.numeric(chrList[[fileIndex]]$chr) * MAXBASE+chrList[[fileIndex]]$start) , uniqueLoc)
if(destranded == FALSE) {
strand[location] = chrList[[fileIndex]]$strand
coverage[location,fileIndex] = chrList[[fileIndex]]$coverage
numCs[location,fileIndex] = chrList[[fileIndex]]$numCs
numTs[location,fileIndex] = chrList[[fileIndex]]$numTs
} else {
# Deal with locations having strand NA and set to *
settingList = is.na(strand[location])
strand[location][settingList] = chrList[[fileIndex]]$strand[settingList]
settingList = !settingList & (strand[location] != chrList[[fileIndex]]$strand)
strand[location][settingList] = "*"

# Deal with positive strand
forward = (chrList[[fileIndex]]$strand == "+")
coverage[location[forward],fileIndex] = chrList[[fileIndex]]$coverage[forward]
numCs[location[forward],fileIndex] = chrList[[fileIndex]]$numCs[forward]
numTs[location[forward],fileIndex] = chrList[[fileIndex]]$numTs[forward]

# Deal with reverse strand
reverse = (chrList[[fileIndex]]$strand == "-")
coverage[location[reverse],fileIndex] = coverage[location[reverse],fileIndex] + chrList[[fileIndex]]$coverage[reverse]
numCs[location[reverse],fileIndex] = numCs[location[reverse],fileIndex] + chrList[[fileIndex]]$numCs[reverse]
numTs[location[reverse],fileIndex] = numTs[location[reverse],fileIndex] + chrList[[fileIndex]]$numTs[reverse]
}
}

# Filter by minCount and maxCount while also modifying chr, uniqueLoc, strand, numCs, numTs
# This is in the idiom of the what's been implemented
countInvalidList = NULL
for(fileIndex in 1:n.files) {
# Record the countInvalidList
countInvalidList = unique( c(countInvalidList, which(
((coverage[, fileIndex] != 0) & (coverage[, fileIndex] < minCount)) |
((coverage[, fileIndex] != 0) & (coverage[, fileIndex] > maxCount)) ) ) )
}

# For simulated data you could have nothing in the countInvalidList
if(length(countInvalidList) > 0) {
uniqueLoc = uniqueLoc[-countInvalidList]
strand = strand[-countInvalidList]
coverage = coverage[-countInvalidList, ]
numCs = numCs[-countInvalidList, ]
numTs = numTs[-countInvalidList, ]

message(sprintf('Sites > maxCount or < minCount: %s / %s = %s',
length(countInvalidList), nrow(coverage) + length(countInvalidList),
signif( length(countInvalidList) / (nrow(coverage) + length(countInvalidList)) , 3)))
} else {
message('Sites > maxCount or < minCount = 0')
}

# strand = as.factor(strand)
# levels(strand) = list("+"="1","-"="2","*"="3")
strand[location] = chrList[[fileIndex]]$strand
coverage[location,fileIndex] = chrList[[fileIndex]]$coverage
numCs[location,fileIndex] = chrList[[fileIndex]]$numCs
numTs[location,fileIndex] = chrList[[fileIndex]]$numTs
}

options = paste("maxCount=", maxCount, " & minCount=", minCount, " & filterSNPs=", filterSNPs, sep="")
if(!is.na(assembly)) options = paste(options, " & assembly=", assembly, sep="")
if(!is.na(context)) options = paste(options, " & context=", context, sep="")
if(!is.na(pipeline)) options = paste(options, " & pipeline=", pipeline, sep="")

# numTs[coverage==0] = NA
# numCs[coverage==0] = NA
# coverage[coverage==0] = NA
methylSig.newData(data.ids=uniqueLoc, data.chr=as.factor(uniqueChr[as.integer(uniqueLoc/MAXBASE)]),
data.start=uniqueLoc%%MAXBASE, data.end=uniqueLoc%%MAXBASE,
data.strand=strand, data.coverage = coverage, data.numTs = numTs, data.numCs = numCs,
Expand Down
8 changes: 4 additions & 4 deletions R/tile.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
#' methTile = methylSigTile(meth)
#' @export
methylSigTile <- function(meth, tiles = NULL, win.size=25) {
if(meth@resolution == "region") stop("Object has already been tiled")
if(meth@resolution == "region") stop("Object has already been tiled")

if(is.null(tiles)) {
message(sprintf('Tiling by %s bp windows', win.size))
MAXBASE = max(meth@data.start) + win.size + 1
if(is.null(tiles)) {
message(sprintf('Tiling by %s bp windows', win.size))
MAXBASE = max(meth@data.start) + win.size + 1
# MAXBASE10 = MAXBASE = 10^{ceiling(log10(MAXBASE + 1))}

startList = as.numeric(meth@data.chr)*MAXBASE + meth@data.start
Expand Down
14 changes: 14 additions & 0 deletions inst/extdata/test_combo_1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
chr21.1 chr21 1 F 14 30 40
chr21.3 chr21 3 F 5 100 0
chr21.4 chr21 4 R 7 100 0
chr21.6 chr21 6 F 16 80 20
chr21.7 chr21 7 R 20 100 0
chr21.8 chr21 8 F 550 100 0
chr21.10 chr21 10 F 2 50 50
chr21.15 chr21 15 F 300 33 67
chr21.16 chr21 16 R 300 20 80
chr21.20 chr21 20 F 30 100 0
chr21.25 chr21 25 R 50 25 75
chr21.30 chr21 30 F 2 100 0
chr21.31 chr21 31 R 3 100 0
chr21.9411410 chr21 9411410 F 72 89 11
12 changes: 12 additions & 0 deletions inst/extdata/test_combo_2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
chr21.1 chr21 1 F 14 30 70
chr21.6 chr21 6 F 30 75 25
chr21.7 chr21 7 R 25 90 10
chr21.8 chr21 8 F 400 96 4
chr21.10 chr21 10 F 2 50 50
chr21.15 chr21 15 F 250 33 67
chr21.16 chr21 16 R 225 20 80
chr21.20 chr21 20 F 80 100 0
chr21.25 chr21 25 R 25 25 75
chr21.30 chr21 30 F 2 100 0
chr21.31 chr21 31 R 3 100 0
chr21.48086570 chr21 48086570 R 34 89 11
6 changes: 6 additions & 0 deletions inst/extdata/test_freq_1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
chr21.3 chr21 3 F 10 30 40
chr21.4 chr21 4 R 14 100 0
chr21.6 chr21 6 F 16 80 20
chr21.7 chr21 7 R 20 100 0
chr21.8 chr21 8 F 200 100 0
chr21.9 chr21 9 R 200 98 2
6 changes: 6 additions & 0 deletions inst/extdata/test_freq_2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
chr21.3 chr21 3 F 10 100 0
chr21.4 chr21 4 R 14 100 0
chr21.6 chr21 6 F 16 80 20
chr21.7 chr21 7 R 20 100 0
chr21.8 chr21 8 F 200 100 0
chr21.9 chr21 9 R 100 90 2
6 changes: 6 additions & 0 deletions inst/extdata/test_minmax_1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
chr21.3 chr21 3 F 5 100 0
chr21.4 chr21 4 R 14 100 0
chr21.6 chr21 6 F 16 80 20
chr21.7 chr21 7 R 20 100 0
chr21.8 chr21 8 F 200 100 0
chr21.9 chr21 9 R 200 98 2
6 changes: 6 additions & 0 deletions inst/extdata/test_minmax_2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
chr21.3 chr21 3 F 10 100 0
chr21.4 chr21 4 R 14 100 0
chr21.6 chr21 6 F 16 80 20
chr21.7 chr21 7 R 20 100 0
chr21.8 chr21 8 F 550 100 0
chr21.9 chr21 9 R 400 98 2
Loading

0 comments on commit 089c561

Please sign in to comment.