From 03667374e872f3ba1df8b5a4fd16f6ef2fec0012 Mon Sep 17 00:00:00 2001 From: Devon Ryan Date: Sat, 17 Jul 2021 23:11:02 +0200 Subject: [PATCH 1/5] Implement #61, allow filtering by non-CpG conversion efficiency. Also don't kill threads if there's a sequence fetch error. --- MBias.c | 13 +++++++ MethylDackel.h | 8 +++++ common.c | 74 +++++++++++++++++++++++++++++++++++++++ extract.c | 15 +++++++- tests/chgchh.fa | 2 ++ tests/chgchh_aln.bam | Bin 0 -> 346 bytes tests/chgchh_aln.bam.bai | Bin 0 -> 96 bytes tests/test.py | 33 +++++++++++++++++ 8 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 tests/chgchh.fa create mode 100644 tests/chgchh_aln.bam create mode 100644 tests/chgchh_aln.bam.bai diff --git a/MBias.c b/MBias.c index c062e8d..1cd374e 100644 --- a/MBias.c +++ b/MBias.c @@ -151,6 +151,9 @@ void *extractMBias(void *foo) { fprintf(stderr, "Note that the output will be truncated!\n"); return NULL; } + data->seq = seq; + data->lseq = seqlen; + data->offset = localPos; //Start the pileup iter = bam_mplp_init(1, filter_func, (void **) &data); @@ -263,6 +266,11 @@ void mbias_usage() { " present, or else the alignment is ignored. This is equivalent\n" " to the -f option in samtools. The default is 0, which\n" " includes all alignments.\n" +" --minConversionEfficiency The minimum non-CpG conversion efficiency observed\n" +" in a read to include it in the output. The default is 0.0 and\n" +" the maximum is 1.0 (100%% conversion). You are strongly\n" +" encouraged to NOT use this option without an EXTREMELY\n" +" compelling reason!\n" " --txt Output tab separated metrics to the screen. These can be\n" " imported into R or another program for manual plotting and\n" " analysis. Note that coordinates are 1-based.\n" @@ -312,6 +320,7 @@ int mbias_main(int argc, char *argv[]) { config.requireFlags = 0; config.nThreads = 1; config.chunkSize = 1000000; + config.minConversionEfficiency = 0.0; for(i=0; i<16; i++) config.bounds[i] = 0; for(i=0; i<16; i++) config.absoluteBounds[i] = 0; @@ -330,6 +339,7 @@ int mbias_main(int argc, char *argv[]) { {"nCTOB", 1, NULL, 12}, {"chunkSize", 1, NULL, 13}, {"keepStrand", 0, NULL, 14}, + {"minConversionEfficiency", 1, NULL, 15}, {"ignoreFlags", 1, NULL, 'F'}, {"requireFlags", 1, NULL, 'R'}, {"help", 0, NULL, 'h'}, @@ -400,6 +410,9 @@ int mbias_main(int argc, char *argv[]) { case 14: keepStrand = 1; break; + case 15: + config.minConversionEfficiency = atof(optarg); + break; case 'F' : config.ignoreFlags = atoi(optarg); break; diff --git a/MethylDackel.h b/MethylDackel.h index 1661f14..a2f2d9e 100644 --- a/MethylDackel.h +++ b/MethylDackel.h @@ -75,6 +75,7 @@ typedef struct { @field output_fp Output file pointers (to CpG, CHG, and CHH, respectively) @field reg A region specified by -r @field BAMName The BAM file name + @field minConversionEfficiency The minimum acceptable conversion efficiency in non-CpG positions for read inclusion @field fp Input file pointer (must be a BAM or CRAM file) @field bai The index for fp @field bedName The BED file name @@ -96,6 +97,7 @@ typedef struct { FILE **output_fp; char *reg; char *BAMName; + float minConversionEfficiency; char *BWName; char *outBBMName; char *BBMName; @@ -128,6 +130,9 @@ typedef struct { @field iter: The alignment iterator that should be traversed. @field ohash: A pointer to the hash table needed for overlap detection @field bedIdx: The last index into the BED file + @field lseq: The length of seq. + @field seq: The sequence for the current region + @field offset: The beginning position of seq on the relevant contig */ typedef struct { Config *config; @@ -136,6 +141,9 @@ typedef struct { hts_itr_t *iter; void *ohash; int32_t bedIdx; + int lseq; + char *seq; + uint32_t offset; } mplp_data; /*! @function diff --git a/common.c b/common.c index 740ce95..599d27e 100644 --- a/common.c +++ b/common.c @@ -312,6 +312,75 @@ char check_mappability(void *data, bam1_t *b) { return num_mappable_reads; } +// This is the same as updateMetrics, 1 on methylation, -1 on unmethylation +int getMethylState(bam1_t *b, int seqPos, Config *config) { + uint8_t base = bam_seqi(bam_get_seq(b), seqPos); + int strand = getStrand(b); //1=OT, 2=OB, 3=CTOT, 4=CTOB + + if(strand==0) { + fprintf(stderr, "Can't determine the strand of a read!\n"); + assert(strand != 0); + } + //Is the phred score even high enough? + if(bam_get_qual(b)[seqPos] < config->minPhred) return 0; + + if(base == 2 && (strand==1 || strand==3)) return 1; //C on an OT/CTOT alignment + else if(base == 8 && (strand==1 || strand==3)) return -1; //T on an OT/CTOT alignment + else if(base == 4 && (strand==2 || strand==4)) return 1; //G on an OB/CTOB alignment + else if(base == 1 && (strand==2 || strand==4)) return -1; //A on an OB/CTOB alignment + return 0; +} + +float computeEfficiency(unsigned int nMethyl, unsigned int nUMethyl) { + if(nMethyl + nUMethyl == 0) return 1.0; + return nUMethyl / ((float)(nMethyl + nUMethyl)); +} + +float computeConversionEfficiency(bam1_t *b, mplp_data *ldata) { + unsigned int nMethyl = 0, nUMethyl = 0; + uint32_t i, j, seqEnd = ldata->offset + ldata->lseq; // 1-base after the end of the sequence + uint32_t *cigar = bam_get_cigar(b), op, opLen; + int direction, type, state; + int pos = b->core.pos, seqPos = 0; //position in the genome and position in the read + + for(i=0; icore.n_cigar; i++) { + op = bam_cigar_op(cigar[i]); + opLen = bam_cigar_oplen(cigar[i]); + + switch(op) { + case 0: + case 7: + case 8: + // do something + for(j=0; j= seqEnd) return computeEfficiency(nMethyl, nUMethyl); + if(isCpG(ldata->seq, pos+j-ldata->offset, ldata->lseq)) { + continue; + } else if((direction = isCHG(ldata->seq, pos+j-ldata->offset, ldata->lseq))) { + state = getMethylState(b, seqPos, ldata->config); + if(state > 0) nMethyl++; + else if(state < 0) nUMethyl++; + } else if((direction = isCHH(ldata->seq, pos+j-ldata->offset, ldata->lseq))) { + state = getMethylState(b, seqPos, ldata->config); + if(state > 0) nMethyl++; + else if(state < 0) nUMethyl++; + } + } + break; + case 1: // I, consume read + case 4: // S, consume read + seqPos += opLen; + break; + case 2: // D, consume seq + case 3: // N, consume seq + pos += opLen; + break; + } + } + + return computeEfficiency(nMethyl, nUMethyl); +} + //This will need to be restructured to handle multiple input files int filter_func(void *data, bam1_t *b) { int rv, NH, overlap; @@ -345,6 +414,11 @@ int filter_func(void *data, bam1_t *b) { } } + // This is (A) moderately expensive to compute and (B) not completely correct at chunk boundaries. + if(ldata->config->minConversionEfficiency > 0.0) { + if(computeConversionEfficiency(b, ldata) < ldata->config->minConversionEfficiency) continue; + } + /*********************************************************************** * * Deal with bounds inclusion (--OT, --OB, etc.) diff --git a/extract.c b/extract.c index a92ca71..94a6016 100644 --- a/extract.c +++ b/extract.c @@ -381,8 +381,11 @@ void *extractCalls(void *foo) { fprintf(stderr, "faidx_fetch_seq returned %i while trying to fetch the sequence for tid %s:%"PRIu32"-%"PRIu32"!\n",\ seqlen, hdr->target_name[localTid], localPos2, localEnd); fprintf(stderr, "Note that the output will be truncated!\n"); - return NULL; + continue; } + data->seq = seq; + data->offset = localPos2; + data->lseq = seqlen; //Start the pileup data->ohash = initOlapHash(); @@ -644,6 +647,11 @@ void extract_usage() { " -q apply here as well. Note further that if you use\n" " --mergeContext that a merged site will be excluded if either\n" " of its individual Cs would be excluded.\n" +" --minConversionEfficiency The minimum non-CpG conversion efficiency observed\n" +" in a read to include it in the output. The default is 0.0 and\n" +" the maximum is 1.0 (100%% conversion). You are strongly\n" +" encouraged to NOT use this option without an EXTREMELY\n" +" compelling reason!\n" " --maxVariantFrac The maximum fraction of A/T/C base calls on the strand\n" " opposite of a C to allow before a position is declared a\n" " variant (thereby being excluded from output). The default is\n" @@ -734,6 +742,7 @@ int extract_main(int argc, char *argv[]) { config.chunkSize = 1000000; config.cytosine_report = 0; config.noBAM = 0; + config.minConversionEfficiency = 0.0; for(i=0; i<16; i++) config.bounds[i] = 0; for(i=0; i<16; i++) config.absoluteBounds[i] = 0; @@ -764,6 +773,7 @@ int extract_main(int argc, char *argv[]) { {"chunkSize", 1, NULL, 19}, {"keepStrand", 0, NULL, 20}, {"cytosine_report", 0, NULL, 21}, + {"minConversionEfficiency", 1, NULL, 22}, {"ignoreFlags", 1, NULL, 'F'}, {"requireFlags", 1, NULL, 'R'}, {"help", 0, NULL, 'h'}, @@ -870,6 +880,9 @@ int extract_main(int argc, char *argv[]) { case 21: config.cytosine_report = 1; break; + case 22: + config.minConversionEfficiency = atof(optarg); + break; case 'M': config.BWName = optarg; break; diff --git a/tests/chgchh.fa b/tests/chgchh.fa new file mode 100644 index 0000000..79325ff --- /dev/null +++ b/tests/chgchh.fa @@ -0,0 +1,2 @@ +>contig1 +CGCTGCTGCTGCTGCTGCTGCTGCTGCTTCTTCTTCTTCG diff --git a/tests/chgchh_aln.bam b/tests/chgchh_aln.bam new file mode 100644 index 0000000000000000000000000000000000000000..001928607bbbabd68f401a92bc00f5cfd5c4ab75 GIT binary patch literal 346 zcmb2|=3rp}f&Xj_PR>jWvl)tuzNEfMPe>^EQ1FoF^Jb$f#t(SDZPz-heNJD?)AwN! z2j3H(Z=1c&o!36;WB%mJ$IV8D4~jSno_{IXsO5J?_vHEWn$LJXIBz(ZCLvX1_4~k4 zkyA#&r$tL9e-iZy>h|vXI@$QrWMiX8!o{DyOcE+KWNrMlL!!~|5ku07qdZbPJUk4t zB2mj%1Dzy~=C*|lv(Gj#)v>ft%aljV`L{onup@l$5aNIEg$z>xzBE1w(>0u#ncF-T6r^}eDQ0kXVTn*s(l}+=qIgKwhfV$e z?B0iajxxWoDRT literal 0 HcmV?d00001 diff --git a/tests/chgchh_aln.bam.bai b/tests/chgchh_aln.bam.bai new file mode 100644 index 0000000000000000000000000000000000000000..288c155bdc3bdfb07c3c86b70622b6dc182756aa GIT binary patch literal 96 vcmZ>A^kigYU|?VZVoxCk1`wNpVGfvNV6X#oy(U5A(ZyMys$dkVUWh0F4qF6b literal 0 HcmV?d00001 diff --git a/tests/test.py b/tests/test.py index 042e922..9b36e0b 100644 --- a/tests/test.py +++ b/tests/test.py @@ -94,4 +94,37 @@ def rm(f): lines = sum(1 for _ in open('test9_CpG.bedGraph')) assert lines == 48 rm('test9_CpG.bedGraph') + +# Check conversion efficiency. 2 read pairs, one mostly converted +# By default, 1 read is MAPQ filtered, another is kept +rm('test10_CpG.bedGraph') +check_call([MPath, 'extract', '-o', 'test10', 'chgchh.fa', 'chgchh_aln.bam']) +assert op.exists('test10_CpG.bedGraph') +lines = sum(1 for _ in open('test10_CpG.bedGraph')) +assert lines == 2 +rm('test10_CpG.bedGraph') + +# Ensure 2 reads/positions are covered by changing MAPQ +rm('test11_CpG.bedGraph') +check_call([MPath, 'extract', '-o', 'test11', '-q', '5', 'chgchh.fa', 'chgchh_aln.bam']) +assert op.exists('test11_CpG.bedGraph') +lines = sum(1 for _ in open('test11_CpG.bedGraph')) +assert lines == 3 +rm('test11_CpG.bedGraph') + +# Only 1 read has a conversion efficiency >=0.9 +rm('test12_CpG.bedGraph') +check_call([MPath, 'extract', '-o', 'test12', '-q', '5', '--minConversionEfficiency', '0.9', 'chgchh.fa', 'chgchh_aln.bam']) +assert op.exists('test12_CpG.bedGraph') +lines = sum(1 for _ in open('test12_CpG.bedGraph')) +assert lines == 2 +rm('test12_CpG.bedGraph') + +# No perfectly converted reads +rm('test13_CpG.bedGraph') +check_call([MPath, 'extract', '-o', 'test13', '-q', '5', '--minConversionEfficiency', '1.0', 'chgchh.fa', 'chgchh_aln.bam']) +assert op.exists('test13_CpG.bedGraph') +lines = sum(1 for _ in open('test13_CpG.bedGraph')) +assert lines == 1 +rm('test13_CpG.bedGraph') print("Finished correctly") From ad38eb33708546677c2605e34d60b03b59996159 Mon Sep 17 00:00:00 2001 From: Devon Ryan Date: Sat, 17 Jul 2021 23:21:09 +0200 Subject: [PATCH 2/5] expand htslib test versions --- azure-pipeline.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/azure-pipeline.yml b/azure-pipeline.yml index 5dabace..75aa38d 100644 --- a/azure-pipeline.yml +++ b/azure-pipeline.yml @@ -10,6 +10,10 @@ jobs: matrix: htslib111: htslib_version: '1.11' + htslib112: + htslib_version: '1.12' + htslib113: + htslib_version: '1.13' maxParallel: 1 steps: @@ -27,6 +31,10 @@ jobs: matrix: htslib111: htslib_version: '1.11' + htslib112: + htslib_version: '1.12' + htslib113: + htslib_version: '1.13' maxParallel: 1 steps: From 4911afd5f8d8c1c1177c15de13fc2f60f235b23d Mon Sep 17 00:00:00 2001 From: Devon Ryan Date: Thu, 22 Jul 2021 09:42:48 +0200 Subject: [PATCH 3/5] Document minConversionEfficiency closes #61 --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 9716381..b964382 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,11 @@ Excluding low-coverage regions If your downstream analysis requires an absolute minimum coverage (here, defined as the number of methylation calls kept after filtering for MAPQ, phred score, etc.), you can use the `--minDepth` option to achieve this. By default, `MethylDackel extract` will output all methylation metrics as long as the coverage is at least 1. If you use `--minDepth 10`, then only sites covered at least 10x will be output. This works in conjunction with the `--mergeContext` option, above. So if you request per-CpG context output (i.e., with `--mergeContext`) and `--minDepth 10` then only CpGs with a minimum coverage of 10 will be output. +Excluding partially converted reads +=================================== + +Some users wish to filter out reads that have evidence of incomplete bisulfite conversion. This can be determined by looking for CHH and CHG-context Cytosines in each read and determining their methylation state. Such filtering should generally be avoided, since there are regions in most genomes with at least some CHH and CHG-context Cytosine methylation, which would cause excess filtering in those regions. However, if you absolutely need to filter out only partially-converted alignments, you can do so with the `--minConversionEfficiency` option. The default is 0, meaning that CHH and CHG-context Cytosine conversion status is ignored. The maximum value is 1.0, meaning that 100% of the CHH and CHG-context Cytosines in an alignment must be converted to T. Note that an alignment with no CHH or CHG-context Cytosines will be given a conversion efficiency of 1.0. + Logit, fraction, and counts only output ======================================= From 045ed3d05a86334635bb92374d0f7f67eb087073 Mon Sep 17 00:00:00 2001 From: Devon Ryan Date: Thu, 22 Jul 2021 09:44:53 +0200 Subject: [PATCH 4/5] Bump to 0.6.0 --- CHANGELOG.md | 4 ++++ Makefile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c67d46a..f1fc190 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +Version 0.6.0: + + * Added the `minConversionEfficiency` option, which allows filtering out incompletely converted alignments on the fly. Note that doing this is generally NOT recommended. (issue #61) + Version 0.5.3: * Fixed an issue with the `perRead` subcommand, wherein the requireFlags option didn't fully work (a read would pass if it had at least one of the required flags set, rather than all of them). (issue #117) diff --git a/Makefile b/Makefile index d77c5d7..1564268 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ CFLAGS ?= -Wall -g -O3 -pthread all: MethylDackel OBJS = common.o bed.o svg.o overlaps.o extract.o MBias.o mergeContext.o perRead.o -VERSION = 0.5.3 +VERSION = 0.6.0 version.h: echo '#define VERSION "$(VERSION)"' > $@ From e52ba66070c4db748681da67a560365d866d61e6 Mon Sep 17 00:00:00 2001 From: Devon Ryan Date: Thu, 22 Jul 2021 09:48:25 +0200 Subject: [PATCH 5/5] one more thing --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1fc190..4323ac2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ Version 0.6.0: * Added the `minConversionEfficiency` option, which allows filtering out incompletely converted alignments on the fly. Note that doing this is generally NOT recommended. (issue #61) + * Fixed various segfaults and other issues related to mappability filtering (courtesy of @Valiec) Version 0.5.3: