From d089c5375159f1fa12e23063390ac2dfff794d68 Mon Sep 17 00:00:00 2001 From: Devon Ryan Date: Mon, 27 Sep 2021 08:42:23 +0200 Subject: [PATCH 1/3] implement #122, the --ignoreNH option --- MBias.c | 9 ++++++++- MethylDackel.h | 2 ++ common.c | 10 ++++++---- extract.c | 8 ++++++++ 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/MBias.c b/MBias.c index 1cd374e..6455c97 100644 --- a/MBias.c +++ b/MBias.c @@ -266,6 +266,9 @@ void mbias_usage() { " present, or else the alignment is ignored. This is equivalent\n" " to the -f option in samtools. The default is 0, which\n" " includes all alignments.\n" +" --ignoreNH Ignore NH auxiliary tags. By default, if an NH tag is present\n" +" and its value is >1 then an entry is ignored as a\n" +" multimapper.\n" " --minConversionEfficiency The minimum non-CpG conversion efficiency observed\n" " in a read to include it in the output. The default is 0.0 and\n" " the maximum is 1.0 (100%% conversion). You are strongly\n" @@ -309,7 +312,7 @@ int mbias_main(int argc, char *argv[]) { config.keepCpG = 1; config.keepCHG = 0; config.keepCHH = 0; config.minMapq = 10; config.minPhred = 5; config.keepDupes = 0; config.keepSingleton = 0, config.keepDiscordant = 0; - config.filterMappability = 0; + config.filterMappability = 0, config.ignoreNH = 0; config.fp = NULL; config.bai = NULL; @@ -340,6 +343,7 @@ int mbias_main(int argc, char *argv[]) { {"chunkSize", 1, NULL, 13}, {"keepStrand", 0, NULL, 14}, {"minConversionEfficiency", 1, NULL, 15}, + {"ignoreNH", 0, NULL, 16}, {"ignoreFlags", 1, NULL, 'F'}, {"requireFlags", 1, NULL, 'R'}, {"help", 0, NULL, 'h'}, @@ -413,6 +417,9 @@ int mbias_main(int argc, char *argv[]) { case 15: config.minConversionEfficiency = atof(optarg); break; + case 16: + config.ignoreNH = 1; + break; case 'F' : config.ignoreFlags = atoi(optarg); break; diff --git a/MethylDackel.h b/MethylDackel.h index a2f2d9e..0557e3e 100644 --- a/MethylDackel.h +++ b/MethylDackel.h @@ -67,6 +67,7 @@ typedef struct { @field merge 1: Merge Cs in either a CpG or CHG context into single entries @field methylKit Output in a format compatible with methylKit @field minOppositeDepth Minimum depth covering the opposite strand needed to look for variants + @field ignoreNH If set, don't exclude alignments with NH auxiliary tags with values >1 (i.e., marked multimappers). @field maxVariantFrac If the fraction of non-Gs on the opposite strand is greater than this then a position is excluded. @field fraction 1: Output should be the methylation fraction only, 0: otherwise @field counts 1: Output just the coverage @@ -91,6 +92,7 @@ typedef struct { int minMapq, minPhred, keepDupes, minDepth; int keepDiscordant, keepSingleton, ignoreFlags, requireFlags; int merge, methylKit, minOppositeDepth; + int ignoreNH; double maxVariantFrac; int fraction, counts, logit; int cytosine_report; diff --git a/common.c b/common.c index 31cf17d..8cd55a3 100644 --- a/common.c +++ b/common.c @@ -418,10 +418,12 @@ int filter_func(void *data, bam1_t *b) { if(b->core.flag & ldata->config->ignoreFlags) continue; //By default: secondary alignments, QC failed, PCR duplicates, and supplemental alignments if(ldata->config->requireFlags && (b->core.flag & ldata->config->requireFlags) != ldata->config->requireFlags) continue; if(!ldata->config->keepDupes && b->core.flag & BAM_FDUP) continue; - p = bam_aux_get(b, "NH"); - if(p != NULL) { - NH = bam_aux2i(p); - if(NH>1) continue; //Ignore obvious multimappers + if(!ldata->config->ignoreNH) { + p = bam_aux_get(b, "NH"); + if(p != NULL) { + NH = bam_aux2i(p); + if(NH>1) continue; //Ignore obvious multimappers + } } if((ldata->config->filterMappability) && check_mappability(ldata, b) == 0) continue; //Low mappability if(!ldata->config->keepSingleton && (b->core.flag & 0x9) == 0x9) continue; //Singleton diff --git a/extract.c b/extract.c index aa12581..44cc5b6 100644 --- a/extract.c +++ b/extract.c @@ -639,6 +639,9 @@ void extract_usage() { " file with a .counts.bedGraph extension.\n" " --logit Extract logit(M/(M+U)) (only) at each position. This produces\n" " a file with a .logit.bedGraph extension.\n" +" --ignoreNH Ignore NH auxiliary tags. By default, if an NH tag is present\n" +" and its value is >1 then an entry is ignored as a\n" +" multimapper.\n" " --minOppositeDepth If you would like to exclude sites that likely contain\n" " SNPs, then specifying a value greater than 0 here will\n" " indicate the minimum depth required on the strand opposite of\n" @@ -722,6 +725,7 @@ int extract_main(int argc, char *argv[]) { config.keepCpG = 1; config.keepCHG = 0; config.keepCHH = 0; config.minMapq = 10; config.minPhred = 5; config.keepDupes = 0; config.keepSingleton = 0, config.keepDiscordant = 0; + config.ignoreNH = 0; config.minDepth = 1; config.methylKit = 0; config.merge = 0; @@ -776,6 +780,7 @@ int extract_main(int argc, char *argv[]) { {"keepStrand", 0, NULL, 20}, {"cytosine_report", 0, NULL, 21}, {"minConversionEfficiency", 1, NULL, 22}, + {"ignoreNH", 0, NULL, 23}, {"ignoreFlags", 1, NULL, 'F'}, {"requireFlags", 1, NULL, 'R'}, {"help", 0, NULL, 'h'}, @@ -885,6 +890,9 @@ int extract_main(int argc, char *argv[]) { case 22: config.minConversionEfficiency = atof(optarg); break; + case 23: + config.ignoreNH = 1; + break; case 'M': config.BWName = optarg; break; From 48a25188de047cd2cd9e8c111dc9d3c80dbef311 Mon Sep 17 00:00:00 2001 From: Devon Ryan Date: Mon, 27 Sep 2021 09:36:16 +0200 Subject: [PATCH 2/3] --ignoreNH in perRead and add test for extract subcommand --- perRead.c | 7 +++++++ tests/NH.bam | Bin 0 -> 529 bytes tests/NH.bam.bai | Bin 0 -> 96 bytes tests/test.py | 18 ++++++++++++++++++ 4 files changed, 25 insertions(+) create mode 100644 tests/NH.bam create mode 100644 tests/NH.bam.bai diff --git a/perRead.c b/perRead.c index 366f701..2c9a00c 100644 --- a/perRead.c +++ b/perRead.c @@ -259,6 +259,9 @@ void perRead_usage() { " present, or else the alignment is ignored. This is equivalent to the\n" " -f option in samtools. The default is 0, which includes all\n" " alignments.\n" +" --ignoreNH Ignore NH auxiliary tags. By default, if an NH tag is present\n" +" and its value is >1 then an entry is ignored as a\n" +" multimapper.\n" " -@ INT The number of threads to use, the default 1\n" " --chunkSize INT The size of the genome processed by a single thread at a time.\n" " The default is 1000000 bases. This value MUST be at least 1.\n" @@ -280,6 +283,7 @@ int perRead_main(int argc, char *argv[]) { config.keepCpG = 1; config.keepCHG = 0; config.keepCHH = 0; config.minMapq = 10; config.minPhred = 5; config.keepDupes = 0; config.keepSingleton = 0, config.keepDiscordant = 0; + config.ignoreNH = 0; config.fp = NULL; config.bai = NULL; config.reg = NULL; @@ -344,6 +348,9 @@ int perRead_main(int argc, char *argv[]) { case 20: keepStrand = 1; break; + case 21: + config.ignoreNH = 1; + break; default : fprintf(stderr, "Invalid option '%c'\n", c); perRead_usage(); diff --git a/tests/NH.bam b/tests/NH.bam new file mode 100644 index 0000000000000000000000000000000000000000..a53eee620761f00d95204b98e75e1160c9a543e7 GIT binary patch literal 529 zcmb2|=3rp}f&Xj_PR>jWR*a<+PiGxA5OAI9>-e)v@wHdo68;Zbz0979rNJ`0OgF8) z#S|T+s-E|M)%*xc)lAQmNlPy7JuEnVroBz?J~xiDVcdUaxiEe${3Y{tN$9f2Pb;SF zdtlPpb?b=tp0a)K`_pHu*2TE&d}V#md5@>xTILp$CFT}!?!xQCbiaG$+Rgk@^zc=B z*9NzcNf8Q5%zd)+6w<>)j zda0DP%b&OA^^e(1X7FY9o4sLu-*_b;L^50DY-F|H;)9m&uf(%GH^t@5Gb_W-~K?MYpz;Hjx&A!*l-s|1Z8&ZJC6T z#Rfxz6%LNwToWAvo5LDp8D*O1N-F(nxN(2x!SmN=9=xu4>cW8rMIj9tNuj4Q2LuEe z#X3Ipc3pCA4Xp|cX_Kqs5p)bZr^qAtF2Ts+=luEY#=%@}7XE5Vb2{0!aHsR;S}ahQ zvyxx&;hT3I9M9u#9JZX9X<{49(CL5PXYtFKOdZO$%nS!Acz71DV!B+Kff<|zKm-7_ C-`a`* literal 0 HcmV?d00001 diff --git a/tests/NH.bam.bai b/tests/NH.bam.bai new file mode 100644 index 0000000000000000000000000000000000000000..5331b67544cb848273d8a1b09e8dec2866fb7fde GIT binary patch literal 96 ucmZ>A^kigYU|?VZVoxCk1`wNp!5Ts_dvGo%mlCi literal 0 HcmV?d00001 diff --git a/tests/test.py b/tests/test.py index 9b36e0b..fe16e3e 100644 --- a/tests/test.py +++ b/tests/test.py @@ -127,4 +127,22 @@ def rm(f): lines = sum(1 for _ in open('test13_CpG.bedGraph')) assert lines == 1 rm('test13_CpG.bedGraph') + +# Test ignoreNH +49 for --ignoreNH +rm('test14_CpG.bedGraph') +check_call([MPath, 'extract', '-o', 'test14', '-q', '1', 'cg100.fa', 'NH.bam']) +assert op.exists('test14_CpG.bedGraph') +lines = sum(1 for _ in open('test14_CpG.bedGraph')) +assert lines == 1 +rm('test14_CpG.bedGraph') + +# Test ignoreNH +rm('test15_CpG.bedGraph') +check_call([MPath, 'extract', '-o', 'test15', '--ignoreNH', '-q', '1', 'cg100.fa', 'NH.bam']) +assert op.exists('test15_CpG.bedGraph') +lines = sum(1 for _ in open('test15_CpG.bedGraph')) +assert lines == 49 +rm('test15_CpG.bedGraph') print("Finished correctly") + From e2013305b1ca5e10419bd0be4bc74a0996116922 Mon Sep 17 00:00:00 2001 From: Devon Ryan Date: Mon, 27 Sep 2021 09:41:03 +0200 Subject: [PATCH 3/3] typo --- tests/test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test.py b/tests/test.py index fe16e3e..2276cee 100644 --- a/tests/test.py +++ b/tests/test.py @@ -129,7 +129,6 @@ def rm(f): rm('test13_CpG.bedGraph') # Test ignoreNH -49 for --ignoreNH rm('test14_CpG.bedGraph') check_call([MPath, 'extract', '-o', 'test14', '-q', '1', 'cg100.fa', 'NH.bam']) assert op.exists('test14_CpG.bedGraph')