From 7fb35bdfdddd79923e62d7135d35e5588516675a Mon Sep 17 00:00:00 2001 From: Nicholas Youngblut Date: Tue, 26 Jan 2021 14:34:46 +0100 Subject: [PATCH] prefix of X for filler taxonomies; changed default fraction cutoff --- ncbi-gtdb_map.py | 10 +++++----- tests/data/ncbi-gtdb/gtdb_tax_queries.txt | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ncbi-gtdb_map.py b/ncbi-gtdb_map.py index b60a6c9..48ce7da 100755 --- a/ncbi-gtdb_map.py +++ b/ncbi-gtdb_map.py @@ -94,7 +94,7 @@ parser.add_argument('-q', '--query-taxonomy', type=str, default='ncbi_taxonomy', choices=['ncbi_taxonomy', 'gtdb_taxonomy'], help='Taxonomy of the query list (Default: %(default)s)') -parser.add_argument('-f', '--fraction', type=float, default=0.51, +parser.add_argument('-f', '--fraction', type=float, default=0.90, help='Homogeneity of LCA (fraction) in order to be used (Default: %(default)s)') parser.add_argument('-m', '--max-tips', type=int, default=100, help='Max no. of tips used for LCA determination. If more, subsampling w/out replacement (Default: %(default)s)') @@ -205,7 +205,7 @@ def format_taxonomy(T, hierarchy, acc): Tx = ['' for i in range(len(hierarchy))] for i,x in enumerate(hierarchy[:-1]): if len(T) < i + 1 or T[i] == '' or T[i] == 'unclassified' or regex.search(T[i]): - Tx[i] = '__'.join([x[0], acc]) + Tx[i] = '__'.join(['X' + x[0], acc]) else: Tx[i] = T[i] Tx[-1] = acc @@ -294,7 +294,7 @@ def load_gtdb_metadata(infile, G, completeness, contamination): raise KeyError('Cannot find "ncbi_taxonomy"') if X == 'none': stats['no ncbi tax'] += 1 - continue + continue # filtering by checkM stats try: X = line[header['checkm_completeness']] @@ -351,7 +351,7 @@ def lca_frac_pass(D, lca_frac): mc = D.most_common(1) except IndexError: return [None,None] - if re.search(r'^[pcofgs]__$', mc[0][0]): + if re.search(r'^[Xx][pcofgs]__', mc[0][0]): return [None,None] try: frac = mc[0][1] / float(sum(D.values())) @@ -401,7 +401,7 @@ def _query_tax(tax_queries, G, qtax, ttax, lca_frac=1.0, max_tips=100, verbose=F # iterating queries for Q in tax_queries: tips = [] - try: + try: # getting descendents of the node tips = [desc for desc in descendants(G[qtax], Q[0]) if \ G[qtax].nodes[desc]['taxonomy'] == 'strain'] diff --git a/tests/data/ncbi-gtdb/gtdb_tax_queries.txt b/tests/data/ncbi-gtdb/gtdb_tax_queries.txt index eb99f57..fdbcd74 100644 --- a/tests/data/ncbi-gtdb/gtdb_tax_queries.txt +++ b/tests/data/ncbi-gtdb/gtdb_tax_queries.txt @@ -1,3 +1,4 @@ +f__Gearchaeaceae g__Aquabacter s__Nitrosopumilus sp000746785 g__Escherichia @@ -6,5 +7,6 @@ s__Xanthomonas oryzae c__Gammaproteobacteria o__Burkholderiales c__Bacteroidia +f__BM003 s__Homo sapiens Blank