From 7abfa9aa59e791104f652db316267f1635f408c5 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 15 Nov 2024 13:20:29 +0100 Subject: [PATCH] add N90 to genome stats --- bakta/main.py | 2 ++ bakta/utils.py | 15 +++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/bakta/main.py b/bakta/main.py index 3a05f94a..cbcd6689 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -501,6 +501,7 @@ def main(): print(f"\tContigs/replicons: {len(data['sequences'])}") print(f"\tGC: {100 * data['stats']['gc']:.1f} %") print(f"\tN50: {data['stats']['n50']:,}") + print(f"\tN90: {data['stats']['n90']:,}") print(f"\tN ratio: {100 * data['stats']['n_ratio']:.1f} %") print(f"\tcoding density: {100 * data['stats']['coding_ratio']:.1f} %") print('\nannotation summary:') @@ -594,6 +595,7 @@ def main(): fh_out.write(f"Count: {len(data['sequences'])}\n") fh_out.write(f"GC: {100 * data['stats']['gc']:.1f}\n") fh_out.write(f"N50: {data['stats']['n50']:}\n") + fh_out.write(f"N90: {data['stats']['n90']:}\n") fh_out.write(f"N ratio: {100 * data['stats']['n_ratio']:.1f}\n") fh_out.write(f"coding density: {100 * data['stats']['coding_ratio']:.1f}\n") fh_out.write('\nAnnotation:\n') diff --git a/bakta/utils.py b/bakta/utils.py index 20e3d337..14e87dee 100644 --- a/bakta/utils.py +++ b/bakta/utils.py @@ -309,16 +309,19 @@ def calc_genome_stats(data: dict): data['stats']['n_ratio'] = n_ratio log.info('N=%0.3f', n_ratio) - n50 = 0 sequence_length_sum = 0 for seq in sorted(data['sequences'], key=lambda x: x['length'], reverse=True): nt_length = len(seq['nt']) sequence_length_sum += nt_length - if(sequence_length_sum >= genome_size / 2): - n50 = nt_length - break - data['stats']['n50'] = n50 - log.info('N50=%i', n50) + if(sequence_length_sum >= genome_size * 0.5): + if 'n50' not in data['stats']: + data['stats']['n50'] = nt_length + log.info('N50=%i', nt_length) + if(sequence_length_sum >= genome_size * 0.9): + if 'n90' not in data['stats']: + data['stats']['n90'] = nt_length + log.info('N90=%i', nt_length) + sequence_by_id = {seq['id']: seq for seq in data['sequences']} coding_nts = 0