Skip to content

Commit

Permalink
add N90 to genome stats
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Nov 15, 2024
1 parent 6332cfd commit 7abfa9a
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 6 deletions.
2 changes: 2 additions & 0 deletions bakta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ def main():
print(f"\tContigs/replicons: {len(data['sequences'])}")
print(f"\tGC: {100 * data['stats']['gc']:.1f} %")
print(f"\tN50: {data['stats']['n50']:,}")
print(f"\tN90: {data['stats']['n90']:,}")
print(f"\tN ratio: {100 * data['stats']['n_ratio']:.1f} %")
print(f"\tcoding density: {100 * data['stats']['coding_ratio']:.1f} %")
print('\nannotation summary:')
Expand Down Expand Up @@ -594,6 +595,7 @@ def main():
fh_out.write(f"Count: {len(data['sequences'])}\n")
fh_out.write(f"GC: {100 * data['stats']['gc']:.1f}\n")
fh_out.write(f"N50: {data['stats']['n50']:}\n")
fh_out.write(f"N90: {data['stats']['n90']:}\n")
fh_out.write(f"N ratio: {100 * data['stats']['n_ratio']:.1f}\n")
fh_out.write(f"coding density: {100 * data['stats']['coding_ratio']:.1f}\n")
fh_out.write('\nAnnotation:\n')
Expand Down
15 changes: 9 additions & 6 deletions bakta/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,16 +309,19 @@ def calc_genome_stats(data: dict):
data['stats']['n_ratio'] = n_ratio
log.info('N=%0.3f', n_ratio)

n50 = 0
sequence_length_sum = 0
for seq in sorted(data['sequences'], key=lambda x: x['length'], reverse=True):
nt_length = len(seq['nt'])
sequence_length_sum += nt_length
if(sequence_length_sum >= genome_size / 2):
n50 = nt_length
break
data['stats']['n50'] = n50
log.info('N50=%i', n50)
if(sequence_length_sum >= genome_size * 0.5):
if 'n50' not in data['stats']:
data['stats']['n50'] = nt_length
log.info('N50=%i', nt_length)
if(sequence_length_sum >= genome_size * 0.9):
if 'n90' not in data['stats']:
data['stats']['n90'] = nt_length
log.info('N90=%i', nt_length)


sequence_by_id = {seq['id']: seq for seq in data['sequences']}
coding_nts = 0
Expand Down

0 comments on commit 7abfa9a

Please sign in to comment.