Skip to content

Commit

Permalink
ENH Better error messages in coverage
Browse files Browse the repository at this point in the history
Give more context in the error message when an error is found

Slight refactor to use better argument/variable names

/cc #168
  • Loading branch information
luispedro committed Aug 3, 2024
1 parent 8bb2343 commit e31fc97
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 9 deletions.
1 change: 1 addition & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Unreleased
* logging: use 'SemiBin2' as logger name
* SemiBin: always log to file in DEBUG level and log command-line arguments
* coverage: Better error messages (#168)

Version 2.1.0 Mar 6 2024 by BigDataBiology
* SemiBin: Support running SemiBin with strobealign-aemb
Expand Down
26 changes: 18 additions & 8 deletions SemiBin/generate_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .atomicwrite import atomic_write

def calculate_coverage(depth_stream, bam_file, must_link_threshold, edge=75, is_combined=False,
contig_threshold=1000, sep=None, contig_threshold_dict=None):
contig_threshold=1000, sep=None, sample_contig_threshold=None):
"""
depth_stream : an iterable like the output of bedtools genomecov
bam_file : str filename
Expand Down Expand Up @@ -41,8 +41,13 @@ def calculate_coverage(depth_stream, bam_file, must_link_threshold, edge=75, is_
if sep is None:
cov_threshold = contig_threshold
else:
sample_name = contig_name.split(sep)[0]
cov_threshold = contig_threshold_dict[sample_name]
try:
sample, contig = contig_name.split(sep)
cov_threshold = sample_contig_threshold[sample]
except ValueError:
raise ValueError(f"Error parsing contig name '{contig_name}' in file {bam_file} (trying to split by {sep} separator)")
except KeyError:
raise KeyError(f"Error: {sample} not found (parsing {bam_file}")
if len(depth_value) < cov_threshold:
continue
depth_value_ = depth_value[edge:-edge]
Expand Down Expand Up @@ -104,7 +109,7 @@ def generate_cov(bam_file, bam_index, out, threshold,
is_combined=is_combined,
sep=sep,
contig_threshold=(contig_threshold if sep is None else 1000),
contig_threshold_dict=(contig_threshold if sep is not None else None))
sample_contig_threshold=(contig_threshold if sep is not None else None))

if bed_p.wait() != 0:
raise OSError(f"Failure in running bedtools ({bam_file})")
Expand Down Expand Up @@ -172,7 +177,7 @@ def combine_cov(cov_dir : str, bam_list, is_combined : bool): # bam_list : list[
data_split_cov = None
return data_cov, data_split_cov

def generate_cov_from_abundances(abundances, output, contig_path, contig_threshold=1000, sep=None, contig_threshold_dict=None):
def generate_cov_from_abundances(abundances, output, contig_path, contig_threshold=1000, sep=None, sample_contig_threshold=None):
import pandas as pd
import numpy as np
from .fasta import fasta_iter
Expand All @@ -188,8 +193,13 @@ def generate_cov_from_abundances(abundances, output, contig_path, contig_thresho
if sep is None:
cov_threshold = contig_threshold
else:
sample_name = h.split(sep)[0]
cov_threshold = contig_threshold_dict[sample_name]
try:
sample_name, contig_name = h.split(sep)
cov_threshold = sample_contig_threshold[sample_name]
except ValueError:
raise ValueError(f"Error parsing contig name '{h}' in file {abun_file} (trying to split by {sep} separator)")
except KeyError:
raise KeyError(f"Error: {sample_name} not found")

if len(seq) >= cov_threshold:
binned_contig.append(h + '_1')
Expand Down Expand Up @@ -219,4 +229,4 @@ def generate_cov_from_abundances(abundances, output, contig_path, contig_thresho
abun.to_csv(ofile)
return abun, abun_split
else:
return abun_split
return abun_split
6 changes: 5 additions & 1 deletion SemiBin/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,7 +1030,11 @@ def fasta_sample_iter(fn):

if args.abundances:
logger.info('Reading abundance information from abundance files.')
abun_split = generate_cov_from_abundances(args.abundances, os.path.join(args.output, 'samples'), args.contig_fasta, sep=args.separator, contig_threshold_dict=binning_threshold)
abun_split = generate_cov_from_abundances(args.abundances,
os.path.join(args.output, 'samples'),
contig_path=args.contig_fasta,
sep=args.separator,
sample_contig_threshold=binning_threshold)
abun_split = abun_split.reset_index()
columns_list = list(abun_split.columns)
columns_list[0] = 'contig_name'
Expand Down

0 comments on commit e31fc97

Please sign in to comment.