ENH Better error messages in coverage

Give more context in the error message when an error is found Slight refactor to use better argument/variable names /cc #168
BigDataBiology · Aug 3, 2024 · e31fc97 · e31fc97
1 parent 8bb2343
commit e31fc97
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 9 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,6 +1,7 @@
 Unreleased
 	* logging: use 'SemiBin2' as logger name
 	* SemiBin: always log to file in DEBUG level and log command-line arguments
+	* coverage: Better error messages (#168)
 
 Version 2.1.0 Mar 6 2024 by BigDataBiology
 	* SemiBin: Support running SemiBin with strobealign-aemb

diff --git a/SemiBin/generate_coverage.py b/SemiBin/generate_coverage.py
@@ -3,7 +3,7 @@
 from .atomicwrite import atomic_write
 
 def calculate_coverage(depth_stream, bam_file, must_link_threshold, edge=75, is_combined=False,
-                       contig_threshold=1000, sep=None, contig_threshold_dict=None):
+                       contig_threshold=1000, sep=None, sample_contig_threshold=None):
     """
     depth_stream : an iterable like the output of bedtools genomecov
     bam_file : str filename
@@ -41,8 +41,13 @@ def calculate_coverage(depth_stream, bam_file, must_link_threshold, edge=75, is_
         if sep is None:
             cov_threshold = contig_threshold
         else:
-            sample_name = contig_name.split(sep)[0]
-            cov_threshold = contig_threshold_dict[sample_name]
+            try:
+                sample, contig = contig_name.split(sep)
+                cov_threshold = sample_contig_threshold[sample]
+            except ValueError:
+                raise ValueError(f"Error parsing contig name '{contig_name}' in file {bam_file} (trying to split by {sep} separator)")
+            except KeyError:
+                raise KeyError(f"Error: {sample} not found (parsing {bam_file}")
         if len(depth_value) < cov_threshold:
             continue
         depth_value_ = depth_value[edge:-edge]
@@ -104,7 +109,7 @@ def generate_cov(bam_file, bam_index, out, threshold,
                                     is_combined=is_combined,
                                     sep=sep,
                                     contig_threshold=(contig_threshold if sep is None else 1000),
-                                    contig_threshold_dict=(contig_threshold if sep is not None else None))
+                                    sample_contig_threshold=(contig_threshold if sep is not None else None))
 
     if bed_p.wait() != 0:
         raise OSError(f"Failure in running bedtools ({bam_file})")
@@ -172,7 +177,7 @@ def combine_cov(cov_dir : str, bam_list, is_combined : bool): # bam_list : list[
         data_split_cov = None
     return data_cov, data_split_cov
 
-def generate_cov_from_abundances(abundances, output, contig_path, contig_threshold=1000, sep=None, contig_threshold_dict=None):
+def generate_cov_from_abundances(abundances, output, contig_path, contig_threshold=1000, sep=None, sample_contig_threshold=None):
     import pandas as pd
     import numpy as np
     from .fasta import fasta_iter
@@ -188,8 +193,13 @@ def generate_cov_from_abundances(abundances, output, contig_path, contig_thresho
             if sep is None:
                 cov_threshold = contig_threshold
             else:
-                sample_name = h.split(sep)[0]
-                cov_threshold = contig_threshold_dict[sample_name]
+                try:
+                    sample_name, contig_name = h.split(sep)
+                    cov_threshold = sample_contig_threshold[sample_name]
+                except ValueError:
+                    raise ValueError(f"Error parsing contig name '{h}' in file {abun_file} (trying to split by {sep} separator)")
+                except KeyError:
+                    raise KeyError(f"Error: {sample_name} not found")
 
             if len(seq) >= cov_threshold:
                 binned_contig.append(h + '_1')
@@ -219,4 +229,4 @@ def generate_cov_from_abundances(abundances, output, contig_path, contig_thresho
             abun.to_csv(ofile)
         return abun, abun_split
     else:
-        return abun_split
+        return abun_split
diff --git a/SemiBin/main.py b/SemiBin/main.py
@@ -1030,7 +1030,11 @@ def fasta_sample_iter(fn):
 
     if args.abundances:
         logger.info('Reading abundance information from abundance files.')
-        abun_split = generate_cov_from_abundances(args.abundances, os.path.join(args.output, 'samples'), args.contig_fasta, sep=args.separator, contig_threshold_dict=binning_threshold)
+        abun_split = generate_cov_from_abundances(args.abundances,
+                                                  os.path.join(args.output, 'samples'),
+                                                  contig_path=args.contig_fasta,
+                                                  sep=args.separator,
+                                                  sample_contig_threshold=binning_threshold)
         abun_split = abun_split.reset_index()
         columns_list = list(abun_split.columns)
         columns_list[0] = 'contig_name'