diff --git a/isovar/__init__.py b/isovar/__init__.py index 105039f..fc0e0f3 100644 --- a/isovar/__init__.py +++ b/isovar/__init__.py @@ -14,4 +14,4 @@ from __future__ import print_function, division, absolute_import -__version__ = "0.5.0" +__version__ = "0.5.1" diff --git a/isovar/assembly.py b/isovar/assembly.py index a99d0b5..57f2719 100644 --- a/isovar/assembly.py +++ b/isovar/assembly.py @@ -17,6 +17,8 @@ from collections import defaultdict import logging +from six.moves import range + from .default_parameters import MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE logger = logging.getLogger(__name__) @@ -27,7 +29,7 @@ def greedy_merge_helper( """ Returns a list of merged VariantSequence objects, and True if any were successfully merged. - """ + """ merged_variant_sequences = {} merged_any = False @@ -37,22 +39,20 @@ def greedy_merge_helper( sequence1 = variant_sequences[i] # it works to loop over the triangle (i+1 onwards) because combine() tries flipping the # arguments if sequence1 is on the right of sequence2 - for j in range(i+1, len(variant_sequences)): + for j in range(i + 1, len(variant_sequences)): sequence2 = variant_sequences[j] - try: - s = sequence1.combine(sequence2) - if s.sequence in merged_variant_sequences: - existing_s = merged_variant_sequences[s.sequence] - # we may have already created the same sequence from another set of reads, in - # which case we need to merge the reads - if s.reads != existing_s.reads: - s = s.add_reads(existing_s.reads) - merged_variant_sequences[s.sequence] = s - unmerged_variant_sequences.discard(sequence1) - unmerged_variant_sequences.discard(sequence2) - merged_any = True - except ValueError: + combined = sequence1.combine(sequence2) + if combined is None: continue + if combined.sequence in merged_variant_sequences: + existing = merged_variant_sequences[combined.sequence] + # the existing VariantSequence and the newly merged + # VariantSequence should differ only in which reads support them + combined = combined.add_reads(existing.reads) + merged_variant_sequences[combined.sequence] = combined + unmerged_variant_sequences.discard(sequence1) + unmerged_variant_sequences.discard(sequence2) + merged_any = True result = list(merged_variant_sequences.values()) + list(unmerged_variant_sequences) return result, merged_any @@ -71,7 +71,7 @@ def greedy_merge( while merged_any: variant_sequences, merged_any = greedy_merge_helper( variant_sequences, - MIN_VARIANT_SEQUENCE_ASSEMBLY_OVERLAP_SIZE) + min_overlap_size=min_overlap_size) return variant_sequences def collapse_substrings(variant_sequences): @@ -85,12 +85,13 @@ def collapse_substrings(variant_sequences): Returns a (potentially shorter) list without any contained subsequences. """ - # dictionary mapping VariantSequence objects to lists of reads - # they absorb from substring VariantSequences if len(variant_sequences) <= 1: # if we don't have at least two VariantSequences then just # return your input return variant_sequences + + # dictionary mapping VariantSequence objects to lists of reads + # they absorb from substring VariantSequences extra_reads_from_substrings = defaultdict(set) result_list = [] # sort by longest to shortest total length @@ -139,4 +140,6 @@ def iterative_overlap_assembly( n_after_collapse) merged_variant_sequences = greedy_merge(variant_sequences, min_overlap_size) - return list(sorted(merged_variant_sequences, key=lambda seq: -len(seq.reads))) + return list(sorted( + merged_variant_sequences, + key=lambda seq: -len(seq.reads))) diff --git a/isovar/variant_sequences.py b/isovar/variant_sequences.py index d474c3a..e150ea8 100644 --- a/isovar/variant_sequences.py +++ b/isovar/variant_sequences.py @@ -138,13 +138,19 @@ def combine(self, other_sequence): them into a single VariantSequence object. If the other sequence is contained in this one, then add its reads to this VariantSequence. Also tries to flip the order (e.g. this sequence is a suffix or - this sequence is a subsequence). + this sequence is a subsequence). If sequences can't be combined + then returns None. """ if other_sequence.alt != self.alt: - raise ValueError( - "Cannot combine %s and %s with mismatching alt sequences" % ( - self, - other_sequence)) + logger.warn( + "Cannot combine %s and %s with mismatching alt sequences", + self, + other_sequence) + return None + elif self.contains(other_sequence): + return self.add_reads(other_sequence.reads) + elif other_sequence.contains(self): + return other_sequence.add_reads(self.reads) elif self.left_overlaps(other_sequence): # If sequences are like AABC and ABCC return VariantSequence( @@ -153,13 +159,14 @@ def combine(self, other_sequence): suffix=other_sequence.suffix, reads=self.reads.union(other_sequence.reads)) elif other_sequence.left_overlaps(self): - return other_sequence.combine(self) - elif self.contains(other_sequence): - return self.add_reads(other_sequence.reads) - elif other_sequence.contains(self): - return other_sequence.add_reads(self.reads) + return VariantSequence( + prefix=other_sequence.prefix, + alt=self.alt, + suffix=self.suffix, + reads=self.reads.union(other_sequence.reads)) else: - raise ValueError("%s does not overlap with %s" % (self, other_sequence)) + # sequences don't overlap + return None def variant_indices(self): """