diff --git a/isovar/dataframe_builder.py b/isovar/dataframe_builder.py index 2231a58..60f1345 100644 --- a/isovar/dataframe_builder.py +++ b/isovar/dataframe_builder.py @@ -32,7 +32,8 @@ def __init__( element_class, exclude=set([]), converters={}, - rename_dict={}): + rename_dict={}, + variant_columns=True): """ Parameters ---------- @@ -52,10 +53,14 @@ def __init__( rename_dict : dict Dictionary mapping element_class field names to desired column names in the produced DataFrame. + + variant_columns : bool + If True, then add four columns for fields of a Variant: chr/pos/ref/alt """ self.element_class = element_class self.rename_dict = rename_dict self.converters = converters + self.variant_columns = variant_columns # remove specified field names without changing the order of the others self.original_field_names = [ @@ -74,13 +79,16 @@ def __init__( self.rename_dict.get(x, x) for x in self.original_field_names ] - columns_list = [ - # fields related to variant - ("chr", []), - ("pos", []), - ("ref", []), - ("alt", []), - ] + if self.variant_columns: + columns_list = [ + # fields related to variant + ("chr", []), + ("pos", []), + ("ref", []), + ("alt", []), + ] + else: + columns_list = [] for name in self.renamed_field_names: columns_list.append((name, [])) @@ -88,13 +96,16 @@ def __init__( self.columns_dict = OrderedDict(columns_list) def add(self, variant, element): - assert isinstance(variant, Variant) - assert isinstance(element, self.element_class) + if self.variant_columns: + assert isinstance(variant, Variant) + self.columns_dict["chr"].append(variant.contig) + self.columns_dict["pos"].append(variant.original_start) + self.columns_dict["ref"].append(variant.original_ref) + self.columns_dict["alt"].append(variant.original_alt) + else: + assert variant is None - self.columns_dict["chr"].append(variant.contig) - self.columns_dict["pos"].append(variant.original_start) - self.columns_dict["ref"].append(variant.original_ref) - self.columns_dict["alt"].append(variant.original_alt) + assert isinstance(element, self.element_class) for name in self.original_field_names: value = getattr(element, name) diff --git a/isovar/default_parameters.py b/isovar/default_parameters.py index e5a3ec8..c853f56 100644 --- a/isovar/default_parameters.py +++ b/isovar/default_parameters.py @@ -38,11 +38,11 @@ CDNA_CONTEXT_SIZE = VARIANT_CDNA_SEQUENCE_LENGTH // 2 # only consider variant cDNA sequences with at least this many reads -MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE = 1 +MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE = 2 # number of nucleotides shared between reference and variant sequence # before variant for reference contexts used to establish ORF -MIN_TRANSCRIPT_PREFIX_LENGTH = 30 +MIN_TRANSCRIPT_PREFIX_LENGTH = 10 # maximum number of mismatching nucleotides between reference and variant # prefix sequences diff --git a/isovar/protein_sequence.py b/isovar/protein_sequence.py index 58f0606..537eba8 100644 --- a/isovar/protein_sequence.py +++ b/isovar/protein_sequence.py @@ -182,6 +182,7 @@ def variants_to_protein_sequences( Yields pairs of a Variant and a list of ProteinSequence objects """ + for (variant, translations) in translate_variants( variants=variants, samfile=samfile, diff --git a/isovar/read_at_locus.py b/isovar/read_at_locus.py index 550d8a5..c39d6b8 100644 --- a/isovar/read_at_locus.py +++ b/isovar/read_at_locus.py @@ -136,6 +136,7 @@ def read_at_locus_generator( continue mapping_quality = read.mapping_quality + missing_mapping_quality = ( mapping_quality is None or mapping_quality == 255) if min_mapping_quality > 0 and missing_mapping_quality: @@ -219,7 +220,7 @@ def reads_at_locus_dataframe(*args, **kwargs): Parameters are the same as those for read_locus_generator. """ - df_builder = DataFrameBuilder(ReadAtLocus) + df_builder = DataFrameBuilder(ReadAtLocus, variant_columns=False) for read_at_locus in read_at_locus_generator(*args, **kwargs): - df_builder.add(read_at_locus) + df_builder.add(variant=None, element=read_at_locus) return df_builder.to_dataframe() diff --git a/isovar/translation.py b/isovar/translation.py index a1d0f8d..2834f74 100644 --- a/isovar/translation.py +++ b/isovar/translation.py @@ -395,7 +395,8 @@ def translate_cdna(cdna_sequence_from_first_codon): def translate_variant_sequence( variant_sequence, reference_context, - max_transcript_mismatches): + max_transcript_mismatches, + protein_sequence_length=None): """ Attempt to translate a single VariantSequence using the reading frame from a single ReferenceContext. @@ -411,6 +412,9 @@ def translate_variant_sequence( sequences disagrees at more than this number of positions before the variant nucleotides. + protein_sequence_length : int, optional + Truncate protein to be at most this long + Returns either a ProteinSequence object or None if the number of mismatches between the RNA and reference transcript sequences exceeds the given threshold. @@ -449,6 +453,22 @@ def translate_variant_sequence( n_ref=len(reference_context.sequence_at_variant_locus), n_amino_acids=len(variant_amino_acids)) + if protein_sequence_length and len(variant_amino_acids) > protein_sequence_length: + if protein_sequence_length <= variant_aa_interval_start: + logging.warn( + ("Truncating amino acid sequence %s from variant sequence %s " + "to only %d elements loses all variant residues") % ( + variant_amino_acids, + variant_sequence, + protein_sequence_length)) + return None + # if the protein is too long then short it, which implies we're no longer + # stopping due to a stop codon and that the variant amino acids might + # need a new stop index + variant_amino_acids = variant_amino_acids[:protein_sequence_length] + variant_aa_interval_end = min(variant_aa_interval_end, protein_sequence_length) + ends_with_stop_codon = False + reference_sequence_before_variant = ( variant_sequence_in_reading_frame.reference_cdna_sequence_before_variant) # converting sequence objects to str since skbio.DNA objects aren't really @@ -473,7 +493,8 @@ def translate_variant_sequence( def translation_generator( variant_sequences, reference_contexts, - max_transcript_mismatches): + max_transcript_mismatches, + protein_sequence_length=None): """ Given all detected VariantSequence objects for a particular variant and all the ReferenceContext objects for that locus, translate @@ -490,6 +511,9 @@ def translation_generator( max_transcript_mismatches : int + protein_sequence_length : int, optional + Truncate protein to be at most this long + Yields a sequence of Translation objects. """ for reference_context in reference_contexts: @@ -497,7 +521,8 @@ def translation_generator( translation = translate_variant_sequence( variant_sequence=variant_sequence, reference_context=reference_context, - max_transcript_mismatches=max_transcript_mismatches) + max_transcript_mismatches=max_transcript_mismatches, + protein_sequence_length=protein_sequence_length) if translation is not None: yield translation @@ -549,16 +574,16 @@ def translate_variants( Translation objects. """ - # adding 2nt to total RNA sequence length in case we need to clip 1 or 2 - # bases of the sequence to match a reference ORF but still want to end up - # with the desired number of amino acids - rna_sequence_length = protein_sequence_length * 3 + 2 + # Adding an extra codon to the desired RNA sequence length in case we + # need to clip nucleotides at the start/end of the sequence + rna_sequence_length = (protein_sequence_length + 1) * 3 for variant, variant_sequences in variant_sequences_generator( variants=variants, samfile=samfile, sequence_length=rna_sequence_length, - min_reads=min_reads_supporting_rna_sequence): + min_reads=min_reads_supporting_rna_sequence, + min_mapping_quality=min_mapping_quality): if len(variant_sequences) == 0: logging.info( @@ -591,7 +616,8 @@ def translate_variants( translations = translation_generator( variant_sequences=variant_sequences, reference_contexts=reference_contexts, - max_transcript_mismatches=max_transcript_mismatches) + max_transcript_mismatches=max_transcript_mismatches, + protein_sequence_length=protein_sequence_length) yield variant, translations diff --git a/isovar/variant_read.py b/isovar/variant_read.py index 29cb7b6..b61907a 100644 --- a/isovar/variant_read.py +++ b/isovar/variant_read.py @@ -233,7 +233,6 @@ def variant_reads_generator( "Chromosome '%s' from variant %s not in alignment file %s" % ( chromosome, variant, samfile.filename)) continue - variant_reads = gather_reads_for_single_variant( samfile=samfile, chromosome=chromosome, diff --git a/isovar/variant_sequence.py b/isovar/variant_sequence.py index 6c06934..a99628f 100644 --- a/isovar/variant_sequence.py +++ b/isovar/variant_sequence.py @@ -192,7 +192,8 @@ def variant_sequences_dataframe( variants, samfile, sequence_length=VARIANT_CDNA_SEQUENCE_LENGTH, - min_reads=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE): + min_reads=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, + min_mapping_quality=MIN_READ_MAPPING_QUALITY): """ Creates a dataframe of all detected cDNA sequences for the given variant collection and alignment file. @@ -211,6 +212,9 @@ def variant_sequences_dataframe( min_reads : int Minimum number of reads supporting + min_mapping_quality : int + Minimum MAPQ value before a read gets ignored + Returns pandas.DataFrame """ df_builder = DataFrameBuilder(VariantSequence) @@ -218,7 +222,8 @@ def variant_sequences_dataframe( variants=variants, samfile=samfile, sequence_length=sequence_length, - min_reads=min_reads): + min_reads=min_reads, + min_mapping_quality=min_mapping_quality): for variant_sequence in variant_sequences: df_builder.add(variant, variant_sequence) return df_builder.to_dataframe() diff --git a/test/test_protein_sequence.py b/test/test_protein_sequence.py index 8459dbe..57b47d8 100644 --- a/test/test_protein_sequence.py +++ b/test/test_protein_sequence.py @@ -142,10 +142,69 @@ def test_sort_protein_sequences(): ] eq_(sort_protein_sequences(unsorted_protein_sequences), expected_order) +""" +protein_sequence_length=PROTEIN_SEQUENCE_LEGNTH, + min_reads_supporting_rna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE, + min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH, + max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES, + max_protein_sequences_per_variant=MAX_PROTEIN_SEQUENCES_PER_VARIANT, + min_mapping_quality=MIN_READ_MAPPING_QUALITY +""" + +def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant(): + variants = load_vcf("data/b16.f10/b16.vcf") + samfile = load_bam("data/b16.f10/b16.combined.sorted.bam") + df = variants_to_protein_sequences_dataframe( + variants, + samfile, + min_mapping_quality=0, + max_protein_sequences_per_variant=1) + print(df) + eq_( + len(df), + len(variants), + "Expected %d entries, got %d" % (len(variants), len(df))) -def test_variants_to_protein_sequences_dataframe(): +def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality(): + # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped variants = load_vcf("data/b16.f10/b16.vcf") samfile = load_bam("data/b16.f10/b16.combined.sorted.bam") - df = variants_to_protein_sequences_dataframe(variants, samfile) + df = variants_to_protein_sequences_dataframe( + variants, + samfile, + min_mapping_quality=100, + max_protein_sequences_per_variant=1) print(df) - assert len(df) == 4, len(df) + eq_( + len(df), + 0, + "Expected 0 entries, got %d" % (len(df),)) + +def test_variants_to_protein_sequences_dataframe_protein_sequence_length(): + # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped + variants = load_vcf("data/b16.f10/b16.vcf") + samfile = load_bam("data/b16.f10/b16.combined.sorted.bam") + + for desired_length in range(9, 20, 3): + df = variants_to_protein_sequences_dataframe( + variants, + samfile, + max_protein_sequences_per_variant=1, + protein_sequence_length=desired_length) + eq_( + len(df), + len(variants), + "Expected %d entries for protein_sequnece_length=%d, got %d results" % ( + len(variants), + desired_length, + len(df))) + protein_sequences = df["amino_acids"] + print(protein_sequences) + protien_sequence_lengths = protein_sequences.str.len() + assert (protien_sequence_lengths == desired_length).all(), protien_sequence_lengths + +if __name__ == "__main__": + test_sort_protein_sequences() + test_variants_to_protein_sequences_dataframe_one_sequence_per_variant() + test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality() + test_variants_to_protein_sequences_dataframe_protein_sequence_length()