From 23e8efcf39f7b444a6e5a271b52b30725bb881de Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn Date: Fri, 15 Apr 2016 17:08:43 -0400 Subject: [PATCH] revamped DataFrameBuilder, adding more tests --- isovar/dataframe_builder.py | 67 ++++++++++++++++++---------- isovar/protein_sequence.py | 11 +++-- isovar/read_at_locus.py | 14 +++++- isovar/reference_context.py | 30 +++++-------- isovar/translation.py | 4 +- isovar/variant_sequence.py | 78 ++++++++++++++++----------------- test/test_dataframe_builder.py | 75 +++++++++++++++++++++++++++---- test/test_reference_contexts.py | 18 +++++++- test/test_variant_sequences.py | 3 +- 9 files changed, 197 insertions(+), 103 deletions(-) diff --git a/isovar/dataframe_builder.py b/isovar/dataframe_builder.py index 3b33adb..2231a58 100644 --- a/isovar/dataframe_builder.py +++ b/isovar/dataframe_builder.py @@ -14,9 +14,13 @@ from __future__ import print_function, division, absolute_import from collections import OrderedDict +from six import integer_types, text_type, binary_type +from varcode import Variant import pandas as pd +VALID_ELEMENT_TYPES = integer_types + (text_type, binary_type, float, bool) + class DataFrameBuilder(object): """ Helper class for constructing a DataFrame which always has fields @@ -26,9 +30,9 @@ class DataFrameBuilder(object): def __init__( self, element_class, - exclude_field_names=set([]), - transform_fields={}, - convert_sequences_to_string=True): + exclude=set([]), + converters={}, + rename_dict={}): """ Parameters ---------- @@ -36,26 +40,39 @@ def __init__( Expected to a have a class-member named '_fields' which is a list of field names. - exclude_field_names : set + exclude : set Field names from element_class which should be used as columns for the DataFrame we're building - transform_fields : dict + converters : dict Dictionary of names mapping to functions. These functions will be applied to each element of a column before it's added to the DataFrame. - convert_sequences_to_string : bool - If a value being added to a column is a list or tuple, should - it be converted to a semi-colon separated string? + rename_dict : dict + Dictionary mapping element_class field names to desired column names + in the produced DataFrame. """ self.element_class = element_class - self.convert_sequences_to_string = convert_sequences_to_string + self.rename_dict = rename_dict + self.converters = converters + # remove specified field names without changing the order of the others - self.field_names = [ + self.original_field_names = [ x for x in element_class._fields - if x not in exclude_field_names + if x not in exclude + ] + + for name in converters: + if name not in self.original_field_names: + raise ValueError("No field named '%s', valid names: %s" % ( + name, + self.original_field_names)) + + self.renamed_field_names = [ + self.rename_dict.get(x, x) + for x in self.original_field_names ] columns_list = [ # fields related to variant @@ -65,31 +82,35 @@ def __init__( ("alt", []), ] - for name in self.field_names: + for name in self.renamed_field_names: columns_list.append((name, [])) + self.columns_dict = OrderedDict(columns_list) - self.transform_fields = transform_fields - for name in transform_fields: - if name not in self.columns_dict: - raise ValueError("No field named '%s' in %s" % ( - name, element_class)) def add(self, variant, element): + assert isinstance(variant, Variant) + assert isinstance(element, self.element_class) + self.columns_dict["chr"].append(variant.contig) self.columns_dict["pos"].append(variant.original_start) self.columns_dict["ref"].append(variant.original_ref) self.columns_dict["alt"].append(variant.original_alt) - for name in self.field_names: + for name in self.original_field_names: value = getattr(element, name) - if name in self.transform_fields: - fn = self.transform_fields[name] + if name in self.converters: + fn = self.converters[name] value = fn(value) - if isinstance(value, (list, tuple)): - if self.convert_sequences_to_string: - value = ";".join(value) + if not isinstance(value, VALID_ELEMENT_TYPES): + raise ValueError( + "Please provider converter for field '%s' : %s to make a scalar or string" % ( + name, + type(value))) + + if name in self.rename_dict: + name = self.rename_dict[name] self.columns_dict[name].append(value) def to_dataframe(self): diff --git a/isovar/protein_sequence.py b/isovar/protein_sequence.py index b5eb17d..58f0606 100644 --- a/isovar/protein_sequence.py +++ b/isovar/protein_sequence.py @@ -241,12 +241,11 @@ def variants_to_protein_sequences_dataframe(*args, **kwargs): """ df_builder = DataFrameBuilder( ProteinSequence, - transform_fields={ - "translations": len, - "supporting_variant_reads": len, - "supporting_transcripts": len, - "gene": lambda x: ";".join(x) - }) + converters=dict( + translations=len, + supporting_variant_reads=len, + supporting_transcripts=len, + gene=lambda x: ";".join(x))) for (variant, protein_sequences) in variants_to_protein_sequences(*args, **kwargs): for protein_sequence in protein_sequences: df_builder.add(variant, protein_sequence) diff --git a/isovar/read_at_locus.py b/isovar/read_at_locus.py index 7150af3..550d8a5 100644 --- a/isovar/read_at_locus.py +++ b/isovar/read_at_locus.py @@ -27,7 +27,7 @@ USE_DUPLICATE_READS, USE_SECONDARY_ALIGNMENTS, ) - +from .dataframe_builder import DataFrameBuilder ReadAtLocus = namedtuple( "ReadAtLocus", @@ -211,3 +211,15 @@ def read_at_locus_generator( quality_scores=base_qualities, base0_read_position_before_variant=base0_read_position_before_variant, base0_read_position_after_variant=base0_read_position_after_variant) + + +def reads_at_locus_dataframe(*args, **kwargs): + """ + Traverse a BAM file to find all the reads overlapping a specified locus. + + Parameters are the same as those for read_locus_generator. + """ + df_builder = DataFrameBuilder(ReadAtLocus) + for read_at_locus in read_at_locus_generator(*args, **kwargs): + df_builder.add(read_at_locus) + return df_builder.to_dataframe() diff --git a/isovar/reference_context.py b/isovar/reference_context.py index f036e9f..bdd3efb 100644 --- a/isovar/reference_context.py +++ b/isovar/reference_context.py @@ -16,12 +16,11 @@ from collections import namedtuple, OrderedDict, defaultdict import logging -import pandas as pd from skbio import DNA from .effect_prediction import reference_transcripts_for_variant from .variant_helpers import interbase_range_affected_by_variant_on_transcript - +from .dataframe_builder import DataFrameBuilder ########################## # @@ -438,24 +437,15 @@ def variants_to_reference_contexts_dataframe( Returns a DataFrame with {"chr", "pos", "ref", "alt"} columns for variants, as well as all the fields of ReferenceContext. """ - columns = [ - ("chr", []), - ("pos", []), - ("ref", []), - ("alt", []), - ] - for field in ReferenceContext._fields: - columns.append((field, [])) - columns_dict = OrderedDict(columns) - for variant, reference_context in reference_contexts_for_variants( + df_builder = DataFrameBuilder( + ReferenceContext, + exclude=["variant"], + converters=dict(transcripts=lambda ts: ";".join(t.name for t in ts))) + for variant, reference_contexts in reference_contexts_for_variants( variants=variants, context_size=context_size, - transcript_id_whitelist=transcript_id_whitelist): - columns_dict["chr"].append(variant.contig) - columns_dict["pos"].append(variant.original_start) - columns_dict["ref"].append(variant.original_ref) - columns_dict["alt"].append(variant.original_alt) - for field in ReferenceContext._fields: - columns_dict[field].append(getattr(reference_context, field)) - return pd.DataFrame(columns_dict) + transcript_id_whitelist=transcript_id_whitelist).items(): + for reference_context in reference_contexts: + df_builder.add(variant, reference_context) + return df_builder.to_dataframe() diff --git a/isovar/translation.py b/isovar/translation.py index ea48342..a1d0f8d 100644 --- a/isovar/translation.py +++ b/isovar/translation.py @@ -606,9 +606,9 @@ def translate_variants_dataframe(*args, **kwargs): """ df_builder = DataFrameBuilder( - element_class=Translation, + Translation, # exlude fields which are structured objects - exclude_field_names=[ + exclude=[ "variant_sequence", "reference_context", "variant_sequence_in_reading_frame"]) diff --git a/isovar/variant_sequence.py b/isovar/variant_sequence.py index cfeca34..6c06934 100644 --- a/isovar/variant_sequence.py +++ b/isovar/variant_sequence.py @@ -13,11 +13,9 @@ # limitations under the License. from __future__ import print_function, division, absolute_import -from collections import namedtuple, OrderedDict +from collections import namedtuple import logging -import numpy as np -import pandas as pd from .common import group_unique_sequences, get_variant_nucleotides from .variant_read import variant_reads_generator @@ -26,6 +24,7 @@ VARIANT_CDNA_SEQUENCE_LENGTH, MIN_READ_MAPPING_QUALITY, ) +from .dataframe_builder import DataFrameBuilder VariantSequence = namedtuple( "VariantSequence", @@ -46,7 +45,8 @@ def sort_key_decreasing_read_count(variant_sequence): def variant_reads_to_sequences( variant_reads, - context_size=None, + max_nucleotides_before_variant=None, + max_nucleotides_after_variant=None, min_sequence_length=None, min_reads_per_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE): """ @@ -59,9 +59,13 @@ def variant_reads_to_sequences( - suffix : str - name : str - context_size : int, optional - Number of nucleotides left and right of the variant. If omitted then - use full length of reads. + max_nucleotides_before_variant : int, optional + Number of nucleotides left of a variant. If omitted then + use all prefix nucleotides. + + max_nucleotides_after_variant : int, optional + Number of nucleotides right of a variant. If omitted then + use all suffix nucleotides. min_sequence_length : int, optional Minimum length of detected sequence @@ -72,6 +76,11 @@ def variant_reads_to_sequences( Returns a collection of VariantSequence objects """ + # just in case variant_reads is a generator, convert it to a list + variant_reads = list(variant_reads) + + if len(variant_reads) == 0: + return [] # Get all unique sequences from reads spanning the # variant locus. This will include partial sequences @@ -80,8 +89,8 @@ def variant_reads_to_sequences( # full length sequence unique_sequence_groups = group_unique_sequences( variant_reads, - max_prefix_size=context_size, - max_suffix_size=context_size) + max_prefix_size=max_nucleotides_before_variant, + max_suffix_size=max_nucleotides_after_variant) variant_seq = get_variant_nucleotides(variant_reads) variant_len = len(variant_seq) @@ -147,31 +156,34 @@ def variant_sequences_generator( min_mapping_quality : int Minimum MAPQ value before a read gets ignored - Generator that yields pairs of variants and sorted list of VariantSequence - objects. + Generator that yields tuples with the following fields: + - Variant + - list of VariantSequence objects """ for variant, variant_reads in variant_reads_generator( variants=variants, samfile=samfile, min_mapping_quality=min_mapping_quality): - logging.info("Generating variant sequences for %s" % variant) - if len(variant_reads) == 0: - logging.info("No variant reads found for %s" % variant) - continue - # the number of context nucleotides on either side of the variant # is half the desired length (minus the number of variant nucleotides) - n_surrounding_nucleotides = sequence_length - len(variant.alt) - - flanking_context_size = int(np.ceil(n_surrounding_nucleotides / 2.0)) + n_alt = len(variant.alt) + n_surrounding_nucleotides = sequence_length - n_alt + max_nucleotides_after_variant = n_surrounding_nucleotides // 2 + # if the number of nucleotides we need isn't divisible by 2 then + # prefer to have one more *before* the variant since we need the + # prefix sequence to match against reference transcripts + max_nucleotides_before_variant = ( + n_surrounding_nucleotides - max_nucleotides_after_variant) logging.info( - "Looking at %dnt RNA sequence context around %s" % ( - flanking_context_size, + "Looking at %dnt before and %dnt after variant %s" % ( + max_nucleotides_before_variant, + max_nucleotides_after_variant, variant)) variant_sequences = variant_reads_to_sequences( variant_reads, - context_size=flanking_context_size, + max_nucleotides_before_variant=max_nucleotides_before_variant, + max_nucleotides_after_variant=max_nucleotides_after_variant, min_reads_per_sequence=min_reads) yield variant, variant_sequences @@ -201,26 +213,12 @@ def variant_sequences_dataframe( Returns pandas.DataFrame """ - columns = OrderedDict([ - ("chr", []), - ("pos", []), - ("ref", []), - ("alt", []) - ]) - for field in VariantSequence._fields: - columns[field] = [] - - for variant, variant_sequences in variant_sequences_generator( + df_builder = DataFrameBuilder(VariantSequence) + for variant, variant_sequences, variant_reads in variant_sequences_generator( variants=variants, samfile=samfile, sequence_length=sequence_length, min_reads=min_reads): for variant_sequence in variant_sequences: - columns["chr"].append(variant.contig) - columns["pos"].append(variant.original_start) - columns["ref"].append(variant.original_ref) - columns["alt"].append(variant.original_alt) - for field_name in VariantSequence._fields: - columns[field_name] = getattr(variant_sequence, field_name) - - return pd.DataFrame(columns) + df_builder.add(variant, variant_sequence) + return df_builder.to_dataframe() diff --git a/test/test_dataframe_builder.py b/test/test_dataframe_builder.py index 27fbe0e..1793db4 100644 --- a/test/test_dataframe_builder.py +++ b/test/test_dataframe_builder.py @@ -19,11 +19,21 @@ from varcode import Variant import pandas as pd +TestClass = namedtuple("TestClass", "a b c") +test_obj = TestClass(a=1, b="s", c=3.0) +test_variant = Variant("X", 10, "CC", "C") + +def check_same_dataframes(df, expected): + eq_(len(df.columns), len(expected.columns)) + assert all(x == y for (x, y) in zip(df.columns, expected.columns)), \ + (df.columns, expected.columns) + # need to aggregate twice because the first time just rolls up each column, + # giving us a boolean Series + assert (df == expected).all().all(), "Expected %s == %s" % (df, expected) + + def test_dataframe_builder(): - TestClass = namedtuple("TestClass", "a b c") df_builder = DataFrameBuilder(TestClass) - test_obj = TestClass(a=1, b="s", c=3.0) - test_variant = Variant("X", 10, "CC", "C") df_builder.add(test_variant, test_obj) df = df_builder.to_dataframe() expected = pd.DataFrame(OrderedDict([ @@ -35,9 +45,56 @@ def test_dataframe_builder(): ("b", [test_obj.b]), ("c", [test_obj.c]), ])) - eq_(len(df.columns), len(expected.columns)) - assert all(x == y for (x, y) in zip(df.columns, expected.columns)), \ - (df.columns, expected.columns) - # need to aggregate twice because the first time just rolls up each column, - # giving us a boolean Series - assert (df == expected).all().all(), "Expected %s == %s" % (df, expected) + check_same_dataframes(df, expected) + +def test_dataframe_builder_rename(): + df_builder = DataFrameBuilder( + TestClass, + rename_dict={"a": "A", "b": "B", "c": "C"}) + df_builder.add(test_variant, test_obj) + df = df_builder.to_dataframe() + expected = pd.DataFrame(OrderedDict([ + ("chr", ["X"]), + ("pos", [10]), + ("ref", ["CC"]), + ("alt", ["C"]), + ("A", [test_obj.a]), + ("B", [test_obj.b]), + ("C", [test_obj.c]), + ])) + check_same_dataframes(df, expected) + +def test_dataframe_rename_and_converters(): + df_builder = DataFrameBuilder( + TestClass, + rename_dict={"a": "A", "b": "B", "c": "C"}, + converters=dict(a=float, c=int)) + df_builder.add(test_variant, test_obj) + df = df_builder.to_dataframe() + expected = pd.DataFrame(OrderedDict([ + ("chr", ["X"]), + ("pos", [10]), + ("ref", ["CC"]), + ("alt", ["C"]), + ("A", [float(test_obj.a)]), + ("B", [test_obj.b]), + ("C", [int(test_obj.c)]), + ])) + check_same_dataframes(df, expected) + +def test_dataframe_rename_and_converters_and_exclude(): + df_builder = DataFrameBuilder( + TestClass, + rename_dict={"c": "C"}, + converters=dict(c=int), + exclude=["a", "b"]) + df_builder.add(test_variant, test_obj) + df = df_builder.to_dataframe() + expected = pd.DataFrame(OrderedDict([ + ("chr", ["X"]), + ("pos", [10]), + ("ref", ["CC"]), + ("alt", ["C"]), + ("C", [int(test_obj.c)]), + ])) + check_same_dataframes(df, expected) diff --git a/test/test_reference_contexts.py b/test/test_reference_contexts.py index 2f1c251..e3f6257 100644 --- a/test/test_reference_contexts.py +++ b/test/test_reference_contexts.py @@ -16,13 +16,14 @@ from isovar.reference_context import ( reference_contexts_for_variants, + variants_to_reference_contexts_dataframe, ReferenceContext, ) from varcode import Variant, VariantCollection from pyensembl import ensembl_grch38 from nose.tools import eq_ -from testing_helpers import assert_equal_fields +from testing_helpers import assert_equal_fields, load_vcf def test_sequence_key_with_reading_frame_substitution_on_negative_strand(): # replace second codon of TP53-001 with 'CCC' @@ -94,3 +95,18 @@ def test_sequence_key_with_reading_frame_substitution_on_negative_strand(): variant=tp53_substitution, transcripts=[tp53_001]) assert_equal_fields(result, expected) + +def test_variants_to_reference_contexts_dataframe(): + variants = load_vcf("data/b16.f10/b16.vcf") + assert len(variants) > 0 + df = variants_to_reference_contexts_dataframe(variants, context_size=10) + print(df) + groups = df.groupby(["chr", "pos", "ref", "alt"]) + # make sure we have at least one reference context for each + # of the B16 coding variants + eq_(len(groups), len(variants)) + + +if __name__ == "__main__": + test_sequence_key_with_reading_frame_substitution_on_negative_strand() + test_variants_to_reference_contexts_dataframe() diff --git a/test/test_variant_sequences.py b/test/test_variant_sequences.py index 95c65a8..50c2016 100644 --- a/test/test_variant_sequences.py +++ b/test/test_variant_sequences.py @@ -36,7 +36,8 @@ def test_sequence_counts_snv(): variant_sequences = variant_reads_to_sequences( variant_reads, - context_size=45) + max_nucleotides_before_variant=45, + max_nucleotides_after_variant=45) assert len(variant_sequences) > 0 for variant_sequence in variant_sequences: print(variant_sequence)