Skip to content

Commit

Permalink
revamped DataFrameBuilder, adding more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
iskandr committed Apr 15, 2016
1 parent d06c34e commit 23e8efc
Show file tree
Hide file tree
Showing 9 changed files with 197 additions and 103 deletions.
67 changes: 44 additions & 23 deletions isovar/dataframe_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@

from __future__ import print_function, division, absolute_import
from collections import OrderedDict
from six import integer_types, text_type, binary_type

from varcode import Variant
import pandas as pd

VALID_ELEMENT_TYPES = integer_types + (text_type, binary_type, float, bool)

class DataFrameBuilder(object):
"""
Helper class for constructing a DataFrame which always has fields
Expand All @@ -26,36 +30,49 @@ class DataFrameBuilder(object):
def __init__(
self,
element_class,
exclude_field_names=set([]),
transform_fields={},
convert_sequences_to_string=True):
exclude=set([]),
converters={},
rename_dict={}):
"""
Parameters
----------
element_class : type
Expected to a have a class-member named '_fields' which is a list
of field names.
exclude_field_names : set
exclude : set
Field names from element_class which should be used as columns for
the DataFrame we're building
transform_fields : dict
converters : dict
Dictionary of names mapping to functions. These functions will be
applied to each element of a column before it's added to the
DataFrame.
convert_sequences_to_string : bool
If a value being added to a column is a list or tuple, should
it be converted to a semi-colon separated string?
rename_dict : dict
Dictionary mapping element_class field names to desired column names
in the produced DataFrame.
"""
self.element_class = element_class
self.convert_sequences_to_string = convert_sequences_to_string
self.rename_dict = rename_dict
self.converters = converters

# remove specified field names without changing the order of the others
self.field_names = [
self.original_field_names = [
x
for x in element_class._fields
if x not in exclude_field_names
if x not in exclude
]

for name in converters:
if name not in self.original_field_names:
raise ValueError("No field named '%s', valid names: %s" % (
name,
self.original_field_names))

self.renamed_field_names = [
self.rename_dict.get(x, x)
for x in self.original_field_names
]
columns_list = [
# fields related to variant
Expand All @@ -65,31 +82,35 @@ def __init__(
("alt", []),
]

for name in self.field_names:
for name in self.renamed_field_names:
columns_list.append((name, []))

self.columns_dict = OrderedDict(columns_list)
self.transform_fields = transform_fields
for name in transform_fields:
if name not in self.columns_dict:
raise ValueError("No field named '%s' in %s" % (
name, element_class))

def add(self, variant, element):
assert isinstance(variant, Variant)
assert isinstance(element, self.element_class)

self.columns_dict["chr"].append(variant.contig)
self.columns_dict["pos"].append(variant.original_start)
self.columns_dict["ref"].append(variant.original_ref)
self.columns_dict["alt"].append(variant.original_alt)

for name in self.field_names:
for name in self.original_field_names:
value = getattr(element, name)

if name in self.transform_fields:
fn = self.transform_fields[name]
if name in self.converters:
fn = self.converters[name]
value = fn(value)

if isinstance(value, (list, tuple)):
if self.convert_sequences_to_string:
value = ";".join(value)
if not isinstance(value, VALID_ELEMENT_TYPES):
raise ValueError(
"Please provider converter for field '%s' : %s to make a scalar or string" % (
name,
type(value)))

if name in self.rename_dict:
name = self.rename_dict[name]
self.columns_dict[name].append(value)

def to_dataframe(self):
Expand Down
11 changes: 5 additions & 6 deletions isovar/protein_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,12 +241,11 @@ def variants_to_protein_sequences_dataframe(*args, **kwargs):
"""
df_builder = DataFrameBuilder(
ProteinSequence,
transform_fields={
"translations": len,
"supporting_variant_reads": len,
"supporting_transcripts": len,
"gene": lambda x: ";".join(x)
})
converters=dict(
translations=len,
supporting_variant_reads=len,
supporting_transcripts=len,
gene=lambda x: ";".join(x)))
for (variant, protein_sequences) in variants_to_protein_sequences(*args, **kwargs):
for protein_sequence in protein_sequences:
df_builder.add(variant, protein_sequence)
Expand Down
14 changes: 13 additions & 1 deletion isovar/read_at_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
USE_DUPLICATE_READS,
USE_SECONDARY_ALIGNMENTS,
)

from .dataframe_builder import DataFrameBuilder

ReadAtLocus = namedtuple(
"ReadAtLocus",
Expand Down Expand Up @@ -211,3 +211,15 @@ def read_at_locus_generator(
quality_scores=base_qualities,
base0_read_position_before_variant=base0_read_position_before_variant,
base0_read_position_after_variant=base0_read_position_after_variant)


def reads_at_locus_dataframe(*args, **kwargs):
"""
Traverse a BAM file to find all the reads overlapping a specified locus.
Parameters are the same as those for read_locus_generator.
"""
df_builder = DataFrameBuilder(ReadAtLocus)
for read_at_locus in read_at_locus_generator(*args, **kwargs):
df_builder.add(read_at_locus)
return df_builder.to_dataframe()
30 changes: 10 additions & 20 deletions isovar/reference_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@
from collections import namedtuple, OrderedDict, defaultdict
import logging

import pandas as pd
from skbio import DNA

from .effect_prediction import reference_transcripts_for_variant
from .variant_helpers import interbase_range_affected_by_variant_on_transcript

from .dataframe_builder import DataFrameBuilder

##########################
#
Expand Down Expand Up @@ -438,24 +437,15 @@ def variants_to_reference_contexts_dataframe(
Returns a DataFrame with {"chr", "pos", "ref", "alt"} columns for variants,
as well as all the fields of ReferenceContext.
"""
columns = [
("chr", []),
("pos", []),
("ref", []),
("alt", []),
]
for field in ReferenceContext._fields:
columns.append((field, []))
columns_dict = OrderedDict(columns)

for variant, reference_context in reference_contexts_for_variants(
df_builder = DataFrameBuilder(
ReferenceContext,
exclude=["variant"],
converters=dict(transcripts=lambda ts: ";".join(t.name for t in ts)))
for variant, reference_contexts in reference_contexts_for_variants(
variants=variants,
context_size=context_size,
transcript_id_whitelist=transcript_id_whitelist):
columns_dict["chr"].append(variant.contig)
columns_dict["pos"].append(variant.original_start)
columns_dict["ref"].append(variant.original_ref)
columns_dict["alt"].append(variant.original_alt)
for field in ReferenceContext._fields:
columns_dict[field].append(getattr(reference_context, field))
return pd.DataFrame(columns_dict)
transcript_id_whitelist=transcript_id_whitelist).items():
for reference_context in reference_contexts:
df_builder.add(variant, reference_context)
return df_builder.to_dataframe()
4 changes: 2 additions & 2 deletions isovar/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,9 +606,9 @@ def translate_variants_dataframe(*args, **kwargs):
"""

df_builder = DataFrameBuilder(
element_class=Translation,
Translation,
# exlude fields which are structured objects
exclude_field_names=[
exclude=[
"variant_sequence",
"reference_context",
"variant_sequence_in_reading_frame"])
Expand Down
78 changes: 38 additions & 40 deletions isovar/variant_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,9 @@
# limitations under the License.

from __future__ import print_function, division, absolute_import
from collections import namedtuple, OrderedDict
from collections import namedtuple
import logging

import numpy as np
import pandas as pd

from .common import group_unique_sequences, get_variant_nucleotides
from .variant_read import variant_reads_generator
Expand All @@ -26,6 +24,7 @@
VARIANT_CDNA_SEQUENCE_LENGTH,
MIN_READ_MAPPING_QUALITY,
)
from .dataframe_builder import DataFrameBuilder

VariantSequence = namedtuple(
"VariantSequence",
Expand All @@ -46,7 +45,8 @@ def sort_key_decreasing_read_count(variant_sequence):

def variant_reads_to_sequences(
variant_reads,
context_size=None,
max_nucleotides_before_variant=None,
max_nucleotides_after_variant=None,
min_sequence_length=None,
min_reads_per_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE):
"""
Expand All @@ -59,9 +59,13 @@ def variant_reads_to_sequences(
- suffix : str
- name : str
context_size : int, optional
Number of nucleotides left and right of the variant. If omitted then
use full length of reads.
max_nucleotides_before_variant : int, optional
Number of nucleotides left of a variant. If omitted then
use all prefix nucleotides.
max_nucleotides_after_variant : int, optional
Number of nucleotides right of a variant. If omitted then
use all suffix nucleotides.
min_sequence_length : int, optional
Minimum length of detected sequence
Expand All @@ -72,6 +76,11 @@ def variant_reads_to_sequences(
Returns a collection of VariantSequence objects
"""
# just in case variant_reads is a generator, convert it to a list
variant_reads = list(variant_reads)

if len(variant_reads) == 0:
return []

# Get all unique sequences from reads spanning the
# variant locus. This will include partial sequences
Expand All @@ -80,8 +89,8 @@ def variant_reads_to_sequences(
# full length sequence
unique_sequence_groups = group_unique_sequences(
variant_reads,
max_prefix_size=context_size,
max_suffix_size=context_size)
max_prefix_size=max_nucleotides_before_variant,
max_suffix_size=max_nucleotides_after_variant)

variant_seq = get_variant_nucleotides(variant_reads)
variant_len = len(variant_seq)
Expand Down Expand Up @@ -147,31 +156,34 @@ def variant_sequences_generator(
min_mapping_quality : int
Minimum MAPQ value before a read gets ignored
Generator that yields pairs of variants and sorted list of VariantSequence
objects.
Generator that yields tuples with the following fields:
- Variant
- list of VariantSequence objects
"""
for variant, variant_reads in variant_reads_generator(
variants=variants,
samfile=samfile,
min_mapping_quality=min_mapping_quality):
logging.info("Generating variant sequences for %s" % variant)
if len(variant_reads) == 0:
logging.info("No variant reads found for %s" % variant)
continue

# the number of context nucleotides on either side of the variant
# is half the desired length (minus the number of variant nucleotides)
n_surrounding_nucleotides = sequence_length - len(variant.alt)

flanking_context_size = int(np.ceil(n_surrounding_nucleotides / 2.0))
n_alt = len(variant.alt)
n_surrounding_nucleotides = sequence_length - n_alt
max_nucleotides_after_variant = n_surrounding_nucleotides // 2
# if the number of nucleotides we need isn't divisible by 2 then
# prefer to have one more *before* the variant since we need the
# prefix sequence to match against reference transcripts
max_nucleotides_before_variant = (
n_surrounding_nucleotides - max_nucleotides_after_variant)
logging.info(
"Looking at %dnt RNA sequence context around %s" % (
flanking_context_size,
"Looking at %dnt before and %dnt after variant %s" % (
max_nucleotides_before_variant,
max_nucleotides_after_variant,
variant))

variant_sequences = variant_reads_to_sequences(
variant_reads,
context_size=flanking_context_size,
max_nucleotides_before_variant=max_nucleotides_before_variant,
max_nucleotides_after_variant=max_nucleotides_after_variant,
min_reads_per_sequence=min_reads)

yield variant, variant_sequences
Expand Down Expand Up @@ -201,26 +213,12 @@ def variant_sequences_dataframe(
Returns pandas.DataFrame
"""
columns = OrderedDict([
("chr", []),
("pos", []),
("ref", []),
("alt", [])
])
for field in VariantSequence._fields:
columns[field] = []

for variant, variant_sequences in variant_sequences_generator(
df_builder = DataFrameBuilder(VariantSequence)
for variant, variant_sequences, variant_reads in variant_sequences_generator(
variants=variants,
samfile=samfile,
sequence_length=sequence_length,
min_reads=min_reads):
for variant_sequence in variant_sequences:
columns["chr"].append(variant.contig)
columns["pos"].append(variant.original_start)
columns["ref"].append(variant.original_ref)
columns["alt"].append(variant.original_alt)
for field_name in VariantSequence._fields:
columns[field_name] = getattr(variant_sequence, field_name)

return pd.DataFrame(columns)
df_builder.add(variant, variant_sequence)
return df_builder.to_dataframe()
Loading

0 comments on commit 23e8efc

Please sign in to comment.