Skip to content

Commit

Permalink
fixed unit test for dropping MAPQ=255 reads, added unit test for mult…
Browse files Browse the repository at this point in the history
…iple protein lengths
  • Loading branch information
iskandr committed Apr 15, 2016
1 parent 23e8efc commit a38fe9d
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 33 deletions.
39 changes: 25 additions & 14 deletions isovar/dataframe_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ def __init__(
element_class,
exclude=set([]),
converters={},
rename_dict={}):
rename_dict={},
variant_columns=True):
"""
Parameters
----------
Expand All @@ -52,10 +53,14 @@ def __init__(
rename_dict : dict
Dictionary mapping element_class field names to desired column names
in the produced DataFrame.
variant_columns : bool
If True, then add four columns for fields of a Variant: chr/pos/ref/alt
"""
self.element_class = element_class
self.rename_dict = rename_dict
self.converters = converters
self.variant_columns = variant_columns

# remove specified field names without changing the order of the others
self.original_field_names = [
Expand All @@ -74,27 +79,33 @@ def __init__(
self.rename_dict.get(x, x)
for x in self.original_field_names
]
columns_list = [
# fields related to variant
("chr", []),
("pos", []),
("ref", []),
("alt", []),
]
if self.variant_columns:
columns_list = [
# fields related to variant
("chr", []),
("pos", []),
("ref", []),
("alt", []),
]
else:
columns_list = []

for name in self.renamed_field_names:
columns_list.append((name, []))

self.columns_dict = OrderedDict(columns_list)

def add(self, variant, element):
assert isinstance(variant, Variant)
assert isinstance(element, self.element_class)
if self.variant_columns:
assert isinstance(variant, Variant)
self.columns_dict["chr"].append(variant.contig)
self.columns_dict["pos"].append(variant.original_start)
self.columns_dict["ref"].append(variant.original_ref)
self.columns_dict["alt"].append(variant.original_alt)
else:
assert variant is None

self.columns_dict["chr"].append(variant.contig)
self.columns_dict["pos"].append(variant.original_start)
self.columns_dict["ref"].append(variant.original_ref)
self.columns_dict["alt"].append(variant.original_alt)
assert isinstance(element, self.element_class)

for name in self.original_field_names:
value = getattr(element, name)
Expand Down
4 changes: 2 additions & 2 deletions isovar/default_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@
CDNA_CONTEXT_SIZE = VARIANT_CDNA_SEQUENCE_LENGTH // 2

# only consider variant cDNA sequences with at least this many reads
MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE = 1
MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE = 2

# number of nucleotides shared between reference and variant sequence
# before variant for reference contexts used to establish ORF
MIN_TRANSCRIPT_PREFIX_LENGTH = 30
MIN_TRANSCRIPT_PREFIX_LENGTH = 10

# maximum number of mismatching nucleotides between reference and variant
# prefix sequences
Expand Down
1 change: 1 addition & 0 deletions isovar/protein_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ def variants_to_protein_sequences(
Yields pairs of a Variant and a list of ProteinSequence objects
"""

for (variant, translations) in translate_variants(
variants=variants,
samfile=samfile,
Expand Down
5 changes: 3 additions & 2 deletions isovar/read_at_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def read_at_locus_generator(
continue

mapping_quality = read.mapping_quality

missing_mapping_quality = (
mapping_quality is None or mapping_quality == 255)
if min_mapping_quality > 0 and missing_mapping_quality:
Expand Down Expand Up @@ -219,7 +220,7 @@ def reads_at_locus_dataframe(*args, **kwargs):
Parameters are the same as those for read_locus_generator.
"""
df_builder = DataFrameBuilder(ReadAtLocus)
df_builder = DataFrameBuilder(ReadAtLocus, variant_columns=False)
for read_at_locus in read_at_locus_generator(*args, **kwargs):
df_builder.add(read_at_locus)
df_builder.add(variant=None, element=read_at_locus)
return df_builder.to_dataframe()
44 changes: 35 additions & 9 deletions isovar/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,8 @@ def translate_cdna(cdna_sequence_from_first_codon):
def translate_variant_sequence(
variant_sequence,
reference_context,
max_transcript_mismatches):
max_transcript_mismatches,
protein_sequence_length=None):
"""
Attempt to translate a single VariantSequence using the reading frame
from a single ReferenceContext.
Expand All @@ -411,6 +412,9 @@ def translate_variant_sequence(
sequences disagrees at more than this number of positions before the
variant nucleotides.
protein_sequence_length : int, optional
Truncate protein to be at most this long
Returns either a ProteinSequence object or None if the number of
mismatches between the RNA and reference transcript sequences exceeds the
given threshold.
Expand Down Expand Up @@ -449,6 +453,22 @@ def translate_variant_sequence(
n_ref=len(reference_context.sequence_at_variant_locus),
n_amino_acids=len(variant_amino_acids))

if protein_sequence_length and len(variant_amino_acids) > protein_sequence_length:
if protein_sequence_length <= variant_aa_interval_start:
logging.warn(
("Truncating amino acid sequence %s from variant sequence %s "
"to only %d elements loses all variant residues") % (
variant_amino_acids,
variant_sequence,
protein_sequence_length))
return None
# if the protein is too long then short it, which implies we're no longer
# stopping due to a stop codon and that the variant amino acids might
# need a new stop index
variant_amino_acids = variant_amino_acids[:protein_sequence_length]
variant_aa_interval_end = min(variant_aa_interval_end, protein_sequence_length)
ends_with_stop_codon = False

reference_sequence_before_variant = (
variant_sequence_in_reading_frame.reference_cdna_sequence_before_variant)
# converting sequence objects to str since skbio.DNA objects aren't really
Expand All @@ -473,7 +493,8 @@ def translate_variant_sequence(
def translation_generator(
variant_sequences,
reference_contexts,
max_transcript_mismatches):
max_transcript_mismatches,
protein_sequence_length=None):
"""
Given all detected VariantSequence objects for a particular variant
and all the ReferenceContext objects for that locus, translate
Expand All @@ -490,14 +511,18 @@ def translation_generator(
max_transcript_mismatches : int
protein_sequence_length : int, optional
Truncate protein to be at most this long
Yields a sequence of Translation objects.
"""
for reference_context in reference_contexts:
for variant_sequence in variant_sequences:
translation = translate_variant_sequence(
variant_sequence=variant_sequence,
reference_context=reference_context,
max_transcript_mismatches=max_transcript_mismatches)
max_transcript_mismatches=max_transcript_mismatches,
protein_sequence_length=protein_sequence_length)
if translation is not None:
yield translation

Expand Down Expand Up @@ -549,16 +574,16 @@ def translate_variants(
Translation objects.
"""

# adding 2nt to total RNA sequence length in case we need to clip 1 or 2
# bases of the sequence to match a reference ORF but still want to end up
# with the desired number of amino acids
rna_sequence_length = protein_sequence_length * 3 + 2
# Adding an extra codon to the desired RNA sequence length in case we
# need to clip nucleotides at the start/end of the sequence
rna_sequence_length = (protein_sequence_length + 1) * 3

for variant, variant_sequences in variant_sequences_generator(
variants=variants,
samfile=samfile,
sequence_length=rna_sequence_length,
min_reads=min_reads_supporting_rna_sequence):
min_reads=min_reads_supporting_rna_sequence,
min_mapping_quality=min_mapping_quality):

if len(variant_sequences) == 0:
logging.info(
Expand Down Expand Up @@ -591,7 +616,8 @@ def translate_variants(
translations = translation_generator(
variant_sequences=variant_sequences,
reference_contexts=reference_contexts,
max_transcript_mismatches=max_transcript_mismatches)
max_transcript_mismatches=max_transcript_mismatches,
protein_sequence_length=protein_sequence_length)

yield variant, translations

Expand Down
1 change: 0 additions & 1 deletion isovar/variant_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,6 @@ def variant_reads_generator(
"Chromosome '%s' from variant %s not in alignment file %s" % (
chromosome, variant, samfile.filename))
continue

variant_reads = gather_reads_for_single_variant(
samfile=samfile,
chromosome=chromosome,
Expand Down
9 changes: 7 additions & 2 deletions isovar/variant_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,8 @@ def variant_sequences_dataframe(
variants,
samfile,
sequence_length=VARIANT_CDNA_SEQUENCE_LENGTH,
min_reads=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE):
min_reads=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
min_mapping_quality=MIN_READ_MAPPING_QUALITY):
"""
Creates a dataframe of all detected cDNA sequences for the given variant
collection and alignment file.
Expand All @@ -211,14 +212,18 @@ def variant_sequences_dataframe(
min_reads : int
Minimum number of reads supporting
min_mapping_quality : int
Minimum MAPQ value before a read gets ignored
Returns pandas.DataFrame
"""
df_builder = DataFrameBuilder(VariantSequence)
for variant, variant_sequences, variant_reads in variant_sequences_generator(
variants=variants,
samfile=samfile,
sequence_length=sequence_length,
min_reads=min_reads):
min_reads=min_reads,
min_mapping_quality=min_mapping_quality):
for variant_sequence in variant_sequences:
df_builder.add(variant, variant_sequence)
return df_builder.to_dataframe()
65 changes: 62 additions & 3 deletions test/test_protein_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,69 @@ def test_sort_protein_sequences():
]
eq_(sort_protein_sequences(unsorted_protein_sequences), expected_order)

"""
protein_sequence_length=PROTEIN_SEQUENCE_LEGNTH,
min_reads_supporting_rna_sequence=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
min_transcript_prefix_length=MIN_TRANSCRIPT_PREFIX_LENGTH,
max_transcript_mismatches=MAX_REFERENCE_TRANSCRIPT_MISMATCHES,
max_protein_sequences_per_variant=MAX_PROTEIN_SEQUENCES_PER_VARIANT,
min_mapping_quality=MIN_READ_MAPPING_QUALITY
"""

def test_variants_to_protein_sequences_dataframe_one_sequence_per_variant():
variants = load_vcf("data/b16.f10/b16.vcf")
samfile = load_bam("data/b16.f10/b16.combined.sorted.bam")
df = variants_to_protein_sequences_dataframe(
variants,
samfile,
min_mapping_quality=0,
max_protein_sequences_per_variant=1)
print(df)
eq_(
len(df),
len(variants),
"Expected %d entries, got %d" % (len(variants), len(df)))

def test_variants_to_protein_sequences_dataframe():
def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality():
# since the B16 BAM has all MAPQ=255 values then all the reads should get dropped
variants = load_vcf("data/b16.f10/b16.vcf")
samfile = load_bam("data/b16.f10/b16.combined.sorted.bam")
df = variants_to_protein_sequences_dataframe(variants, samfile)
df = variants_to_protein_sequences_dataframe(
variants,
samfile,
min_mapping_quality=100,
max_protein_sequences_per_variant=1)
print(df)
assert len(df) == 4, len(df)
eq_(
len(df),
0,
"Expected 0 entries, got %d" % (len(df),))

def test_variants_to_protein_sequences_dataframe_protein_sequence_length():
# since the B16 BAM has all MAPQ=255 values then all the reads should get dropped
variants = load_vcf("data/b16.f10/b16.vcf")
samfile = load_bam("data/b16.f10/b16.combined.sorted.bam")

for desired_length in range(9, 20, 3):
df = variants_to_protein_sequences_dataframe(
variants,
samfile,
max_protein_sequences_per_variant=1,
protein_sequence_length=desired_length)
eq_(
len(df),
len(variants),
"Expected %d entries for protein_sequnece_length=%d, got %d results" % (
len(variants),
desired_length,
len(df)))
protein_sequences = df["amino_acids"]
print(protein_sequences)
protien_sequence_lengths = protein_sequences.str.len()
assert (protien_sequence_lengths == desired_length).all(), protien_sequence_lengths

if __name__ == "__main__":
test_sort_protein_sequences()
test_variants_to_protein_sequences_dataframe_one_sequence_per_variant()
test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality()
test_variants_to_protein_sequences_dataframe_protein_sequence_length()

0 comments on commit a38fe9d

Please sign in to comment.