diff --git a/setup.py b/setup.py index 9b0046f..dae67b6 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ setup( name='varcode', packages=find_packages(), - version="0.3.19", + version="0.4.0", description="Variant annotation in Python", long_description=readme, url="https://github.com/hammerlab/varcode", @@ -62,6 +62,5 @@ 'biopython>=1.64', 'pyvcf>=0.6.7', 'memoized_property>=1.0.2', - 'pysam>=0.8.3', ], ) diff --git a/test/data/reads/gatk_mini_bundle_extract.bam b/test/data/reads/gatk_mini_bundle_extract.bam deleted file mode 100644 index 0385e19..0000000 Binary files a/test/data/reads/gatk_mini_bundle_extract.bam and /dev/null differ diff --git a/test/data/reads/gatk_mini_bundle_extract.bam.bai b/test/data/reads/gatk_mini_bundle_extract.bam.bai deleted file mode 100644 index c408fe9..0000000 Binary files a/test/data/reads/gatk_mini_bundle_extract.bam.bai and /dev/null differ diff --git a/test/data/reads/rna_chr17_41244936.bam b/test/data/reads/rna_chr17_41244936.bam deleted file mode 100644 index 0b970b8..0000000 Binary files a/test/data/reads/rna_chr17_41244936.bam and /dev/null differ diff --git a/test/data/reads/rna_chr17_41244936.bam.bai b/test/data/reads/rna_chr17_41244936.bam.bai deleted file mode 100644 index 7afd1fb..0000000 Binary files a/test/data/reads/rna_chr17_41244936.bam.bai and /dev/null differ diff --git a/test/read_evidence_performance.py b/test/read_evidence_performance.py deleted file mode 100644 index 16e96ad..0000000 --- a/test/read_evidence_performance.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Simple script to measure performance of the read_evidence module. - -Usage: - %(prog)s /path/to/reads.bam ... --num-loci-per-region X - -Example: - %(prog)s my.bam 8:101723999-101725999 --num-loci-per-region 2000 - -""" -import argparse -import sys -import random -import time - -import varcode -import varcode.read_evidence -from varcode.locus import Locus - -PARSER = argparse.ArgumentParser(usage=__doc__) -PARSER.add_argument("bam_path") -PARSER.add_argument("regions", nargs="+") -PARSER.add_argument("--num-loci-per-region", type=int, default=100) - -def parse_region(s): - (contig, rest) = s.split(":") - (start, end) = rest.split("-") - return varcode.Locus.from_inclusive_coordinates( - contig, int(start), int(end)) - -def go(argv): - args = PARSER.parse_args(argv) - - loci_regions = [parse_region(s) for s in args.regions] - - loci = [] - for region in loci_regions: - new_loci = random.sample( - range(region.start, region.end), args.num_loci_per_region) - loci.extend( - Locus.from_inclusive_coordinates(region.contig, locus) - for locus in new_loci) - - print("Loading pileups for %d loci." % len(loci)) - - start = time.time() - varcode.read_evidence.PileupCollection.from_bam(args.bam_path, loci) - elapsed = time.time() - start - - print("Read pileups for %d loci in %f.2 seconds = %f locus / sec" % ( - len(loci), elapsed, len(loci) / elapsed)) - -if __name__ == '__main__': - go(sys.argv[1:]) diff --git a/test/test_read_evidence.py b/test/test_read_evidence.py deleted file mode 100644 index 4121e46..0000000 --- a/test/test_read_evidence.py +++ /dev/null @@ -1,232 +0,0 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import absolute_import - -import collections -from nose.tools import eq_, assert_raises -import pysam -from varcode import Variant as VarcodeVariant -from varcode.read_evidence import PileupCollection -from varcode import Locus -from pysam.csamfile import Samfile -from .data import data_path - -# This will be moved into mainline varcode soon. For now, however, -# these tests use their own simple Variant class. -Variant = collections.namedtuple("Variant", "locus ref alt") - -def filtered_read_names(filtered_evidence): - assert filtered_evidence.parent is not None - full = set(filtered_evidence.parent.read_attribute('query_name')) - filtered = set(filtered_evidence.read_attribute('query_name')) - assert filtered - full == set() - return full - filtered - -def test_read_evidence_rna1_single_base_loci(): - loci = [ - Locus.from_inclusive_coordinates("17", 41244936, 41244936), # 0 - Locus.from_inclusive_coordinates("17", 41244937, 41244937), # 1 - Locus.from_inclusive_coordinates("17", 41244935, 41244935), # 2 - Locus.from_inclusive_coordinates("17", 41244933, 41244933), # 3 - Locus.from_inclusive_coordinates("17", 41244853, 41244853), # 4 - Locus.from_inclusive_coordinates("17", 41244857, 41244857), # 5 - Locus.from_inclusive_coordinates("17", 41244864, 41244864), # 6 - Locus.from_inclusive_coordinates("17", 41244879, 41244879), # 7 - Locus.from_inclusive_coordinates("17", 41244901, 41244901), # 8 - Locus.from_inclusive_coordinates("17", 41244910, 41244910), # 9 - Locus.from_inclusive_coordinates("17", 41244917, 41244917), # 10 - Locus.from_inclusive_coordinates("17", 41244972, 41244972), # 11 - Locus.from_inclusive_coordinates("17", 41244973, 41244973), # 12 - Locus.from_inclusive_coordinates("17", 41245026, 41245026), # 13 - Locus.from_inclusive_coordinates("17", 41245027, 41245027), # 14 - Locus.from_inclusive_coordinates("17", 41245019, 41245019), # 15 - Locus.from_inclusive_coordinates("17", 41245018, 41245018), # 16 - ] - evidence = PileupCollection.from_bam( - data_path("reads/rna_chr17_41244936.bam"), loci) - - eq_(evidence.allele_summary(loci[0]), [("A", 11), ("G", 7)]) - eq_(evidence.allele_summary(loci[1]), [("G", 17), ("A", 1)]) - eq_(evidence.allele_summary(loci[2]), [("C", 18)]) - eq_(evidence.allele_summary(loci[3]), [("A", 17), ("G", 1)]) - eq_(evidence.allele_summary(loci[4]), [("C", 1)]) - eq_(evidence.allele_summary(loci[5]), [("T", 2)]) - eq_(evidence.allele_summary(loci[6]), [("T", 4)]) - eq_(evidence.allele_summary(loci[7]), [("C", 8)]) - eq_(evidence.allele_summary(loci[8]), [("C", 8)]) - eq_(evidence.allele_summary(loci[9]), [("C", 9)]) - eq_(evidence.allele_summary(loci[10]), [("A", 10)]) - eq_(evidence.allele_summary(loci[11]), [("T", 11)]) - eq_(evidence.allele_summary(loci[12]), [("T", 11)]) - eq_(evidence.allele_summary(loci[13]), [("C", 1)]) - eq_(evidence.allele_summary(loci[14]), [("G", 1)]) - eq_(evidence.allele_summary(loci[15]), [("T", 8)]) - eq_(evidence.allele_summary(loci[16]), [("T", 8)]) - -def test_read_evidence_rna1_multi_base_loci(): - loci = [ - Locus.from_inclusive_coordinates("17", 41244853, 41244854), # 0 - Locus.from_inclusive_coordinates("17", 41244853, 41244857), # 1 - Locus.from_inclusive_coordinates("17", 41244854, 41244857), # 2 - Locus.from_inclusive_coordinates("17", 41244852, 41244857), # 3 - Locus.from_inclusive_coordinates("17", 41244933, 41244936), # 4 - Locus.from_inclusive_coordinates("17", 41244933, 41244937), # 5 - Locus.from_inclusive_coordinates("17", 41244971, 41244973), # 6 - Locus.from_inclusive_coordinates("17", 41265063, 41265067), # 7 - ] - evidence = PileupCollection.from_bam( - data_path("reads/rna_chr17_41244936.bam"), loci) - eq_(evidence.allele_summary(loci[0]), [("CT", 1)]) - eq_(evidence.allele_summary(loci[1]), [("CTTTT", 1)]) - eq_(evidence.allele_summary(loci[2]), [("TTTT", 1)]) - eq_(evidence.allele_summary(loci[3]), []) - eq_(evidence.allele_summary(loci[4]), - [("AACA", 11), ("AACG", 6), ("GACG", 1)]) - eq_(evidence.allele_summary(loci[5]), - [("AACAG", 10), ("AACGG", 6), ("AACAA", 1), ("GACGG", 1)]) - eq_(evidence.allele_summary(loci[6]), [("ATT", 11)]) - eq_(evidence.allele_summary(loci[7]), [("ACCCG", 1)]) - -def test_read_evidence_gatk_mini_bundle_extract(): - loci = [ - Locus.from_inclusive_coordinates("20", 9999996, 9999996), # 0 - Locus.from_inclusive_coordinates("20", 10260442), # 1 - Locus.from_inclusive_coordinates("20", 10006823), # 2 - Locus.from_inclusive_coordinates("20", 10006819, 10006823), # 3 - Locus.from_inclusive_coordinates("20", 10006819, 10006825), # 4 - Locus.from_inclusive_coordinates("20", 10006822, 10006827), # 5 - Locus.from_inclusive_coordinates("20", 10007175), # 6 - Locus.from_inclusive_coordinates("20", 10007174, 10007176), # 7 - Locus.from_inclusive_coordinates("20", 1, 3), # 8 - Locus.from_inclusive_coordinates("20", 10008796), # 9 - Locus.from_inclusive_coordinates("20", 10008921), # 10 - ] - handle = Samfile(data_path("reads/gatk_mini_bundle_extract.bam")) - evidence = PileupCollection.from_bam(handle, loci) - - eq_(evidence.allele_summary(loci[0]), [("ACT", 9)]) - eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[0]), - [("ACT", 8)]) - eq_(evidence.allele_summary(loci[1]), [("T", 7)]) - eq_(evidence.filter().allele_summary(loci[2]), [("", 6), ("C", 2)]) - eq_(evidence.filter( - drop_duplicates=True, min_base_quality=50).allele_summary(loci[2]), - []) - eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[2]), - [("", 5), ("C", 1)]) - eq_(evidence.filter( - drop_duplicates=True, min_mapping_quality=60).allele_summary( - loci[2]), - [("", 5), ("C", 1)]) - eq_(evidence.filter(drop_duplicates=True, - min_mapping_quality=61).allele_summary(loci[2]), [("", 2)]) - eq_(evidence.filter(drop_duplicates=True, - min_mapping_quality=61).allele_summary(loci[3]), [("A", 2)]) - eq_(evidence.filter(drop_duplicates=True, - min_mapping_quality=61).allele_summary(loci[4]), [("AAA", 2)]) - eq_(evidence.filter(drop_duplicates=True, - min_mapping_quality=61).allele_summary(loci[5]), [("AAAC", 2)]) - eq_(evidence.filter().allele_summary(loci[6]), [("T", 5), ("C", 3)]) - eq_(evidence.filter(min_base_quality=30).allele_summary(loci[6]), - [("T", 4), ("C", 3)]) - eq_(evidence.filter().allele_summary(loci[7]), - [("CTT", 5), ("CCT", 3)]) - eq_(evidence.filter(min_base_quality=30).allele_summary(loci[7]), - [("CTT", 3), ("CCT", 2)]) - eq_(evidence.filter(min_base_quality=32).allele_summary(loci[2]), - [("", 6), ("C", 1)]) - eq_(filtered_read_names(evidence.at(loci[2]).filter(min_base_quality=32)), - {'20GAVAAXX100126:4:3:18352:43857'}) - eq_(evidence.allele_summary(loci[8]), []) - eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[8]), []) - assert_raises(KeyError, - evidence.allele_summary, - Locus.from_inclusive_coordinates("20", 10009174, 10009176)) - eq_(filtered_read_names( - evidence.at(loci[9]).filter(drop_improper_mate_pairs=True)), - {'20FUKAAXX100202:8:68:1530:49310'}) - eq_(len(evidence.at(loci[8]).read_attribute('mapping_quality')), 0) - eq_(list(evidence.at(loci[9]).read_attribute('mapping_quality')), - list(evidence.at(loci[9]).read_attributes().mapping_quality)) - eq_(evidence.filter(drop_duplicates=True).allele_summary(loci[10]), - [('C', 2), ('CA', 1), ('CAA', 1)]) - eq_(evidence.filter(drop_duplicates=True).allele_summary( - Locus.from_interbase_coordinates( - loci[10].contig, loci[10].start, loci[10].start)), - [('', 2), ('A', 1), ('AA', 1)]) - - -def test_read_evidence_variant_matching_gatk_mini_bundle_extract(): - handle = Samfile(data_path("reads/gatk_mini_bundle_extract.bam")) - - loci = [ - Locus.from_inclusive_coordinates("20", 10008951), # 0 - Locus.from_inclusive_coordinates("20", 10009053), # 1 - Locus.from_inclusive_coordinates("20", 10009053, 10009054), # 2 - Locus.from_inclusive_coordinates("20", 10006822), # 3 - Locus.from_inclusive_coordinates("20", 10006822, 10006823), # 4 - - ] - evidence = PileupCollection.from_bam(handle, loci) - - eq_(evidence.match_summary(Variant(loci[0], "A", "C")), - [('A', 1), ('C', 4)]) - eq_(evidence.filter(drop_duplicates=True).match_summary( - Variant(loci[0], "A", "C")), - [('A', 0), ('C', 3)]) - eq_(evidence.match_summary(Variant(loci[1], "A", "C")), - [('A', 3), ('C', 0)]) - eq_(evidence.match_summary(Variant(loci[1], "A", "CC")), - [('A', 3), ('CC', 0)]) - eq_(evidence.match_summary(Variant(loci[1], "A", "")), - [('A', 3), ('', 0)]) - eq_(evidence.match_summary(Variant(loci[1], "A", "")), - [('A', 3), ('', 0)]) - eq_(evidence.match_summary(Variant(loci[2], "AT", "")), - [('AT', 3), ('', 0)]) - eq_(evidence.match_summary(Variant(loci[3], "A", "")), - [('A', 2), ('', 6)]) - eq_(evidence.match_summary(Variant(loci[4], "AC", "")), - [('AC', 2), ('', 6)]) - eq_(evidence.match_summary( - Variant(loci[4], "AC", ""), - lambda e: e.read_attributes().mapping_quality.mean()), - [('AC', 60.0), ('', 65.0)]) - -def test_read_evidence_variant_matching_gatk_bundle_native_varcode_variant(): - # Try native varcode Variant. - handle = Samfile(data_path("reads/gatk_mini_bundle_extract.bam")) - locus = Locus.from_inclusive_coordinates("20", 10008951) - variant = VarcodeVariant( - locus.contig, - locus.position + 1, # inclusive not interbase - "A", - "C") - evidence = PileupCollection.from_bam(handle, [variant]) - eq_(evidence.match_summary(variant), - [('A', 1), ('C', 4)]) - - -def test_read_evidence_variant_matching_gatk_mini_bundle_extract_warning(): - filename = data_path("reads/gatk_mini_bundle_extract.bam") - - # Should log a warning but pass. - loci = [ - Locus.from_inclusive_coordinates("20", 10009053, 10009054), # 0 - ] - evidence = PileupCollection.from_bam(filename, loci) - eq_(evidence.match_summary(Variant(loci[0], "A", "")), - [('A', 0), ('', 0), ('AT', 3)]) - - diff --git a/varcode/__init__.py b/varcode/__init__.py index 582b64a..915b3d4 100644 --- a/varcode/__init__.py +++ b/varcode/__init__.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .locus import Locus from .variant import Variant from .collection import Collection from .effect_collection import EffectCollection @@ -52,7 +51,6 @@ __all__ = [ # basic classes - "Locus", "Variant", "Collection", "EffectCollection", diff --git a/varcode/locus.py b/varcode/locus.py deleted file mode 100644 index d758f23..0000000 --- a/varcode/locus.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import typechecks -from collections import namedtuple - -class Locus(namedtuple("Locus", "contig start end")): - ''' - A genomic interval in 0-indexed interbase coordinates. - - See this blog post for a discussion on coordinate systems: - http://alternateallele.blogspot.com/2012/03/genome-coordinate-conventions.html - ''' - - @property - def positions(self): - ''' - A Python range object giving the bases included in this locus. - ''' - return range(self.start, self.end) - - @property - def position(self): - ''' - If this locus spans a single base, this property gives that position. - Otherwise, raises a ValueError. - ''' - if self.end != self.start + 1: - raise ValueError("Not a single base: %s" % str(self)) - return self.start - - # Factory functions. - @staticmethod - def from_inclusive_coordinates(contig, start, end=None): - ''' - Given coordinates in 1-based coordinates that are inclusive on start - and end, return a Locus instance. Locus instances are always 0-based - "interbase" coordinates. - ''' - typechecks.require_string(contig) - typechecks.require_integer(start) - if end is None: - end = start - typechecks.require_integer(end) - return Locus(contig, start - 1, end) - - @staticmethod - def from_interbase_coordinates(contig, start, end=None): - ''' - Given coordinates in 0-based interbase coordinates, return a Locus - instance. - ''' - typechecks.require_string(contig) - typechecks.require_integer(start) - if end is None: - end = start + 1 - typechecks.require_integer(end) - return Locus(contig, start, end) diff --git a/varcode/read_evidence/__init__.py b/varcode/read_evidence/__init__.py deleted file mode 100644 index c89c94c..0000000 --- a/varcode/read_evidence/__init__.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -''' -This subpackage provides functionality for collecting and filtering aligned -sequencing reads from a BAM file, determining the alleles they suggest at -a locus, and assesing the evidence for particular variants. - -In this subpackage, the records stored in the BAM file are referred to as -"alignments," whereas the term "read" may be more familiar. We use the term -"alignment" for consistency with the SAM specification, and since an -individual read from the sequencer may generate any number of alignments in -the case of chimeric alignments and secondary alignments. -''' - -from .util import alignment_key, read_key -from .pileup import Pileup -from .pileup_element import PileupElement -from .pileup_collection import PileupCollection - -__all__ = [ - "PileupCollection", - "Pileup", - "PileupElement", - "alignment_key", - "read_key", -] diff --git a/varcode/read_evidence/pileup.py b/varcode/read_evidence/pileup.py deleted file mode 100644 index 6966e9e..0000000 --- a/varcode/read_evidence/pileup.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import - -from collections import OrderedDict - - -class Pileup(object): - ''' - A Pileup is a collection of PileupElement instances at a particular locus. - - Attributes - ---------- - locus : Varcode.Locus - The reference locus. Must be length 1, i.e. a single base. - - elements : OrderedDict of PileupElement instances - This is logically and ordered set, which we implement as an OrderedDict - with all values mapping to None. - ''' - def __init__(self, locus, elements): - ''' - Construct a new Pileup. - - Parameters - ---------- - locus : Varcode.Locus - The reference locus. Must be length 1, i.e. a single base. - - elements : iterable of PileupElement - The pileup elements. The locus field of these instances must - match the locus parameter. - ''' - self.locus = locus - self.elements = OrderedDict((e, None) for e in elements) - assert all(e.locus == self.locus for e in self.elements) - - def __iter__(self): - return iter(self.elements) - - def __len__(self): - return len(self.elements) - - def append(self, element): - ''' - Append a PileupElement to this Pileup. If an identical PileupElement is - already part of this Pileup, do nothing. - ''' - assert element.locus == self.locus, ( - "Element locus (%s) != Pileup locus (%s)" - % (element.locus, self.locus)) - self.elements[element] = None - - def update(self, other): - ''' - Add all pileup elements from other into self. - ''' - assert self.locus == other.locus - self.elements.update(other.elements) - - def filter(self, filters): - ''' - Apply filters to the pileup elements, and return a new Pileup with the - filtered elements removed. - - Parameters - ---------- - filters : list of PileupElement -> bool callables - A PileupUp element is retained if all filters return True when - called on it. - ''' - new_elements = [ - e for e in self.elements - if all(function(e) for function in filters)] - return Pileup(self.locus, new_elements) diff --git a/varcode/read_evidence/pileup_collection.py b/varcode/read_evidence/pileup_collection.py deleted file mode 100644 index f301faa..0000000 --- a/varcode/read_evidence/pileup_collection.py +++ /dev/null @@ -1,624 +0,0 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import - -from collections import namedtuple, defaultdict, OrderedDict -import logging -import itertools -import re - -import pandas -import pysam -from pysam.csamfile import Samfile -import typechecks -import pyensembl - -from .. import Locus -from . import Pileup, PileupElement, alignment_key, read_key - -MatchingEvidence = namedtuple("MatchingEvidence", "ref alt other") - -class PileupCollection(object): - ''' - A collection of Pileup instances for some loci. - - Attributes - ---------- - pileups (optional) : dict of Locus -> Pileup - A map from 1-base Locus instances to the Pileup at that locus. - - parent (optional) : PileupCollection - For PileupCollection instances that are derived (e.g. by filtering) - from another instance, this attribute tracks the parent instance. - ''' - def __init__(self, pileups=None, parent=None): - ''' - Construct a new PileupCollection. - ''' - if pileups is None: - pileups = {} - self.pileups = pileups - self.parent = parent - - def pileup(self, locus): - ''' - Given a 1-base locus, return the Pileup at that locus. - - Raises a KeyError if this PileupCollection does not have a Pileup at - the specified locus. - ''' - locus = to_locus(locus) - if len(locus.positions) != 1: - raise ValueError("Not a single-base locus: %s" % locus) - return self.pileups[locus] - - def at(self, *loci): - ''' - Return a new PileupCollection instance including only pileups for - the specified loci. - ''' - loci = [to_locus(obj) for obj in loci] - single_position_loci = [] - for locus in loci: - for position in locus.positions: - single_position_loci.append( - Locus.from_interbase_coordinates(locus.contig, position)) - pileups = dict( - (locus, self.pileups[locus]) for locus in single_position_loci) - return PileupCollection(pileups, self) - - def loci(self): - ''' - Returns the loci included in this instance. - ''' - return list(self.pileups) - - def reads(self): - ''' - The reads in this PileupCollection. All reads will have an alignment - that overlaps at least one of the included loci. - - Since SAM (and pysam) have no real notion of a "read", the returned - instances are actually pysam.AlignedSegment instances, (i.e. - alignments). However, only one alignment will be returned by this - method per read. - - Returns - ---------- - List of pysam.AlignedSegment instances. If a particular read has more - than one alignment in this PileupCollection (e.g. one primary and one - secondary), then the alignment returned is the one with the highest - mapping quality. - ''' - # TODO: Optimize this. - - def alignment_precedence(pysam_alignment_record): - return pysam_alignment_record.mapping_quality - - result = {} - for pileup in self.pileups.values(): - for e in pileup.elements: - key = read_key(e.alignment) - if key not in result or ( - alignment_precedence(e.alignment) > - alignment_precedence(result[key])): - result[key] = e.alignment - return list(result.values()) - - def num_reads(self): - ''' - Returns the number of reads in this PileupCollection. - ''' - return len(self.reads()) - - def read_attribute(self, attribute): - ''' - Query a read attribute across all reads in this PileupCollection. - - Parameters - ---------- - attribute : string - Attribute to query. See `PileupCollection.read_attributes` for - possible attributes. - - Returns - ---------- - pandas.Series instance giving values for the attribute in each read. - The order of reads is fixed, so multiple calls to this function will - return corresponding values. - ''' - return self.read_attributes([attribute])[attribute] - - def read_attributes(self, attributes=None): - ''' - Collect read attributes across reads in this PileupCollection into a - pandas.DataFrame. - - Valid attributes are the following properties of a pysam.AlignedSegment - instance. See: - http://pysam.readthedocs.org/en/latest/api.html#pysam.AlignedSegment - for the meaning of these attributes. - - * cigarstring - * flag - * inferred_length - * is_duplicate - * is_paired - * is_proper_pair - * is_qcfail - * is_read1 - * is_read2 - * is_reverse - * is_secondary - * is_unmapped - * mapping_quality - * mate_is_reverse - * mate_is_unmapped - * next_reference_id - * next_reference_start - * query_alignment_end - * query_alignment_length - * query_alignment_qualities - * query_alignment_sequence - * query_alignment_start - * query_length - * query_name - * reference_end - * reference_id - * reference_length - * reference_start - * template_length - - (Note: the above list is parsed into the _READ_ATTRIBUTE_NAMES class - variable, so be careful when modifying it.) - - Additionally, for alignment "tags" (arbitrary key values associated - with an alignment), a column of the form "TAG_{tag name}" is - included. - - Finally, the column "pysam_alignment_record" gives the underlying - `pysam.AlignedSegment` instances. - - Parameters - ---------- - attributes (optional): list of strings - List of columns to include. If unspecified, all columns are - included in the result. - - Returns - ---------- - pandas.DataFrame of read attributes. - - ''' - def include(attribute): - return attributes is None or attribute in attributes - - reads = self.reads() - - possible_column_names = list(PileupCollection._READ_ATTRIBUTE_NAMES) - result = OrderedDict( - (name, [getattr(read, name) for read in reads]) - for name in PileupCollection._READ_ATTRIBUTE_NAMES - if include(name)) - - # Add tag columns. - if reads: - tag_dicts = [dict(x.get_tags()) for x in reads] - tag_keys = set.union( - *[set(item.keys()) for item in tag_dicts]) - for tag_key in sorted(tag_keys): - column_name = "TAG_%s" % tag_key - possible_column_names.append(column_name) - if include(column_name): - result[column_name] = [d.get(tag_key) for d in tag_dicts] - - # Lastly, we include the underlying pysam alignment record. - possible_column_names.append("pysam_alignment_record") - if include("pysam_alignment_record"): - result["pysam_alignment_record"] = reads - - # If particular attributes were requested, check that they're here. - if attributes is not None: - for attribute in attributes: - if attribute not in result: - raise ValueError( - "No such attribute: %s. Valid attributes are: %s" - % (attribute, " ".join(possible_column_names))) - assert set(attributes) == set(result) - - return pandas.DataFrame(result) - - # Parse the bulleted list of attributes in the previous method's docstring. - _READ_ATTRIBUTE_NAMES = [] - for line in read_attributes.__doc__.split("\n"): - match = re.match("\s+\*\s*(\w+)\s*", line) - if match: - _READ_ATTRIBUTE_NAMES.append(match.groups()[0]) - - def group_by_allele(self, locus): - ''' - Split the PileupCollection by the alleles suggested by the reads at the - specified locus. - - If a read has an insertion immediately following the locus, then the - insertion is included in the allele. For example, if locus is the - 1-base range [5,6), one allele might be "AGA", indicating that at - locus 5 some read has an "A" followed by a 2-base insertion ("GA"). If - a read has a deletion at the specified locus, the allele is the empty - string. - - The given locus may include any number of bases. If the locus includes - multiple bases, then the alleles consist of all bases aligning to that - range in any read. Note that only sequences actually sequenced in a - particular read are included. For example, if one read has "ATT" at a - locus and another read has "GCC", then the alleles are "ATT" and - "GCC", but not "GTT". That is, the bases in each allele are phased. For - this reason, only reads that overlap the entire locus are included. - - If the locus is an empty interval (e.g. [5,5) ), then the alleles - consist only of inserted bases. In this example, only bases inserted - immediately after locus 5 would be included (but *not* the base - actually at locus 5). In the previous insertion example, the allele - would be "GA", indicating a 2-base insertion. Reads that have no - insertion at that position (matches or deletions) would have the empty - string as their allele. - - Parameters - ---------- - locus : Locus - The reference locus, encompassing 0 or more bases. - - Returns - ---------- - A dict of string -> PileupCollection. The keys are nucleotide strings - giving the bases sequenced at the locus, and the values are - PileupCollection instances of the alignments that support that allele. - - ''' - locus = to_locus(locus) - read_to_allele = None - loci = [] - if locus.positions: - # Our locus includes at least one reference base. - for position in locus.positions: - base_position = Locus.from_interbase_coordinates( - locus.contig, position) - loci.append(base_position) - new_read_to_allele = {} - for element in self.pileups[base_position]: - allele_prefix = "" - key = alignment_key(element.alignment) - if read_to_allele is not None: - try: - allele_prefix = read_to_allele[key] - except KeyError: - continue - allele = allele_prefix + element.bases - new_read_to_allele[key] = allele - read_to_allele = new_read_to_allele - else: - # Our locus is between reference bases. - position_before = Locus.from_interbase_coordinates( - locus.contig, locus.start) - loci.append(position_before) - read_to_allele = {} - for element in self.pileups[position_before]: - allele = element.bases[1:] - read_to_allele[alignment_key(element.alignment)] = allele - - split = defaultdict(lambda: PileupCollection(pileups={}, parent=self)) - for locus in loci: - pileup = self.pileups[locus] - for e in pileup.elements: - key = read_to_allele.get(alignment_key(e.alignment)) - if key is not None: - if locus in split[key].pileups: - split[key].pileups[locus].append(e) - else: - split[key].pileups[locus] = Pileup(locus, [e]) - - # Sort by number of reads (descending). Break ties with the - # lexicographic ordering of the allele string. - def sorter(pair): - (allele, pileup_collection) = pair - return (-1 * pileup_collection.num_reads(), allele) - return OrderedDict(sorted(split.items(), key=sorter)) - - def allele_summary(self, locus, score=lambda x: x.num_reads()): - ''' - Convenience method to summarize the evidence for each of the alleles - present at a locus. Applies a score function to the PileupCollection - associated with each allele. - - See also `PileupCollection.group_by_allele`. - - Parameters - ---------- - locus : Locus - The reference locus, encompassing 0 or more bases. - - score (optional) : PileupCollection -> object - Function to apply to summarize the evidence for each allele. - Default: count number of reads. - - Returns - ---------- - List of (allele, score) pairs. - ''' - locus = to_locus(locus) - return [ - (allele, score(x)) - for (allele, x) in self.group_by_allele(locus).items() - ] - - def group_by_match(self, variant): - ''' - Given a variant, split the PileupCollection based on whether it the - data supports the reference allele, the alternate allele, or neither. - - Parameters - ---------- - variant : Variant - The variant. Must have fields 'locus', 'ref', and 'alt'. - - Returns - ---------- - A MatchingEvidence named tuple with fields (ref, alt, other), - each of which is a string -> PileupCollection dict mapping alleles - to the PileupCollection of evidence supporting them. - ''' - locus = to_locus(variant) - if len(variant.ref) != len(locus.positions): - logging.warning( - "Ref is length %d but locus has %d bases in variant: %s" % - (len(variant.ref), len(locus.positions), str(variant))) - - alleles_dict = self.group_by_allele(locus) - single_base_loci = [ - Locus.from_interbase_coordinates(locus.contig, position) - for position in locus.positions - ] - empty_pileups = dict( - (locus, Pileup(locus=locus, elements=[])) - for locus in single_base_loci) - empty_collection = PileupCollection(pileups=empty_pileups, parent=self) - - ref = {variant.ref: alleles_dict.pop(variant.ref, empty_collection)} - alt = {variant.alt: alleles_dict.pop(variant.alt, empty_collection)} - other = alleles_dict - - # TODO: consider end of read issues for insertions - return MatchingEvidence(ref, alt, other) - - def match_summary(self, variant, score=lambda x: x.num_reads()): - ''' - Convenience method to summarize the evidence for and against a variant - using a user-specified score function. - - See also `PileupCollection.group_by_match`. - - Parameters - ---------- - variant : Variant - The variant. Must have fields 'locus', 'ref', and 'alt'. - - score (optional) : PileupCollection -> object - Function to apply to summarize the evidence for each allele. - Default: count number of reads. - - Returns - ---------- - List of (allele, score) pairs. This list will always have at least two - elements. The first pair in the list is the reference allele. The - second pair is the alternate. The subsequent items give the "third" - alleles (neither ref nor alt), if any. - ''' - split = self.group_by_match(variant) - - def name(allele_to_pileup_collection): - return ",".join(allele_to_pileup_collection) - - def aggregate_and_score(pileup_collections): - merged = PileupCollection.merge(*pileup_collections) - return score(merged) - - result = [ - (name(split.ref), aggregate_and_score(split.ref.values())), - (name(split.alt), aggregate_and_score(split.alt.values())), - ] - result.extend( - (allele, score(collection)) - for (allele, collection) in split.other.items()) - return result - - def filter(self, - drop_duplicates=False, - drop_improper_mate_pairs=False, - min_mapping_quality=None, - min_base_quality=None, - filters=None): - ''' - Return a new PileupCollection that includes only pileup elements - satisfying the specified criteria. - - Parameters - ---------- - drop_duplicates (optional, default False) : boolean - Remove alignments with the is_duplicate flag. - - drop_improper_mate_pairs (optional, default False) : boolean - Retain only alignments that have mapped mate pairs, where one - alignment in the pair is on the forward strand and the other - is on the reverse. - - min_mapping_quality (optional) : int - If specified, retain only alignments with mapping quality >= the - specified threshold. - - min_base_quality (optional) : int - If specified, retain only pileup elements where the base quality - for the bases aligning to the pileup's locus are >= the specified - threshold. - - filters (optional) : list of PileupElement -> bool functions - User-specified filter functions to apply. This will be called on - each PileupElement, and should return True if the element should - be retained. - - Returns - ---------- - A new PileupCollection that includes the subset of the pileup elements - matching all the specified filters. - ''' - - if filters is None: - filters = [] - if drop_duplicates: - filters.append(lambda e: not e.alignment.is_duplicate) - if drop_improper_mate_pairs: - filters.append(lambda e: e.alignment.is_proper_pair) - if min_mapping_quality is not None: - filters.append( - lambda e: e.alignment.mapping_quality >= min_mapping_quality) - if min_base_quality is not None: - filters.append( - lambda e: e.min_base_quality >= min_base_quality) - pileups = OrderedDict( - (locus, pileup.filter(filters)) - for (locus, pileup) - in self.pileups.items()) - return PileupCollection(pileups=pileups, parent=self) - - def merge(self, *others): - ''' - Return a new PileupCollection that is the union of self and the other - specified collections. - ''' - new_pileups = {} - for collection in (self,) + others: - for (locus, pileup) in collection.pileups.items(): - if locus in new_pileups: - new_pileups[locus].update(pileup) - else: - new_pileups[locus] = Pileup(locus, pileup.elements) - return PileupCollection(new_pileups, parent=self) - - @staticmethod - def from_bam(pysam_samfile, loci): - ''' - Create a PileupCollection for a set of loci from a BAM file. - - Parameters - ---------- - pysam_samfile : `pysam.csamfile.Samfile` instance, or filename string - to a BAM file. The BAM file must be indexed. - - loci : list of Locus instances - Loci to collect pileups for. - - Returns - ---------- - PileupCollection instance containing pileups for the specified loci. - All alignments in the BAM file are included (e.g. duplicate reads, - secondary alignments, etc.). See `PileupCollection.filter` if these - need to be removed. - ''' - - loci = [to_locus(obj) for obj in loci] - - close_on_completion = False - if typechecks.is_string(pysam_samfile): - pysam_samfile = Samfile(pysam_samfile) - close_on_completion = True - - try: - # Map from pyensembl normalized chromosome names used in Variant to - # the names used in the BAM file. - chromosome_name_map = {} - for name in pysam_samfile.references: - normalized = pyensembl.locus.normalize_chromosome(name) - chromosome_name_map[normalized] = name - - result = PileupCollection({}) - - # Optimization: we sort variants so our BAM reads are localized. - locus_iterator = itertools.chain.from_iterable( - (Locus.from_interbase_coordinates(locus_interval.contig, pos) - for pos - in locus_interval.positions) - for locus_interval in sorted(loci)) - for locus in locus_iterator: - result.pileups[locus] = Pileup(locus, []) - try: - chromosome = chromosome_name_map[locus.contig] - except KeyError: - logging.warn("No such contig in bam: %s" % locus.contig) - continue - columns = pysam_samfile.pileup( - chromosome, - locus.position, - locus.position + 1, # exclusive, 0-indexed - truncate=True, - stepper="nofilter") - try: - column = next(columns) - except StopIteration: - # No reads align to this locus. - continue - - # Note that storing the pileups here is necessary, since the - # subsequent assertion will invalidate our column. - pileups = column.pileups - assert list(columns) == [] # column is invalid after this. - for pileup_read in pileups: - if not pileup_read.is_refskip: - element = PileupElement.from_pysam_alignment( - locus, pileup_read) - result.pileups[locus].append(element) - return result - finally: - if close_on_completion: - pysam_samfile.close() - -def to_locus(variant_or_locus): - """ - Return a Locus object for a Variant instance. - - This is necessary since the read evidence module expects Variant instances - to have a locus attribute st to a varcode.Locus instance of interbase - genomic coordinates. The rest of varcode uses a different Variant class, - but will eventually be transitioned to interbase coordinates. - - See test/test_read_evidence.py for a definition of the Variant class that - the read_evidence module is meant to work with. - - This function can be passed a regular varcode.Variant instance (with fields - start, end, contig, etc.), a different kind of variant object that has a - 'locus' field, or a varcode.Locus. It will return a varcode.Locus instance. - - This should all get cleaned up once varcode switches to interbase - coordinates and we standardize on a Variant class. - """ - if isinstance(variant_or_locus, Locus): - return variant_or_locus - try: - return variant_or_locus.locus - except AttributeError: - # IMPORTANT: if varcode someday changes from inclusive to interbase - # coordinates, this will need to be updated. - return Locus.from_inclusive_coordinates( - variant_or_locus.contig, - variant_or_locus.start, - variant_or_locus.end) diff --git a/varcode/read_evidence/pileup_element.py b/varcode/read_evidence/pileup_element.py deleted file mode 100644 index 42cc365..0000000 --- a/varcode/read_evidence/pileup_element.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import - -from . import alignment_key - -class PileupElement(object): - ''' - A PileupElement represents the segment of an alignment that aligns to a - particular base in the reference. - - Attributes - ---------- - locus : Varcode.Locus - The reference locus. Must be length 1, i.e. a single base. - - offset_start : int - 0-based start offset into the alignment sequence, inclusive - - offset_end : int - 0-based end offset into the alignment sequence, exclusive - - alignment : pysam.AlignedSegment - pysam alignment instance - - alignment_key : tuple - value computed from the alignment instance that uniquely specifies its - properties. Used for comparisons since pysam.AlignedSegment instances - do not support a useful notion of equality (they compare using object - identity). See `read_evidence.alignment_key` for the implementation of - this key. - ''' - def __init__(self, locus, offset_start, offset_end, alignment): - ''' - Construct a PileupElement object. - ''' - assert offset_end >= offset_start, \ - "offset_start=%d > offset_end=%d" % (offset_start, offset_end) - self.locus = locus - self.offset_start = offset_start - self.offset_end = offset_end - self.alignment = alignment - self.alignment_key = alignment_key(self.alignment) - - def fields(self): - ''' - Fields that should be considered for our notion of object equality. - ''' - return ( - self.locus, self.offset_start, self.offset_end, self.alignment_key) - - def __eq__(self, other): - return hasattr(other, "fields") and self.fields() == other.fields() - - def __hash__(self): - return hash(self.fields()) - - @property - def bases(self): - ''' - The sequenced bases in the alignment that align to this locus in the - genome, as a string. - - Empty string in the case of a deletion. String of length > 1 if there - is an insertion here. - ''' - sequence = self.alignment.query_sequence - assert self.offset_end <= len(sequence), \ - "End offset=%d > sequence length=%d. CIGAR=%s. SEQUENCE=%s" % ( - self.offset_end, - len(sequence), - self.alignment.cigarstring, - sequence) - return sequence[self.offset_start:self.offset_end] - - @property - def base_qualities(self): - ''' - The phred-scaled base quality scores corresponding to `self.bases`, as - a list. - ''' - return self.alignment.query_qualities[ - self.offset_start:self.offset_end] - - @property - def min_base_quality(self): - ''' - The minimum of the base qualities. In the case of a deletion, in which - case there are no bases in this PileupElement, the minimum is taken - over the sequenced bases immediately before and after the deletion. - ''' - try: - return min(self.base_qualities) - except ValueError: - # We are mid-deletion. We return the minimum of the adjacent bases. - assert self.offset_start == self.offset_end - adjacent_qualities = [ - self.alignment.query_qualities[offset] - for offset in [self.offset_start - 1, self.offset_start] - if 0 <= offset < len(self.alignment.query_qualities) - ] - return min(adjacent_qualities) - - @staticmethod - def from_pysam_alignment(locus, pileup_read): - ''' - Factory function to create a new PileupElement from a pysam - `PileupRead`. - - Parameters - ---------- - locus : varcode.Locus - Reference locus for which to construct a PileupElement. Must - include exactly one base. - - pileup_read : pysam.calignmentfile.PileupRead - pysam PileupRead instance. Its alignment must overlap the locus. - - Returns - ---------- - PileupElement - - ''' - assert not pileup_read.is_refskip, ( - "Can't create a PileupElement in a refskip (typically an intronic " - "gap in an RNA alignment)") - - # Pysam has an `aligned_pairs` method that gives a list of - # (offset, locus) pairs indicating the correspondence between bases in - # the alignment and reference loci. Here we use that to compute - # offset_start and offset_end. - # - # This is slightly tricky in the case of insertions and deletions. - # Here are examples of the desired logic. - # - # Target locus = 1000 - # - # (1) Simple case: matching bases. - # - # OFFSET LOCUS - # 0 999 - # 1 1000 - # 2 1001 - # - # DESIRED RESULT: offset_start=1, offset_end=2. - # - # - # (2) A 1 base insertion at offset 2. - # - # OFFSET LOCUS - # 0 999 - # 1 1000 - # 2 None - # 3 1001 - # - # DESIRED RESULT: offset_start = 1, offset_end=3. - # - # - # (3) A 2 base deletion at loci 1000 and 1001. - # - # OFFSET LOCUS - # 0 999 - # None 1000 - # None 1001 - # 1 1002 - # - # DESIRED RESULT: offset_start = 1, offset_end=1. - # - offset_start = None - offset_end = len(pileup_read.alignment.query_sequence) - # TODO: doing this with get_blocks() may be faster. - for (offset, position) in pileup_read.alignment.aligned_pairs: - if offset is not None and position is not None: - if position == locus.position: - offset_start = offset - elif position > locus.position: - offset_end = offset - break - if offset_start is None: - offset_start = offset_end - - assert pileup_read.is_del == (offset_end - offset_start == 0), \ - "Deletion=%s but | [%d,%d) |=%d for locus %d in: \n%s" % ( - pileup_read.is_del, - offset_start, - offset_end, - offset_end - offset_start, - locus.position, - pileup_read.alignment.aligned_pairs) - - assert offset_end >= offset_start - result = PileupElement( - locus, offset_start, offset_end, pileup_read.alignment) - return result - diff --git a/varcode/read_evidence/util.py b/varcode/read_evidence/util.py deleted file mode 100644 index 252d4a8..0000000 --- a/varcode/read_evidence/util.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import - -def alignment_key(pysam_alignment_record): - ''' - Return the identifying attributes of a `pysam.AlignedSegment` instance. - This is necessary since these objects do not support a useful notion of - equality (they compare on identify by default). - ''' - return ( - read_key(pysam_alignment_record), - pysam_alignment_record.query_alignment_start, - pysam_alignment_record.query_alignment_end, - ) - -def read_key(pysam_alignment_record): - ''' - Given a `pysam.AlignedSegment` instance, return the attributes identifying - the *read* it comes from (not the alignment). There may be more than one - alignment for a read, e.g. chimeric and secondary alignments. - ''' - return ( - pysam_alignment_record.query_name, - pysam_alignment_record.is_duplicate, - pysam_alignment_record.is_read1, - pysam_alignment_record.is_read2, - )