Merge pull request #7 from hammerlab/cli

CLI
openvax · Apr 15, 2016 · 89b2a4e · 89b2a4e
2 parents 44f020f + a38fe9d
commit 89b2a4e
Show file tree

Hide file tree

Showing 64 changed files with 8,959 additions and 1,155 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,45 @@
+sudo: false  # Use container-based infrastructure
+language: python
+python:
+  - "2.7"
+  - "3.4"
+before_install:
+  # Commands below copied from: http://conda.pydata.org/docs/travis.html
+  # We do this conditionally because it saves us some downloading if the
+  # version is the same.
+  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
+      wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh;
+    else
+      wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
+    fi
+  - bash miniconda.sh -b -p $HOME/miniconda
+  - export PATH="$HOME/miniconda/bin:$PATH"
+  # reset the shell's lookup table for program name to path mappings
+  - hash -r
+  - conda config --set always_yes yes --set changeps1 no
+  - conda update -q conda
+  # Useful for debugging any issues with conda
+  - conda info -a
+addons:
+  apt:
+    packages:
+    # install pandoc for use with pypandoc for converting the README
+    # from markdown to RST
+    - pandoc
+install:
+  - >
+      conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION
+      numpy scipy nose pandas matplotlib
+  - source activate test-environment
+  - pip install pypandoc
+  - pip install -r requirements.txt
+  - pip install .
+  - pip install coveralls
+script:
+  - pyensembl install --release 75 --species human
+  - pyensembl install --release 83 --species human
+  - pyensembl install --release 83 --species mouse
+  # run tests
+  - nosetests test --with-coverage --cover-package=isovar  && ./lint.sh
+after_success:
+  coveralls
diff --git a/README.md b/README.md
@@ -1,4 +1,51 @@
-[![DOI](https://zenodo.org/badge/18834/hammerlab/isovar.svg)](https://zenodo.org/badge/latestdoi/18834/hammerlab/isovar)
+[![DOI](https://zenodo.org/badge/18834/hammerlab/isovar.svg)](https://zenodo.org/badge/latestdoi/18834/hammerlab/isovar) [![Build Status](https://travis-ci.org/hammerlab/isovar.svg?branch=master)](https://travis-ci.org/hammerlab/isovar)
 
 # isovar
 Abundance quantification of distinct transcript sequences containing somatic variants from cancer RNAseq
+
+## Example
+
+```sh
+$ isovar-protein-sequences.py  \
+    --vcf somatic-variants.vcf  \
+    --bam rnaseq.bam \
+    --genome hg19 \
+    --min-reads 2 \
+    --protein-sequence-length 30 \
+    --output isovar-results.csv
+
+  chr       pos ref alt                      amino_acids  \
+0  22  46931060   A   C   FGVEAVDHGWPSMSSGSSWRASRGPPPPPR
+1  22  46931062   G   A  CFGVEAVDHGWPPMSLAHGGPAVVHRLHPEA
+
+   variant_aa_interval_start  variant_aa_interval_end ends_with_stop_codon  \
+0                         16                       17                False
+1                         16                       17                False
+
+  frameshift  translations_count  supporting_variant_reads_count  \
+0      False                   1                               1
+1      False                   1                               1
+
+   total_variant_reads  supporting_transcripts_count  total_transcripts  \
+0                  130                             2                  2
+1                  127                             2                  2
+
+     gene
+0  CELSR1
+1  CELSR1
+```
+
+## Algorithm/Design
+
+The one line explanation of isovar: `ProteinSequence = VariantSequence + ReferenceContext`.
+
+A little more detail about the algorithm:
+  1. Scan through an RNAseq BAM file and extract sequences overlapping a variant locus (represented by `ReadAtLocus`)
+  2. Make sure that the read contains the variant allele and split its sequence into prefix/alt/suffix string parts (represented by `VariantRead`)
+  3. Combine multiple `VariantRead` records into a `VariantSequence`
+  4. Gather possible reading frames for distinct reference sequences around the variant locus (represented by `ReferenceContext`).
+  5. Use the reading frame from a `ReferenceContext` to translate a `VariantSequence` into a protein fragment (represented by `Translation`).
+  6. Multiple distinct variant sequences and reference contexts can generate the same translations, so we aggregate those equivalent `Translation` objects into a `ProteinSequence`.
+
+Since we may not want to deal with *every* possible translation of *every* distinct sequence detected around a variant, `isovar` sorts the variant sequences by the number of supporting reads and the reference contexts in order of protein length and a configurable number of
+translated protein fragments can be kept from this ordering.
diff --git a/isovar/__init__.py b/isovar/__init__.py
@@ -14,32 +14,4 @@
 
 from __future__ import print_function, division, absolute_import
 
-from .variant_reads import gather_variant_reads
-from .overlapping_reads import gather_overlapping_reads
-from .assembly import assemble_transcript_fragments
-from .nucleotide_counts import most_common_nucleotides
-from .common import (
-    group_unique_sequences,
-    nucleotides,
-    index_to_nucleotide,
-    nucleotide_to_index
-)
-from .sequence_counts import sequence_counts
-from .protein_sequences import (
-    variant_to_protein_sequences,
-    variants_to_protein_sequences,
-)
-
-__all__ = [
-    "gather_variant_reads",
-    "gather_overlapping_reads",
-    "assemble_transcript_fragments",
-    "most_common_nucleotides",
-    "group_unique_sequences",
-    "nucleotides",
-    "index_to_nucleotide",
-    "nucleotide_to_index",
-    "sequence_counts",
-    "variant_to_protein_sequences",
-    "variants_to_protein_sequences"
-]
+__version__ = "0.0.2"
diff --git a/isovar/assembly.py b/isovar/assembly.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function, division, absolute_import
 
-from .common import group_unique_sequences
+from .common import group_unique_sequences, get_variant_nucleotides
 
 
 def sort_by_decreasing_prefix_length(x):
@@ -134,8 +134,7 @@ def assemble_transcript_fragments(
         min_overlap_size=30,
         n_merge_iters=2):
     assert len(variant_reads) > 0
-    variant_seq = variant_reads[0].variant
-    assert all(r.variant == variant_seq for r in variant_reads)
+    variant_seq = get_variant_nucleotides(variant_reads)
     assembly_counts = recursive_assembly(
         variant_reads,
         min_overlap_size=min_overlap_size,

diff --git a/isovar/common.py b/isovar/common.py
@@ -13,13 +13,28 @@
 # limitations under the License.
 
 from __future__ import print_function, division, absolute_import
+import logging
 
 from collections import defaultdict
 
 nucleotides = ["A", "C", "T", "G"]
 nucleotide_to_index = {c: i for (i, c) in enumerate(nucleotides)}
 index_to_nucleotide = {i: c for (i, c) in enumerate(nucleotides)}
 
+def create_logger(
+        name="root",
+        level=logging.DEBUG,
+        format_string="%(levelname)s: %(message)s (%(filename)s:%(lineno)s - %(funcName)s)"):
+    logger = logging.getLogger(name)
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter(format_string)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    logger.setLevel(level)
+    return logger
+
+logger = create_logger()
+
 def group_unique_sequences(
         variant_reads,
         max_prefix_size=None,
@@ -62,8 +77,8 @@ def count_unique_sequences(
 
 def get_variant_nucleotides(variant_reads):
     if len(variant_reads) > 0:
-        variant_seq = variant_reads[0].variant
-        if not all(r.variant == variant_seq for r in variant_reads):
+        variant_seq = variant_reads[0].alt
+        if not all(r.alt == variant_seq for r in variant_reads):
             raise ValueError(
                 ("Cannot call `get_variant_nucleotides` on a collection"
                  " of VariantRead objects spanning multiple variants"))

diff --git a/isovar/dataframe_builder.py b/isovar/dataframe_builder.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division, absolute_import
+from collections import OrderedDict
+from six import integer_types, text_type, binary_type
+
+from varcode import Variant
+import pandas as pd
+
+VALID_ELEMENT_TYPES = integer_types + (text_type, binary_type, float, bool)
+
+class DataFrameBuilder(object):
+    """
+    Helper class for constructing a DataFrame which always has fields
+    of a variant (chr/pos/ref/alt) as well as some subset of the fields
+    from a namedtuple.
+    """
+    def __init__(
+            self,
+            element_class,
+            exclude=set([]),
+            converters={},
+            rename_dict={},
+            variant_columns=True):
+        """
+        Parameters
+        ----------
+        element_class : type
+            Expected to a have a class-member named '_fields' which is a list
+            of field names.
+
+        exclude : set
+            Field names from element_class which should be used as columns for
+            the DataFrame we're building
+
+        converters : dict
+            Dictionary of names mapping to functions. These functions will be
+            applied to each element of a column before it's added to the
+            DataFrame.
+
+        rename_dict : dict
+            Dictionary mapping element_class field names to desired column names
+            in the produced DataFrame.
+
+        variant_columns : bool
+            If True, then add four columns for fields of a Variant: chr/pos/ref/alt
+        """
+        self.element_class = element_class
+        self.rename_dict = rename_dict
+        self.converters = converters
+        self.variant_columns = variant_columns
+
+        # remove specified field names without changing the order of the others
+        self.original_field_names = [
+            x
+            for x in element_class._fields
+            if x not in exclude
+        ]
+
+        for name in converters:
+            if name not in self.original_field_names:
+                raise ValueError("No field named '%s', valid names: %s" % (
+                    name,
+                    self.original_field_names))
+
+        self.renamed_field_names = [
+            self.rename_dict.get(x, x)
+            for x in self.original_field_names
+        ]
+        if self.variant_columns:
+            columns_list = [
+                # fields related to variant
+                ("chr", []),
+                ("pos", []),
+                ("ref", []),
+                ("alt", []),
+            ]
+        else:
+            columns_list = []
+
+        for name in self.renamed_field_names:
+            columns_list.append((name, []))
+
+        self.columns_dict = OrderedDict(columns_list)
+
+    def add(self, variant, element):
+        if self.variant_columns:
+            assert isinstance(variant, Variant)
+            self.columns_dict["chr"].append(variant.contig)
+            self.columns_dict["pos"].append(variant.original_start)
+            self.columns_dict["ref"].append(variant.original_ref)
+            self.columns_dict["alt"].append(variant.original_alt)
+        else:
+            assert variant is None
+
+        assert isinstance(element, self.element_class)
+
+        for name in self.original_field_names:
+            value = getattr(element, name)
+
+            if name in self.converters:
+                fn = self.converters[name]
+                value = fn(value)
+
+            if not isinstance(value, VALID_ELEMENT_TYPES):
+                raise ValueError(
+                    "Please provider converter for field '%s' : %s to make a scalar or string" % (
+                        name,
+                        type(value)))
+
+            if name in self.rename_dict:
+                name = self.rename_dict[name]
+            self.columns_dict[name].append(value)
+
+    def to_dataframe(self):
+        return pd.DataFrame(self.columns_dict)
diff --git a/isovar/default_parameters.py b/isovar/default_parameters.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2016. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division, absolute_import
+
+"""
+Gathered all the default function parameters in a single module, so that these
+values can be easily shared between modules and also between commandline
+arguments of different scripts.
+"""
+
+# lowest mapping quality (MAPQ) value to allow for RNAseq reads
+MIN_READ_MAPPING_QUALITY = 0
+
+# use a read even if it's been marked as a duplicate?
+USE_DUPLICATE_READS = False
+
+# use a read even at a location that isn't its primary alignment?
+USE_SECONDARY_ALIGNMENTS = True
+
+# number of nucleotides to extract from RNAseq reads around each variant
+VARIANT_CDNA_SEQUENCE_LENGTH = 90
+
+# number of nucleotides to the left and right of a variant to extract,
+# usually gets adjusted for variant length but some functions take this
+# parameter directly
+CDNA_CONTEXT_SIZE = VARIANT_CDNA_SEQUENCE_LENGTH // 2
+
+# only consider variant cDNA sequences with at least this many reads
+MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE = 2
+
+# number of nucleotides shared between reference and variant sequence
+# before variant for reference contexts used to establish ORF
+MIN_TRANSCRIPT_PREFIX_LENGTH = 10
+
+# maximum number of mismatching nucleotides between reference and variant
+# prefix sequences
+MAX_REFERENCE_TRANSCRIPT_MISMATCHES = 2
+
+# number of amino acids / codons we're trying to translate
+PROTEIN_SEQUENCE_LEGNTH = 20
+
+# number of protein sequences we want to return per variant
+MAX_PROTEIN_SEQUENCES_PER_VARIANT = 1