Implemented find_clones and added API.md readme

phbradley · Sep 22, 2017 · d3ffbef · d3ffbef
1 parent 7bdc586
commit d3ffbef
Show file tree

Hide file tree

Showing 8 changed files with 573 additions and 79 deletions.
diff --git a/api.md b/api.md
@@ -0,0 +1,46 @@
+# API branch
+---
+
+
+# Single TCR example
+Here's an example of how you can process a single nucleotide sequence:
+
+```python
+betaNT = 'CGGGGGGGGTACCNTTGNTTAGGTCCTCTACACGGTTAACCTGGTCCCCGAACCGAAGGTCAATAGGGCCTGTATACTGCTGGCACAGAAGTACACAGCTGAGTCCCTGGGTTCTGAGGGCTGGATCTTCAGAGTGGAGTCANN'
+
+betaQuals = '12.12.12.12.12.22.9.8.6.6.6.8.3.0.3.10.3.0.3.10.10.11.20.25.30.37.37.29.27.14.14.15.27.30.41.47.36.50.50.50.42.42.57.57.43.47.53.47.47.47.47.47.47.50.54.57.57.57.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.57.57.57.57.59.59.59.57.57.57.57.57.57.57.57.59.57.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.59.59.59.59.59.57.57.57.59.57.57.43.37.28.28.21.28.23.37.28.30.15.19.17.15.21.20.25.3.0.0'
+
+import tcrdist as td
+chain = td.processing.processNT('human', 'B', betaNT, betaQuals)
+```
+
+# Pipeline example
+Here's an example of a TCR pipeline with `tcrdist`:
+
+```python
+import tcrdist as td
+psDf = td.datasets.loadPSData('human_pairseqs_v1')
+probDf = td.processing.computeProbs(psDf)
+psDf = psDf.join(probDf)
+clonesDf = td.processing.identifyClones(psDf)
+```
+
+# Testing
+To run all tests:
+
+`py.test tcrdist`
+
+or to pass all print statements through for debugging:
+
+`py.test tcrdist -s`
+
+Note that importing and running functions in `tcrdist` generates a log file, `tcrdist.log`. You can set the logging `level` by modifying the parameter in the base `__init__.py` file using one of the following:
+
+```
+logging.ERROR
+logging.WARNING
+logging.INFO
+logging.DEBUG
+```
+
+For details see documentation on the `logging` [module](https://docs.python.org/2/library/logging.html).
diff --git a/tcrdist/__init__.py b/tcrdist/__init__.py
@@ -12,7 +12,9 @@
 from . import utils
 from . import plotting
 from .objects import TCRClone, TCRChain
+from . import datasets
 
 __all__ = ['processing',
            'utils',
-           'plotting']
+           'plotting',
+           'datasets']
diff --git a/tcrdist/compute_probs.py b/tcrdist/compute_probs.py
@@ -13,8 +13,7 @@
 __all__ = ['filterOutRow',
            'samplerProb',
            'getMaskedSeqs',
-           'rearrangementProb',
-           'computeProbs']
+           'rearrangementProb']
 
 def filterOutRow(r, max_cdr3_length=30, allow_stop_codons=False, allow_X=False):
     """Assesses whether a single row from psDf should be filtered out"""
@@ -134,72 +133,4 @@ def rearrangementProb(r, chain):
     v_rep_prob = max( [ tcr_rearrangement.all_countrep_pseudoprobs[r.organism][x] for x in v_countreps ] )
     j_rep_prob = max( [ tcr_rearrangement.all_countrep_pseudoprobs[r.organism][x] for x in j_countreps ] )
 
-    return v_rep_prob, j_rep_prob
-
-def computeProbs(psDf, add_masked_seqs=True, filterOut=False, max_cdr3_length=30, allow_stop_codons=False, allow_X=False):
-    """Compute probabilities for each TCR based on observed likelihood from an unpaired dataset
-    and based on rearrangement likelihood
-    
-    Parameters
-    ----------
-    psDf : pd.DataFrame
-        Paired-sequences dataset generated by td.processing.readPairedSequences()
-        
-    Returns
-    -------
-    psDf : pd.DataFrame
-        Input dataset with new columns: a_protseq_prob, cdr3a_protseq_prob, va_rep_prob,
-                                       ja_rep_prob, a_nucseq_prob, b_protseq_prob,
-                                       cdr3b_protseq_prob, vb_rep_prob, jb_rep_prob, b_nucseq_prob
-
-        optionally: cdr3a_protseq_masked, a_indels, cdr3a_new_nucseq, cdr3b_protseq_masked, b_indels, cdr3b_new_nucseq"""
-
-    out = []
-    for rowi, row in psDf.iterrows():
-        """If iterrows is slow there are potentially ways to speed this up using psDf.apply()"""
-        vals = {}
-        vals['ind'] = rowi
-
-        if filterOut:
-            fo = filterOutRow(row,
-                              max_cdr3_length=max_cdr3_length,
-                              allow_stop_codons=allow_stop_codons,
-                              allow_X=allow_X)
-            if fo:
-                """vals will be missing keys, which will be assigned Nan in outDf"""
-                continue
-
-        aprob_nucseq, aprob_protseq = samplerProb(row, 'a')
-        va_rep_prob, ja_rep_prob = rearrangementProb(row, 'a')
-
-        vals['a_protseq_prob'    ] = aprob_protseq * va_rep_prob * ja_rep_prob
-        vals['cdr3a_protseq_prob'] = aprob_protseq
-        vals['va_rep_prob'       ] = va_rep_prob
-        vals['ja_rep_prob'       ] = ja_rep_prob
-        vals['a_nucseq_prob'     ] = aprob_nucseq * va_rep_prob * ja_rep_prob
-
-        bprob_nucseq, bprob_protseq = samplerProb(row, 'b')
-        vb_rep_prob, jb_rep_prob = rearrangementProb(row, 'b')
-
-        vals['b_protseq_prob'    ] = bprob_protseq * vb_rep_prob * jb_rep_prob
-        vals['cdr3b_protseq_prob'] = bprob_protseq
-        vals['vb_rep_prob'       ] = vb_rep_prob
-        vals['jb_rep_prob'       ] = jb_rep_prob
-        vals['b_nucseq_prob'     ] = bprob_nucseq * vb_rep_prob * jb_rep_prob
-
-        if add_masked_seqs:
-            cdr3a_protseq_masked, ita, cdr3a_new_nucseq = getMaskedSeqs(row, 'a')
-            cdr3b_protseq_masked, itb, cdr3b_new_nucseq = getMaskedSeqs(row, 'b')
-
-            vals[ 'cdr3a_protseq_masked'] = cdr3a_protseq_masked
-            vals[ 'a_indels'] = ita
-            vals[ 'cdr3a_new_nucseq' ] = cdr3a_new_nucseq
-            vals[ 'cdr3b_protseq_masked'] = cdr3b_protseq_masked
-            vals[ 'b_indels'] = itb
-            vals[ 'cdr3b_new_nucseq' ] = cdr3b_new_nucseq
-        out.append(vals)
-
-    outDf = pd.DataFrame(out).set_index('ind')
-    assert outDf.shape[0] == psDf.shape[0]
-    return outDf
-
+    return v_rep_prob, j_rep_prob  
diff --git a/tcrdist/datasets.py b/tcrdist/datasets.py
@@ -0,0 +1,48 @@
+import os.path as op
+import logging
+import inspect
+
+from . import processing
+
+logger = logging.getLogger('datasets.py')
+
+def loadPSData(name=None):
+    """Used to load paired sequence data without needing to be aware of the full path.
+    Call without parameters to see a list of datasets available."""
+
+    datasets = dict(human_pairseqs_v1='human_pairseqs_v1.tsv',
+                    mouse_pairseqs_v1='mouse_pairseqs_v1.tsv',
+                    test_human_pairseqs='test_human_pairseqs.tsv',
+                    test_mouse_pairseqs='test_mouse_pairseqs.tsv')
+
+    if name is None or name == '':
+        print('Available datsets:')
+        for k in datasets.keys():
+            print('  %s' % k)
+        return
+
+    try:
+        fn = datasets[name]
+    except KeyError:
+        logger.error('Could not find dataset: %s' % name)
+
+    if fn[-3:] == 'tsv':
+        delimiter = '\t'
+    else:
+        delimiter = ','
+
+    datasetsPath = op.join(op.split(inspect.stack()[0][1])[0], 'datasets')
+    filename = op.join(datasetsPath, fn)
+
+    print('Reading from %s' % filename)
+
+    if 'mouse' in fn:
+        organism = 'mouse'
+    else:
+        organism = 'human'
+
+    psDf = processing.readPairedSequences(organism, filename)
+    return psDf
+
+
+