Skip to content

Commit

Permalink
Implemented find_clones and added API.md readme
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Gartland committed Sep 22, 2017
1 parent 7bdc586 commit d3ffbef
Show file tree
Hide file tree
Showing 8 changed files with 573 additions and 79 deletions.
46 changes: 46 additions & 0 deletions api.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# API branch
---


# Single TCR example
Here's an example of how you can process a single nucleotide sequence:

```python
betaNT = 'CGGGGGGGGTACCNTTGNTTAGGTCCTCTACACGGTTAACCTGGTCCCCGAACCGAAGGTCAATAGGGCCTGTATACTGCTGGCACAGAAGTACACAGCTGAGTCCCTGGGTTCTGAGGGCTGGATCTTCAGAGTGGAGTCANN'

betaQuals = '12.12.12.12.12.22.9.8.6.6.6.8.3.0.3.10.3.0.3.10.10.11.20.25.30.37.37.29.27.14.14.15.27.30.41.47.36.50.50.50.42.42.57.57.43.47.53.47.47.47.47.47.47.50.54.57.57.57.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.57.57.57.57.59.59.59.57.57.57.57.57.57.57.57.59.57.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.68.59.59.59.59.59.57.57.57.59.57.57.43.37.28.28.21.28.23.37.28.30.15.19.17.15.21.20.25.3.0.0'

import tcrdist as td
chain = td.processing.processNT('human', 'B', betaNT, betaQuals)
```

# Pipeline example
Here's an example of a TCR pipeline with `tcrdist`:

```python
import tcrdist as td
psDf = td.datasets.loadPSData('human_pairseqs_v1')
probDf = td.processing.computeProbs(psDf)
psDf = psDf.join(probDf)
clonesDf = td.processing.identifyClones(psDf)
```

# Testing
To run all tests:

`py.test tcrdist`

or to pass all print statements through for debugging:

`py.test tcrdist -s`

Note that importing and running functions in `tcrdist` generates a log file, `tcrdist.log`. You can set the logging `level` by modifying the parameter in the base `__init__.py` file using one of the following:

```
logging.ERROR
logging.WARNING
logging.INFO
logging.DEBUG
```

For details see documentation on the `logging` [module](https://docs.python.org/2/library/logging.html).
4 changes: 3 additions & 1 deletion tcrdist/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
from . import utils
from . import plotting
from .objects import TCRClone, TCRChain
from . import datasets

__all__ = ['processing',
'utils',
'plotting']
'plotting',
'datasets']
73 changes: 2 additions & 71 deletions tcrdist/compute_probs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@
__all__ = ['filterOutRow',
'samplerProb',
'getMaskedSeqs',
'rearrangementProb',
'computeProbs']
'rearrangementProb']

def filterOutRow(r, max_cdr3_length=30, allow_stop_codons=False, allow_X=False):
"""Assesses whether a single row from psDf should be filtered out"""
Expand Down Expand Up @@ -134,72 +133,4 @@ def rearrangementProb(r, chain):
v_rep_prob = max( [ tcr_rearrangement.all_countrep_pseudoprobs[r.organism][x] for x in v_countreps ] )
j_rep_prob = max( [ tcr_rearrangement.all_countrep_pseudoprobs[r.organism][x] for x in j_countreps ] )

return v_rep_prob, j_rep_prob

def computeProbs(psDf, add_masked_seqs=True, filterOut=False, max_cdr3_length=30, allow_stop_codons=False, allow_X=False):
"""Compute probabilities for each TCR based on observed likelihood from an unpaired dataset
and based on rearrangement likelihood
Parameters
----------
psDf : pd.DataFrame
Paired-sequences dataset generated by td.processing.readPairedSequences()
Returns
-------
psDf : pd.DataFrame
Input dataset with new columns: a_protseq_prob, cdr3a_protseq_prob, va_rep_prob,
ja_rep_prob, a_nucseq_prob, b_protseq_prob,
cdr3b_protseq_prob, vb_rep_prob, jb_rep_prob, b_nucseq_prob
optionally: cdr3a_protseq_masked, a_indels, cdr3a_new_nucseq, cdr3b_protseq_masked, b_indels, cdr3b_new_nucseq"""

out = []
for rowi, row in psDf.iterrows():
"""If iterrows is slow there are potentially ways to speed this up using psDf.apply()"""
vals = {}
vals['ind'] = rowi

if filterOut:
fo = filterOutRow(row,
max_cdr3_length=max_cdr3_length,
allow_stop_codons=allow_stop_codons,
allow_X=allow_X)
if fo:
"""vals will be missing keys, which will be assigned Nan in outDf"""
continue

aprob_nucseq, aprob_protseq = samplerProb(row, 'a')
va_rep_prob, ja_rep_prob = rearrangementProb(row, 'a')

vals['a_protseq_prob' ] = aprob_protseq * va_rep_prob * ja_rep_prob
vals['cdr3a_protseq_prob'] = aprob_protseq
vals['va_rep_prob' ] = va_rep_prob
vals['ja_rep_prob' ] = ja_rep_prob
vals['a_nucseq_prob' ] = aprob_nucseq * va_rep_prob * ja_rep_prob

bprob_nucseq, bprob_protseq = samplerProb(row, 'b')
vb_rep_prob, jb_rep_prob = rearrangementProb(row, 'b')

vals['b_protseq_prob' ] = bprob_protseq * vb_rep_prob * jb_rep_prob
vals['cdr3b_protseq_prob'] = bprob_protseq
vals['vb_rep_prob' ] = vb_rep_prob
vals['jb_rep_prob' ] = jb_rep_prob
vals['b_nucseq_prob' ] = bprob_nucseq * vb_rep_prob * jb_rep_prob

if add_masked_seqs:
cdr3a_protseq_masked, ita, cdr3a_new_nucseq = getMaskedSeqs(row, 'a')
cdr3b_protseq_masked, itb, cdr3b_new_nucseq = getMaskedSeqs(row, 'b')

vals[ 'cdr3a_protseq_masked'] = cdr3a_protseq_masked
vals[ 'a_indels'] = ita
vals[ 'cdr3a_new_nucseq' ] = cdr3a_new_nucseq
vals[ 'cdr3b_protseq_masked'] = cdr3b_protseq_masked
vals[ 'b_indels'] = itb
vals[ 'cdr3b_new_nucseq' ] = cdr3b_new_nucseq
out.append(vals)

outDf = pd.DataFrame(out).set_index('ind')
assert outDf.shape[0] == psDf.shape[0]
return outDf

return v_rep_prob, j_rep_prob
48 changes: 48 additions & 0 deletions tcrdist/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os.path as op
import logging
import inspect

from . import processing

logger = logging.getLogger('datasets.py')

def loadPSData(name=None):
"""Used to load paired sequence data without needing to be aware of the full path.
Call without parameters to see a list of datasets available."""

datasets = dict(human_pairseqs_v1='human_pairseqs_v1.tsv',
mouse_pairseqs_v1='mouse_pairseqs_v1.tsv',
test_human_pairseqs='test_human_pairseqs.tsv',
test_mouse_pairseqs='test_mouse_pairseqs.tsv')

if name is None or name == '':
print('Available datsets:')
for k in datasets.keys():
print(' %s' % k)
return

try:
fn = datasets[name]
except KeyError:
logger.error('Could not find dataset: %s' % name)

if fn[-3:] == 'tsv':
delimiter = '\t'
else:
delimiter = ','

datasetsPath = op.join(op.split(inspect.stack()[0][1])[0], 'datasets')
filename = op.join(datasetsPath, fn)

print('Reading from %s' % filename)

if 'mouse' in fn:
organism = 'mouse'
else:
organism = 'human'

psDf = processing.readPairedSequences(organism, filename)
return psDf



Loading

0 comments on commit d3ffbef

Please sign in to comment.