Skip to content

Commit

Permalink
Merge branch 'master' of github.com:phbradley/tcr-dist into API
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Gartland committed Sep 21, 2017
2 parents c33b1f8 + bd32070 commit 7bdc586
Show file tree
Hide file tree
Showing 36 changed files with 3,585 additions and 889 deletions.
19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# TCRdist pipeline, version 0.0.1 beta
# TCRdist pipeline, version 0.0.2 beta

Thanks for your interest in TCRdist! This software is very much a
work in progress, so we welcome any/all feedback and will work to
Expand Down Expand Up @@ -153,3 +153,20 @@ The code uses the command line parsing toolkit `blargs`. See the license and inf
The tables in the .html results can be sorted thanks to `tablesorter`. See the license and info in `external/tablesorter/`

Sequence parsing relies on the BLAST suite, see info in `external/blast-2.2.16/`

---
# UPDATES

## Version 0.0.2 (09/21/2017):

- New sequence database system that makes it easier to work with alternate gene sets

- Preliminary support for gamma-delta TCRs: edit the `db_file` field of the `pipeline_params` dictionary
stored in `basic.py`. (This is a temporary hack; likely will move to switching by command line flag sometime soon).

- New `all_genes` dictionary in `all_genes.py` indexed by organism that holds information on all the genes; it's
read from the `db_file` pointed to by `basic.py`.

- With new minor updates to the probability model and default sequence database we're no longer trying to preserve
exact numerical identity with the results from the paper. To get the classic results, you can check out the
`version_001` branch on github.
260 changes: 260 additions & 0 deletions all_genes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
from basic import *
from amino_acids import amino_acids
from tcr_distances_blosum import blosum
from paths import path_to_db
import translation

cdrs_sep = ';'
gap_character = '.'

all_genes = {}

class TCR_Gene:
def __init__( self, l ):
self.id = l['id']
self.organism = l['organism']
self.chain = l['chain']
self.region = l['region']
self.nucseq = l['nucseq']
self.alseq = l['aligned_protseq']
self.cdrs = l['cdrs'].split(cdrs_sep) if l['cdrs'] else []
## these are still 1-indexed !!!!!!!!!!!!!!
self.cdr_columns = [ map( int,x.split('-')) for x in l['cdr_columns'].split(cdrs_sep) ] if self.cdrs else []
frame = l['frame']
assert frame in ['+1','+2','+3','1','2','3']
self.nucseq_offset = int( frame[-1] )-1 ## 0, 1 or 2 (0-indexed for python)
self.protseq = translation.get_translation( self.nucseq, frame )[0]
assert self.protseq == self.alseq.replace(gap_character,'')
# sanity check
if self.cdrs:
assert self.cdrs == [ self.alseq[ x[0]-1 : x[1] ] for x in self.cdr_columns ]

def trim_allele_to_gene( id ):
return id[: id.index('*') ] #will fail if id doesn't contain '*'

## need to make this a little more configurable (cmdline??)

db_file = path_to_db+'/'+pipeline_params['db_file']
assert exists(db_file)

lines = parse_tsv_file( db_file )

for l in lines:
g = TCR_Gene( l )
if g.organism not in all_genes:
all_genes[g.organism] = {} # map from id to TCR_Gene objects
all_genes[g.organism][g.id] = g


verbose = ( __name__ == '__main__' )

for organism,genes in all_genes.iteritems():

for ab in 'AB':
org_merged_loopseqs = {}
for id,g in genes.iteritems():
if g.chain == ab and g.region == 'V':
loopseqs = g.cdrs[:-1] ## exclude CDR3 Nterm
org_merged_loopseqs[id] = ' '.join( loopseqs )

all_loopseq_nbrs = {}
all_loopseq_nbrs_mm1 = {}
for id1,seq1 in org_merged_loopseqs.iteritems():
g1 = genes[id1]
cpos = g1.cdr_columns[-1][0] - 1 #0-indexed
alseq1 = g1.alseq
minlen = cpos+1
assert len(alseq1) >= minlen
if alseq1[cpos] != 'C' and verbose:
print 'funny cpos:',id1,alseq1,g1.cdrs[-1]

all_loopseq_nbrs[id1] = []
all_loopseq_nbrs_mm1[id1] = []
for id2,seq2 in org_merged_loopseqs.iteritems():
g2 = genes[id2]
alseq2 = g2.alseq
assert len(alseq2) >= minlen
assert len(seq1) == len(seq2)
if seq1 == seq2:
all_loopseq_nbrs[id1].append( id2 )
all_loopseq_nbrs_mm1[id1].append( id2 )
continue

## count mismatches between these two, maybe count as an "_mm1" nbr
loop_mismatches = 0
loop_mismatches_cdrx = 0
loop_mismatch_seqs =[]
spaces=0
for a,b in zip( seq1,seq2):
if a==' ':
spaces+=1
continue
if a!= b:
if a in '*.' or b in '*.':
loop_mismatches += 10
break
else:
assert a in amino_acids and b in amino_acids
if spaces<=1:
loop_mismatches += 1
loop_mismatch_seqs.append( ( a,b ) )
else:
assert spaces==2
loop_mismatches_cdrx += 1
if loop_mismatches>1:
break
if loop_mismatches <=1:
all_mismatches = 0
for a,b in zip( alseq1[:cpos+2],alseq2[:cpos+2]):
if a!= b:
if a in '*.' or b in '*.':
all_mismatches += 10
else:
assert a in amino_acids and b in amino_acids
all_mismatches += 1
#dist = tcr_distances.blosum_sequence_distance( seq1, seq2, gap_penalty=10 )
if loop_mismatches<=1 and loop_mismatches + loop_mismatches_cdrx <= 2 and all_mismatches<=10:
if loop_mismatches == 1:
blscore= blosum[(loop_mismatch_seqs[0][0],loop_mismatch_seqs[0][1])]
else:
blscore = 100
if blscore>=1:
all_loopseq_nbrs_mm1[id1].append( id2 )
if loop_mismatches>0 and verbose:
mmstring = ','.join(['%s/%s'%(x[0],x[1]) for x in loop_mismatch_seqs])
gene1 = trim_allele_to_gene( id1 )
gene2 = trim_allele_to_gene( id2 )
if gene1 != gene2 and verbose:
print 'v_mismatches:',organism,mmstring,blscore,id1,id2,\
loop_mismatches,loop_mismatches_cdrx,all_mismatches,seq1
print 'v_mismatches:',organism,mmstring,blscore,id1,id2,\
loop_mismatches,loop_mismatches_cdrx,all_mismatches,seq2


for id in all_loopseq_nbrs:
rep = min( all_loopseq_nbrs[id] )
assert org_merged_loopseqs[id] == org_merged_loopseqs[ rep ]
genes[id].rep = rep
if verbose:
print 'vrep %s %15s %15s %s'%(organism, id, rep, org_merged_loopseqs[id])


## merge mm1 nbrs to guarantee transitivity
while True:
new_nbrs = False
for id1 in all_loopseq_nbrs_mm1:
new_id1_nbrs = False
for id2 in all_loopseq_nbrs_mm1[id1]:
for id3 in all_loopseq_nbrs_mm1[id2]:
if id3 not in all_loopseq_nbrs_mm1[id1]:
all_loopseq_nbrs_mm1[id1].append( id3 )
if verbose:
print 'new_nbr:',id1,'<--->',id2,'<--->',id3
new_id1_nbrs = True
break
if new_id1_nbrs:
break
if new_id1_nbrs:
new_nbrs = True
if verbose:
print 'new_nbrs:',ab,organism,new_nbrs
if not new_nbrs:
break

for id in all_loopseq_nbrs_mm1:
rep = min( all_loopseq_nbrs_mm1[id] )
genes[id].mm1_rep = rep
if verbose:
print 'mm1vrep %s %15s %15s %s'%(organism, id, rep,org_merged_loopseqs[id])


## setup Jseq reps
for ab in 'AB':
jloopseqs = {}
for id,g in genes.iteritems():
if g.chain == ab and g.region == 'J':
num = len( g.cdrs[0].replace( gap_character, '' ) )
jloopseq = g.protseq[:num+3] ## go all the way up to and including the GXG
jloopseqs[id] = jloopseq
all_jloopseq_nbrs = {}
for id1,seq1 in jloopseqs.iteritems():
all_jloopseq_nbrs[id1] = []
for id2,seq2 in jloopseqs.iteritems():
if seq1 == seq2:
all_jloopseq_nbrs[id1].append( id2 )
for id in all_jloopseq_nbrs:
rep = min( all_jloopseq_nbrs[id] )
genes[id].rep = rep
genes[id].mm1_rep = rep # just so we have an mm1_rep field defined...
assert jloopseqs[id] == jloopseqs[ rep ]
if verbose:
print 'jrep %s %15s %15s %15s'%(organism, id, rep, jloopseqs[id])



## setup a mapping that we can use for counting when allowing mm1s and also ignoring alleles

# allele2mm1_rep_gene_for_counting = {}
# def get_mm1_rep_ignoring_allele( gene, organism ): # helper fxn
# rep = get_mm1_rep( gene, organism )
# rep = rep[:rep.index('*')]
# return rep

#allele2mm1_rep_gene_for_counting[ organism ] = {}

for chain in 'AB':

for vj in 'VJ':
allele_gs = [ (id,g) for (id,g) in all_genes[organism].iteritems() if g.chain==chain and g.region==vj]

gene2rep = {}
gene2alleles = {}
rep_gene2alleles = {}

for allele,g in allele_gs:
#assert allele[2] == chain
gene = trim_allele_to_gene( allele )
rep_gene = trim_allele_to_gene( g.mm1_rep )
if rep_gene not in rep_gene2alleles:
rep_gene2alleles[ rep_gene ] = []
rep_gene2alleles[ rep_gene ].append( allele )

if gene not in gene2rep:
gene2rep[gene] = set()
gene2alleles[gene] = []
gene2rep[ gene ].add( rep_gene )
gene2alleles[gene].append( allele )

merge_rep_genes = {}
for gene,reps in gene2rep.iteritems():
if len(reps)>1:
assert vj=='V'
if verbose:
print 'multireps:',organism, gene, reps
for allele in gene2alleles[gene]:
print ' '.join(all_genes[organism][allele].cdrs), allele, \
all_genes[organism][allele].rep, \
all_genes[organism][allele].mm1_rep

## we are going to merge these reps
## which one should we choose?
l = [ (len(rep_gene2alleles[rep]), rep ) for rep in reps ]
l.sort()
l.reverse()
assert l[0][0] > l[1][0]
toprep = l[0][1]
for (count,rep) in l:
if rep in merge_rep_genes:
assert rep == toprep and merge_rep_genes[rep] == rep
merge_rep_genes[ rep ] = toprep


for allele,g in allele_gs:
count_rep = trim_allele_to_gene( g.mm1_rep ) #get_mm1_rep_ignoring_allele( allele, organism )
if count_rep in merge_rep_genes:
count_rep = merge_rep_genes[ count_rep ]
g.count_rep = count_rep #allele2mm1_rep_gene_for_counting[ organism ][ allele] = count_rep
if verbose:
print 'countrep:',organism, allele, count_rep


9 changes: 5 additions & 4 deletions analyze_epitope_privacy.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import sys
from basic import *
import tcr_distances
import cdr3s_human
import parse_tsv
import numpy as np
from scipy.cluster import hierarchy
from scipy.spatial import distance
import util
import html_colors
from all_genes import all_genes


with Parser(locals()) as p:
#p.str('args').unspecified_default().multiple().required()
Expand Down Expand Up @@ -144,9 +145,9 @@ def get_Z( val, mean, sdev ):
## ( va_reps, vb_reps, cdr3a, cdr3b )
index = len( tcrs )
mouse_indices[mouse].append( index )
tcrs.append( ( frozenset( [cdr3s_human.all_loopseq_representative[organism][x] for x in l[0].split(';') ] ),
frozenset( [cdr3s_human.all_loopseq_representative[organism][x] for x in l[1].split(';') ] ),
l[2], l[3] ) )
va_reps = frozenset( ( all_genes[organism][x].rep for x in l[0].split(';') ) )
vb_reps = frozenset( ( all_genes[organism][x].rep for x in l[1].split(';') ) )
tcrs.append( ( va_reps, vb_reps, l[2], l[3] ) )
assert len(l) == 5
dict_from_parse_tsv_line = l[4]
tcr_infos.append( dict_from_parse_tsv_line )
Expand Down
Loading

0 comments on commit 7bdc586

Please sign in to comment.