Skip to content

Commit

Permalink
more code updates for new framework to handle gammadeltas, others
Browse files Browse the repository at this point in the history
  • Loading branch information
phbradley committed Sep 20, 2017
1 parent 8bfa6d6 commit bb1def0
Show file tree
Hide file tree
Showing 20 changed files with 347 additions and 377 deletions.
206 changes: 71 additions & 135 deletions all_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ def __init__( self, l ):
if self.cdrs:
assert self.cdrs == [ self.alseq[ x[0]-1 : x[1] ] for x in self.cdr_columns ]

def trim_allele_to_gene( id ):
return id[: id.index('*') ] #will fail if id doesn't contain '*'

## need to make this a little more configurable (cmdline??)

db_file = path_to_db+'/'+pipeline_params['db_file']
Expand Down Expand Up @@ -119,8 +122,8 @@ def __init__( self, l ):
all_loopseq_nbrs_mm1[id1].append( id2 )
if loop_mismatches>0 and verbose:
mmstring = ','.join(['%s/%s'%(x[0],x[1]) for x in loop_mismatch_seqs])
gene1 = id1[:id1.index('*')]
gene2 = id2[:id2.index('*')]
gene1 = trim_allele_to_gene( id1 )
gene2 = trim_allele_to_gene( id2 )
if gene1 != gene2 and verbose:
print 'v_mismatches:',organism,mmstring,blscore,id1,id2,\
loop_mismatches,loop_mismatches_cdrx,all_mismatches,seq1
Expand Down Expand Up @@ -189,136 +192,69 @@ def __init__( self, l ):



def get_cdr3_and_j_match_counts( organism, ab, qseq, j_gene, min_min_j_matchlen = 3,
extended_cdr3 = False ):
#fasta = all_fasta[organism]
jg = all_genes[organism][j_gene]

errors = []

## qseq starts at CA...
assert qseq[0] == 'C'

num_genome_j_positions_in_loop = len(jg.cdrs[0].replace(gap_character,''))-2
#num_genome_j_positions_in_loop = all_num_genome_j_positions_in_loop[organism][ab][j_gene]

if extended_cdr3: num_genome_j_positions_in_loop += 2 ## up to but not including GXG

## history: was only for alpha
aseq = qseq[:] ## starts at the C position

ja_gene = j_gene
assert ja_gene in fasta
ja_seq = jg.protseq #fasta[ ja_gene ]

min_j_matchlen = min_min_j_matchlen+3

while min_j_matchlen >= min_min_j_matchlen:
ntrim =0
while ntrim+min_j_matchlen<len(ja_seq) and ja_seq[ntrim:ntrim+min_j_matchlen] not in aseq:
ntrim += 1

jatag = ja_seq[ntrim:ntrim+min_j_matchlen]
if jatag in aseq:
break
else:
min_j_matchlen -= 1

if jatag not in aseq:
Log(`( 'whoah',ab,aseq,ja_seq )`)
errors.append( 'j{}tag_not_in_aseq'.format(ab) )
return '-',[100,0],errors
elif ja_seq.count( jatag ) != 1:
Log(`( 'whoah2',ab,aseq,ja_seq )`)
errors.append( 'multiple_j{}tag_in_jseq'.format(ab) )
return '-',[100,0],errors
else:
pos = aseq.find( jatag )
looplen = pos - ntrim + num_genome_j_positions_in_loop
if not extended_cdr3:
aseq = aseq[3:]
looplen -= 3 ## dont count CAX
if len(aseq)<looplen:
Log(`( 'short',ab,aseq,ja_seq )`)
errors.append( ab+'seq_too_short' )
return '-',[100,0],errors

cdrseq = aseq[:looplen ]

## now count mismatches in the J gene, beyond the cdrseq
j_seq = jg.protseq #fasta[ j_gene ] ## not sure why we do this again (old legacy code)
if qseq.count( cdrseq ) > 1:
Log('multiple cdrseq occurrences %s %s'%(qseq,cdrseq))
errors.append('multiple_cdrseq_occ')
return '-',[100,0],errors
assert qseq.count(cdrseq) == 1
start_counting_qseq = qseq.find(cdrseq)+len(cdrseq)
start_counting_jseq = num_genome_j_positions_in_loop
j_match_counts = [0,0]
#assert extended_cdr3 ## otherwise I think this count is not right?
#print 'here',start_counting_qseq,start_counting_jseq,len(qseq)
for qpos in range( start_counting_qseq, len(qseq)):
jpos = start_counting_jseq + (qpos-start_counting_qseq)
#print 'here',qpos,jpos
if jpos>= len(j_seq): break
if qseq[qpos] == j_seq[jpos]:
j_match_counts[1] += 1
else:
j_match_counts[0] += 1

return cdrseq, j_match_counts,errors




def parse_cdr3( organism, ab, qseq, v_gene, j_gene, q2v_align, extended_cdr3 = False ):
## v_align is a mapping from 0-indexed qseq positions to 0-indexed v_gene protseq positions
#fasta = all_fasta[ organism ]
#align_fasta = all_align_fasta[ organism ]
vg = all_genes[organism][v_gene]

errors = []

## what is the C position in this v gene?
v_seq = vg.protseq #fasta[ v_gene ]
v_alseq = vg.alseq #align_fasta[ v_gene ]
assert v_seq == v_alseq.replace(gap_character,'')

alseq_cpos = vg.cdr_columns[-1][0] - 1 ## now 0-indexed
#alseq_cpos = alseq_C_pos[organism][ab] - 1 ## now 0-indexed
numgaps = v_alseq[:alseq_cpos].count(gap_character)

cpos = alseq_cpos - numgaps ## 0-indexed
cpos_match = -1

v_match_counts = [0,0]

qseq_len = len(qseq)
for (qpos,vpos) in sorted( q2v_align.iteritems() ):
#print 'q2v-align:',qpos, vpos, cpos
if qpos == len(qseq):
continue ## from a partial codon at the end
if vpos == cpos:
cpos_match = qpos
elif vpos <= cpos:
## only count v mismatches here
if qseq[qpos] == v_seq[vpos]:
v_match_counts[1] += 1
else:
v_match_counts[0] += 1

if cpos_match<0 or qseq[ cpos_match ] != 'C':
## problemo
Log('failed to find blast match to C position')
errors.append('no_V{}_Cpos_blastmatch'.format(ab))
return '-',[100,0],[100,0],errors

cdrseq, j_match_counts, other_errors = get_cdr3_and_j_match_counts( organism, ab, qseq[ cpos_match: ], j_gene,
extended_cdr3 = extended_cdr3 )

return cdrseq, v_match_counts, j_match_counts, errors+other_errors


if __name__ == '__main__':
## show J alignments
pass
## setup a mapping that we can use for counting when allowing mm1s and also ignoring alleles

# allele2mm1_rep_gene_for_counting = {}
# def get_mm1_rep_ignoring_allele( gene, organism ): # helper fxn
# rep = get_mm1_rep( gene, organism )
# rep = rep[:rep.index('*')]
# return rep

#allele2mm1_rep_gene_for_counting[ organism ] = {}

for chain in 'AB':

for vj in 'VJ':
allele_gs = [ (id,g) for (id,g) in all_genes[organism].iteritems() if g.chain==chain and g.region==vj]

gene2rep = {}
gene2alleles = {}
rep_gene2alleles = {}

for allele,g in allele_gs:
#assert allele[2] == chain
gene = trim_allele_to_gene( allele )
rep_gene = trim_allele_to_gene( g.mm1_rep )
if rep_gene not in rep_gene2alleles:
rep_gene2alleles[ rep_gene ] = []
rep_gene2alleles[ rep_gene ].append( allele )

if gene not in gene2rep:
gene2rep[gene] = set()
gene2alleles[gene] = []
gene2rep[ gene ].add( rep_gene )
gene2alleles[gene].append( allele )

merge_rep_genes = {}
for gene,reps in gene2rep.iteritems():
if len(reps)>1:
assert vj=='V'
if verbose:
print 'multireps:',organism, gene, reps
for allele in gene2alleles[gene]:
print ' '.join(all_genes[organism][allele].cdrs), allele, \
all_genes[organism][allele].rep, \
all_genes[organism][allele].mm1_rep

## we are going to merge these reps
## which one should we choose?
l = [ (len(rep_gene2alleles[rep]), rep ) for rep in reps ]
l.sort()
l.reverse()
assert l[0][0] > l[1][0]
toprep = l[0][1]
for (count,rep) in l:
if rep in merge_rep_genes:
assert rep == toprep and merge_rep_genes[rep] == rep
merge_rep_genes[ rep ] = toprep


for allele,g in allele_gs:
count_rep = trim_allele_to_gene( g.mm1_rep ) #get_mm1_rep_ignoring_allele( allele, organism )
if count_rep in merge_rep_genes:
count_rep = merge_rep_genes[ count_rep ]
g.count_rep = count_rep #allele2mm1_rep_gene_for_counting[ organism ][ allele] = count_rep
if verbose:
print 'countrep:',organism, allele, count_rep


5 changes: 0 additions & 5 deletions analyze_epitope_privacy.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import sys
from basic import *
import tcr_distances
import cdr3s_human # just for debugging
import parse_tsv
import numpy as np
from scipy.cluster import hierarchy
Expand Down Expand Up @@ -146,12 +145,8 @@ def get_Z( val, mean, sdev ):
## ( va_reps, vb_reps, cdr3a, cdr3b )
index = len( tcrs )
mouse_indices[mouse].append( index )
old_va_reps = frozenset( [cdr3s_human.all_loopseq_representative[organism][x] for x in l[0].split(';') ] )
old_vb_reps = frozenset( [cdr3s_human.all_loopseq_representative[organism][x] for x in l[1].split(';') ] )
va_reps = frozenset( ( all_genes[organism][x].rep for x in l[0].split(';') ) )
vb_reps = frozenset( ( all_genes[organism][x].rep for x in l[1].split(';') ) )
assert va_reps == old_va_reps
assert vb_reps == old_vb_reps
tcrs.append( ( va_reps, vb_reps, l[2], l[3] ) )
assert len(l) == 5
dict_from_parse_tsv_line = l[4]
Expand Down
12 changes: 8 additions & 4 deletions analyze_gene_frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ def get_jsd_normed( P, Q ):
segtype = g.region + g.chain
if segtype in segtypes_uppercase:
if segtype not in all_background_tuple_counts:
all_background_tuple_counts[segtype] = {}
tup = tuple( util.get_mm1_rep_gene_for_counting( id ) )
all_background_tuple_counts[segtype] = {logfile:{}}
tup = tuple( util.get_mm1_rep_gene_for_counting( id, organism ) )
all_background_tuple_counts[segtype][logfile][ tup ] = 1 ## flat counts


Expand Down Expand Up @@ -174,6 +174,9 @@ def get_jsd_normed( P, Q ):


for segtype in segtypes_uppercase:
region = segtype[0]
chain = segtype[1]
assert region in 'VJ' and chain in 'AB'

tcr_counts = {}
num_tcrs = len(tcrs)
Expand Down Expand Up @@ -240,7 +243,8 @@ def get_jsd_normed( P, Q ):
l.sort()
l.reverse()

countreps = sorted( ( x for x in all_countrep_pseudoprobs[organism].keys() if x[3]+x[2] == segtype ) )
countreps = sorted( set( g.count_rep for g in all_genes[organism].values() \
if g.chain == chain and g.region == region ) )

num_tcrs = len(tcrs)

Expand All @@ -250,7 +254,7 @@ def get_jsd_normed( P, Q ):
bg_prob = bg_freq.get(rep,0.)
jsd_prob_enrich = 1.0 if bg_prob==0. else jsd_prob / bg_prob
pseudoprob = float( tcr_pseudoprob_counts.get(rep,0))/num_tcrs
bg_pseudoprob = all_countrep_pseudoprobs[ organism ][ rep ]
bg_pseudoprob = all_countrep_pseudoprobs[ organism ][ chain ][ region ][ rep ]
pseudoprob_enrich = 1 if bg_pseudoprob==0. else pseudoprob / bg_pseudoprob
outl = { 'epitope':epitope,
'gene':rep,
Expand Down
9 changes: 0 additions & 9 deletions analyze_overlap_compute_simpsons.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import util
from scipy import stats
from tcr_distances import get_rank
import cdr3s_human # just for debugging
from all_genes import all_genes

import numpy as np
Expand Down Expand Up @@ -137,19 +136,11 @@ def get_prob( logp,k=k,N=N ):
vb_genes = set( l['vb_genes'].split(';') )
jb_genes = set( l['jb_genes'].split(';') )

old_va_reps = set([ cdr3s_human.all_loopseq_representative[ organism ][ x] for x in va_genes ])
#old_ja_reps = set([ cdr3s_human. all_jseq_representative[ organism ][ x] for x in ja_genes ])
#old_vb_reps = set([ cdr3s_human.all_loopseq_representative[ organism ][ x] for x in vb_genes ])
old_jb_reps = set([ cdr3s_human. all_jseq_representative[ organism ][ x] for x in jb_genes ])

va_reps = set(( all_genes[organism][x].rep for x in va_genes ))
ja_reps = set(( all_genes[organism][x].rep for x in ja_genes ))
vb_reps = set(( all_genes[organism][x].rep for x in vb_genes ))
jb_reps = set(( all_genes[organism][x].rep for x in jb_genes ))

assert va_reps == old_va_reps
assert jb_reps == old_jb_reps ## etc.

protprob = { 'A': float(l['a_protseq_prob']),
'B': float(l['b_protseq_prob']),
'AB': float(l['a_protseq_prob']) * float( l['b_protseq_prob'] ) }
Expand Down
5 changes: 0 additions & 5 deletions compute_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from basic import *
from util import get_top_genes
import cdr3s_human #debug
from all_genes import all_genes
#from scipy import stats, optimize
import tcr_distances
Expand Down Expand Up @@ -81,12 +80,8 @@
va_genes = l['va_genes'].split(';')
vb_genes = l['vb_genes'].split(';')

old_va_reps = frozenset( [ cdr3s_human.all_loopseq_representative[ organism ][ x] for x in va_genes ])
old_vb_reps = frozenset( [ cdr3s_human.all_loopseq_representative[ organism ][ x] for x in vb_genes ])
va_reps = frozenset( ( all_genes[organism][x].rep for x in va_genes ))
vb_reps = frozenset( ( all_genes[organism][x].rep for x in vb_genes ))
assert va_reps == old_va_reps
assert vb_reps == old_vb_reps

all_info.append( l )

Expand Down
18 changes: 11 additions & 7 deletions compute_probs.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@
va_cdr3_protseq,codons = get_translation( va_cdr3_nucseq, '+1' )
ja_cdr3_protseq,codons = get_translation( ja_cdr3_nucseq, '+{}'.format(1+len(ja_cdr3_nucseq)%3))

if no_probabilities: ##all probabilities will be set to 1 if this flag is set
if no_probabilities or not tcr_rearrangement.probs_data_exist( organism,'A'):
##all probabilities will be set to 1 if this flag is set
aprob_nucseq = 1
aprob_protseq = 1
else:
Expand Down Expand Up @@ -148,7 +149,8 @@
vb_cdr3_protseq,codons = get_translation( vb_cdr3_nucseq, '+1' )
jb_cdr3_protseq,codons = get_translation( jb_cdr3_nucseq, '+{}'.format(1+len(jb_cdr3_nucseq)%3))

if no_probabilities: ##all probabilities will be set to 1 if this flag is set
if no_probabilities or not tcr_rearrangement.probs_data_exist( organism,'B'):
##all probabilities will be set to 1 if this flag is set
bprob_nucseq = 1
bprob_protseq = 1
else:
Expand Down Expand Up @@ -197,16 +199,18 @@
vals[ 'cdr3b_new_nucseq' ] = cdr3b_new_nucseq

## there's a little bit of a bias toward guys with more blast hits, ie shorted reads? since we take a max
if no_probabilities: ##all probabilities will be set to 1 if this flag is set
if no_probabilities or not ( tcr_rearrangement.probs_data_exist(organism,'A') and
tcr_rearrangement.probs_data_exist(organism,'B') ):
##all probabilities will be set to 1 if this flag is set
va_rep_prob = 1
ja_rep_prob = 1
vb_rep_prob = 1
jb_rep_prob = 1
elif new_probs:
va_rep_prob = max( [ tcr_rearrangement.all_countrep_pseudoprobs[organism][x] for x in va_countreps ] )
ja_rep_prob = max( [ tcr_rearrangement.all_countrep_pseudoprobs[organism][x] for x in ja_countreps ] )
vb_rep_prob = max( [ tcr_rearrangement.all_countrep_pseudoprobs[organism][x] for x in vb_countreps ] )
jb_rep_prob = max( [ tcr_rearrangement.all_countrep_pseudoprobs[organism][x] for x in jb_countreps ] )
va_rep_prob = max( [ tcr_rearrangement.all_countrep_pseudoprobs[organism]['A']['V'][x] for x in va_countreps ] )
ja_rep_prob = max( [ tcr_rearrangement.all_countrep_pseudoprobs[organism]['A']['J'][x] for x in ja_countreps ] )
vb_rep_prob = max( [ tcr_rearrangement.all_countrep_pseudoprobs[organism]['B']['V'][x] for x in vb_countreps ] )
jb_rep_prob = max( [ tcr_rearrangement.all_countrep_pseudoprobs[organism]['B']['J'][x] for x in jb_countreps ] )
else:
va_rep_prob = max( [ tcr_rearrangement.all_rep_probs[organism][x] for x in va_reps ] )
ja_rep_prob = max( [ tcr_rearrangement.all_rep_probs[organism][x] for x in ja_reps ] )
Expand Down
Loading

0 comments on commit bb1def0

Please sign in to comment.