Skip to content

Commit

Permalink
database files are now found in directories paired with the correspon…
Browse files Browse the repository at this point in the history
…ding geneset files
  • Loading branch information
phbradley committed Sep 14, 2017
1 parent 03dadd0 commit a739f80
Show file tree
Hide file tree
Showing 12 changed files with 172 additions and 362 deletions.
8 changes: 2 additions & 6 deletions all_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ def __init__( self, l ):
assert self.protseq == self.alseq.replace(gap_character,'')
# sanity check
if self.cdrs:
#print self.cdrs
#print [ self.alseq[ x[0]-1 : x[1] ] for x in self.cdr_columns ]
assert self.cdrs == [ self.alseq[ x[0]-1 : x[1] ] for x in self.cdr_columns ]

## need to make this a little more configurable (cmdline??)
Expand Down Expand Up @@ -64,7 +62,7 @@ def __init__( self, l ):
alseq1 = g1.alseq
minlen = cpos+1
assert len(alseq1) >= minlen
if alseq1[cpos] != 'C':
if alseq1[cpos] != 'C' and verbose:
print 'funny cpos:',id1,alseq1,g1.cdrs[-1]

all_loopseq_nbrs[id1] = []
Expand Down Expand Up @@ -123,7 +121,7 @@ def __init__( self, l ):
mmstring = ','.join(['%s/%s'%(x[0],x[1]) for x in loop_mismatch_seqs])
gene1 = id1[:id1.index('*')]
gene2 = id2[:id2.index('*')]
if gene1 != gene2:
if gene1 != gene2 and verbose:
print 'v_mismatches:',organism,mmstring,blscore,id1,id2,\
loop_mismatches,loop_mismatches_cdrx,all_mismatches,seq1
print 'v_mismatches:',organism,mmstring,blscore,id1,id2,\
Expand Down Expand Up @@ -226,8 +224,6 @@ def get_cdr3_and_j_match_counts( organism, ab, qseq, j_gene, min_min_j_matchlen
else:
min_j_matchlen -= 1

#print 'min_j_matchlen:',min_j_matchlen,'jatag:',jatag,'ntrim:',ntrim,'ja_seq:',ja_seq,'qseq',qseq

if jatag not in aseq:
Log(`( 'whoah',ab,aseq,ja_seq )`)
errors.append( 'j{}tag_not_in_aseq'.format(ab) )
Expand Down
52 changes: 31 additions & 21 deletions analyze_gene_frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#from util import get_rep, get_mm1_rep, get_mm1_rep_gene_for_counting
import util
from tcr_rearrangement_new import all_countrep_pseudoprobs
from paths import path_to_db
from all_genes import all_genes

with Parser(locals()) as p:
#p.str('args').unspecified_default().multiple().required()
Expand All @@ -22,8 +22,6 @@
probs_tsvfile = clones_file[:-4] + '_gene_probs.tsv'
probs_fields = ['epitope','gene','label_prob','jsd_prob','jsd_prob_enrich','pseudoprob','pseudoprob_enrich']



def get_freq_from_tuple_counts_and_bias( counts, bias ):
reps = set()

Expand Down Expand Up @@ -104,26 +102,38 @@ def get_jsd_normed( P, Q ):


## read the background gene-tuple counts
tuplecountsfile = path_to_db+'/nextgen_tuple_counts_v2_{}_max10M.log'.format(organism)
assert exists( tuplecountsfile )

all_background_tuple_counts = {}

Log('reading {}'.format( tuplecountsfile ))
for line in open( tuplecountsfile,'r'):
if line.startswith('TUPLE_COUNT'):
l = line.split()
count = int(l[1] )
tup = tuple( sorted( l[2].split(',') ) )
segtype = tup[0][3] + tup[0][2] ## go from 'TRAV*' to 'VA'
assert segtype in segtypes_uppercase
if segtype not in all_background_tuple_counts:
all_background_tuple_counts[segtype] = {}
logfile = l[-1]
if logfile not in all_background_tuple_counts[segtype]:
all_background_tuple_counts[segtype][logfile] = {}
assert tup not in all_background_tuple_counts[segtype][logfile] ## should not be any repeats in the output
all_background_tuple_counts[segtype][logfile][ tup ] = count
tuplecountsfile = path_to_current_db_files()+'/nextgen_tuple_counts_v2_{}_max10M.log'.format(organism)

if exists( tuplecountsfile ):
Log('reading {}'.format( tuplecountsfile ))
for line in open( tuplecountsfile,'r'):
if line.startswith('TUPLE_COUNT'):
l = line.split()
count = int(l[1] )
tup = tuple( sorted( l[2].split(',') ) )
g0 = all_genes[organism][tup[0]]
segtype = g0.region + g0.chain
#segtype = tup[0][3] + tup[0][2] ## go from 'TRAV*' to 'VA'
assert segtype in segtypes_uppercase
if segtype not in all_background_tuple_counts:
all_background_tuple_counts[segtype] = {}
logfile = l[-1]
if logfile not in all_background_tuple_counts[segtype]:
all_background_tuple_counts[segtype][logfile] = {}
assert tup not in all_background_tuple_counts[segtype][logfile] ## should not be any repeats in the output
all_background_tuple_counts[segtype][logfile][ tup ] = count
else:
Log('WARNING: making up fake tuple counts since dbfile ({}) is missing'.format(tuplecountsfile))
logfile = 'fake_logfile'
for id,g in all_genes[organism].iteritems():
segtype = g.region + g.chain
if segtype in segtypes_uppercase:
if segtype not in all_background_tuple_counts:
all_background_tuple_counts[segtype] = {}
tup = tuple( util.get_mm1_rep_gene_for_counting( id ) )
all_background_tuple_counts[segtype][logfile][ tup ] = 1 ## flat counts


## here we compute J-S divergences of the gene distributions to background
Expand Down
12 changes: 12 additions & 0 deletions basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,18 @@
############################################################################################
############################################################################################

def path_to_current_db_files():
"""Without the trailing /"""
db_file = paths.path_to_db+'/'+pipeline_params['db_file']
assert exists(db_file)
db_files_dir = db_file+'_files'
if not exists(db_files_dir):
mkdir(db_files_dir)
return db_files_dir




## you could modify this function if you have a different cmdline tool for converting svg to png
## like inkscape or cairosvg
##
Expand Down
9 changes: 5 additions & 4 deletions find_cdr3_motifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@
import cdr3s_human #debug
from all_genes import all_genes
import sys
from paths import path_to_db
import util


with Parser(locals()) as p:
p.str('clones_file').required()
p.str('organism')
Expand Down Expand Up @@ -214,8 +212,11 @@ def extend_motif( oldmotif, oldshowmotif, old_chi_squared, seqs, seq_indices, ra
## index these by the v_rep and the j_rep
ng_tcrs = { 'A':{}, 'B':{} }
for ab in 'AB':
ng_logfile = '{}/new_nextgen_chains_{}_{}.tsv'.format(path_to_db,organism,ab)
assert exists(ng_logfile)
ng_logfile = '{}/new_nextgen_chains_{}_{}.tsv'.format(path_to_current_db_files(),organism,ab)
if not exists( ng_logfile ):
Log('WARNING:: find_cdr3_motifs.py: missing next-gen chains file {}'.format(ng_logfile))
continue

counter=0
num_chains=0
ab_chains = {}
Expand Down
4 changes: 0 additions & 4 deletions parse_cdr3.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
from basic import *
#from amino_acids import amino_acids
#from tcr_distances_blosum import blosum
#from paths import path_to_db
#import translation
from all_genes import all_genes, gap_character


Expand Down
46 changes: 22 additions & 24 deletions plot_sharing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from basic import *
import html_colors
import util
from paths import path_to_db

with Parser(locals()) as p:
p.str('clones_file').required()
Expand Down Expand Up @@ -456,25 +455,26 @@
epitope_diversity[chains][epitope][2] = avg_nbrdist

## load the random values for avg_nbrdist
randfile = '{}/{}_rand_divs_new.txt'.format(path_to_db,organism)
assert exists(randfile)
rand_divs = {}
for chains in ['A','B','AB']: rand_divs[chains] = [[],[],[],[]]

for line in open(randfile,'r'):
l = line.split()
if line.startswith("GAUSSDIV SM1 SE1"):
chains = l[5]
div = float(l[7])
rand_divs[chains][0].append( div )
elif line.startswith("GAUSSDIVSHANNON"):
chains = l[2]
div = float( l[4] )
rand_divs[chains][1].append( div )
elif line.startswith('avg_nbrdist:'):
fake_epitope,chains = l[1:3]
avg_nbrdist = float( l[3] )
rand_divs[chains][2].append( avg_nbrdist )
randfile = '{}/{}_rand_divs_new.txt'.format(path_to_current_db_files(),organism)
if not exists(randfile):
Log('WARNING:: plot_sharing.py: missing random divergences file: {}'.format(randfile))
else:
for line in open(randfile,'r'):
l = line.split()
if line.startswith("GAUSSDIV SM1 SE1"):
chains = l[5]
div = float(l[7])
rand_divs[chains][0].append( div )
elif line.startswith("GAUSSDIVSHANNON"):
chains = l[2]
div = float( l[4] )
rand_divs[chains][1].append( div )
elif line.startswith('avg_nbrdist:'):
fake_epitope,chains = l[1:3]
avg_nbrdist = float( l[3] )
rand_divs[chains][2].append( avg_nbrdist )

## let's try to get epitope diversity information from the aucs for discrimination from random tcrs
desired_nbrdist_tag_suffix = 'wtd_nbrdist10'
Expand Down Expand Up @@ -537,12 +537,10 @@
plt.title('{} {}'.format(chains, divtype))

rdivs = rand_divs[chains][divindex]
rdivs.sort()
#if divindex==0:
# rdivs = rdivs[:len(rdivs)/4] ## take bottom third

for val in rdivs:
plt.plot( [0,len(l)], [val,val], ':r' )
if rdivs:
rdivs.sort()
for val in rdivs:
plt.plot( [0,len(l)], [val,val], ':r' )

locs,labels = plt.yticks()
minval = min( ( x[0] for x in l ) )
Expand Down
63 changes: 33 additions & 30 deletions random_tcr_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import cdr3s_human #debug
from all_genes import all_genes
import parse_tsv
from paths import path_to_db

with Parser(locals()) as p:
p.str('organism').required()
Expand All @@ -26,38 +25,42 @@
random_tcrs = []
random_info = []

if organism == 'mouse':
random_chains = {}
for ab in 'AB':
random_chains[ab] = []
random_tcrs_file = '{}/new_nextgen_chains_{}_{}.tsv'.format(path_to_db,organism,ab)
assert exists( random_tcrs_file )
Log('reading {}'.format( random_tcrs_file ))
header = []
for line in open(random_tcrs_file,'r'):
l = line[:-1].split('\t')
if not header:
header = l[:]
assert header == ['v_reps','j_reps','cdr3','cdr3_nucseq']
else:
random_chains[ab].append( ( frozenset( l[0].split(',') ), l[2] ) ) ## (v_reps, cdr3)
random.shuffle( random_chains[ab] )
assert len(random_chains[ab])>=nrandom
for a,b in zip( random_chains['A'][:nrandom], random_chains['B'][:nrandom] ):
random_tcrs.append( ( a[0],b[0],a[1],b[1] ) ) #va_reps, vb_reps, cdr3a, cdr3b
random_info.append( { 'va_reps': ';'.join( a[0] ),
'vb_reps': ';'.join( b[0] ),
'cdr3a':a[1],
'cdr3b':b[1] } )

else:
##
human_paired_tcrs_file = path_to_db+'/randhuman_paired.tsv'
assert exists( human_paired_tcrs_file )
rtcrs = parse_tsv.parse_tsv_file( human_paired_tcrs_file, [], ['va_reps','vb_reps','cdr3a','cdr3b'] )
## sometimes we have a paired chains file which is probably better
paired_tcrs_file = '{}/rand{}_paired.tsv'.format( path_to_current_db_files(), organism )
if exists( paired_tcrs_file):
rtcrs = parse_tsv.parse_tsv_file( paired_tcrs_file, [], ['va_reps','vb_reps','cdr3a','cdr3b'] )
for l in rtcrs:
random_tcrs.append( ( frozenset( l[0].split(';') ), frozenset( l[1].split(';') ), l[2], l[3] ) )
random_info.append( { 'va_reps': l[0], 'vb_reps': l[1], 'cdr3a': l[2], 'cdr3b': l[3] } )
else:
random_tcrs_files = glob('{}/new_nextgen_chains_{}_[AB].tsv'.format( path_to_current_db_files(),organism))
if len(random_tcrs_files) == 2:
random_chains = {}
for ab in 'AB':
random_chains[ab] = []
random_tcrs_file = '{}/new_nextgen_chains_{}_{}.tsv'.format(path_to_current_db_files(),organism,ab)
assert exists( random_tcrs_file )
Log('reading {}'.format( random_tcrs_file ))
header = []
for line in open(random_tcrs_file,'r'):
l = line[:-1].split('\t')
if not header:
header = l[:]
assert header == ['v_reps','j_reps','cdr3','cdr3_nucseq']
else:
random_chains[ab].append( ( frozenset( l[0].split(',') ), l[2] ) ) ## (v_reps, cdr3)
random.shuffle( random_chains[ab] )
assert len(random_chains[ab])>=nrandom
for a,b in zip( random_chains['A'][:nrandom], random_chains['B'][:nrandom] ):
random_tcrs.append( ( a[0],b[0],a[1],b[1] ) ) #va_reps, vb_reps, cdr3a, cdr3b
random_info.append( { 'va_reps': ';'.join( a[0] ),
'vb_reps': ';'.join( b[0] ),
'cdr3a':a[1],
'cdr3b':b[1] } )
else:
Log('WARNING:: random_tcr_distances.py: no nextgen TCR chains files ({}) -- nothing to do'\
.format( len(random_tcrs_files) ) )
exit()


print 'precomputing v-region distances'
Expand Down
8 changes: 5 additions & 3 deletions read_motifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import svg_basic
import tcr_sampler
import util
from paths import path_to_db
import parse_tsv
from basic import *
import random
Expand Down Expand Up @@ -251,8 +250,11 @@ def get_amino_acid_consensus( seqs ):
## index these by the v_rep and the j_rep

for ab in 'AB':
ng_logfile = '{}/new_nextgen_chains_{}_{}.tsv'.format( path_to_db, organism, ab )
assert exists(ng_logfile)
ng_logfile = '{}/new_nextgen_chains_{}_{}.tsv'.format( path_to_current_db_files(), organism, ab )
if not exists(ng_logfile):
Log('WARNING:: read_motifs.py: missing nextgen TCR chains file: {}'.format(ng_logfile))
continue

counter=0
num_chains=0
ab_chains = {}
Expand Down
Loading

0 comments on commit a739f80

Please sign in to comment.