database files are now found in directories paired with the correspon…

…ding geneset files
phbradley · Sep 14, 2017 · a739f80 · a739f80
1 parent 03dadd0
commit a739f80
Show file tree

Hide file tree

Showing 12 changed files with 172 additions and 362 deletions.
diff --git a/all_genes.py b/all_genes.py
@@ -27,8 +27,6 @@ def __init__( self, l ):
         assert self.protseq == self.alseq.replace(gap_character,'')
         # sanity check
         if self.cdrs:
-            #print self.cdrs
-            #print [ self.alseq[ x[0]-1 : x[1] ] for x in self.cdr_columns ]
             assert self.cdrs == [ self.alseq[ x[0]-1 : x[1] ] for x in self.cdr_columns ]
 
 ## need to make this a little more configurable (cmdline??)
@@ -64,7 +62,7 @@ def __init__( self, l ):
             alseq1 = g1.alseq
             minlen = cpos+1
             assert len(alseq1) >= minlen
-            if alseq1[cpos] != 'C':
+            if alseq1[cpos] != 'C' and verbose:
                 print 'funny cpos:',id1,alseq1,g1.cdrs[-1]
 
             all_loopseq_nbrs[id1] = []
@@ -123,7 +121,7 @@ def __init__( self, l ):
                                 mmstring = ','.join(['%s/%s'%(x[0],x[1]) for x in loop_mismatch_seqs])
                                 gene1 = id1[:id1.index('*')]
                                 gene2 = id2[:id2.index('*')]
-                                if gene1 != gene2:
+                                if gene1 != gene2 and verbose:
                                     print 'v_mismatches:',organism,mmstring,blscore,id1,id2,\
                                         loop_mismatches,loop_mismatches_cdrx,all_mismatches,seq1
                                     print 'v_mismatches:',organism,mmstring,blscore,id1,id2,\
@@ -226,8 +224,6 @@ def get_cdr3_and_j_match_counts( organism, ab, qseq, j_gene, min_min_j_matchlen
         else:
             min_j_matchlen -= 1
 
-    #print 'min_j_matchlen:',min_j_matchlen,'jatag:',jatag,'ntrim:',ntrim,'ja_seq:',ja_seq,'qseq',qseq
-
     if jatag not in aseq:
         Log(`( 'whoah',ab,aseq,ja_seq )`)
         errors.append( 'j{}tag_not_in_aseq'.format(ab) )

diff --git a/analyze_gene_frequencies.py b/analyze_gene_frequencies.py
@@ -8,7 +8,7 @@
 #from util import get_rep, get_mm1_rep, get_mm1_rep_gene_for_counting
 import util
 from tcr_rearrangement_new import all_countrep_pseudoprobs
-from paths import path_to_db
+from all_genes import all_genes
 
 with Parser(locals()) as p:
     #p.str('args').unspecified_default().multiple().required()
@@ -22,8 +22,6 @@
 probs_tsvfile = clones_file[:-4] + '_gene_probs.tsv'
 probs_fields = ['epitope','gene','label_prob','jsd_prob','jsd_prob_enrich','pseudoprob','pseudoprob_enrich']
 
-
-
 def get_freq_from_tuple_counts_and_bias( counts, bias ):
     reps = set()
 
@@ -104,26 +102,38 @@ def get_jsd_normed( P, Q ):
 
 
 ## read the background gene-tuple counts
-tuplecountsfile = path_to_db+'/nextgen_tuple_counts_v2_{}_max10M.log'.format(organism)
-assert exists( tuplecountsfile )
-
 all_background_tuple_counts = {}
 
-Log('reading {}'.format( tuplecountsfile ))
-for line in open( tuplecountsfile,'r'):
-    if line.startswith('TUPLE_COUNT'):
-        l = line.split()
-        count = int(l[1] )
-        tup = tuple( sorted( l[2].split(',') ) )
-        segtype = tup[0][3] + tup[0][2] ## go from 'TRAV*' to 'VA'
-        assert segtype in segtypes_uppercase
-        if segtype not in all_background_tuple_counts:
-            all_background_tuple_counts[segtype] = {}
-        logfile = l[-1]
-        if logfile not in all_background_tuple_counts[segtype]:
-            all_background_tuple_counts[segtype][logfile] = {}
-        assert tup not in all_background_tuple_counts[segtype][logfile] ## should not be any repeats in the output
-        all_background_tuple_counts[segtype][logfile][ tup ] = count
+tuplecountsfile = path_to_current_db_files()+'/nextgen_tuple_counts_v2_{}_max10M.log'.format(organism)
+
+if exists( tuplecountsfile ):
+    Log('reading {}'.format( tuplecountsfile ))
+    for line in open( tuplecountsfile,'r'):
+        if line.startswith('TUPLE_COUNT'):
+            l = line.split()
+            count = int(l[1] )
+            tup = tuple( sorted( l[2].split(',') ) )
+            g0 = all_genes[organism][tup[0]]
+            segtype = g0.region + g0.chain
+            #segtype = tup[0][3] + tup[0][2] ## go from 'TRAV*' to 'VA'
+            assert segtype in segtypes_uppercase
+            if segtype not in all_background_tuple_counts:
+                all_background_tuple_counts[segtype] = {}
+            logfile = l[-1]
+            if logfile not in all_background_tuple_counts[segtype]:
+                all_background_tuple_counts[segtype][logfile] = {}
+            assert tup not in all_background_tuple_counts[segtype][logfile] ## should not be any repeats in the output
+            all_background_tuple_counts[segtype][logfile][ tup ] = count
+else:
+    Log('WARNING: making up fake tuple counts since dbfile ({}) is missing'.format(tuplecountsfile))
+    logfile = 'fake_logfile'
+    for id,g in all_genes[organism].iteritems():
+        segtype = g.region + g.chain
+        if segtype in segtypes_uppercase:
+            if segtype not in all_background_tuple_counts:
+                all_background_tuple_counts[segtype] = {}
+            tup = tuple( util.get_mm1_rep_gene_for_counting( id ) )
+            all_background_tuple_counts[segtype][logfile][ tup ] = 1 ## flat counts
 
 
 ## here we compute J-S divergences of the gene distributions to background

diff --git a/basic.py b/basic.py
@@ -44,6 +44,18 @@
 ############################################################################################
 ############################################################################################
 
+def path_to_current_db_files():
+    """Without the trailing /"""
+    db_file = paths.path_to_db+'/'+pipeline_params['db_file']
+    assert exists(db_file)
+    db_files_dir = db_file+'_files'
+    if not exists(db_files_dir):
+        mkdir(db_files_dir)
+    return db_files_dir
+
+
+
+
 ## you could modify this function if you have a different cmdline tool for converting svg to png
 ## like inkscape or cairosvg
 ##

diff --git a/find_cdr3_motifs.py b/find_cdr3_motifs.py
@@ -6,10 +6,8 @@
 import cdr3s_human #debug
 from all_genes import all_genes
 import sys
-from paths import path_to_db
 import util
 
-
 with Parser(locals()) as p:
     p.str('clones_file').required()
     p.str('organism')
@@ -214,8 +212,11 @@ def extend_motif( oldmotif, oldshowmotif, old_chi_squared, seqs, seq_indices, ra
 ## index these by the v_rep and the j_rep
 ng_tcrs = { 'A':{}, 'B':{} }
 for ab in 'AB':
-    ng_logfile = '{}/new_nextgen_chains_{}_{}.tsv'.format(path_to_db,organism,ab)
-    assert exists(ng_logfile)
+    ng_logfile = '{}/new_nextgen_chains_{}_{}.tsv'.format(path_to_current_db_files(),organism,ab)
+    if not exists( ng_logfile ):
+        Log('WARNING:: find_cdr3_motifs.py: missing next-gen chains file {}'.format(ng_logfile))
+        continue
+
     counter=0
     num_chains=0
     ab_chains = {}

diff --git a/parse_cdr3.py b/parse_cdr3.py
@@ -1,8 +1,4 @@
 from basic import *
-#from amino_acids import amino_acids
-#from tcr_distances_blosum import blosum
-#from paths import path_to_db
-#import translation
 from all_genes import all_genes, gap_character
 
 

diff --git a/plot_sharing.py b/plot_sharing.py
@@ -1,7 +1,6 @@
 from basic import *
 import html_colors
 import util
-from paths import path_to_db
 
 with Parser(locals()) as p:
     p.str('clones_file').required()
@@ -456,25 +455,26 @@
         epitope_diversity[chains][epitope][2] = avg_nbrdist
 
 ## load the random values for avg_nbrdist
-randfile = '{}/{}_rand_divs_new.txt'.format(path_to_db,organism)
-assert exists(randfile)
 rand_divs = {}
 for chains in ['A','B','AB']: rand_divs[chains] = [[],[],[],[]]
-
-for line in open(randfile,'r'):
-    l = line.split()
-    if line.startswith("GAUSSDIV SM1 SE1"):
-        chains = l[5]
-        div = float(l[7])
-        rand_divs[chains][0].append( div )
-    elif line.startswith("GAUSSDIVSHANNON"):
-        chains = l[2]
-        div = float( l[4] )
-        rand_divs[chains][1].append( div )
-    elif line.startswith('avg_nbrdist:'):
-        fake_epitope,chains = l[1:3]
-        avg_nbrdist = float( l[3] )
-        rand_divs[chains][2].append( avg_nbrdist )
+randfile = '{}/{}_rand_divs_new.txt'.format(path_to_current_db_files(),organism)
+if not exists(randfile):
+    Log('WARNING:: plot_sharing.py: missing random divergences file: {}'.format(randfile))
+else:
+    for line in open(randfile,'r'):
+        l = line.split()
+        if line.startswith("GAUSSDIV SM1 SE1"):
+            chains = l[5]
+            div = float(l[7])
+            rand_divs[chains][0].append( div )
+        elif line.startswith("GAUSSDIVSHANNON"):
+            chains = l[2]
+            div = float( l[4] )
+            rand_divs[chains][1].append( div )
+        elif line.startswith('avg_nbrdist:'):
+            fake_epitope,chains = l[1:3]
+            avg_nbrdist = float( l[3] )
+            rand_divs[chains][2].append( avg_nbrdist )
 
 ## let's try to get epitope diversity information from the aucs for discrimination from random tcrs
 desired_nbrdist_tag_suffix = 'wtd_nbrdist10'
@@ -537,12 +537,10 @@
         plt.title('{} {}'.format(chains, divtype))
 
         rdivs = rand_divs[chains][divindex]
-        rdivs.sort()
-        #if divindex==0:
-        #    rdivs = rdivs[:len(rdivs)/4] ## take bottom third
-
-        for val in rdivs:
-            plt.plot( [0,len(l)], [val,val], ':r' )
+        if rdivs:
+            rdivs.sort()
+            for val in rdivs:
+                plt.plot( [0,len(l)], [val,val], ':r' )
 
         locs,labels = plt.yticks()
         minval = min( ( x[0] for x in l ) )

diff --git a/random_tcr_distances.py b/random_tcr_distances.py
@@ -4,7 +4,6 @@
 import cdr3s_human #debug
 from all_genes import all_genes
 import parse_tsv
-from paths import path_to_db
 
 with Parser(locals()) as p:
     p.str('organism').required()
@@ -26,38 +25,42 @@
 random_tcrs = []
 random_info = []
 
-if organism == 'mouse':
-    random_chains = {}
-    for ab in 'AB':
-        random_chains[ab] = []
-        random_tcrs_file = '{}/new_nextgen_chains_{}_{}.tsv'.format(path_to_db,organism,ab)
-        assert exists( random_tcrs_file )
-        Log('reading {}'.format( random_tcrs_file ))
-        header = []
-        for line in open(random_tcrs_file,'r'):
-            l = line[:-1].split('\t')
-            if not header:
-                header = l[:]
-                assert header == ['v_reps','j_reps','cdr3','cdr3_nucseq']
-            else:
-                random_chains[ab].append( ( frozenset( l[0].split(',') ), l[2] ) ) ## (v_reps, cdr3)
-        random.shuffle( random_chains[ab] )
-        assert len(random_chains[ab])>=nrandom
-    for a,b in zip( random_chains['A'][:nrandom], random_chains['B'][:nrandom] ):
-        random_tcrs.append( ( a[0],b[0],a[1],b[1] ) ) #va_reps, vb_reps, cdr3a, cdr3b
-        random_info.append( { 'va_reps': ';'.join( a[0] ),
-                              'vb_reps': ';'.join( b[0] ),
-                              'cdr3a':a[1],
-                              'cdr3b':b[1] } )
-
-else:
-    ##
-    human_paired_tcrs_file = path_to_db+'/randhuman_paired.tsv'
-    assert exists( human_paired_tcrs_file )
-    rtcrs = parse_tsv.parse_tsv_file( human_paired_tcrs_file, [], ['va_reps','vb_reps','cdr3a','cdr3b'] )
+## sometimes we have a paired chains file which is probably better
+paired_tcrs_file = '{}/rand{}_paired.tsv'.format( path_to_current_db_files(), organism )
+if exists( paired_tcrs_file):
+    rtcrs = parse_tsv.parse_tsv_file( paired_tcrs_file, [], ['va_reps','vb_reps','cdr3a','cdr3b'] )
     for l in rtcrs:
         random_tcrs.append( ( frozenset( l[0].split(';') ), frozenset( l[1].split(';') ), l[2], l[3] ) )
         random_info.append( { 'va_reps': l[0], 'vb_reps': l[1], 'cdr3a': l[2], 'cdr3b': l[3] } )
+else:
+    random_tcrs_files = glob('{}/new_nextgen_chains_{}_[AB].tsv'.format( path_to_current_db_files(),organism))
+    if len(random_tcrs_files) == 2:
+        random_chains = {}
+        for ab in 'AB':
+            random_chains[ab] = []
+            random_tcrs_file = '{}/new_nextgen_chains_{}_{}.tsv'.format(path_to_current_db_files(),organism,ab)
+            assert exists( random_tcrs_file )
+            Log('reading {}'.format( random_tcrs_file ))
+            header = []
+            for line in open(random_tcrs_file,'r'):
+                l = line[:-1].split('\t')
+                if not header:
+                    header = l[:]
+                    assert header == ['v_reps','j_reps','cdr3','cdr3_nucseq']
+                else:
+                    random_chains[ab].append( ( frozenset( l[0].split(',') ), l[2] ) ) ## (v_reps, cdr3)
+            random.shuffle( random_chains[ab] )
+            assert len(random_chains[ab])>=nrandom
+        for a,b in zip( random_chains['A'][:nrandom], random_chains['B'][:nrandom] ):
+            random_tcrs.append( ( a[0],b[0],a[1],b[1] ) ) #va_reps, vb_reps, cdr3a, cdr3b
+            random_info.append( { 'va_reps': ';'.join( a[0] ),
+                                  'vb_reps': ';'.join( b[0] ),
+                                  'cdr3a':a[1],
+                                  'cdr3b':b[1] } )
+    else:
+        Log('WARNING:: random_tcr_distances.py: no nextgen TCR chains files ({}) -- nothing to do'\
+            .format( len(random_tcrs_files) ) )
+        exit()
 
 
 print 'precomputing v-region distances'

diff --git a/read_motifs.py b/read_motifs.py
@@ -7,7 +7,6 @@
 import svg_basic
 import tcr_sampler
 import util
-from paths import path_to_db
 import parse_tsv
 from basic import *
 import random
@@ -251,8 +250,11 @@ def get_amino_acid_consensus( seqs ):
 ## index these by the v_rep and the j_rep
 
 for ab in 'AB':
-    ng_logfile = '{}/new_nextgen_chains_{}_{}.tsv'.format( path_to_db, organism, ab )
-    assert exists(ng_logfile)
+    ng_logfile = '{}/new_nextgen_chains_{}_{}.tsv'.format( path_to_current_db_files(), organism, ab )
+    if not exists(ng_logfile):
+        Log('WARNING:: read_motifs.py: missing nextgen TCR chains file: {}'.format(ng_logfile))
+        continue
+
     counter=0
     num_chains=0
     ab_chains = {}