Add the new options --make_fake_alpha and --make_fake_beta for proces…

…sing pair_seqs files that only have a single chain
phbradley · Jul 11, 2017 · 36a9152 · 36a9152
1 parent d353e54
commit 36a9152
Show file tree

Hide file tree

Showing 10 changed files with 95 additions and 12 deletions.
diff --git a/analyze_overlap_compute_simpsons.py b/analyze_overlap_compute_simpsons.py
@@ -28,6 +28,8 @@
 print 'making:',outlogfile
 outlog =open( outlogfile,'w')
 
+fake_chains = util.detect_fake_chains( clones_file )
+
 import matplotlib
 if not show: matplotlib.use('Agg')
 import matplotlib.pyplot as plt
@@ -726,6 +728,7 @@ def safe_inverse(p):
                     all_div = {}
 
                     for chains in ['A','B','AB']:
+                        if chains in fake_chains: continue
 
                         overlaps = []
 
@@ -764,7 +767,7 @@ def safe_inverse(p):
                         all_div[chains] = diversity
 
                         if chains == 'AB':
-                            est = all_div['A']*all_div['B']
+                            est = all_div.get('A',1.0)*all_div.get('B',1.0)
                             estimate = ' est_unpaired: {:9.1f} ratio: {:9.3f} '.format( est, 0 if diversity==0 else est/diversity)
                         else:
                             estimate = ' '

diff --git a/make_really_tall_trees.py b/make_really_tall_trees.py
@@ -23,6 +23,7 @@
     p.multiword('extra_color_schemes').shorthand('colors').cast(lambda x:x.split())
 
 if constant_seed: random.seed(1)
+fake_chains = util.detect_fake_chains( clones_file )
 
 probs_cs            = 'probs'
 sharing_cs          = 'sharing'
@@ -44,7 +45,15 @@
 gap_character = '-' ## different from some other places
 min_cluster_size = 1
 
-cluster_radius = {'AB':1.0, 'A':0.5, 'B':0.5}
+#cluster_radius = {'AB':1.0, 'A':0.5, 'B':0.5}
+distance_threshold_25_scaled = distance_scale_factor * pipeline_params[ 'distance_threshold_25' ]
+
+cluster_radius = {
+    'A' : distance_threshold_25_scaled*2, ## was 0.5
+    'B' : distance_threshold_25_scaled*2, ## was 0.5
+    'AB': distance_threshold_25_scaled*4  ## was 1.0
+}
+
 
 tree_width = 750
 ymargin = 30 ## right now we dont want top text to get cut off
@@ -336,6 +345,8 @@ def same_tcr( t1, t2, chains ): ## t1 = (subject,genes,reps,cdr3a,cdr3b,...)
 
 
     for ab in ['A','B','AB']:
+        if ab in fake_chains: continue
+
         radius = cluster_radius[ab]
         distfile = '{}_{}_{}.dist'.format( clones_file[:-4], ab, epitope )
         assert exists(distfile)

diff --git a/make_summary_table.py b/make_summary_table.py
@@ -14,6 +14,7 @@
     p.str('clones_file').required()
     p.str('organism')
 
+fake_chains = util.detect_fake_chains( clones_file )
 
 table1_file_prefix = clones_file[:-4]+'_summary_table'
 table2_file_prefix = clones_file[:-4]+'_CDR3_table'
@@ -288,6 +289,7 @@ def get_hp2( cdr3 ):
 plt.figure(1,figsize=(12,12))
 plotno=0
 for ab in ['a','b','ab']:
+    if ab.upper() in fake_chains: continue
     for suf in scoretag_suffixes:
         plotno += 1
         plt.subplot(nrows,ncols,plotno)

diff --git a/make_tall_trees.py b/make_tall_trees.py
@@ -51,6 +51,15 @@
 if paper_figs and ABs == None:
     ABs = ['AB']
 
+if ABs == None:
+    ABs = ['A','B','AB']
+
+fake_chains = util.detect_fake_chains( clones_file )
+for ch in fake_chains:
+    if ch in ABs:
+        del ABs[ ABs.index(ch)]
+
+
 font_family = "Droid Sans Mono"
 
 greek_alpha = '&#x3b1;'
@@ -245,8 +254,6 @@ def get_safe_log10(f ):
         color_score_range['AB'] = ( -20.0, -14.0 )
 
 
-if ABs == None:
-    ABs = ['A','B','AB']
 
 for epitope in epitopes:
     if hacking and epitope != 'M158': continue

diff --git a/plot_nbrdist_distributions.py b/plot_nbrdist_distributions.py
@@ -26,6 +26,8 @@
 clones_file_with_nbrdists = '{}_nbrdists.tsv'.format(clones_file[:-4])
 assert exists( clones_file_with_nbrdists )
 
+fake_chains = util.detect_fake_chains( clones_file )
+
 ##
 all_tcrs = {}
 all_info = []
@@ -91,7 +93,8 @@
             tag = '{}_{}{}'.format( ep, chains, suffix )
             all_vals.append( [ float( x[tag] ) for x in all_info ] )
 
-
+        if chains in fake_chains:
+            continue
 
         A = np.zeros( ( len(epitopes),len(epitopes) ) )
         D = np.zeros( ( len(epitopes),len(epitopes) ) )
@@ -386,6 +389,8 @@
         plotno += 1
         plt.subplot( nrows, ncols, plotno )
 
+        if chains in fake_chains: continue
+
         mn,mx = all_min_max[chains ]
 
         sortl = []
@@ -454,6 +459,8 @@
             plotno += 1
             plt.subplot( nrows, ncols, plotno )
 
+            if chains in fake_chains: continue
+
             mn,mx = all_min_max[chains ]
 
             sortl = []

diff --git a/plot_sharing.py b/plot_sharing.py
@@ -17,7 +17,7 @@
 logfile = clones_file[:-4]+'_sharing.log'
 auc_logfile= clones_file[:-4]+'_random_aucs.log'
 
-
+fake_chains = util.detect_fake_chains( clones_file )
 
 if not exists( logfile ):
     print 'Sorry, you need to run analyze_overlap_compute_simpsons.py before this script'
@@ -506,6 +506,7 @@
 plotno=0
 
 for chains in ['A','B','AB']:
+    if chains in fake_chains: continue
     #for ( divindex, divtype ) in [ ( 0, 'inv_simpson_gaussdist' ), (1, 'shannon_gaussdist'), (2, 'avg_nbrdist') ]:
     for ( divindex, divtype ) in [ ( 0, 'TCRdiv' ),
                                    ( 2, 'avg_nbrdist10p aka NN-distance'),

diff --git a/read_pair_seqs.py b/read_pair_seqs.py
@@ -8,10 +8,15 @@
     p.str('infile').required()
     p.str('outfile').required()
     p.str('organism').required()
+    #p.str('Achain').default('A').described_as("Use 'G' for gamma delta") ### coming soon!
+    #p.str('Bchain').default('B').described_as("Use 'D' for gamma delta") ### coming soon!
+    p.str('single_chain').described_as("To analyze only a single chain; e.g. \"--single_chain alpha\" ")
     p.flag('verbose')       # --flag_arg  (no argument passed)
     p.flag('clobber').shorthand('c')       # --flag_arg  (no argument passed)
     p.flag('dry_run')       # --flag_arg  (no argument passed)
     p.flag('save_results')       # --flag_arg  (no argument passed)
+    p.flag('make_fake_alpha').described_as("Create a fictitious alpha chain sequence")
+    p.flag('make_fake_beta').described_as("Create a fictitious beta chain sequence")
     p.flag('make_fake_quals').described_as("Create a fictitious quality string for each nucleotide sequence")
     p.flag('make_fake_ids').described_as("Create an id for each line based on index in the file")
     p.set_help_prefix("""
@@ -64,6 +69,20 @@
 if exists(outfile): assert clobber
 
 
+fake_nucseqs = {
+    'human': { 'A':'CGTACATAAGNTATTTGTCTGTGATATACACATCAGAATCCTTACTTTGTGACACATTTGTTTGAGAATCAAAATCGGTGAATAGGCAGACAGACTTGTCACTGGATTTAGAGTCTCTCAGCTGGTACACGGCAGGGTCAGGCTTCTGGATATTGGGCTTCACCACCAGCTGAGTTCCATCTCCAAACATGAGTCTGGCATTGTTATTCCGGACAGCACAGAAGTACAAAGCGGAGTCGCTCACAAGGGCAGATGGTTTCTTCAGGTGGAAGGAGGTTTGACTCTAGTTAAATTCAGCTTCAAAGCCATAGCTGCCTTTAACCAGGTTATCCCCTGTGATGTATTTCAGAAGGAACTGGAGGCCTCGGTTGGGGTATTGAACATACCAAAAAAGATAAGGGTTTCCAGAGACTGAATATGTGCATAAAA',
+               'B':'CCGGGGGTGAAACCTTGATTAGGTCCTCTACAACTGTGAGTCTGGTGCCTTGTCCAAAGAAAGCTTCGTTTACAATGCTGCTGGCACAGAAGTACACAGCTGAGTCCCTGGGTTCTGAGGGCTGGATCTTCAGAGTGGAAACAAN',
+               'G':'',
+               'D':'',
+               },
+    'mouse': {'A':'GGGAAAGCGTAGGGGTGCTGTCCTGAGACCGAGGATCTTTTAACTGGTACACAGCAGGTTCTGGGTTCTGGATGTCTGGCTTTATAATTAGCTTGGTCCCAGAGCCCCAGATCAACTGATAGTTGCTATCCATGTCAGTAGCACAGAAGTACACAGCGGTGTCTTCACACCGAGTAGCAGTGATGGACAAGGAGCTGCTCTGGCTGGAGGTGTCAAGGGTGGCTCTAAAAN',
+              'B':'CACCGGGGGGGTCGGGGAAGAAGCCCCTNGGCCAGCACACGAGGGTAGCCTTTTGTTTGTTTGCAATCTCTGCTTTTGATGGCTCAAACAAGGAGACCTTGGGTGGAGTCACATTTCTCAGATCCTCCAGGACAGACAGCTTGGTTCCATGACCGAAAAATAATCTTTCGTTGGCCCCCATACTGCTGGCACAGAGAAAAACGGCCATCTCGTTCTTCTGGGCAGATGTCACAGTGAGAGAAAAAGATGACTTCTTCTCTCGAGACGCATCATAGCCTTCAGATAGATCGCCTTTTTGAAGATCGTTTTCAGTTATTGAATAGTAGATCAGTCTCAATCCTTTCCCTGAATCCTGTCGGTACCAGN',
+              'G':'',
+              'D':''
+              }
+    }
+
+
 def get_qualstring( cdr3seqtag, nucseq_in, quals_in ):
     nucseq = nucseq_in[:].upper()
     quals = quals_in[:]
@@ -142,14 +161,27 @@ def get_qualstring( cdr3seqtag, nucseq_in, quals_in ):
         id = l[ 'id' ]
     epitope = l[ 'epitope' ]
     mouse = l[ 'subject' ]
-    aseq = l[ 'a_nucseq' ]
-    bseq = l[ 'b_nucseq' ]
-    if make_fake_quals:
+
+    ## nucseqs and quals
+    if make_fake_alpha:
+        aseq = fake_nucseqs[organism]['A']
         aquals = [60]*len(aseq)
+    else:
+        aseq = l[ 'a_nucseq' ]
+        if make_fake_quals:
+            aquals = [60]*len(aseq)
+        else:
+            aquals = map( int, l[ 'a_quals' ].split('.') )
+
+    if make_fake_beta:
+        bseq = fake_nucseqs[organism]['B']
         bquals = [60]*len(bseq)
     else:
-        aquals = map( int, l[ 'a_quals' ].split('.') )
-        bquals = map( int, l[ 'b_quals' ].split('.') )
+        bseq = l[ 'b_nucseq' ]
+        if make_fake_quals:
+            bquals = [60]*len(bseq)
+        else:
+            bquals = map( int, l[ 'b_quals' ].split('.') )
 
     assert len(aseq) == len(aquals)
     assert len(bseq) == len(bquals)

diff --git a/read_random_tcr_distances.py b/read_random_tcr_distances.py
@@ -29,6 +29,7 @@
 def greek_seg( seg ):
     return seg.replace( 'A', r'$\alpha$' ).replace( 'B', r'$\beta$' )
 
+fake_chains = util.detect_fake_chains( clones_file )
 
 ## don't exclude xr guys just for discrimination vs random
 ##
@@ -237,6 +238,8 @@ def same_tcr( a, b ):
             mn = min( positive_nbrdists + negative_nbrdists_random + negative_nbrdists_others )
             mx = max( positive_nbrdists + negative_nbrdists_random + negative_nbrdists_others )
 
+            if chains in fake_chains:
+                continue ## the next line will fail if all positive_nbrdists are 0
             positive_density = gaussian_kde( positive_nbrdists )
             negative_density_random = gaussian_kde( negative_nbrdists_random )
             if other_epitopes: negative_density_others = gaussian_kde( negative_nbrdists_others )

diff --git a/run_basic_analysis.py b/run_basic_analysis.py
@@ -29,6 +29,8 @@
     p.float('seed_threshold_for_motifs')
     p.flag('make_fake_quals').described_as("(for --pair_seqs_file I/O, arg is passed to read_pair_seqs.py) Create a fictitious quality string for each nucleotide sequence")
     p.flag('make_fake_ids').described_as("(for --pair_seqs_file I/O, arg passed to read_pair_seqs.py) Create an id for each line based on index in the pair_seqs file")
+    p.flag('make_fake_alpha').described_as("Create a fictitious alpha chain sequence")
+    p.flag('make_fake_beta').described_as("Create a fictitious beta chain sequence")
     p.flag('force')
     p.flag('webstatus')
     p.flag('dry_run')
@@ -234,10 +236,12 @@ def run(cmd):
 
 
 if pair_seqs_file and ( force or not exists( parsed_seqs_file ) ):
-    cmd = 'python {}/read_pair_seqs.py {} {} --organism {} --infile {} --outfile {} -c > {}.log 2> {}.err'\
+    cmd = 'python {}/read_pair_seqs.py {} {} {} {} --organism {} --infile {} --outfile {} -c > {}.log 2> {}.err'\
           .format( path_to_scripts,
                    ' --make_fake_ids ' if make_fake_ids else '',
                    ' --make_fake_quals ' if make_fake_quals else '',
+                   ' --make_fake_alpha ' if make_fake_alpha else '',
+                   ' --make_fake_beta ' if make_fake_beta else '',
                    organism, pair_seqs_file, parsed_seqs_file, parsed_seqs_file, parsed_seqs_file )
     run( cmd )
 

diff --git a/util.py b/util.py
@@ -4,6 +4,7 @@
 from scipy.spatial import distance
 import os
 import html_colors
+import parse_tsv
 
 verbose = __name__ == '__main__'
 
@@ -218,6 +219,18 @@ def assign_label_reps_and_colors_based_on_most_common_genes_in_repertoire( tcr_i
     return ## we modified the elements of the tcr_infos list in place
 
 
+## this is not exactly perfect, but probably OK to start with...
+##
+def detect_fake_chains( clones_file, Achain='A', Bchain='B' ):
+    tcrs = parse_tsv.parse_tsv_file( clones_file, key_fields = [], store_fields = ['va_gene','cdr3a','vb_gene','cdr3b'] )
+    fake_chains = []
+    if len( set( [ (x[0],x[1]) for x in tcrs ] ) )==1:
+        fake_chains.append( Achain )
+    if len( set( [ (x[2],x[3]) for x in tcrs ] ) )==1:
+        fake_chains.append( Bchain )
+    if fake_chains:
+        print 'Fake sequence data detected for chains: {}'.format( ' '.join( fake_chains ) )
+    return fake_chains
 
 
 # if __name__ == '__main__':