minor i/o tweaks on branch

phbradley · Oct 6, 2019 · 47391cb · 47391cb
1 parent 4f83df3
commit 47391cb
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 6 deletions.
diff --git a/compute_distances.py b/compute_distances.py
@@ -21,10 +21,12 @@
     p.str('organism')
     p.flag('intrasubject_nbrdists')
     p.multiword('clones_files').cast(lambda x:x.split())
+    p.multiword('dist_chains').cast(lambda x:x.split()).default('A B AB').described_as('The chains over which the distance calcn will be performed; default is all three ("A B AB")')
     p.multiword('epitope_prefixes').cast(lambda x:x.split())
     p.multiword('nbrdist_percentiles').cast(lambda x: [int(val) for val in x.split()]).default("5 10 25")#"5 10 25 -1 -5 -10")
     p.str('distance_params')
 
+
 #internal legacy hack
 new_nbrdists = not intrasubject_nbrdists
 
@@ -104,7 +106,7 @@
 
 
 
-for chains in ['A','B','AB']:
+for chains in dist_chains:
     epitopes = list( set( [x['epitope'] for x in all_info ] ) )
     epitopes.sort()
 

diff --git a/file_converter.py b/file_converter.py
@@ -353,6 +353,10 @@ def map_field_names( l, input_format, output_format ):
                 continue
             outl[ field ] = l[ field ]
 
+            if field == 'cdr3a_nucseq' or field == 'cdr3b_nucseq':
+                outl[ field ] = outl[ field ].lower()
+
+
         # check the genes
         if check_genes:
             bad_genes = False

diff --git a/make_10x_clones_file.py b/make_10x_clones_file.py
@@ -132,6 +132,9 @@ def make_clones_file( organism, outfile, clonotype2tcrs, clonotype2barcodes ):
     '''
 
     tmpfile = outfile+'.tmp' # a temporary intermediate file
+    bc_mapfile = outfile+'.barcode_mapping.tsv'
+    outmap = open(bc_mapfile,'w')
+    outmap.write('clone_id\tbarcodes\n')
 
     out = open(tmpfile,'w')
     outfields = 'clone_id subject clone_size va_gene ja_gene vb_gene jb_gene cdr3a cdr3a_nucseq cdr3b cdr3b_nucseq'\
@@ -169,7 +172,9 @@ def make_clones_file( organism, outfile, clonotype2tcrs, clonotype2barcodes ):
             outl['beta_umi']     = str(btcr_umi)
             outl['num_betas']    = str(len(btcrs))
             out.write( make_tsv_line(outl,outfields)+'\n' )
+            outmap.write('{}\t{}\n'.format(clonotype,','.join(clonotype2barcodes[clonotype])))
     out.close()
+    outmap.close()
 
 
     cmd = 'python {}/file_converter.py --input_format clones --output_format clones --input_file {} --output_file {}  --organism {} --clobber --epitope UNK_E --check_genes --extra_fields {} '\

diff --git a/make_kpca_plots.py b/make_kpca_plots.py
@@ -20,6 +20,7 @@
     p.flag('paper_figs')
     p.flag('vertical') ## each epitope is a vertical stack of scatter plots
     p.flag('showmotifs')
+    p.flag('use_tsne')
     p.multiword('epitopes').cast(lambda x: x.split())
 
 if pngfile_prefix is None:
@@ -92,9 +93,14 @@
     D = np.minimum( D, np.full( D.shape, Dmax ) )
     print 'true_Dmax:',old_Dmax,'using_Dmax:',Dmax,epitope
 
-    pca = KernelPCA(kernel='precomputed')
-    gram = 1 - ( D / Dmax )
-    xy = pca.fit_transform(gram)
+    if use_tsne:
+        from sklearn.manifold import TSNE
+        tsne = TSNE(n_components=2, metric='precomputed')
+        xy = tsne.fit_transform(D)
+    else:
+        pca = KernelPCA(kernel='precomputed')
+        gram = 1 - ( D / Dmax )
+        xy = pca.fit_transform(gram)
 
     xs = xy[:,0]
     ys = xy[:,1]

diff --git a/read_motifs.py b/read_motifs.py
@@ -162,6 +162,8 @@ def get_amino_acid_consensus( seqs ):
 ## new approach
 all_tcr_infos = parse_tsv.parse_tsv_file( clones_file, ['epitope'], [], True )
 
+fake_chains = util.detect_fake_chains( clones_file )
+
 if epitopes is None:
     epitopes = all_tcr_infos.keys()[:]
     epitopes.sort()
@@ -192,8 +194,8 @@ def get_amino_acid_consensus( seqs ):
         jb = l['jb_rep']
         cdr3a = l['cdr3a']
         cdr3b = l['cdr3b']
-        cdr3a_nucseq = l['cdr3a_nucseq']
-        cdr3b_nucseq = l['cdr3b_nucseq']
+        cdr3a_nucseq = l['cdr3a_nucseq'].lower()
+        cdr3b_nucseq = l['cdr3b_nucseq'].lower()
 
         ## note-- we are using mm1 reps here, same as in find_cdr3_motifs.py
         ##
@@ -244,6 +246,8 @@ def get_amino_acid_consensus( seqs ):
 ## index these by the v_rep and the j_rep
 
 for ab in 'AB':
+    if ab in fake_chains:
+        continue
     ng_logfile = '{}/new_nextgen_chains_{}_{}.tsv'.format( path_to_current_db_files(), organism, ab )
     if not exists(ng_logfile):
         Log('WARNING:: read_motifs.py: missing nextgen TCR chains file: {}'.format(ng_logfile))
@@ -313,7 +317,10 @@ def get_amino_acid_consensus( seqs ):
     all_distances = {}
     all_neighbors = {}
     for ab in 'AB':
+        if ab in fake_chains:
+            continue
         distfile = '{}_{}_{}.dist'.format(clones_file[:-4],ab,epitope)
+        print 'reading:',distfile
         assert exists(distfile)
         N=0
         all_nbrs = []
@@ -557,6 +564,7 @@ def analyze_matches_using_ngseqs( matches, matched_tcrs, ab ):
         if target_num != None and target_num != num: continue
 
         if ABs != None and ab not in ABs: continue
+        if ab in fake_chains: continue
 
         expected_fraction = max(expect_random,expect_nextgen)/num_tcrs