Skip to content

Commit

Permalink
Add the new options --make_fake_alpha and --make_fake_beta for proces…
Browse files Browse the repository at this point in the history
…sing pair_seqs files that only have a single chain
  • Loading branch information
phbradley committed Jul 11, 2017
1 parent d353e54 commit 36a9152
Show file tree
Hide file tree
Showing 10 changed files with 95 additions and 12 deletions.
5 changes: 4 additions & 1 deletion analyze_overlap_compute_simpsons.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
print 'making:',outlogfile
outlog =open( outlogfile,'w')

fake_chains = util.detect_fake_chains( clones_file )

import matplotlib
if not show: matplotlib.use('Agg')
import matplotlib.pyplot as plt
Expand Down Expand Up @@ -726,6 +728,7 @@ def safe_inverse(p):
all_div = {}

for chains in ['A','B','AB']:
if chains in fake_chains: continue

overlaps = []

Expand Down Expand Up @@ -764,7 +767,7 @@ def safe_inverse(p):
all_div[chains] = diversity

if chains == 'AB':
est = all_div['A']*all_div['B']
est = all_div.get('A',1.0)*all_div.get('B',1.0)
estimate = ' est_unpaired: {:9.1f} ratio: {:9.3f} '.format( est, 0 if diversity==0 else est/diversity)
else:
estimate = ' '
Expand Down
13 changes: 12 additions & 1 deletion make_really_tall_trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
p.multiword('extra_color_schemes').shorthand('colors').cast(lambda x:x.split())

if constant_seed: random.seed(1)
fake_chains = util.detect_fake_chains( clones_file )

probs_cs = 'probs'
sharing_cs = 'sharing'
Expand All @@ -44,7 +45,15 @@
gap_character = '-' ## different from some other places
min_cluster_size = 1

cluster_radius = {'AB':1.0, 'A':0.5, 'B':0.5}
#cluster_radius = {'AB':1.0, 'A':0.5, 'B':0.5}
distance_threshold_25_scaled = distance_scale_factor * pipeline_params[ 'distance_threshold_25' ]

cluster_radius = {
'A' : distance_threshold_25_scaled*2, ## was 0.5
'B' : distance_threshold_25_scaled*2, ## was 0.5
'AB': distance_threshold_25_scaled*4 ## was 1.0
}


tree_width = 750
ymargin = 30 ## right now we dont want top text to get cut off
Expand Down Expand Up @@ -336,6 +345,8 @@ def same_tcr( t1, t2, chains ): ## t1 = (subject,genes,reps,cdr3a,cdr3b,...)


for ab in ['A','B','AB']:
if ab in fake_chains: continue

radius = cluster_radius[ab]
distfile = '{}_{}_{}.dist'.format( clones_file[:-4], ab, epitope )
assert exists(distfile)
Expand Down
2 changes: 2 additions & 0 deletions make_summary_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
p.str('clones_file').required()
p.str('organism')

fake_chains = util.detect_fake_chains( clones_file )

table1_file_prefix = clones_file[:-4]+'_summary_table'
table2_file_prefix = clones_file[:-4]+'_CDR3_table'
Expand Down Expand Up @@ -288,6 +289,7 @@ def get_hp2( cdr3 ):
plt.figure(1,figsize=(12,12))
plotno=0
for ab in ['a','b','ab']:
if ab.upper() in fake_chains: continue
for suf in scoretag_suffixes:
plotno += 1
plt.subplot(nrows,ncols,plotno)
Expand Down
11 changes: 9 additions & 2 deletions make_tall_trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,15 @@
if paper_figs and ABs == None:
ABs = ['AB']

if ABs == None:
ABs = ['A','B','AB']

fake_chains = util.detect_fake_chains( clones_file )
for ch in fake_chains:
if ch in ABs:
del ABs[ ABs.index(ch)]


font_family = "Droid Sans Mono"

greek_alpha = 'α'
Expand Down Expand Up @@ -245,8 +254,6 @@ def get_safe_log10(f ):
color_score_range['AB'] = ( -20.0, -14.0 )


if ABs == None:
ABs = ['A','B','AB']

for epitope in epitopes:
if hacking and epitope != 'M158': continue
Expand Down
9 changes: 8 additions & 1 deletion plot_nbrdist_distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
clones_file_with_nbrdists = '{}_nbrdists.tsv'.format(clones_file[:-4])
assert exists( clones_file_with_nbrdists )

fake_chains = util.detect_fake_chains( clones_file )

##
all_tcrs = {}
all_info = []
Expand Down Expand Up @@ -91,7 +93,8 @@
tag = '{}_{}{}'.format( ep, chains, suffix )
all_vals.append( [ float( x[tag] ) for x in all_info ] )


if chains in fake_chains:
continue

A = np.zeros( ( len(epitopes),len(epitopes) ) )
D = np.zeros( ( len(epitopes),len(epitopes) ) )
Expand Down Expand Up @@ -386,6 +389,8 @@
plotno += 1
plt.subplot( nrows, ncols, plotno )

if chains in fake_chains: continue

mn,mx = all_min_max[chains ]

sortl = []
Expand Down Expand Up @@ -454,6 +459,8 @@
plotno += 1
plt.subplot( nrows, ncols, plotno )

if chains in fake_chains: continue

mn,mx = all_min_max[chains ]

sortl = []
Expand Down
3 changes: 2 additions & 1 deletion plot_sharing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
logfile = clones_file[:-4]+'_sharing.log'
auc_logfile= clones_file[:-4]+'_random_aucs.log'


fake_chains = util.detect_fake_chains( clones_file )

if not exists( logfile ):
print 'Sorry, you need to run analyze_overlap_compute_simpsons.py before this script'
Expand Down Expand Up @@ -506,6 +506,7 @@
plotno=0

for chains in ['A','B','AB']:
if chains in fake_chains: continue
#for ( divindex, divtype ) in [ ( 0, 'inv_simpson_gaussdist' ), (1, 'shannon_gaussdist'), (2, 'avg_nbrdist') ]:
for ( divindex, divtype ) in [ ( 0, 'TCRdiv' ),
( 2, 'avg_nbrdist10p aka NN-distance'),
Expand Down
42 changes: 37 additions & 5 deletions read_pair_seqs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,15 @@
p.str('infile').required()
p.str('outfile').required()
p.str('organism').required()
#p.str('Achain').default('A').described_as("Use 'G' for gamma delta") ### coming soon!
#p.str('Bchain').default('B').described_as("Use 'D' for gamma delta") ### coming soon!
p.str('single_chain').described_as("To analyze only a single chain; e.g. \"--single_chain alpha\" ")
p.flag('verbose') # --flag_arg (no argument passed)
p.flag('clobber').shorthand('c') # --flag_arg (no argument passed)
p.flag('dry_run') # --flag_arg (no argument passed)
p.flag('save_results') # --flag_arg (no argument passed)
p.flag('make_fake_alpha').described_as("Create a fictitious alpha chain sequence")
p.flag('make_fake_beta').described_as("Create a fictitious beta chain sequence")
p.flag('make_fake_quals').described_as("Create a fictitious quality string for each nucleotide sequence")
p.flag('make_fake_ids').described_as("Create an id for each line based on index in the file")
p.set_help_prefix("""
Expand Down Expand Up @@ -64,6 +69,20 @@
if exists(outfile): assert clobber


fake_nucseqs = {
'human': { 'A':'CGTACATAAGNTATTTGTCTGTGATATACACATCAGAATCCTTACTTTGTGACACATTTGTTTGAGAATCAAAATCGGTGAATAGGCAGACAGACTTGTCACTGGATTTAGAGTCTCTCAGCTGGTACACGGCAGGGTCAGGCTTCTGGATATTGGGCTTCACCACCAGCTGAGTTCCATCTCCAAACATGAGTCTGGCATTGTTATTCCGGACAGCACAGAAGTACAAAGCGGAGTCGCTCACAAGGGCAGATGGTTTCTTCAGGTGGAAGGAGGTTTGACTCTAGTTAAATTCAGCTTCAAAGCCATAGCTGCCTTTAACCAGGTTATCCCCTGTGATGTATTTCAGAAGGAACTGGAGGCCTCGGTTGGGGTATTGAACATACCAAAAAAGATAAGGGTTTCCAGAGACTGAATATGTGCATAAAA',
'B':'CCGGGGGTGAAACCTTGATTAGGTCCTCTACAACTGTGAGTCTGGTGCCTTGTCCAAAGAAAGCTTCGTTTACAATGCTGCTGGCACAGAAGTACACAGCTGAGTCCCTGGGTTCTGAGGGCTGGATCTTCAGAGTGGAAACAAN',
'G':'',
'D':'',
},
'mouse': {'A':'GGGAAAGCGTAGGGGTGCTGTCCTGAGACCGAGGATCTTTTAACTGGTACACAGCAGGTTCTGGGTTCTGGATGTCTGGCTTTATAATTAGCTTGGTCCCAGAGCCCCAGATCAACTGATAGTTGCTATCCATGTCAGTAGCACAGAAGTACACAGCGGTGTCTTCACACCGAGTAGCAGTGATGGACAAGGAGCTGCTCTGGCTGGAGGTGTCAAGGGTGGCTCTAAAAN',
'B':'CACCGGGGGGGTCGGGGAAGAAGCCCCTNGGCCAGCACACGAGGGTAGCCTTTTGTTTGTTTGCAATCTCTGCTTTTGATGGCTCAAACAAGGAGACCTTGGGTGGAGTCACATTTCTCAGATCCTCCAGGACAGACAGCTTGGTTCCATGACCGAAAAATAATCTTTCGTTGGCCCCCATACTGCTGGCACAGAGAAAAACGGCCATCTCGTTCTTCTGGGCAGATGTCACAGTGAGAGAAAAAGATGACTTCTTCTCTCGAGACGCATCATAGCCTTCAGATAGATCGCCTTTTTGAAGATCGTTTTCAGTTATTGAATAGTAGATCAGTCTCAATCCTTTCCCTGAATCCTGTCGGTACCAGN',
'G':'',
'D':''
}
}


def get_qualstring( cdr3seqtag, nucseq_in, quals_in ):
nucseq = nucseq_in[:].upper()
quals = quals_in[:]
Expand Down Expand Up @@ -142,14 +161,27 @@ def get_qualstring( cdr3seqtag, nucseq_in, quals_in ):
id = l[ 'id' ]
epitope = l[ 'epitope' ]
mouse = l[ 'subject' ]
aseq = l[ 'a_nucseq' ]
bseq = l[ 'b_nucseq' ]
if make_fake_quals:

## nucseqs and quals
if make_fake_alpha:
aseq = fake_nucseqs[organism]['A']
aquals = [60]*len(aseq)
else:
aseq = l[ 'a_nucseq' ]
if make_fake_quals:
aquals = [60]*len(aseq)
else:
aquals = map( int, l[ 'a_quals' ].split('.') )

if make_fake_beta:
bseq = fake_nucseqs[organism]['B']
bquals = [60]*len(bseq)
else:
aquals = map( int, l[ 'a_quals' ].split('.') )
bquals = map( int, l[ 'b_quals' ].split('.') )
bseq = l[ 'b_nucseq' ]
if make_fake_quals:
bquals = [60]*len(bseq)
else:
bquals = map( int, l[ 'b_quals' ].split('.') )

assert len(aseq) == len(aquals)
assert len(bseq) == len(bquals)
Expand Down
3 changes: 3 additions & 0 deletions read_random_tcr_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
def greek_seg( seg ):
return seg.replace( 'A', r'$\alpha$' ).replace( 'B', r'$\beta$' )

fake_chains = util.detect_fake_chains( clones_file )

## don't exclude xr guys just for discrimination vs random
##
Expand Down Expand Up @@ -237,6 +238,8 @@ def same_tcr( a, b ):
mn = min( positive_nbrdists + negative_nbrdists_random + negative_nbrdists_others )
mx = max( positive_nbrdists + negative_nbrdists_random + negative_nbrdists_others )

if chains in fake_chains:
continue ## the next line will fail if all positive_nbrdists are 0
positive_density = gaussian_kde( positive_nbrdists )
negative_density_random = gaussian_kde( negative_nbrdists_random )
if other_epitopes: negative_density_others = gaussian_kde( negative_nbrdists_others )
Expand Down
6 changes: 5 additions & 1 deletion run_basic_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
p.float('seed_threshold_for_motifs')
p.flag('make_fake_quals').described_as("(for --pair_seqs_file I/O, arg is passed to read_pair_seqs.py) Create a fictitious quality string for each nucleotide sequence")
p.flag('make_fake_ids').described_as("(for --pair_seqs_file I/O, arg passed to read_pair_seqs.py) Create an id for each line based on index in the pair_seqs file")
p.flag('make_fake_alpha').described_as("Create a fictitious alpha chain sequence")
p.flag('make_fake_beta').described_as("Create a fictitious beta chain sequence")
p.flag('force')
p.flag('webstatus')
p.flag('dry_run')
Expand Down Expand Up @@ -234,10 +236,12 @@ def run(cmd):


if pair_seqs_file and ( force or not exists( parsed_seqs_file ) ):
cmd = 'python {}/read_pair_seqs.py {} {} --organism {} --infile {} --outfile {} -c > {}.log 2> {}.err'\
cmd = 'python {}/read_pair_seqs.py {} {} {} {} --organism {} --infile {} --outfile {} -c > {}.log 2> {}.err'\
.format( path_to_scripts,
' --make_fake_ids ' if make_fake_ids else '',
' --make_fake_quals ' if make_fake_quals else '',
' --make_fake_alpha ' if make_fake_alpha else '',
' --make_fake_beta ' if make_fake_beta else '',
organism, pair_seqs_file, parsed_seqs_file, parsed_seqs_file, parsed_seqs_file )
run( cmd )

Expand Down
13 changes: 13 additions & 0 deletions util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from scipy.spatial import distance
import os
import html_colors
import parse_tsv

verbose = __name__ == '__main__'

Expand Down Expand Up @@ -218,6 +219,18 @@ def assign_label_reps_and_colors_based_on_most_common_genes_in_repertoire( tcr_i
return ## we modified the elements of the tcr_infos list in place


## this is not exactly perfect, but probably OK to start with...
##
def detect_fake_chains( clones_file, Achain='A', Bchain='B' ):
tcrs = parse_tsv.parse_tsv_file( clones_file, key_fields = [], store_fields = ['va_gene','cdr3a','vb_gene','cdr3b'] )
fake_chains = []
if len( set( [ (x[0],x[1]) for x in tcrs ] ) )==1:
fake_chains.append( Achain )
if len( set( [ (x[2],x[3]) for x in tcrs ] ) )==1:
fake_chains.append( Bchain )
if fake_chains:
print 'Fake sequence data detected for chains: {}'.format( ' '.join( fake_chains ) )
return fake_chains


# if __name__ == '__main__':
Expand Down

0 comments on commit 36a9152

Please sign in to comment.