Skip to content

Commit

Permalink
add constant_seed flag for use when reproducibility is desired, e.g. …
Browse files Browse the repository at this point in the history
…when running the testing datasets to validate successful install
  • Loading branch information
phbradley committed Jun 21, 2017
1 parent 91b1ba4 commit d7d9707
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 33 deletions.
2 changes: 1 addition & 1 deletion create_and_run_testing_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@

## use intrasubject_nbrdists here because these mini-repertoires may contain only a single subject
## if there's only one subject, then we can't compute a nbrdist if we exclude intra-subject distances
cmd = 'python run_basic_analysis.py --intrasubject_nbrdists --organism {} --pair_seqs_file {} > {}.log 2> {}.err &'\
cmd = 'python run_basic_analysis.py --constant_seed --intrasubject_nbrdists --organism {} --pair_seqs_file {} > {}.log 2> {}.err &'\
.format( organism, newfile, newfile, newfile )
print cmd
system(cmd)
Expand Down
3 changes: 3 additions & 0 deletions find_cdr3_motifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
p.float('min_extended_count_ratio').default(0.6)
p.flag('use_fake_seqs')
p.flag('big')
p.flag('constant_seed')
p.flag('force_random_len')
p.flag('verbose')
p.flag('nofilter')
Expand All @@ -33,6 +34,8 @@
p.flag('hacking')
p.multiword('epitopes').cast(lambda x:x.split())

if constant_seed: random.seed(1)

assert big

#assert force_random_len
Expand Down
6 changes: 5 additions & 1 deletion html_colors.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,11 @@ def rgb_from_hex( hex ):


## get some colors for objects sorted by frequency
## fiddle with random seed/state to get consistent colors; legacy silliness...
##
import random
initial_random_state = random.getstate()

random.seed(2)

if COLOR_BREWER:
Expand Down Expand Up @@ -252,7 +256,7 @@ def rgb_from_hex( hex ):
random.shuffle( other_colors_no_lights )
rank_colors_no_lights = starter_colors + other_colors_no_lights

random.seed() ## restore randomness
random.setstate(initial_random_state)

def get_rank_colors( num ):
n_rank_colors = len(rank_colors)
Expand Down
12 changes: 8 additions & 4 deletions make_tall_trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
#p.int('int_arg').shorthand('i')
#p.float('float_arg') # --float_arg 9.6
p.flag('verbose') # --flag_arg (no argument passed)
p.int('constant_seed') # --flag_arg (no argument passed)
p.flag('constant_seed') # --flag_arg (no argument passed)
p.int('random_seed') # --flag_arg (no argument passed)
p.flag('hacking') # --flag_arg (no argument passed)
p.flag('paper_figs') # --flag_arg (no argument passed)
p.flag('junction_bars') # --flag_arg (no argument passed)
Expand All @@ -40,9 +41,12 @@
if outfile_prefix is None:
outfile_prefix = clones_file[:-4]

if constant_seed != None:
print 'constant_seed:',constant_seed
random.seed(constant_seed)
if constant_seed:
random.seed(1)

if random_seed != None:
print 'random_seed:',random_seed
random.seed(random_seed)

if paper_figs and ABs == None:
ABs = ['AB']
Expand Down
5 changes: 5 additions & 0 deletions read_motifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from paths import path_to_db
import parse_tsv
from basic import *
import random


nbr_distance_default = pipeline_params[ 'distance_threshold_25' ]
Expand All @@ -24,6 +25,7 @@
p.float('nbr_distance').default( nbr_distance_default ) ## single-chain nbr distance threshold
p.float('motifs_clustering_threshold').default(0.3)
p.flag('verbose')
p.flag('constant_seed')
p.flag('paper_figs')
p.flag('paper_supp')
p.flag('junction_bars')
Expand All @@ -32,6 +34,9 @@
p.multiword('epitopes').cast(lambda x:x.split())
p.multiword('ABs').cast(lambda x:x.split())

if constant_seed:
random.seed(1)

if outfile_prefix is None:
outfile_prefix = clones_file[:-4]

Expand Down
30 changes: 19 additions & 11 deletions run_basic_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
p.flag('only_parsed_seqs')
p.flag('find_cdr3_motifs_in_parallel').described_as('Will spawn multiple simultaneous CDR3 motif finding jobs')
p.flag('only_clones')
p.flag('constant_seed')
p.str('extra_make_really_tall_trees_args').shorthand('mrtt_args')
p.str('extra_find_clones_args').shorthand('fc_args')
p.str('organism').required()
Expand Down Expand Up @@ -167,7 +168,12 @@
if distance_params:
distance_params_args = ' --distance_params {} '.format( distance_params )
else:
distance_params_args = ''
distance_params_args = ' '

if constant_seed:
constant_seed_args = ' --constant_seed '
else:
constant_seed_args = ' '

if not webdir:
webdir = '{}_web'.format(clones_file[:-4])
Expand Down Expand Up @@ -300,8 +306,9 @@ def run(cmd):
## compare with random tcrs
random_nbrdists_file = '{}_random_nbrdists.tsv'.format(clones_file[:-4] )
if not exists( random_nbrdists_file ):
cmd = 'python {}/random_tcr_distances.py {} --organism {} --clones_file {} > {}_rtd.log 2> {}_rtd.err'\
.format( path_to_scripts, distance_params_args, organism, clones_file, clones_file, clones_file )
cmd = 'python {}/random_tcr_distances.py {} {} --organism {} --clones_file {} > {}_rtd.log 2> {}_rtd.err'\
.format( path_to_scripts, constant_seed_args, distance_params_args, organism,
clones_file, clones_file, clones_file )
run(cmd)

## now read the output of the random nbrdists
Expand All @@ -326,13 +333,14 @@ def run(cmd):


## make tall trees
cmd = 'python {}/make_tall_trees.py --organism {} --clones_file {} --junction_bars > {}_mtt.log 2> {}_mtt.err'\
.format( path_to_scripts, organism, clones_file, clones_file, clones_file )
cmd = 'python {}/make_tall_trees.py {} --organism {} --clones_file {} --junction_bars > {}_mtt.log 2> {}_mtt.err'\
.format( path_to_scripts, constant_seed_args, organism, clones_file, clones_file, clones_file )
run(cmd)

## analyze intra-subject privacy
cmd = 'python {}/analyze_epitope_privacy.py {} --organism {} --clones_file {} --all_chains AB --nrepeat 1000 --tree_height_inches 5.0 --nbrdist_percentile 10 > {}_aep.log 2> {}_aep.err'\
.format( path_to_scripts, distance_params_args, organism, clones_file, clones_file, clones_file )
cmd = 'python {}/analyze_epitope_privacy.py {} {} --organism {} --clones_file {} --all_chains AB --nrepeat 1000 --tree_height_inches 5.0 --nbrdist_percentile 10 > {}_aep.log 2> {}_aep.err'\
.format( path_to_scripts, constant_seed_args, distance_params_args, organism,
clones_file, clones_file, clones_file )
run(cmd)

## find motifs #################################################
Expand Down Expand Up @@ -374,8 +382,8 @@ def run(cmd):
extra_args = ' --chi_squared_threshold_for_seeds {} '.format( my_seed_threshold_for_motifs ) \
if my_seed_threshold_for_motifs else ''

cmd = 'python {}/find_cdr3_motifs.py --organism {} {} --clones_file {} --min_count {} --epitopes {} {} {} --verbose --big --nsamples {} --max_motif_len {} --max_ng_lines {} > {} 2> {}'\
.format( path_to_scripts, organism, extra_args, clones_file, min_count, ep,
cmd = 'python {}/find_cdr3_motifs.py {} --organism {} {} --clones_file {} --min_count {} --epitopes {} {} {} --verbose --big --nsamples {} --max_motif_len {} --max_ng_lines {} > {} 2> {}'\
.format( path_to_scripts, constant_seed_args, organism, extra_args, clones_file, min_count, ep,
' --nofilter '*nofilter,
' --force_random_len '*fixlen,
nsamples, max_motif_len, max_ng_lines,
Expand Down Expand Up @@ -411,8 +419,8 @@ def run(cmd):

## make motifs summary

cmd = 'python {}/read_motifs.py --junction_bars --max_ng_lines {} --organism {} --clones_file {} > {}_rm.log 2> {}_rm.err'\
.format( path_to_scripts, max_ng_lines, organism, clones_file, clones_file, clones_file )
cmd = 'python {}/read_motifs.py {} --junction_bars --max_ng_lines {} --organism {} --clones_file {} > {}_rm.log 2> {}_rm.err'\
.format( path_to_scripts, constant_seed_args, max_ng_lines, organism, clones_file, clones_file, clones_file )
run(cmd)

## make kpca landscape plots
Expand Down
1 change: 0 additions & 1 deletion score_trees_devel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import string
from os import popen,system,getcwd
import sys
from random import random
from math import floor,log,exp
#from popen2 import popen2
#from os.path import exists
Expand Down
18 changes: 3 additions & 15 deletions tcr_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import cdr3s_human
from genetic_code import genetic_code, reverse_genetic_code
import logo_tools
#from math import log2
import random

if pipeline_params['new_probs']:
print 'tcr_sampler: new_probs'
Expand Down Expand Up @@ -154,9 +154,7 @@ def analyze_junction( organism, v_gene, j_gene, cdr3_protseq, cdr3_nucseq, force
if num_matched_v + num_matched_j > len(cdr3_nucseq):
## some overlap!
extra = num_matched_v + num_matched_j - len(cdr3_nucseq )
#print 'TRIM extra',extra
#fake_v_trim = random.randint(0,extra)
fake_v_trim = extra/2 ## now dterministic
fake_v_trim = extra/2 ## now deterministic
fake_j_trim = extra - fake_v_trim
num_matched_v -= fake_v_trim
num_matched_j -= fake_j_trim
Expand Down Expand Up @@ -231,19 +229,11 @@ def analyze_junction( organism, v_gene, j_gene, cdr3_protseq, cdr3_nucseq, force


else:
best_d_id = 0 #random.randint( 1, 2)
best_d_id = 0
n_vd_insert = 0
n_dj_insert = 0
d0_trim = 0
d1_trim = 0
# pos = random.randint( 0, len(nseq) )
# best_overlap_seq = ''
# nucseq = trbd_nucseq[best_id]
# start = random.randint( 0, len(nucseq) )
# stop = start-1
# best_trim = (start,len(nucseq)-1-stop)

#d_info = ' %d D%d %d '%(best_trim[0],best_id,best_trim[1])



Expand Down Expand Up @@ -362,7 +352,6 @@ def alpha_cdr3_protseq_probability( organism, v_gene, j_gene, cdr3_protseq,
## some overlap!
extra = max_v + max_j - len(cdr3_nucseq )
#print 'TRIM extra',extra
#fake_v_trim = random.randint(0,extra)
fake_v_trim = extra/2 ## now dterministic
fake_j_trim = extra - fake_v_trim
max_v -= fake_v_trim
Expand Down Expand Up @@ -504,7 +493,6 @@ def beta_cdr3_protseq_probability( organism, v_gene, j_gene, cdr3_protseq,
## some overlap!
extra = max_v + max_j - len(cdr3_nucseq )
#print 'TRIM extra',extra
#fake_v_trim = random.randint(0,extra)
fake_v_trim = extra/2 ## now dterministic
fake_j_trim = extra - fake_v_trim
max_v -= fake_v_trim
Expand Down

0 comments on commit d7d9707

Please sign in to comment.