Skip to content

Commit

Permalink
various updates to sweep pipeline (#116)
Browse files Browse the repository at this point in the history
* latest

* pca of sims

* stuff & diploshic/vcf bug fix

* update diploshic from hd5 to h5; removing any variants at last site

* updates
  • Loading branch information
mufernando authored Nov 12, 2024
1 parent e8b1f61 commit 8ad0c0a
Show file tree
Hide file tree
Showing 10 changed files with 11,899 additions and 488 deletions.
37 changes: 32 additions & 5 deletions boundary.ipynb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@ dependencies:
- r-patchwork
- r-scales
- r-ggnewscale
- r-quantreg
- libprotobuf=3.21.12
- pip:
- git+https://github.com/popsim-consortium/stdpopsim.git
- git+https://github.com/popgenmethods/smcpp
- scikit-allel
- git+https://github.com/xin-huang/dadi-cli
- git+https://github.com/kr-colab/diploSHIC.git@refs/pull/56/merge
- diploSHIC
328 changes: 328 additions & 0 deletions pca_sims_training.ipynb

Large diffs are not rendered by default.

11,892 changes: 11,454 additions & 438 deletions power_analysis.ipynb

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions workflows/config/snakemake/oregon_profile_simple/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@ cluster:
--time={resources.time}
--job-name=smk-{rule}%j
--output=logs/{rule}/{rule}%j.out
--parsable
default-resources:
- time=60
- mem_mb=12000
- threads=1
cluster-status: "status-sacct.sh"
restart-times: 3
max-jobs-per-second: 10
max-status-checks-per-second: 1
local-cores: 1
latency-wait: 60
jobs: 500
keep-going: True
max-jobs-per-second: 1000
max-status-checks-per-second: 1000
jobs: 3500
rerun-incomplete: True
printshellcmds: True
latency-wait: 30
scheduler: greedy
use-conda: True
jobscript: "jobscript-wo-properties.sh"
24 changes: 24 additions & 0 deletions workflows/config/snakemake/oregon_profile_simple/status-sacct.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env bash

# Check status of Slurm job

jobid="$1"

if [[ "$jobid" == Submitted ]]
then
echo smk-simple-slurm: Invalid job ID: "$jobid" >&2
echo smk-simple-slurm: Did you remember to add the flag --parsable to your sbatch call? >&2
exit 1
fi

output=`sacct -j "$jobid" --format State --noheader | head -n 1 | awk '{print $1}'`

if [[ $output =~ ^(COMPLETED).* ]]
then
echo success
elif [[ $output =~ ^(RUNNING|PENDING|COMPLETING|CONFIGURING|SUSPENDED).* ]]
then
echo running
else
echo failed
fi
2 changes: 1 addition & 1 deletion workflows/config/snakemake/sweep_config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# General configs
seed: 12345
replicates: 200
replicates: 1_000
output_dir: results

# Contig configs
Expand Down
29 changes: 18 additions & 11 deletions workflows/diploshic.snake
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ rule all:
expand("{tDir}/neut_0.fvec", tDir = ["train", "test"]),
expand("trainingSets/{cl}.fvec", cl = ["hard", "linkedHard", "soft", "linkedSoft", "neut"]),
"trained_model.json",
"trained_model.weights.hdf5"
"trained_model.weights.h5"

rule clone_discoal:
output:
Expand All @@ -74,6 +74,7 @@ rule clone_discoal:
cd ext/
git clone https://github.com/kr-colab/discoal.git
cd discoal/
sed -i -e 's/SITES 220020/SITES 1100100/' discoal.h
make discoal
cd ..
"""
Expand All @@ -84,6 +85,7 @@ rule discoal_neutral_simulation:
discoal_exec
output:
"{tDir}/discoal.neut.{i}.out"
resources: time=6000, mem_mb=256000
shell:
neutString + " > {output}"

Expand All @@ -94,6 +96,7 @@ rule discoal_hard_simulation:
discoal_exec
output:
"{tDir}/discoal.hard.{x}.{i}.out"
resources: time=6000, mem_mb=256000
run:
cmd = selStr + " -x "+str(sweep_locs[int(wildcards.x)])+" > {output}"
shell(cmd)
Expand All @@ -104,6 +107,7 @@ rule discoal_soft_simulation:
discoal_exec
output:
"{tDir}/discoal.soft.{x}.{i}.out"
resources: time=6000, mem_mb=256000
run:
cmd = selStr + softStr + " -x "+str(sweep_locs[int(wildcards.x)])+" > {output}"
shell(cmd)
Expand All @@ -126,14 +130,17 @@ rule concat_fvecs:
run:
for t in ["train", "test"]:
cmd = "cat {t}/discoal.neut.0.0.out.fvec > {t}/neut_0.fvec"
shell(cmd)
for ii in range(1, totSimRuns):
cmd += f" && tail -n +2 {t}/discoal.neut.0.{ii}.out.fvec >> {t}/neut_0.fvec"
cmd = f"tail -n +2 {t}/discoal.neut.0.{ii}.out.fvec >> {t}/neut_0.fvec"
shell(cmd)
for model in ["hard", "soft"]:
for xx in range(11):
cmd += f" && cat {t}/discoal.{model}.{xx}.0.out.fvec > {t}/{model}_{xx}.fvec"
for i in range(1,totSimRuns):
cmd += f" && tail -n +2 {t}/discoal.{model}.{xx}.{ii}.out.fvec >> {t}/{model}_{xx}.fvec"
shell(cmd)
cmd = f"cat {t}/discoal.{model}.{xx}.0.out.fvec > {t}/{model}_{xx}.fvec"
shell(cmd)
for ii in range(1,totSimRuns):
cmd = f"tail -n +2 {t}/discoal.{model}.{xx}.{ii}.out.fvec >> {t}/{model}_{xx}.fvec"
shell(cmd)

rule make_training_sets:
input:
Expand All @@ -149,13 +156,13 @@ rule train_classifier:
rules.make_training_sets.output
output:
"trained_model.json",
"trained_model.weights.hdf5"
"trained_model.weights.h5"
run:
# cpu training below
cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && {diploSHIC_exec} train trainingSets/ trainingSets/ trained_model --epochs=100"
shell(cmd)

rule clean:
shell:
"rm -rf test/ train/ trained_model* \
.snakemake slurm*"
#rule clean:
# shell:
# "rm -rf test/ train/ trained_model* \
# .snakemake slurm*"
12 changes: 6 additions & 6 deletions workflows/diploshic_params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
trainSampleNumber: 20 #the number of simulation replicates we want to generate for each file in our training set
testSampleNumber: 20 #the number of simulations to create for each file in the test set
sampleSize: 20 #the number of individuals in our population sample
numSites: 110_000 #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11)
u: 1e-8 #per-site mutation rate (used to calculate mean theta)
numSites: 1_100_000 #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11)
u: 2e-8 #per-site mutation rate (used to calculate mean theta)
maxSoftSweepInitFreq: 0.1 #maximum initial selected frequency for soft sweeps
tauHigh: 0.02 #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past
tauHigh: 0.01 #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past
rhoOverTheta: 1.0 #crossover rate over mut rate (used to calculate mean rho)

ne0: 10_000
thetaMean: 4*N0*u*numSites
rhoMean: thetaMean * rhoOverTheta

seln_coeff_max: 0.05
seln_coeff_min: 0.001
totSimRuns: 20
seln_coeff_max: 0.10
seln_coeff_min: 0.01
totSimRuns: 100
49 changes: 28 additions & 21 deletions workflows/sweep_simulate.snake
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,12 @@ def write_rec_rates(fname, ratemap, windows):
last = 0
fh = open(fname, 'w')
for start, end in windows:
cum_cM = ratemap.get_cumulative_mass(end)
print(f'{start}\t{end}\t{cum_cM}\t{cum_cM-last}', file=fh)
last = cum_cM
wsize = end - start
start_center = start + (wsize*0.4)
end_center = start + (wsize*0.6)
cM_center = ratemap.get_cumulative_mass(end_center) - ratemap.get_cumulative_mass(start_center)
cM = ratemap.get_cumulative_mass(end) - ratemap.get_cumulative_mass(start)
print(f'{start}\t{end}\t{cM}\t{cM_center}', file=fh)
fh.close()

def write_annot_density(fname, chrom, annot, windows):
Expand Down Expand Up @@ -410,7 +413,11 @@ def dump_results(input, output, params_dict, target_pops, num_subwins=1):
if len(del_intervals) > 0:
tss = tss.delete_intervals(del_intervals)
tss = tss.trim()
tss.write_vcf(fh_vcf, position_transform = lambda x: np.fmax(1, np.round(x)))
# because we are shifting from 0-based to 1-based, I need to remove any sites that may have happened at the last position
sites_at_last = np.where(np.round(tss.sites_position)==config["focal_size"])[0]
assert sites_at_last.shape[0] < 4 # realistically we shouldn't get more than two or three hits there
tss = tss.delete_sites(sites_at_last)
tss.write_vcf(fh_vcf, position_transform = lambda x: 1 + np.round(x))
fh_vcf.close()
# write seqlen of shortened ts
with open(output[2], 'w') as f:
Expand Down Expand Up @@ -535,7 +542,7 @@ shic_outs3 = [file_prefix+".stats.tsv.shic" for file_prefix in sw_outs_prefix_po

rule all:
input:
rules.diploshic_all.input,
rules.diploshic_all.input,
boundary_outs + trees_outs + stats_outs + vcf_outs + [output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv', output_dir+f'/simulated_data/sweeps/rec_map_{chrom}_{config["num_windows"]}.tsv'] + annot_outs +anc_outs + fv_outs + pred_outs + shic_outs1 + shic_outs2 + shic_outs3
default_target: True

Expand All @@ -556,7 +563,7 @@ rule boundary_sims:
input:
output:
output_dir + "/simulated_data/sweeps/boundary_sims/sim_{seed}_{region_size}.trees"
resources: time=6000, mem_mb=6000
resources: time=60, mem_mb=6000
run:
model = species.get_demographic_model(demo_model["id"])
mut_rate = model.mutation_rate
Expand Down Expand Up @@ -585,7 +592,7 @@ rule neutral:
input:
output:
output_dir + f"/simulated_data/sweeps/neutral/{demo_model['id']}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees",
resources: time=3000, mem_mb=8000
resources: time=30, mem_mb=7000
run:
model = species.get_demographic_model(demo_model["id"])
mutation_rate = model.mutation_rate
Expand Down Expand Up @@ -617,7 +624,7 @@ rule bgs:
input:
output:
output_dir + f"/simulated_data/sweeps/bgs/{demo_model['id']}/{{annot}}/{{dfe}}/{{seed}}/sim_{chrom}_{{left}}_{{right}}.trees",
resources: time=3000, mem_mb=3000
resources: time=30, mem_mb=8000
run:
model = species.get_demographic_model(demo_model["id"])
mutation_rate = model.mutation_rate
Expand Down Expand Up @@ -656,7 +663,7 @@ rule sweep:
input:
output:
output_dir + f"/simulated_data/sweeps/sweep/{demo_model['id']}/{{popu}}/{{annot}}/{{dfe}}/{{coeff}}/{{tmult}}/{{seed}}/sim_{{chrom}}_{{left}}_{{right}}.trees",
resources: time=3000, mem_mb=16000
resources: time=30, mem_mb=12000
run:
model = species.get_demographic_model(demo_model["id"])
mutation_rate = model.mutation_rate
Expand Down Expand Up @@ -751,7 +758,7 @@ rule get_stats:
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.ancFile",
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.samples"

resources: time=3000, mem_mb=2000
resources: time=30, mem_mb=4000
run:
params_dict, target_pops = _get_params_dict_from_wildcards(wildcards)
dump_results(input, output, params_dict, target_pops, config["num_subwins"])
Expand All @@ -766,11 +773,11 @@ rule diploshic_fvs:

output:
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.diploshic.fv'
resources: time=30, mem_mb=1200
resources: time=40, mem_mb=5000
run:
with open(input[0],'r') as f:
seq_len = f.read().strip()
cmd = f"diploSHIC fvecVcf haploid {input[1]} 1 {int(seq_len)} {output[0]} --targetPop {wildcards.popu} --sampleToPopFileName {input[3]} --winSize 2200000 --ancFileName {input[2]}"
cmd = f"diploSHIC fvecVcf haploid {input[1]} 1 {int(seq_len)} {output[0]} --targetPop {wildcards.popu} --sampleToPopFileName {input[3]} --winSize 1100000 --ancFileName {input[2]}"
shell(cmd)


Expand All @@ -780,17 +787,17 @@ rule diploshic_pred:
rules.diploshic_train_classifier.output
output:
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.diploshic.preds'
resources: time=30, mem_mb=1200
resources: time=30, mem_mb=3000
run:
cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && diploSHIC predict trained_model.json trained_model.weights.hdf5 {input[0]} {output[0]}"
cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && diploSHIC predict trained_model.json trained_model.weights.h5 {input[0]} {output[0]}"
shell(cmd)


rule add_diploshic_preds_to_stats:
input: output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.trees",
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.diploshic.preds"
output: output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}_{popu}.stats.tsv.shic"
resources: time=30, mem_mb=600
resources: time=30, mem_mb=6000
run:
params_dict, target_pops = _get_params_dict_from_wildcards(wildcards)
target_pops = [wildcards.popu]
Expand All @@ -806,7 +813,7 @@ rule add_diploshic_preds_to_stats:
assert np.all(df.classifiedWinEnd <= ts.metadata['windowing']['window_right'])
assert np.all(df.classifiedWinStart > ts.metadata['windowing']['window_left']+1)
windows = df.iloc[:,[1,2]].to_numpy().astype(np.int32)
stats = 1 - df.iloc[:,5].to_numpy().astype(np.float32) # prob non neutral
stats = (df.iloc[:,8].to_numpy() + df.iloc[:,9].to_numpy()).astype(np.float32) # prob soft+hard
stats = np.expand_dims(stats, axis = 1) # adding another dim going from (n_wins,) to (n_wins,1)
fh = open(output[0], 'w')
_print_stat_line("diploshic", stats, windows, target_pops, meta_str, fh)
Expand All @@ -817,7 +824,7 @@ rule merge_stats:
input: stats_outs
output:
output_dir + f'/simulated_data/sweeps/all_sims.tmp.stats.tsv'
resources: time=3000, mem_mb=350000, disk_mb=350000
resources: time=1500, mem_mb=350000, disk_mb=350000
run:
#print(input, flush=True)
#import pdb; pdb.set_trace()
Expand All @@ -827,7 +834,7 @@ rule merge_stats_shic1:
input: shic_outs1
output:
output_dir + f'/simulated_data/sweeps/all_sims1.shic.stats.tsv'
resources: time=3000, mem_mb=150000, disk_mb=150000
resources: time=3000, mem_mb=350000, disk_mb=350000
run:
#print(input, flush=True)
#import pdb; pdb.set_trace()
Expand All @@ -837,7 +844,7 @@ rule merge_stats_shic2:
input: shic_outs2
output:
output_dir + f'/simulated_data/sweeps/all_sims2.shic.stats.tsv'
resources: time=3000, mem_mb=150000, disk_mb=150000
resources: time=3000, mem_mb=350000, disk_mb=350000
run:
#print(input, flush=True)
#import pdb; pdb.set_trace()
Expand All @@ -847,7 +854,7 @@ rule merge_stats_shic3:
input: shic_outs3
output:
output_dir + f'/simulated_data/sweeps/all_sims3.shic.stats.tsv'
resources: time=3000, mem_mb=150000, disk_mb=150000
resources: time=3000, mem_mb=350000, disk_mb=350000
run:
#print(input, flush=True)
#import pdb; pdb.set_trace()
Expand All @@ -857,7 +864,7 @@ rule merge_stats_shic3:
rule merge_all_stats:
input: [output_dir + f'/simulated_data/sweeps/all_sims.tmp.stats.tsv', output_dir + f'/simulated_data/sweeps/all_sims3.shic.stats.tsv', output_dir + f'/simulated_data/sweeps/all_sims2.shic.stats.tsv', output_dir + f'/simulated_data/sweeps/all_sims1.shic.stats.tsv']
output: output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv'
resources: time=3000, mem_mb=150000, disk_mb=150000
resources: time=3000, mem_mb=350000, disk_mb=350000
shell:
"cat {input} > {output}"

Expand Down

0 comments on commit 8ad0c0a

Please sign in to comment.