Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Summary stat fix #120

Merged
merged 9 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,122 changes: 1,122 additions & 0 deletions notebooks/DFE_figures.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion workflows/config/snakemake/production_config_PhoSin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"seed": 12345
"replicates": 3
"output_dir": "PhoSin_results"
"workflow_path": "workflows/"
"workflow_path": "./workflows/"

# species-specific settings
"species": "PhoSin"
Expand Down
30 changes: 16 additions & 14 deletions workflows/dfe.snake
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ np.random.seed(config["seed"])
replicates = config["replicates"]

#workflow path in case it's not the current directory
workflow_path = config.get("workflow_path", "./")
workflow_path = config.get("workflow_path", "./workflows")

# Where you would like all output files from analysis to live
output_dir = os.path.abspath(config["output_dir"])
Expand Down Expand Up @@ -85,7 +85,7 @@ rule all:
expand(output_dir + "/plots/{demog}/{dfes}/{annots}/dfe.inference.benchmark.pdf",
demog=demo_model_id_list,
dfes=dfe_list,
annots=annotation_list )
annots=annotation_list)


# ###############################################################################
Expand Down Expand Up @@ -165,7 +165,7 @@ rule dadi_infer_dfe:
opts = 100,
prefix = output_dir + "/inference/{demog}/dadi/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.two_epoch.gamma"
resources:
time=4000
time=10080
threads: 8
shell:
"""
Expand Down Expand Up @@ -290,7 +290,7 @@ rule generate_dfe_alpha_fs:
ts_dict[chrm] = fp
index = int(wildcards.ids)
ts2fs.generate_fs(ts_dict, index, mask_file, wildcards.annots,
species, output, format='DFE-alpha', is_folded=True,
species, output, format='DFE-alpha', is_folded=False,
data_path_1=dfe_alpha_data_path_1, data_path_2=dfe_alpha_data_path_2,
sfs_input_file=output[2], est_dfe_results_dir=params.output,
est_dfe_demography_results_file=params.output+"/neu/est_dfe.out")
Expand Down Expand Up @@ -404,13 +404,13 @@ rule plot_results:
annots=annotation_list,
ids=pids,
)),
dfe_alpha_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/DFE-alpha/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.DFE-alpha.bestfit",
demog=demo_model_id_list,
seeds=seed_list,
dfes=dfe_list,
annots=annotation_list,
ids=pids,
)),
#dfe_alpha_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/DFE-alpha/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.DFE-alpha.bestfit",
# demog=demo_model_id_list,
# seeds=seed_list,
# dfes=dfe_list,
# annots=annotation_list,
# ids=pids,
#)),
grapes_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/grapes/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.grapes.bestfit",
demog=demo_model_id_list,
seeds=seed_list,
Expand Down Expand Up @@ -438,7 +438,7 @@ rule plot_results:

dadi_bestfits = [ b for b in input.dadi_res if wildcards.demog in b]
polydfe_bestfits = [ b for b in input.polydfe_res if wildcards.demog in b]
dfe_alpha_bestfits = [ b for b in input.dfe_alpha_res if wildcards.demog in b]
#dfe_alpha_bestfits = [ b for b in input.dfe_alpha_res if wildcards.demog in b]
grapes_bestfits = [ b for b in input.grapes_res if wildcards.demog in b ]

if wildcards.demog == 'Constant':
Expand All @@ -447,7 +447,7 @@ rule plot_results:

dadi_bestfits = [ b for b in dadi_bestfits if 'pop0' in b ]
polydfe_bestfits = [ b for b in polydfe_bestfits if 'pop0' in b ]
dfe_alpha_bestfits = [ b for b in dfe_alpha_bestfits if 'pop0' in b ]
#dfe_alpha_bestfits = [ b for b in dfe_alpha_bestfits if 'pop0' in b ]
grapes_bestfits = [ b for b in grapes_bestfits if 'pop0' in b ]
else:
model = species.get_demographic_model(wildcards.demog)
Expand All @@ -456,4 +456,6 @@ rule plot_results:
mutation_rate = species.genome.mean_mutation_rate

pop_names = [ model.populations[i].name for i in range(len(model.populations)) ]
plots.plot_all_dfe_results([dadi_bestfits, polydfe_bestfits, dfe_alpha_bestfits, grapes_bestfits], output, mutation_rate, seq_len, 0.7, pop_names)
#plots.plot_all_dfe_results([dadi_bestfits, polydfe_bestfits, dfe_alpha_bestfits, grapes_bestfits], output, mutation_rate, seq_len, 0.7, pop_names)
plots.plot_all_dfe_results([dadi_bestfits, polydfe_bestfits, grapes_bestfits], output, mutation_rate, seq_len, 0.7, pop_names)

24 changes: 12 additions & 12 deletions workflows/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,8 @@ def plot_all_dfe_results(input, output, mu, seq_len, nonneu_prop, pop_names=None
"""
dadi_bestfits = _read_bestfits(input[0], len(pop_names))
polydfe_bestfits = _read_bestfits(input[1], len(pop_names))
dfe_alpha_bestfits = _read_bestfits(input[2], len(pop_names))
grapes_bestfits = _read_bestfits(input[3], len(pop_names))
#dfe_alpha_bestfits = _read_bestfits(input[2], len(pop_names))
grapes_bestfits = _read_bestfits(input[2], len(pop_names))

# SLiM assumes genotype fitnesses: 1, 1+sh, 1+s
# dadi and polyDFE assumes genotype fitnesses: 1, 1+2sh, 1+2s
Expand All @@ -273,10 +273,10 @@ def plot_all_dfe_results(input, output, mu, seq_len, nonneu_prop, pop_names=None

# Per https://doi.org/10.1534/genetics.107.080663 under Model
# DFE alpha assumes genotype fitnesses are 1, 1-s/2, 1-s, so like SLiM
dfe_alpha_shapes = [dfe_alpha_bestfits[i]['b']
for i in range(len(dfe_alpha_bestfits))]
dfe_alpha_Es = [abs(dfe_alpha_bestfits[i]['Es'])
for i in range(len(dfe_alpha_bestfits))]
#dfe_alpha_shapes = [dfe_alpha_bestfits[i]['b']
# for i in range(len(dfe_alpha_bestfits))]
#dfe_alpha_Es = [abs(dfe_alpha_bestfits[i]['Es'])
# for i in range(len(dfe_alpha_bestfits))]

# Per https://github.com/BioPP/grapes/blob/master/README.md,
# GRAPES implements the model of DFE alpha, so we expect genotype fitnesses 1, 1-s/2, and 1-s.
Expand Down Expand Up @@ -311,12 +311,12 @@ def plot_all_dfe_results(input, output, mu, seq_len, nonneu_prop, pop_names=None
plt.subplot(4, 2, 4)
_plot_boxplots(plt, 'polyDFE', polydfe_Es, 0.014, xtick_label, '|E(s)|')

plt.subplot(4, 2, 5)
_plot_boxplots(plt, 'DFE-alpha', dfe_alpha_shapes,
0.19, xtick_label, 'Shape')
plt.subplot(4, 2, 6)
_plot_boxplots(plt, 'DFE-alpha', dfe_alpha_Es,
0.014, xtick_label, '|E(s)|')
#plt.subplot(4, 2, 5)
#_plot_boxplots(plt, 'DFE-alpha', dfe_alpha_shapes,
# 0.19, xtick_label, 'Shape')
#plt.subplot(4, 2, 6)
#_plot_boxplots(plt, 'DFE-alpha', dfe_alpha_Es,
# 0.014, xtick_label, '|E(s)|')

plt.subplot(4, 2, 7)
_plot_boxplots(plt, 'grapes', grapes_shapes, 0.19,
Expand Down
12 changes: 12 additions & 0 deletions workflows/summary_stats.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#config=workflows/config/snakemake/production_config_HomSap.yml
config=workflows/config/snakemake/production_config_PhoSin.yml

for i in {1..20};
do
snakemake \
--snakefile workflows/summary_stats.snake \
--profile workflows/config/snakemake/oregon_profile/ \
--configfile $config \
--batch all=$i/20 \
--groups run_diploshic_fvs=group0
done
28 changes: 6 additions & 22 deletions workflows/summary_stats.snake
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ output_dir = os.path.abspath(config["output_dir"])
# The analysis species
species = stdpopsim.get_species(config["species"])


# The names of all chromosomes to simulate, separated by commas
# Use "all" to simulate all chromsomes for the genome
chrm_list = [chrom.id for chrom in species.genome.chromosomes]
Expand All @@ -47,22 +46,19 @@ for x in demo_model_array:
else:
model = species.get_demographic_model(x["id"])
demo_sample_size_dict[x["id"]] = {f"{model.populations[i].name}": m for i, m in enumerate(x["num_samples_per_population"])}
demo_pop_ids[x["id"]] = [x.name for x in model.populations]
demo_pop_ids[x["id"]] = [x.name for x in model.populations[:len(x["num_samples_per_population"])]]

# Select DFE model from catalog
dfe_list = config["dfe_list"]
annotation_list = config["annotation_list"]
mask_file = config["mask_file"]



nchunks=100
chunks = np.arange(nchunks)
# ###############################################################################
# GENERAL RULES & GLOBALS
# ###############################################################################


rule all:
input:
expand(
Expand Down Expand Up @@ -92,14 +88,9 @@ rule all:
chrms=chrm_list,
),





rule make_vcf:
input:
output_dir + "/simulated_data/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.trees"

output:
output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.vcf"
run:
Expand Down Expand Up @@ -128,10 +119,9 @@ def write_diploshic_ancestralAllelesFile(ts, filename):
rule make_diploshic_inputs:
input:
output_dir + "/simulated_data/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.trees"

output:
temp(output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.sampleToPopFile"),
temp(output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.ancestralAllelesFile")
output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.sampleToPopFile",
output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.ancestralAllelesFile"
run:
ts = tskit.load(input[0])
write_diploshic_sampleToPopFile(ts, output[0], wildcards.demog)
Expand All @@ -156,22 +146,17 @@ rule run_diploshic_fvs:
input:
rules.make_vcf.output,
rules.make_diploshic_inputs.output

output:
temp(
expand(output_dir + "/summaries/{{demog}}/{{dfes}}/{{annots}}/{{seeds}}/sim_{{chrms}}.{{popid}}.{{chunk}}.diploshic.{ext}",
chunk=chunks,
ext=["fvec", "stats"],
)
)

params:
seq_len = lambda wildcards, input: int(species.get_contig(wildcards.chrms).length),
start = lambda wildcards, input: chunk_coords(wildcards.chrms, int(wildcards.chunk), nchunks)[0],
end = lambda wildcards, input: chunk_coords(wildcards.chrms, int(wildcards.chunk), nchunks)[1],
test = lambda wildcards, input: chunk_coords(wildcards.chrms, int(wildcards.chunk), nchunks)[2],
popid = lambda wildcards, input: wildcards.popid,

shell:
"""
if [[ {params.test} -eq 1 && `diploSHIC fvecVcf haploid {input[0]} 1 {params.seq_len} {output[0]} --sampleToPopFileName {input[1]} --ancFileName {input[2]} --targetPop {params.popid} --statFileName {output[1]} --segmentStart {params.start} --segmentEnd {params.end}` == 0 ]];
Expand All @@ -181,8 +166,7 @@ rule run_diploshic_fvs:
touch {output[0]} && touch {output[1]}
fi
"""



rule gather_diploshic_fvs:
input:
expand(output_dir + "/summaries/{{demog}}/{{dfes}}/{{annots}}/{{seeds}}/sim_{{chrms}}.{{popid}}.{chunk}.diploshic.{ext}",
Expand Down Expand Up @@ -222,7 +206,7 @@ rule plot_pi_individual:
sns.lineplot(x="mid", y="pi", data=df, ax=ax, linewidth=2.5, palette="tab10")

# plot annotations as rugplot
if wildcards.annots != "none":
if wildcards.annots not in ["none", "all_sites"]:
exons = species.get_annotations(wildcards.annots)
exon_intervals = exons.get_chromosome_annotations(wildcards.chrms)
sns.rugplot(pd.DataFrame(data={'exons':exon_intervals[:,0]}), ax=ax, color="g", lw=1, alpha=0.05)
Expand Down Expand Up @@ -266,7 +250,7 @@ rule plot_pi_allseeds:
for i, stat in enumerate(stat_names):
sns.lineplot(data=stacked, x="mid", y=stat, hue="seed", alpha=0.5, ax=axs[i])
# plot annotations as rugplot
if wildcards.annots != "none":
if wildcards.annots not in ["none", "all_sites"]:
exons = species.get_annotations(wildcards.annots)
exon_intervals = exons.get_chromosome_annotations(wildcards.chrms)
sns.rugplot(pd.DataFrame(data={'exons':exon_intervals[:,0]}),
Expand Down
Loading
Loading