Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

edits related to production pipeline #114

Merged
merged 8 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion notebooks/manuscript_figures.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
"source": [
"### Auxiliary code to generate figures for manuscript. \n",
"\n",
"A lot of the inputs are generated from `workflows/summary_stats.snake`. Some of the code below is still hardcoded instead of drawing from the output of that pipeline directly. *Updating this is still in progress*"
"A lot of the inputs are generated from `workflows/summary_stats.snake`. Some of the code below is still hardcoded instead of drawing from the output of that pipeline directly. *Updating this is still in progress*\n",
"\n",
"\n",
"Full pipeline command \n",
"`snakemake --snakefile ../Snakefile --profile ../workflows/config/snakemake/oregon_profile/ --configfile ../workflows/config/snakemake/production_config_PhoSin.yml -np` "
]
},
{
Expand Down
34 changes: 17 additions & 17 deletions workflows/config/snakemake/production_config_HomSap.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# snakemake settings
"seed": 12345
"replicates": 3
"output_dir": "results"
"output_dir": "HomSap_results"

# species-specific settings
"species": "HomSap"
Expand All @@ -10,32 +10,32 @@
"num_samples_per_population": [100],
},
{"id":"OutOfAfricaArchaicAdmixture_5R19",
"num_samples_per_population": 3*[100],
"num_samples_per_population": [100, 100, 100],
}
]
"genetic_map": "HapMapII_GRCh38"
"chrm_list": "chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22"
"chrm_list": "chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22"
"dfe_list": ["none", "Gamma_K17"]
"annotation_list": ["all_sites", "ensembl_havana_104_exons", "ensembl_havana_104_exons"]
"mask_file": "workflows/masks/HapmapII_GRCh38.mask.bed"
"stairway_annot_mask" : "" # set this or any of the below to 'none' to skip annot masking
"msmc_annot_mask" : ""
"gone_annot_mask" : ""
"smcpp_annot_mask" : ""
"msmc_annot_mask": ""
"gone_annot_mask": ""
"smcpp_annot_mask": ""

# slim settings
"slim_scaling_factor": 1
"slim_burn_in": 10

# n(t) specific configs
"methods" : ["stairwayplot", "gone", "smcpp", "msmc"]
"num_sampled_genomes_msmc" : [6]
"num_msmc_iterations" : 20
"gone_phase" : 1 # 0 for pseudohaploid, 1 for phased, 2 for unknown phase
"gone_max_snps" : 50000 # default=50000
"gone_threads" : 8
"gone_num_gens" : 2000 # default=2000
"gone_num_bins" : 400 # default=400
"methods": ["stairwayplot", "gone", "smcpp", "msmc"]
"num_sampled_genomes_msmc": [6]
"num_msmc_iterations": 20
"gone_phase": 1 # 0 for pseudohaploid, 1 for phased, 2 for unknown phase
"gone_max_snps": 50000 # default=50000
"gone_threads": 8
"gone_num_gens": 2000 # default=2000
"gone_num_bins": 400 # default=400


# exe paths
Expand All @@ -44,6 +44,6 @@
"dfe_alpha_data_path_1": "ext/dfe-alpha-release-2.16/data"
"dfe_alpha_data_path_2": "three-epoch"
"grapes_exec": "ext/grapes/multi_grapes"
"msmc_exec" : "ext/msmc2/build/release/msmc2"
"stairwayplot_code" : "ext/stairwayplot/swarmops.jar"
"gone_code" : "ext/GONE/Linux"
"msmc_exec": "ext/msmc2/build/release/msmc2"
"stairwayplot_code": "ext/stairwayplot/swarmops.jar"
"gone_code": "ext/GONE/Linux"
44 changes: 21 additions & 23 deletions workflows/config/snakemake/production_config_PhoSin.yml
Original file line number Diff line number Diff line change
@@ -1,41 +1,38 @@
# snakemake settings
"seed": 12345
"replicates": 3
"output_dir": "results"
"output_dir": "PhoSin_results"
"workflow_path": "workflows/"

# species-specific settings
"species": "PhoSin"
"demo_models": [
{"id":"Constant",
"num_samples_per_population": [100],
},
{"id":"Vaquita2Epoch_1R22",
"num_samples_per_population": 3*[100],
}
{"id": "Constant", "num_samples_per_population": [100]},
{"id": "Vaquita2Epoch_1R22", "num_samples_per_population": [100]}
]
"genetic_map": null
"chrm_list": "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21"
"chrm_list": "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21"
"dfe_list": ["none", "Gamma_R22"]
"annotation_list": ["all_sites", "Phocoena_sinus.mPhoSin1.pri.110_exons"]
"mask_file": "workflows/masks/PhoSin_fake.mask.bed"
"stairway_annot_mask" : "" # set this or any of the below to 'none' to skip annot masking
"msmc_annot_mask" : ""
"gone_annot_mask" : ""
"smcpp_annot_mask" : ""
"stairway_annot_mask": "" # set this or any of the below to 'none' to skip annot masking
"msmc_annot_mask": ""
"gone_annot_mask": ""
"smcpp_annot_mask": ""

# slim settings
"slim_scaling_factor": 1
"slim_burn_in": 10

# n(t) specific configs
"methods" : ["stairwayplot", "gone", "smcpp", "msmc"]
"num_sampled_genomes_msmc" : [6]
"num_msmc_iterations" : 20
"gone_phase" : 1 # 0 for pseudohaploid, 1 for phased, 2 for unknown phase
"gone_max_snps" : 50000 # default=50000
"gone_threads" : 8
"gone_num_gens" : 2000 # default=2000
"gone_num_bins" : 400 # default=400
"methods": ["stairwayplot", "gone", "smcpp", "msmc"]
"num_sampled_genomes_msmc": [6]
"num_msmc_iterations": 20
"gone_phase": 1 # 0 for pseudohaploid, 1 for phased, 2 for unknown phase
"gone_max_snps": 50000 # default=50000
"gone_threads": 8
"gone_num_gens": 2000 # default=2000
"gone_num_bins": 400 # default=400


# exe paths
Expand All @@ -44,6 +41,7 @@
"dfe_alpha_data_path_1": "ext/dfe-alpha-release-2.16/data"
"dfe_alpha_data_path_2": "three-epoch"
"grapes_exec": "ext/grapes/multi_grapes"
"msmc_exec" : "ext/msmc2/build/release/msmc2"
"stairwayplot_code" : "ext/stairwayplot/swarmops.jar"
"gone_code" : "ext/GONE/Linux"
"msmc_exec": "ext/msmc2/build/release/msmc2"
"stairwayplot_code": "ext/stairwayplot/swarmops.jar"
"gone_code": "ext/GONE/Linux"

38 changes: 24 additions & 14 deletions workflows/dfe.snake
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,12 @@ import ts2fs
import plots
import masks

configfile: "workflows/config/snakemake/tiny_config.yaml"
#quick fix for excluding "Constant" with "pop1" and "pop2" from the analysis
def drop_constant_populations(paths):
return [path for path in paths if not ("Constant" in path and ("pop1" in path or "pop2" in path))]


configfile: "workflows/config/snakemake/tiny_config.yaml"

np.random.seed(config["seed"])

Expand All @@ -22,6 +26,9 @@ np.random.seed(config["seed"])
# The number of replicates of each analysis you would like to run
replicates = config["replicates"]

#workflow path in case it's not the current directory
workflow_path = config.get("workflow_path", "./")

# Where you would like all output files from analysis to live
output_dir = os.path.abspath(config["output_dir"])

Expand Down Expand Up @@ -73,7 +80,6 @@ if "none" in annotation_dict:
del annotation_dict["none"]
annotation_list = list(annotation_dict)


rule all:
input:
expand(output_dir + "/plots/{demog}/{dfes}/{annots}/dfe.inference.benchmark.pdf",
Expand Down Expand Up @@ -158,11 +164,12 @@ rule dadi_infer_dfe:
ratio = 2.31,
opts = 100,
prefix = output_dir + "/inference/{demog}/dadi/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.two_epoch.gamma"
resources:
time=4000
threads: 8
shell:
"""
dadi-cli InferDFE --fs {input[0]} --cache1d {input[1]} --demo-popt {input[2]} --output-prefix {params.prefix} --pdf1d {params.dfe} --p0 {params.dfe_p0} --ubounds {params.dfe_ubounds} --lbounds {params.dfe_lbounds} --ratio {params.ratio} --optimizations {params.opts} --cpus {threads} --nomisid --force-convergence 1

"""

rule get_dadi_dfe_bestfits:
Expand Down Expand Up @@ -220,10 +227,11 @@ rule run_polydfe:
output:
output_dir + "/inference/{demog}/polyDFE/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.polyDFE.out"
params:
exec=poly_dfe_exec
exec=poly_dfe_exec,
workflow_path = workflow_path
shell:
"""
'{params.exec}' -d {input} -m C -i workflows/config/polyDFE/polyDFE_init_models.txt 1 -e > {output[0]}
'{params.exec}' -d {input} -m C -i {params.workflow_path}/config/polyDFE/polyDFE_init_models.txt 1 -e > {output[0]}
"""

rule get_polydfe_bestfit:
Expand Down Expand Up @@ -382,34 +390,34 @@ rule get_grapes_bestfits:

rule plot_results:
input:
dadi_res = expand(output_dir + "/inference/{demog}/dadi/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.dadi.bestfit",
dadi_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/dadi/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.dadi.bestfit",
demog=demo_model_id_list,
seeds=seed_list,
dfes=dfe_list,
annots=annotation_list,
ids=pids,
),
polydfe_res = expand(output_dir + "/inference/{demog}/polyDFE/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.polyDFE.bestfit",
)),
polydfe_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/polyDFE/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.polyDFE.bestfit",
demog=demo_model_id_list,
seeds=seed_list,
dfes=dfe_list,
annots=annotation_list,
ids=pids,
),
dfe_alpha_res = expand(output_dir + "/inference/{demog}/DFE-alpha/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.DFE-alpha.bestfit",
)),
dfe_alpha_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/DFE-alpha/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.DFE-alpha.bestfit",
demog=demo_model_id_list,
seeds=seed_list,
dfes=dfe_list,
annots=annotation_list,
ids=pids,
),
grapes_res = expand(output_dir + "/inference/{demog}/grapes/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.grapes.bestfit",
)),
grapes_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/grapes/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.grapes.bestfit",
demog=demo_model_id_list,
seeds=seed_list,
dfes=dfe_list,
annots=annotation_list,
ids=pids,
)
))
output:
output_dir + "/plots/{demog}/{dfes}/{annots}/dfe.inference.benchmark.pdf"
run:
Expand All @@ -435,7 +443,7 @@ rule plot_results:

if wildcards.demog == 'Constant':
model = stdpopsim.PiecewiseConstantSize(species.population_size)
mutation_rate = 1.29e-08
mutation_rate = species.genome.mean_mutation_rate

dadi_bestfits = [ b for b in dadi_bestfits if 'pop0' in b ]
polydfe_bestfits = [ b for b in polydfe_bestfits if 'pop0' in b ]
Expand All @@ -444,6 +452,8 @@ rule plot_results:
else:
model = species.get_demographic_model(wildcards.demog)
mutation_rate = model.mutation_rate
if mutation_rate is None:
mutation_rate = species.genome.mean_mutation_rate

pop_names = [ model.populations[i].name for i in range(len(model.populations)) ]
plots.plot_all_dfe_results([dadi_bestfits, polydfe_bestfits, dfe_alpha_bestfits, grapes_bestfits], output, mutation_rate, seq_len, 0.7, pop_names)
10 changes: 7 additions & 3 deletions workflows/masks.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,12 @@ def get_mask_from_chrom_annotation(speciesID, chrom_annotation, chromID):
:chromID: chromosome ID (e.g., "chr1")
"""
species = stdpopsim.get_species(speciesID)
a = species.get_annotations(chrom_annotation)
mask = [a.get_chromosome_annotations(chrom) for chrom in chromID]
if chrom_annotation in [s.id for s in species.annotations]:
a = species.get_annotations(chrom_annotation)
mask = [a.get_chromosome_annotations(chrom) for chrom in chromID]
else:
#if annotation is anything but a stdpopsim annotation, don't mask anything
mask = np.array([], dtype=int).reshape(0, 2)
return mask


Expand Down Expand Up @@ -107,7 +111,7 @@ def get_combined_masks(species, mask_file, chromID, chrom_annotation=None):
else:
mask = np.array([], dtype=int).reshape(0, 2)
# get annotations for chrom
if chrom_annotation is not None:
if chrom_annotation in [s.id for s in stdpopsim.get_species(species).annotations]:
an_mask = get_mask_from_chrom_annotation(species, chrom_annotation, chromID)
mask = [np.concatenate((m, am)) for m, am in zip(mask, an_mask)]
# merge overlapping and adjacent intervals
Expand Down
56 changes: 28 additions & 28 deletions workflows/masks/HapmapII_GRCh38.mask.bed
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
chr10 38861078 41705569
chr1 121569984 143276353
chr11 50795892 54555633
chr12 34700914 37463389
chr13 1 18447064
chr14 1 19198131
chr15 1 19809716
chr16 36034945 46428208
chr17 22738887 26950488
chr18 15409995 20934306
chr19 24373307 27244449
chr20 26328611 30187700
chr21 10646501 12968298
chr21 1 10326653
chr22 1 15287921
chr2 90236658 91433610
chr2 92117097 94663732
chr3 90427071 93808853
chr4 49627542 51818712
chr5 46363876 50291681
chr6 58452161 60342543
chr7 57870656 61072218
chr8 43936537 46006821
chr9 38743712 40906821
chr9 41438036 62926704
chr9 66183582 68223187
chrX 1 2785350
chrX 58456814 62611346
chr1 121569983 143276353
chr2 90236657 91433610
chr2 92117096 94663732
chr3 90427070 93808853
chr4 49627541 51818712
chr5 46363875 50291681
chr6 58452160 60342543
chr7 57870655 61072218
chr8 43936536 46006821
chr9 38743711 40906821
chr9 41438035 62926704
chr9 66183581 68223187
chr10 38861077 41705569
chr11 50795891 54555633
chr12 34700913 37463389
chr13 0 18447064
chr14 0 19198131
chr15 0 19809716
chr16 36034944 46428208
chr17 22738886 26950488
chr18 15409994 20934306
chr19 24373306 27244449
chr20 26328610 30187700
chr21 0 10326653
chr21 10646500 12968298
chr22 0 15287921
chrX 0 2785350
chrX 58456813 62611346
9 changes: 7 additions & 2 deletions workflows/msmc.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,19 @@ def run_msmc_estimate(input_files, output_file, num_genomes, msmc_exec_loc, tota
The final estimates will be written to
input_file.final.txt
"""

#check that there are snps in the input or msmc2 will crash with an unhelpful error
input_list = input_files.split(" ")
non_emtpy_input_files = " ".join([file for file in input_list if os.path.getsize(file) > 0])
if non_emtpy_input_files == "":
raise ValueError("All input files are empty")
num_genomes = int(num_genomes)
assert num_genomes < total_samples * 2
subset = np.random.choice(range(total_samples*2), num_genomes, replace=False)
haplotypes = ",".join(map(str, sorted(subset)))
cmd = (f"{msmc_exec_loc} -r 0.25 -I {haplotypes} -i {iterations} -o {output_file}{num_genomes}.trees.multihep.txt -t {ncores} {input_files}")
cmd = (f"{msmc_exec_loc} -r 0.25 -I {haplotypes} -i {iterations} -o {output_file}{num_genomes}.trees.multihep.txt -t {ncores} {non_emtpy_input_files}")
subprocess.run(cmd, shell=True, check=True)


def convert_msmc_output(results_file, outfile, mutation_rate, generation_time):
"""
This function converts the output from msmc into a csv the will be read in
Expand Down
Loading
Loading