diff --git a/notebooks/manuscript_figures.ipynb b/notebooks/manuscript_figures.ipynb index 9d2064d..b3f852b 100644 --- a/notebooks/manuscript_figures.ipynb +++ b/notebooks/manuscript_figures.ipynb @@ -6,7 +6,11 @@ "source": [ "### Auxiliary code to generate figures for manuscript. \n", "\n", - "A lot of the inputs are generated from `workflows/summary_stats.snake`. Some of the code below is still hardcoded instead of drawing from the output of that pipeline directly. *Updating this is still in progress*" + "A lot of the inputs are generated from `workflows/summary_stats.snake`. Some of the code below is still hardcoded instead of drawing from the output of that pipeline directly. *Updating this is still in progress*\n", + "\n", + "\n", + "Full pipeline command \n", + "`snakemake --snakefile ../Snakefile --profile ../workflows/config/snakemake/oregon_profile/ --configfile ../workflows/config/snakemake/production_config_PhoSin.yml -np` " ] }, { diff --git a/workflows/config/snakemake/production_config_HomSap.yml b/workflows/config/snakemake/production_config_HomSap.yml index 21b0d09..0784f94 100644 --- a/workflows/config/snakemake/production_config_HomSap.yml +++ b/workflows/config/snakemake/production_config_HomSap.yml @@ -1,7 +1,7 @@ # snakemake settings "seed": 12345 "replicates": 3 -"output_dir": "results" +"output_dir": "HomSap_results" # species-specific settings "species": "HomSap" @@ -10,32 +10,32 @@ "num_samples_per_population": [100], }, {"id":"OutOfAfricaArchaicAdmixture_5R19", - "num_samples_per_population": 3*[100], + "num_samples_per_population": [100, 100, 100], } ] "genetic_map": "HapMapII_GRCh38" -"chrm_list": "chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22" +"chrm_list": "chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22" "dfe_list": ["none", "Gamma_K17"] "annotation_list": ["all_sites", "ensembl_havana_104_exons", "ensembl_havana_104_exons"] "mask_file": "workflows/masks/HapmapII_GRCh38.mask.bed" "stairway_annot_mask" : "" # set this or any of the below to 'none' to skip annot masking -"msmc_annot_mask" : "" -"gone_annot_mask" : "" -"smcpp_annot_mask" : "" +"msmc_annot_mask": "" +"gone_annot_mask": "" +"smcpp_annot_mask": "" # slim settings "slim_scaling_factor": 1 "slim_burn_in": 10 # n(t) specific configs -"methods" : ["stairwayplot", "gone", "smcpp", "msmc"] -"num_sampled_genomes_msmc" : [6] -"num_msmc_iterations" : 20 -"gone_phase" : 1 # 0 for pseudohaploid, 1 for phased, 2 for unknown phase -"gone_max_snps" : 50000 # default=50000 -"gone_threads" : 8 -"gone_num_gens" : 2000 # default=2000 -"gone_num_bins" : 400 # default=400 +"methods": ["stairwayplot", "gone", "smcpp", "msmc"] +"num_sampled_genomes_msmc": [6] +"num_msmc_iterations": 20 +"gone_phase": 1 # 0 for pseudohaploid, 1 for phased, 2 for unknown phase +"gone_max_snps": 50000 # default=50000 +"gone_threads": 8 +"gone_num_gens": 2000 # default=2000 +"gone_num_bins": 400 # default=400 # exe paths @@ -44,6 +44,6 @@ "dfe_alpha_data_path_1": "ext/dfe-alpha-release-2.16/data" "dfe_alpha_data_path_2": "three-epoch" "grapes_exec": "ext/grapes/multi_grapes" -"msmc_exec" : "ext/msmc2/build/release/msmc2" -"stairwayplot_code" : "ext/stairwayplot/swarmops.jar" -"gone_code" : "ext/GONE/Linux" +"msmc_exec": "ext/msmc2/build/release/msmc2" +"stairwayplot_code": "ext/stairwayplot/swarmops.jar" +"gone_code": "ext/GONE/Linux" diff --git a/workflows/config/snakemake/production_config_PhoSin.yml b/workflows/config/snakemake/production_config_PhoSin.yml index c029e44..bae4dd1 100644 --- a/workflows/config/snakemake/production_config_PhoSin.yml +++ b/workflows/config/snakemake/production_config_PhoSin.yml @@ -1,41 +1,38 @@ # snakemake settings "seed": 12345 "replicates": 3 -"output_dir": "results" +"output_dir": "PhoSin_results" +"workflow_path": "workflows/" # species-specific settings "species": "PhoSin" "demo_models": [ - {"id":"Constant", - "num_samples_per_population": [100], - }, - {"id":"Vaquita2Epoch_1R22", - "num_samples_per_population": 3*[100], - } + {"id": "Constant", "num_samples_per_population": [100]}, + {"id": "Vaquita2Epoch_1R22", "num_samples_per_population": [100]} ] "genetic_map": null -"chrm_list": "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21" +"chrm_list": "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21" "dfe_list": ["none", "Gamma_R22"] "annotation_list": ["all_sites", "Phocoena_sinus.mPhoSin1.pri.110_exons"] "mask_file": "workflows/masks/PhoSin_fake.mask.bed" -"stairway_annot_mask" : "" # set this or any of the below to 'none' to skip annot masking -"msmc_annot_mask" : "" -"gone_annot_mask" : "" -"smcpp_annot_mask" : "" +"stairway_annot_mask": "" # set this or any of the below to 'none' to skip annot masking +"msmc_annot_mask": "" +"gone_annot_mask": "" +"smcpp_annot_mask": "" # slim settings "slim_scaling_factor": 1 "slim_burn_in": 10 # n(t) specific configs -"methods" : ["stairwayplot", "gone", "smcpp", "msmc"] -"num_sampled_genomes_msmc" : [6] -"num_msmc_iterations" : 20 -"gone_phase" : 1 # 0 for pseudohaploid, 1 for phased, 2 for unknown phase -"gone_max_snps" : 50000 # default=50000 -"gone_threads" : 8 -"gone_num_gens" : 2000 # default=2000 -"gone_num_bins" : 400 # default=400 +"methods": ["stairwayplot", "gone", "smcpp", "msmc"] +"num_sampled_genomes_msmc": [6] +"num_msmc_iterations": 20 +"gone_phase": 1 # 0 for pseudohaploid, 1 for phased, 2 for unknown phase +"gone_max_snps": 50000 # default=50000 +"gone_threads": 8 +"gone_num_gens": 2000 # default=2000 +"gone_num_bins": 400 # default=400 # exe paths @@ -44,6 +41,7 @@ "dfe_alpha_data_path_1": "ext/dfe-alpha-release-2.16/data" "dfe_alpha_data_path_2": "three-epoch" "grapes_exec": "ext/grapes/multi_grapes" -"msmc_exec" : "ext/msmc2/build/release/msmc2" -"stairwayplot_code" : "ext/stairwayplot/swarmops.jar" -"gone_code" : "ext/GONE/Linux" +"msmc_exec": "ext/msmc2/build/release/msmc2" +"stairwayplot_code": "ext/stairwayplot/swarmops.jar" +"gone_code": "ext/GONE/Linux" + diff --git a/workflows/dfe.snake b/workflows/dfe.snake index ead89a9..81b9db3 100644 --- a/workflows/dfe.snake +++ b/workflows/dfe.snake @@ -10,8 +10,12 @@ import ts2fs import plots import masks -configfile: "workflows/config/snakemake/tiny_config.yaml" +#quick fix for excluding "Constant" with "pop1" and "pop2" from the analysis +def drop_constant_populations(paths): + return [path for path in paths if not ("Constant" in path and ("pop1" in path or "pop2" in path))] + +configfile: "workflows/config/snakemake/tiny_config.yaml" np.random.seed(config["seed"]) @@ -22,6 +26,9 @@ np.random.seed(config["seed"]) # The number of replicates of each analysis you would like to run replicates = config["replicates"] +#workflow path in case it's not the current directory +workflow_path = config.get("workflow_path", "./") + # Where you would like all output files from analysis to live output_dir = os.path.abspath(config["output_dir"]) @@ -73,7 +80,6 @@ if "none" in annotation_dict: del annotation_dict["none"] annotation_list = list(annotation_dict) - rule all: input: expand(output_dir + "/plots/{demog}/{dfes}/{annots}/dfe.inference.benchmark.pdf", @@ -158,11 +164,12 @@ rule dadi_infer_dfe: ratio = 2.31, opts = 100, prefix = output_dir + "/inference/{demog}/dadi/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.two_epoch.gamma" + resources: + time=4000 threads: 8 shell: """ dadi-cli InferDFE --fs {input[0]} --cache1d {input[1]} --demo-popt {input[2]} --output-prefix {params.prefix} --pdf1d {params.dfe} --p0 {params.dfe_p0} --ubounds {params.dfe_ubounds} --lbounds {params.dfe_lbounds} --ratio {params.ratio} --optimizations {params.opts} --cpus {threads} --nomisid --force-convergence 1 - """ rule get_dadi_dfe_bestfits: @@ -220,10 +227,11 @@ rule run_polydfe: output: output_dir + "/inference/{demog}/polyDFE/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.polyDFE.out" params: - exec=poly_dfe_exec + exec=poly_dfe_exec, + workflow_path = workflow_path shell: """ - '{params.exec}' -d {input} -m C -i workflows/config/polyDFE/polyDFE_init_models.txt 1 -e > {output[0]} + '{params.exec}' -d {input} -m C -i {params.workflow_path}/config/polyDFE/polyDFE_init_models.txt 1 -e > {output[0]} """ rule get_polydfe_bestfit: @@ -382,34 +390,34 @@ rule get_grapes_bestfits: rule plot_results: input: - dadi_res = expand(output_dir + "/inference/{demog}/dadi/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.dadi.bestfit", + dadi_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/dadi/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.dadi.bestfit", demog=demo_model_id_list, seeds=seed_list, dfes=dfe_list, annots=annotation_list, ids=pids, - ), - polydfe_res = expand(output_dir + "/inference/{demog}/polyDFE/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.polyDFE.bestfit", + )), + polydfe_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/polyDFE/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.polyDFE.bestfit", demog=demo_model_id_list, seeds=seed_list, dfes=dfe_list, annots=annotation_list, ids=pids, - ), - dfe_alpha_res = expand(output_dir + "/inference/{demog}/DFE-alpha/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.DFE-alpha.bestfit", + )), + dfe_alpha_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/DFE-alpha/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.DFE-alpha.bestfit", demog=demo_model_id_list, seeds=seed_list, dfes=dfe_list, annots=annotation_list, ids=pids, - ), - grapes_res = expand(output_dir + "/inference/{demog}/grapes/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.grapes.bestfit", + )), + grapes_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/grapes/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.grapes.bestfit", demog=demo_model_id_list, seeds=seed_list, dfes=dfe_list, annots=annotation_list, ids=pids, - ) + )) output: output_dir + "/plots/{demog}/{dfes}/{annots}/dfe.inference.benchmark.pdf" run: @@ -435,7 +443,7 @@ rule plot_results: if wildcards.demog == 'Constant': model = stdpopsim.PiecewiseConstantSize(species.population_size) - mutation_rate = 1.29e-08 + mutation_rate = species.genome.mean_mutation_rate dadi_bestfits = [ b for b in dadi_bestfits if 'pop0' in b ] polydfe_bestfits = [ b for b in polydfe_bestfits if 'pop0' in b ] @@ -444,6 +452,8 @@ rule plot_results: else: model = species.get_demographic_model(wildcards.demog) mutation_rate = model.mutation_rate + if mutation_rate is None: + mutation_rate = species.genome.mean_mutation_rate pop_names = [ model.populations[i].name for i in range(len(model.populations)) ] plots.plot_all_dfe_results([dadi_bestfits, polydfe_bestfits, dfe_alpha_bestfits, grapes_bestfits], output, mutation_rate, seq_len, 0.7, pop_names) diff --git a/workflows/masks.py b/workflows/masks.py index 2990c05..57b1d38 100644 --- a/workflows/masks.py +++ b/workflows/masks.py @@ -44,8 +44,12 @@ def get_mask_from_chrom_annotation(speciesID, chrom_annotation, chromID): :chromID: chromosome ID (e.g., "chr1") """ species = stdpopsim.get_species(speciesID) - a = species.get_annotations(chrom_annotation) - mask = [a.get_chromosome_annotations(chrom) for chrom in chromID] + if chrom_annotation in [s.id for s in species.annotations]: + a = species.get_annotations(chrom_annotation) + mask = [a.get_chromosome_annotations(chrom) for chrom in chromID] + else: + #if annotation is anything but a stdpopsim annotation, don't mask anything + mask = np.array([], dtype=int).reshape(0, 2) return mask @@ -107,7 +111,7 @@ def get_combined_masks(species, mask_file, chromID, chrom_annotation=None): else: mask = np.array([], dtype=int).reshape(0, 2) # get annotations for chrom - if chrom_annotation is not None: + if chrom_annotation in [s.id for s in stdpopsim.get_species(species).annotations]: an_mask = get_mask_from_chrom_annotation(species, chrom_annotation, chromID) mask = [np.concatenate((m, am)) for m, am in zip(mask, an_mask)] # merge overlapping and adjacent intervals diff --git a/workflows/masks/HapmapII_GRCh38.mask.bed b/workflows/masks/HapmapII_GRCh38.mask.bed index bdae41b..107e1ef 100644 --- a/workflows/masks/HapmapII_GRCh38.mask.bed +++ b/workflows/masks/HapmapII_GRCh38.mask.bed @@ -1,28 +1,28 @@ -chr10 38861078 41705569 -chr1 121569984 143276353 -chr11 50795892 54555633 -chr12 34700914 37463389 -chr13 1 18447064 -chr14 1 19198131 -chr15 1 19809716 -chr16 36034945 46428208 -chr17 22738887 26950488 -chr18 15409995 20934306 -chr19 24373307 27244449 -chr20 26328611 30187700 -chr21 10646501 12968298 -chr21 1 10326653 -chr22 1 15287921 -chr2 90236658 91433610 -chr2 92117097 94663732 -chr3 90427071 93808853 -chr4 49627542 51818712 -chr5 46363876 50291681 -chr6 58452161 60342543 -chr7 57870656 61072218 -chr8 43936537 46006821 -chr9 38743712 40906821 -chr9 41438036 62926704 -chr9 66183582 68223187 -chrX 1 2785350 -chrX 58456814 62611346 +chr1 121569983 143276353 +chr2 90236657 91433610 +chr2 92117096 94663732 +chr3 90427070 93808853 +chr4 49627541 51818712 +chr5 46363875 50291681 +chr6 58452160 60342543 +chr7 57870655 61072218 +chr8 43936536 46006821 +chr9 38743711 40906821 +chr9 41438035 62926704 +chr9 66183581 68223187 +chr10 38861077 41705569 +chr11 50795891 54555633 +chr12 34700913 37463389 +chr13 0 18447064 +chr14 0 19198131 +chr15 0 19809716 +chr16 36034944 46428208 +chr17 22738886 26950488 +chr18 15409994 20934306 +chr19 24373306 27244449 +chr20 26328610 30187700 +chr21 0 10326653 +chr21 10646500 12968298 +chr22 0 15287921 +chrX 0 2785350 +chrX 58456813 62611346 diff --git a/workflows/msmc.py b/workflows/msmc.py index fec27ea..d769056 100644 --- a/workflows/msmc.py +++ b/workflows/msmc.py @@ -80,14 +80,19 @@ def run_msmc_estimate(input_files, output_file, num_genomes, msmc_exec_loc, tota The final estimates will be written to input_file.final.txt """ + + #check that there are snps in the input or msmc2 will crash with an unhelpful error + input_list = input_files.split(" ") + non_emtpy_input_files = " ".join([file for file in input_list if os.path.getsize(file) > 0]) + if non_emtpy_input_files == "": + raise ValueError("All input files are empty") num_genomes = int(num_genomes) assert num_genomes < total_samples * 2 subset = np.random.choice(range(total_samples*2), num_genomes, replace=False) haplotypes = ",".join(map(str, sorted(subset))) - cmd = (f"{msmc_exec_loc} -r 0.25 -I {haplotypes} -i {iterations} -o {output_file}{num_genomes}.trees.multihep.txt -t {ncores} {input_files}") + cmd = (f"{msmc_exec_loc} -r 0.25 -I {haplotypes} -i {iterations} -o {output_file}{num_genomes}.trees.multihep.txt -t {ncores} {non_emtpy_input_files}") subprocess.run(cmd, shell=True, check=True) - def convert_msmc_output(results_file, outfile, mutation_rate, generation_time): """ This function converts the output from msmc into a csv the will be read in diff --git a/workflows/n_t.snake b/workflows/n_t.snake index 9c0eb4d..8e39a43 100644 --- a/workflows/n_t.snake +++ b/workflows/n_t.snake @@ -160,6 +160,8 @@ def mutation_rate_helper(demog, species): mutation_rate = species.genome.mean_mutation_rate else: mutation_rate = species.get_demographic_model(demog).mutation_rate + if mutation_rate is None: + mutation_rate = species.genome.mean_mutation_rate return mutation_rate rule write_bdd: @@ -216,10 +218,10 @@ rule run_stairwayplot: chrms=chrm_list) output: output_dir + "/inference/{demog}/stairwayplot/{dfes}/{annots}/{seeds}/{pops}/stairwayplot_estimated_Ne.txt" - threads: 20 resources: - mem_mb=120000, + time = 3_000, + mem_mb=120_000, run: inputs = expand( output_dir @@ -315,7 +317,8 @@ rule ts_to_multihep: output_dir + "/simulated_data/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.trees" output: output_dir + "/inference/{demog}/msmc/{dfes}/{annots}/{seeds}/{pops}/{chrms}.trees.multihep.txt" - + resources: + time=2_000, run: print(input[0], num_sampled_genomes_msmc, mask_file) if wildcards.annots == "none" or msmc_mask == "none": @@ -345,7 +348,9 @@ rule run_msmc: output_dir + "/inference/{demog}/msmc/{dfes}/{annots}/{seeds}/{pops}/{samps}.trees.multihep.txt.final.txt" threads: 8 resources: - mem_mb=24000 + time = lambda wildcards, attempt: attempt * 1_000, + mem_mb = lambda wildcards, attempt: attempt * 200_000 + run: inputs = expand(output_dir + "/inference/{demog}/msmc/{dfes}/{annots}/{seeds}/{pops}/{chrms}.trees.multihep.txt", demog=wildcards.demog, @@ -459,6 +464,8 @@ rule gone_prep_inputs: output_dir + "/inference/{demog}/gone/{dfes}/{annots}/{seeds}/{pops}/gone.ped", output_dir + "/inference/{demog}/gone/{dfes}/{annots}/{seeds}/{pops}/gone.map", threads: 1 + resources: + mem_mb=36000 run: inputs = expand( output_dir @@ -500,7 +507,7 @@ rule gone_run: output: output_dir + "/inference/{demog}/gone/{dfes}/{annots}/{seeds}/{pops}/gone_estimated_Ne.txt", threads: 8 - resources: time=180 + resources: time=360 shell: """ cwd=$PWD diff --git a/workflows/simulation.snake b/workflows/simulation.snake index 002864c..fdce3ce 100644 --- a/workflows/simulation.snake +++ b/workflows/simulation.snake @@ -60,32 +60,33 @@ rule simulation: input: output: output_dir + "/simulated_data/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.trees" - resources: time=3000, mem_mb=10000 + resources: + time = lambda wildcards, attempt: attempt * 6000, + mem_mb = lambda wildcards, attempt: attempt * 100000 run: if wildcards.demog == 'Constant': model = stdpopsim.PiecewiseConstantSize(species.population_size) - mutation_rate = 1.29e-08 # where is this from? - #samples = model.get_samples(*demo_sample_size_dict[wildcards.demog]) + mutation_rate = species.genome.mean_mutation_rate else: model = species.get_demographic_model(wildcards.demog) mutation_rate = model.mutation_rate - #samples = model.get_samples(*demo_sample_size_dict[wildcards.demog]) # YRI, CEU, CHB + if mutation_rate is None: + mutation_rate = species.genome.mean_mutation_rate + samples = {f"{model.populations[i].name}": m for i, m in enumerate(demo_sample_size_dict[wildcards.demog])} genetic_map_id = config["genetic_map"] contig = species.get_contig(wildcards.chrms, genetic_map=genetic_map_id) if wildcards.dfes != "none": # Load dfe only if provided dfe = species.get_dfe(wildcards.dfes) - if wildcards.annots == "all_sites": - # Adding selection to the whole contig - contig.add_dfe(intervals=np.array([[0, int(contig.length)]]), DFE=dfe) - elif wildcards.annots == "none": - contig = species.get_contig(wildcards.chrms, genetic_map=genetic_map_id) - else: - ## Adding annotation only seletion on exon region - annot = species.get_annotations(wildcards.annots) - annot_intervals = annot.get_chromosome_annotations(wildcards.chrms) - contig.add_dfe(intervals=annot_intervals, DFE=dfe) + if wildcards.annots == "all_sites": + # Adding selection to the whole contig + contig.add_dfe(intervals=np.array([[0, int(contig.length)]]), DFE=dfe) + else: + ## Adding annotation only seletion on exon region + annot = species.get_annotations(wildcards.annots) + annot_intervals = annot.get_chromosome_annotations(wildcards.chrms) + contig.add_dfe(intervals=annot_intervals, DFE=dfe) contig.mutation_rate = mutation_rate engine = stdpopsim.get_engine("slim") diff --git a/workflows/summary_stats.snake b/workflows/summary_stats.snake index 3a662e4..97da9e6 100644 --- a/workflows/summary_stats.snake +++ b/workflows/summary_stats.snake @@ -191,9 +191,11 @@ rule gather_diploshic_fvs: ) output: output_dir + "/summaries/{demog}/{dfes}/{annots}/{seeds}/sim_{chrms}.{popid}.diploshic.stats", + params: + output_dir = output_dir shell: """ - cat results/summaries/{wildcards.demog}/{wildcards.dfes}/{wildcards.annots}/{wildcards.seeds}/sim_{wildcards.chrms}.{wildcards.popid}.*.diploshic.stats | sort -nk 2 | uniq >> {output} + cat {params.output_dir}/summaries/{wildcards.demog}/{wildcards.dfes}/{wildcards.annots}/{wildcards.seeds}/sim_{wildcards.chrms}.{wildcards.popid}.*.diploshic.stats | sort -nk 2 | uniq >> {output} """ rule plot_pi_individual: