popsim-consortium · andrewkern · Jul 18, 2024 · May 14, 2024 · May 16, 2024 · Jun 5, 2024
diff --git a/notebooks/manuscript_figures.ipynb b/notebooks/manuscript_figures.ipynb
@@ -6,7 +6,11 @@
    "source": [
     "### Auxiliary code to generate figures for manuscript. \n",
     "\n",
-    "A lot of the inputs are generated from `workflows/summary_stats.snake`. Some of the code below is still hardcoded instead of drawing from the output of that pipeline directly. *Updating this is still in progress*"
+    "A lot of the inputs are generated from `workflows/summary_stats.snake`. Some of the code below is still hardcoded instead of drawing from the output of that pipeline directly. *Updating this is still in progress*\n",
+    "\n",
+    "\n",
+    "Full pipeline command \n",
+    "`snakemake --snakefile ../Snakefile --profile ../workflows/config/snakemake/oregon_profile/ --configfile ../workflows/config/snakemake/production_config_PhoSin.yml -np` "
    ]
   },
   {

diff --git a/workflows/config/snakemake/production_config_HomSap.yml b/workflows/config/snakemake/production_config_HomSap.yml
@@ -1,7 +1,7 @@
 # snakemake settings
 "seed": 12345
 "replicates": 3
-"output_dir": "results"
+"output_dir": "HomSap_results"
 
 # species-specific settings
 "species": "HomSap"
@@ -10,32 +10,32 @@
     "num_samples_per_population": [100],
     },
     {"id":"OutOfAfricaArchaicAdmixture_5R19",
-    "num_samples_per_population": 3*[100],
+    "num_samples_per_population": [100, 100, 100],
     }
 ]
 "genetic_map": "HapMapII_GRCh38"
-"chrm_list": "chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22"
+"chrm_list": "chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22"
 "dfe_list": ["none", "Gamma_K17"]
 "annotation_list": ["all_sites", "ensembl_havana_104_exons", "ensembl_havana_104_exons"]
 "mask_file": "workflows/masks/HapmapII_GRCh38.mask.bed"
 "stairway_annot_mask" : "" # set this or any of the below to 'none' to skip annot masking
-"msmc_annot_mask" : ""
-"gone_annot_mask" : ""
-"smcpp_annot_mask" : ""
+"msmc_annot_mask": ""
+"gone_annot_mask": ""
+"smcpp_annot_mask": ""
 
 # slim settings
 "slim_scaling_factor": 1
 "slim_burn_in": 10
 
 # n(t) specific configs
-"methods" : ["stairwayplot", "gone", "smcpp", "msmc"]
-"num_sampled_genomes_msmc" : [6]
-"num_msmc_iterations" : 20
-"gone_phase" : 1  # 0 for pseudohaploid, 1 for phased, 2 for unknown phase
-"gone_max_snps" : 50000  # default=50000
-"gone_threads" : 8
-"gone_num_gens" : 2000  # default=2000
-"gone_num_bins" : 400  # default=400
+"methods": ["stairwayplot", "gone", "smcpp", "msmc"]
+"num_sampled_genomes_msmc": [6]
+"num_msmc_iterations": 20
+"gone_phase": 1  # 0 for pseudohaploid, 1 for phased, 2 for unknown phase
+"gone_max_snps": 50000  # default=50000
+"gone_threads": 8
+"gone_num_gens": 2000  # default=2000
+"gone_num_bins": 400  # default=400
 
 
 # exe paths
@@ -44,6 +44,6 @@
 "dfe_alpha_data_path_1": "ext/dfe-alpha-release-2.16/data"
 "dfe_alpha_data_path_2": "three-epoch"
 "grapes_exec": "ext/grapes/multi_grapes"
-"msmc_exec" : "ext/msmc2/build/release/msmc2"
-"stairwayplot_code" : "ext/stairwayplot/swarmops.jar"
-"gone_code" : "ext/GONE/Linux"
+"msmc_exec": "ext/msmc2/build/release/msmc2"
+"stairwayplot_code": "ext/stairwayplot/swarmops.jar"
+"gone_code": "ext/GONE/Linux"
diff --git a/workflows/config/snakemake/production_config_PhoSin.yml b/workflows/config/snakemake/production_config_PhoSin.yml
@@ -1,41 +1,38 @@
 # snakemake settings
 "seed": 12345
 "replicates": 3
-"output_dir": "results"
+"output_dir": "PhoSin_results"
+"workflow_path": "workflows/"
 
 # species-specific settings
 "species": "PhoSin"
 "demo_models": [
-    {"id":"Constant", 
-    "num_samples_per_population": [100],
-    },
-    {"id":"Vaquita2Epoch_1R22",
-    "num_samples_per_population": 3*[100],
-    }
+    {"id": "Constant", "num_samples_per_population": [100]},
+    {"id": "Vaquita2Epoch_1R22", "num_samples_per_population": [100]}
 ]
 "genetic_map": null
-"chrm_list": "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21"
+"chrm_list": "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21"
 "dfe_list": ["none", "Gamma_R22"]
 "annotation_list": ["all_sites", "Phocoena_sinus.mPhoSin1.pri.110_exons"]
 "mask_file": "workflows/masks/PhoSin_fake.mask.bed"
-"stairway_annot_mask" : "" # set this or any of the below to 'none' to skip annot masking
-"msmc_annot_mask" : ""
-"gone_annot_mask" : ""
-"smcpp_annot_mask" : ""
+"stairway_annot_mask": "" # set this or any of the below to 'none' to skip annot masking
+"msmc_annot_mask": ""
+"gone_annot_mask": ""
+"smcpp_annot_mask": ""
 
 # slim settings
 "slim_scaling_factor": 1
 "slim_burn_in": 10
 
 # n(t) specific configs
-"methods" : ["stairwayplot", "gone", "smcpp", "msmc"]
-"num_sampled_genomes_msmc" : [6]
-"num_msmc_iterations" : 20
-"gone_phase" : 1  # 0 for pseudohaploid, 1 for phased, 2 for unknown phase
-"gone_max_snps" : 50000  # default=50000
-"gone_threads" : 8
-"gone_num_gens" : 2000  # default=2000
-"gone_num_bins" : 400  # default=400
+"methods": ["stairwayplot", "gone", "smcpp", "msmc"]
+"num_sampled_genomes_msmc": [6]
+"num_msmc_iterations": 20
+"gone_phase": 1  # 0 for pseudohaploid, 1 for phased, 2 for unknown phase
+"gone_max_snps": 50000  # default=50000
+"gone_threads": 8
+"gone_num_gens": 2000  # default=2000
+"gone_num_bins": 400  # default=400
 
 
 # exe paths
@@ -44,6 +41,7 @@
 "dfe_alpha_data_path_1": "ext/dfe-alpha-release-2.16/data"
 "dfe_alpha_data_path_2": "three-epoch"
 "grapes_exec": "ext/grapes/multi_grapes"
-"msmc_exec" : "ext/msmc2/build/release/msmc2"
-"stairwayplot_code" : "ext/stairwayplot/swarmops.jar"
-"gone_code" : "ext/GONE/Linux"
+"msmc_exec": "ext/msmc2/build/release/msmc2"
+"stairwayplot_code": "ext/stairwayplot/swarmops.jar"
+"gone_code": "ext/GONE/Linux"
+
diff --git a/workflows/dfe.snake b/workflows/dfe.snake
@@ -10,8 +10,12 @@ import ts2fs
 import plots
 import masks
 
-configfile: "workflows/config/snakemake/tiny_config.yaml"
+#quick fix for excluding "Constant" with "pop1" and "pop2" from the analysis
+def drop_constant_populations(paths):
+    return [path for path in paths if not ("Constant" in path and ("pop1" in path or "pop2" in path))]
+
 
+configfile: "workflows/config/snakemake/tiny_config.yaml"
 
 np.random.seed(config["seed"])
 
@@ -22,6 +26,9 @@ np.random.seed(config["seed"])
 # The number of replicates of each analysis you would like to run
 replicates = config["replicates"]
 
+#workflow path in case it's not the current directory
+workflow_path = config.get("workflow_path", "./")
+
 # Where you would like all output files from analysis to live
 output_dir = os.path.abspath(config["output_dir"])
 
@@ -73,7 +80,6 @@ if "none" in annotation_dict:
     del annotation_dict["none"]
 annotation_list = list(annotation_dict)
 
-
 rule all:
     input: 
         expand(output_dir + "/plots/{demog}/{dfes}/{annots}/dfe.inference.benchmark.pdf",
@@ -158,11 +164,12 @@ rule dadi_infer_dfe:
         ratio = 2.31,
         opts = 100,
         prefix = output_dir + "/inference/{demog}/dadi/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.two_epoch.gamma"
+    resources:
+        time=4000
     threads: 8
     shell:
         """
         dadi-cli InferDFE --fs {input[0]} --cache1d {input[1]} --demo-popt {input[2]} --output-prefix {params.prefix} --pdf1d {params.dfe} --p0 {params.dfe_p0} --ubounds {params.dfe_ubounds} --lbounds {params.dfe_lbounds} --ratio {params.ratio} --optimizations {params.opts} --cpus {threads} --nomisid --force-convergence 1
-
         """
 
 rule get_dadi_dfe_bestfits:
@@ -220,10 +227,11 @@ rule run_polydfe:
     output: 
         output_dir + "/inference/{demog}/polyDFE/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.polyDFE.out"
     params:
-        exec=poly_dfe_exec
+        exec=poly_dfe_exec,
+        workflow_path = workflow_path
     shell:
         """
-        '{params.exec}' -d {input} -m C -i workflows/config/polyDFE/polyDFE_init_models.txt 1 -e > {output[0]}
+        '{params.exec}' -d {input} -m C -i {params.workflow_path}/config/polyDFE/polyDFE_init_models.txt 1 -e > {output[0]}
         """
 
 rule get_polydfe_bestfit:
@@ -382,34 +390,34 @@ rule get_grapes_bestfits:
 
 rule plot_results:
     input:
-        dadi_res = expand(output_dir + "/inference/{demog}/dadi/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.dadi.bestfit", 
+        dadi_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/dadi/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.dadi.bestfit", 
             demog=demo_model_id_list,
             seeds=seed_list,
             dfes=dfe_list,
             annots=annotation_list,
             ids=pids,
-        ),
-        polydfe_res = expand(output_dir + "/inference/{demog}/polyDFE/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.polyDFE.bestfit",
+        )),
+        polydfe_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/polyDFE/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.polyDFE.bestfit",
             demog=demo_model_id_list,
             seeds=seed_list,
             dfes=dfe_list,
             annots=annotation_list,
             ids=pids,
-        ),
-        dfe_alpha_res = expand(output_dir + "/inference/{demog}/DFE-alpha/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.DFE-alpha.bestfit",
+        )),
+        dfe_alpha_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/DFE-alpha/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.DFE-alpha.bestfit",
             demog=demo_model_id_list,
             seeds=seed_list,
             dfes=dfe_list,
             annots=annotation_list,
             ids=pids,
-        ),
-        grapes_res = expand(output_dir + "/inference/{demog}/grapes/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.grapes.bestfit",
+        )),
+        grapes_res = drop_constant_populations(expand(output_dir + "/inference/{demog}/grapes/{dfes}/{annots}/{seeds}/pop{ids}/pop{ids}.grapes.bestfit",
             demog=demo_model_id_list,
             seeds=seed_list,
             dfes=dfe_list,
             annots=annotation_list,
             ids=pids,
-        )
+        ))
     output:
         output_dir + "/plots/{demog}/{dfes}/{annots}/dfe.inference.benchmark.pdf"
     run:
@@ -435,7 +443,7 @@ rule plot_results:
 
         if wildcards.demog == 'Constant': 
             model = stdpopsim.PiecewiseConstantSize(species.population_size)
-            mutation_rate = 1.29e-08
+            mutation_rate = species.genome.mean_mutation_rate
 
             dadi_bestfits = [ b for b in dadi_bestfits if 'pop0' in b ]
             polydfe_bestfits = [ b for b in polydfe_bestfits if 'pop0' in b ]
@@ -444,6 +452,8 @@ rule plot_results:
         else: 
             model = species.get_demographic_model(wildcards.demog)
             mutation_rate = model.mutation_rate
+            if mutation_rate is None:
+                mutation_rate = species.genome.mean_mutation_rate
 
         pop_names = [ model.populations[i].name for i in range(len(model.populations)) ]
         plots.plot_all_dfe_results([dadi_bestfits, polydfe_bestfits, dfe_alpha_bestfits, grapes_bestfits], output, mutation_rate, seq_len, 0.7, pop_names)
diff --git a/workflows/masks.py b/workflows/masks.py
@@ -44,8 +44,12 @@ def get_mask_from_chrom_annotation(speciesID, chrom_annotation, chromID):
     :chromID: chromosome ID (e.g., "chr1")
     """
     species = stdpopsim.get_species(speciesID)
-    a = species.get_annotations(chrom_annotation)
-    mask = [a.get_chromosome_annotations(chrom) for chrom in chromID]
+    if chrom_annotation in [s.id for s in species.annotations]:
+        a = species.get_annotations(chrom_annotation)
+        mask = [a.get_chromosome_annotations(chrom) for chrom in chromID]        
+    else:
+        #if annotation is anything but a stdpopsim annotation, don't mask anything
+        mask = np.array([], dtype=int).reshape(0, 2)
     return mask
 
 
@@ -107,7 +111,7 @@ def get_combined_masks(species, mask_file, chromID, chrom_annotation=None):
     else:
         mask = np.array([], dtype=int).reshape(0, 2)
     # get annotations for chrom
-    if chrom_annotation is not None:
+    if chrom_annotation in [s.id for s in stdpopsim.get_species(species).annotations]:
         an_mask = get_mask_from_chrom_annotation(species, chrom_annotation, chromID)
         mask = [np.concatenate((m, am)) for m, am in zip(mask, an_mask)]
     # merge overlapping and adjacent intervals

diff --git a/workflows/masks/HapmapII_GRCh38.mask.bed b/workflows/masks/HapmapII_GRCh38.mask.bed
@@ -1,28 +1,28 @@
-chr10	 38861078	 41705569	
-chr1	 121569984	 143276353	
-chr11	 50795892	 54555633	
-chr12	 34700914	 37463389	
-chr13	 1	 18447064	
-chr14	 1	 19198131	
-chr15	 1	 19809716	
-chr16	 36034945	 46428208	
-chr17	 22738887	 26950488	
-chr18	 15409995	 20934306	
-chr19	 24373307	 27244449	
-chr20	 26328611	 30187700	
-chr21	 10646501	 12968298	
-chr21	 1	 10326653	
-chr22	 1	 15287921	
-chr2	 90236658	 91433610	
-chr2	 92117097	 94663732	
-chr3	 90427071	 93808853	
-chr4	 49627542	 51818712	
-chr5	 46363876	 50291681	
-chr6	 58452161	 60342543	
-chr7	 57870656	 61072218	
-chr8	 43936537	 46006821	
-chr9	 38743712	 40906821	
-chr9	 41438036	 62926704	
-chr9	 66183582	 68223187	
-chrX	 1	 2785350	
-chrX	 58456814	 62611346	
+chr1	121569983	143276353
+chr2	90236657	91433610
+chr2	92117096	94663732
+chr3	90427070	93808853
+chr4	49627541	51818712
+chr5	46363875	50291681
+chr6	58452160	60342543
+chr7	57870655	61072218
+chr8	43936536	46006821
+chr9	38743711	40906821
+chr9	41438035	62926704
+chr9	66183581	68223187
+chr10	38861077	41705569
+chr11	50795891	54555633
+chr12	34700913	37463389
+chr13	0	18447064
+chr14	0	19198131
+chr15	0	19809716
+chr16	36034944	46428208
+chr17	22738886	26950488
+chr18	15409994	20934306
+chr19	24373306	27244449
+chr20	26328610	30187700
+chr21	0	10326653
+chr21	10646500	12968298
+chr22	0	15287921
+chrX	0	2785350
+chrX	58456813	62611346
diff --git a/workflows/msmc.py b/workflows/msmc.py
@@ -80,14 +80,19 @@ def run_msmc_estimate(input_files, output_file, num_genomes, msmc_exec_loc, tota
     The final estimates will be written to
     input_file.final.txt
     """
+
+    #check that there are snps in the input or msmc2 will crash with an unhelpful error
+    input_list = input_files.split(" ")
+    non_emtpy_input_files = " ".join([file for file in input_list if os.path.getsize(file) > 0])
+    if non_emtpy_input_files == "":
+        raise ValueError("All input files are empty")
     num_genomes = int(num_genomes)
     assert num_genomes < total_samples * 2
     subset = np.random.choice(range(total_samples*2), num_genomes, replace=False)
     haplotypes = ",".join(map(str, sorted(subset)))
-    cmd = (f"{msmc_exec_loc} -r 0.25 -I {haplotypes} -i {iterations} -o {output_file}{num_genomes}.trees.multihep.txt -t {ncores} {input_files}")
+    cmd = (f"{msmc_exec_loc} -r 0.25 -I {haplotypes} -i {iterations} -o {output_file}{num_genomes}.trees.multihep.txt -t {ncores} {non_emtpy_input_files}")
     subprocess.run(cmd, shell=True, check=True)
 
-
 def convert_msmc_output(results_file, outfile, mutation_rate, generation_time):
     """
     This function converts the output from msmc into a csv the will be read in