flushes out the summary stats calcs and starts evo hx

raywray · Oct 23, 2024 · c66b617 · c66b617
1 parent 311fd0b
commit c66b617
Show file tree

Hide file tree

Showing 20 changed files with 69 additions and 4 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "evolutionary_history/supercomputer_scripts"]
 	path = evolutionary_history/supercomputer_scripts
 	url = https://github.com/raywray/supercomputer_scripts.git
+[submodule "evolutionary_history/CoalMiner"]
+	path = evolutionary_history/CoalMiner
+	url = https://github.com/raywray/CoalMiner.git
diff --git a/evolutionary_history/CoalMiner b/evolutionary_history/CoalMiner
diff --git a/evolutionary_history/README.md b/evolutionary_history/README.md
@@ -1 +1,4 @@
-Next, the pipeline uses the SFS(s) created for fastsimcoal analyses, as well as a user-generated parameter `yaml` file, to feed into a fastsimcoal wrapper (citation here) that generates thousands of random coalescent models. This was run on a cluster. The wrapper identified the best model and parameters, ran a bootstrap analysis, and generated images. The results of the best model are included here in the `results/fastsimcoal` folder. 
+After the summary statistics were generated, we used the SFS(s) created for fastsimcoal analyses from statMix, as well as a user-generated parameter `yaml` file, to feed into CoalMiner (citation here), a random coalescent topology generate to create 1000 coalescent models. This was run on a cluster. The wrapper identified the best model and parameters, ran a bootstrap analysis, and generated images. The results of the best model are included here in the `results/fastsimcoal` folder. 
+
+First, we ran CoalMiner. The output can be found at `/data/output/evolutionary_history/coalminer_output`
+
diff --git a/...ated_cluster_commands/find_best_models.py → ...ated_cluster_commands/find_best_models.py b/...ated_cluster_commands/find_best_models.py → ...ated_cluster_commands/find_best_models.py
diff --git a/..._commands/ucr/best_models_job_template.sh → ..._commands/ucr/best_models_job_template.sh b/..._commands/ucr/best_models_job_template.sh → ..._commands/ucr/best_models_job_template.sh
diff --git a/..._cluster_commands/ucr/find_best_models.py → ..._cluster_commands/ucr/find_best_models.py b/..._cluster_commands/ucr/find_best_models.py → ..._cluster_commands/ucr/find_best_models.py
diff --git a/evolutionary_history/supercomputer_scripts → ...ory/cluster_scripts/supercomputer_scripts b/evolutionary_history/supercomputer_scripts → ...ory/cluster_scripts/supercomputer_scripts
diff --git a/evolutionary_history/generate_evolutionary_history.py b/evolutionary_history/generate_evolutionary_history.py
@@ -0,0 +1,40 @@
+import os
+
+from utilities import basic_utilities
+
+def generate_models_with_coalminer():
+    # define paths
+    project_path = "/home/raya/Documents/Projects/hops_pipeline"
+    coalminer_path = os.path.join(project_path, "evolutionary_history/CoalMiner")
+    coalminer_input_folder_path = os.path.join(project_path, "data/input/evolutionary_history/coal_miner_input_files")
+    coalminer_input_yml = "user_input_hops_k4.yml"
+
+    # copy observed SFS and .yml into the CoalMiner project
+    copy_sfs_cmd = [
+        "cp",
+        "-r",
+        coalminer_input_folder_path,
+        coalminer_path
+    ]
+    basic_utilities.execute_command(copy_sfs_cmd)
+
+    # run coalminer
+    # change into the coalminer dir
+    os.chdir(coalminer_path)
+    run_coalminer_cmd = [
+        "python3",
+        "coalminer.py",
+        coalminer_input_yml
+    ]
+    basic_utilities.execute_command(run_coalminer_cmd)
+
+
+def run_models_on_cluster():
+    print("cluster")
+
+def find_best_model():
+    print("best")
+
+def run_bootstrap():
+    print("boot")
+    # @ARUN
diff --git a/utilities/utilities.py → ...nary_history/utilities/basic_utilities.py b/utilities/utilities.py → ...nary_history/utilities/basic_utilities.py
diff --git a/utilities/ParFileViewer.r → general_utilities/ParFileViewer.r b/utilities/ParFileViewer.r → general_utilities/ParFileViewer.r
diff --git a/utilities/SFStools.r → general_utilities/SFStools.r b/utilities/SFStools.r → general_utilities/SFStools.r
diff --git a/utilities/bootstrap_utils.py → general_utilities/bootstrap_utils.py b/utilities/bootstrap_utils.py → general_utilities/bootstrap_utils.py
diff --git a/utilities/calculate_AIC.sh → general_utilities/calculate_AIC.sh b/utilities/calculate_AIC.sh → general_utilities/calculate_AIC.sh
diff --git a/utilities/cluster_commands.py → general_utilities/cluster_commands.py b/utilities/cluster_commands.py → general_utilities/cluster_commands.py
diff --git a/utilities/log.py → general_utilities/log.py b/utilities/log.py → general_utilities/log.py
diff --git a/utilities/plotModel.r → general_utilities/plotModel.r b/utilities/plotModel.r → general_utilities/plotModel.r
diff --git a/general_utilities/utilities.py b/general_utilities/utilities.py
@@ -0,0 +1,14 @@
+import os
+
+def execute_command(command_list):
+    command = " ".join(command_list)
+    print(command)
+    result = os.system(command)
+    if result != 0:
+        print("Command Failed to Execute")
+    else:
+        print("Command Successfully Executed")
+
+def create_directory(dir_path):
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
diff --git a/utilities/visualization.py → general_utilities/visualization.py b/utilities/visualization.py → general_utilities/visualization.py
diff --git a/summary_statistics/README.md b/summary_statistics/README.md
@@ -1,4 +1,4 @@
-First, we created a reusable pipeline that generates several summary statistics (statMix) and fed our *hops* `vcf` data through the pipeline. The summary statistics and analysis results we generated for *hops* are as follows: 
+We created a reusable pipeline (statMix) that generates several summary statistics and fed our *hops* `vcf` data through the pipeline. The summary statistics and analysis results we generated for *hops* are as follows: 
 - Hardy Weinberg Equilibrium
 - Full population structure analysis using admixture 
 - SFS based on the population structure results
@@ -12,4 +12,8 @@ First, we created a reusable pipeline that generates several summary statistics
 - Fit
 - Fis
 - allele frequency
-- SFS(s) compatible for fastsimcoal analyses
+- SFS(s) compatible for fastsimcoal analyses
+
+The commands we used to generate the summary statistics are found in `/summary_statistics/generate_summary_statistics.py`
+
+The results of these analyses are found in `/data/output/summary_statistics/statmix_output`
diff --git a/main.py → ...statistics/generate_summary_statistics.py b/main.py → ...statistics/generate_summary_statistics.py
@@ -9,7 +9,7 @@ def execute_command(command_list):
 def get_statmix_stats():
     stats = ["hwe", "pop_structure", "sfs", "generic_stats", "fsc"]
     statmix_path = os.path.join("/home/raya/Documents/Projects/hops_pipeline/statMix", "statmix.py")
-    vcf_path = "/home/raya/Documents/Projects/hops_pipeline/input_data/hops.vcf"
+    vcf_path = "/home/raya/Documents/Projects/hops_pipeline/data/input/summary_statistics/hops.vcf"
     output_prefix = "hops"
 
     command = [