improved logic and updated README

griffithlab · Jan 28, 2025 · 65591d6 · 65591d6
1 parent 1de1b2e
commit 65591d6
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 116 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # pVACcompare
-pVACcompare is a file comparison suite built for comparing results generated by [pVACtools](https://github.com/griffithlab/pVACtools) or the [GriffithLab](https://github.com/griffithlab) immuno pipeline. Comparison can be specified for MHC Class I results, MHC Class II results, or both. In its current state the following files are included in the comparison:
+pVACcompare is a file comparison suite built for comparing results generated by [pVACtools](https://github.com/griffithlab/pVACtools). Comparison can be specified for MHC Class I results, MHC Class II results, or both. In its current state the following files are included in the comparison:
 - log/inputs.yml
 - all_epitopes.tsv
 - all_epitopes.aggregated.tsv
@@ -19,10 +19,10 @@ python3 run.py -h
 ```
 An example of running the tool likes like the following:<br>
 ```bash
-python3 run.py --pvactools_release --output_dir path/to/output/directory --mhc_class 1 --aggregated_columns 'Best Peptide', 'Best Transcript' version1/result version2/result
+python3 run.py --output_dir path/to/output/directory --mhc_class 1 --aggregated_columns 'Best Peptide', 'Best Transcript' version1/result version2/result
 ```
-**Note**: You must specify if the results are from pVACtools or the immuno pipeline. All columns specified must be in quotes and comma separated. If you do not specify MHC Class, the tool will include both in the report. A list of available columns is displayed in the help menu.<br><br>
-The above command will perform a MHC Class I output comparison between two result folders generated by pVACtools only with the specified columns included in the aggregated tsv comparison. Columns for the unaggregated tsv comparison and reference match tsv comparison were not specified, so the default columns will be used. Results will be generated in the specified output directory. If an output directory is not specified, one will be created inside ```pvaccompare/```.
+**Note**: All columns specified must be in quotes and comma separated. If you do not specify MHC Class, the tool will include both in the report. A list of available columns is displayed in the help menu.<br><br>
+The above command will perform a MHC Class I output comparison only with the specified columns included in the aggregated tsv comparison. Columns for the unaggregated tsv comparison and reference match tsv comparison were not specified, so the default columns will be used. Results will be generated in the specified output directory. If an output directory is not specified, one will be created inside ```pvaccompare/```.
 ## Viewing Results
 After completing a run, a results folder containing the JSON files generated by the tool will be created in the output directory. pVACcompare provides an organized HTML report for efficient parsing and visualization of results. To view the HTML report:
 1. Navigate to the ```pvaccompare/``` directory.
@@ -33,4 +33,5 @@ python3 server.py
 3. The server will output a link where you can access the report. Open this link in your browser to view the available results.<br><br>
 Once on the report page:
     - Select the results directory you'd like to explore.
-    - To return to the directory selection screen, click **pVACcompare** in the navigation bar.
+    - To return to the directory selection screen, click **pVACcompare** in the navigation bar.
+    - Click the MHC Class dropdown in the navigation bar to switch between classes if both were included.
diff --git a/pvaccompare/compare_tools/__init__.py b/pvaccompare/compare_tools/__init__.py
@@ -1,2 +1,2 @@
-from .comparison_router import run_comparison, prepare_results_folder
+from .comparison_router import run_comparison
 from .validators import *
diff --git a/pvaccompare/compare_tools/comparison_router.py b/pvaccompare/compare_tools/comparison_router.py
@@ -1,8 +1,6 @@
 import glob
 import os
-import shutil
 import logging
-from datetime import datetime
 from runners import *
 
 
@@ -17,23 +15,23 @@ def find_file(results_folder, subfolder, pattern):
     return files[0] if files else None
 
 
-def prepare_results_folder(classes, base_output_dir):
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    unique_output_dir = f"{base_output_dir}/results_{timestamp}"
-
-    os.makedirs(unique_output_dir)
-
-    if "1" in classes:
-        os.makedirs(f"{unique_output_dir}/mhc_class_i")
-    if "2" in classes:
-        os.makedirs(f"{unique_output_dir}/mhc_class_ii")
-
-    return unique_output_dir
+def get_prefix(class_type, results_folder):
+    if os.path.exists(os.path.join(results_folder, "MHC_Class_I")) or os.path.exists(
+        os.path.join(results_folder, "MHC_Class_II")
+    ):
+        return "MHC_Class_I" if class_type == "1" else "MHC_Class_II"
+    elif os.path.exists(
+        os.path.join(results_folder, "pVACseq/mhc_i")
+    ) or os.path.exists(os.path.join(results_folder, "pVACseq/mhc_ii")):
+        return "pVACseq/mhc_i" if class_type == "1" else "pVACseq/mhc_ii"
+    else:
+        raise FileNotFoundError(
+            f"Could not locate result files for folder: {results_folder}"
+        )
 
 
 def run_comparison(
     class_type,
-    prefix,
     results_folder1,
     results_folder2,
     output_dir,
@@ -46,44 +44,42 @@ def run_comparison(
     Modifies:   Nothing
     Returns:    None
     """
+    folder1_prefix = get_prefix(class_type, results_folder1)
+    folder2_prefix = get_prefix(class_type, results_folder2)
     output_path = (
         f'{output_dir}/{"mhc_class_i" if class_type == "1" else "mhc_class_ii"}'
     )
 
-    if "pVACseq" not in prefix:
-        yml1_path = find_file(results_folder1, prefix + "/log", "inputs.yml")
-        yml2_path = find_file(results_folder2, prefix + "/log", "inputs.yml")
-        if yml1_path and yml2_path:
-            logging.info("Running the input YML comparison tool...")
-            run_compare_yml(yml1_path, yml2_path, output_path, class_type)
-            logging.info("\u2713 Comparison completed successfully.")
+    yml1_path = find_file(results_folder1, folder1_prefix + "/log", "inputs.yml")
+    yml2_path = find_file(results_folder2, folder2_prefix + "/log", "inputs.yml")
+    if yml1_path and yml2_path:
+        logging.info("Running the input YML comparison tool...")
+        run_compare_yml(yml1_path, yml2_path, output_path, class_type)
+        logging.info("\u2713 Comparison completed successfully.")
+    else:
+        if yml1_path:
+            logging.error(
+                "ERROR: Could not locate the input YML file in results folder 2 for MHC Class %s.",
+                "I" if class_type == "1" else "II",
+            )
+        elif yml2_path:
+            logging.error(
+                "ERROR: Could not locate the input YML file in results folder 1 for MHC Class %s.",
+                "I" if class_type == "1" else "II",
+            )
         else:
-            if yml1_path:
-                logging.error(
-                    "ERROR: Could not locate the input YML file in results folder 2 for %s.",
-                    prefix,
-                )
-            elif yml2_path:
-                logging.error(
-                    "ERROR: Could not locate the input YML file in results folder 1 for %s.",
-                    prefix,
-                )
-            else:
-                logging.error(
-                    "ERROR: Could not locate the input YML file in either results folder for %s.",
-                    prefix,
-                )
+            logging.error(
+                "ERROR: Could not locate the input YML file in either results folder for MHC Class %s.",
+                "I" if class_type == "1" else "II",
+            )
 
-            logging.info("\u2716 Comparison skipped.")
-    else:
-        logging.info("Input YML files are not included in immuno pipeline results")
         logging.info("\u2716 Comparison skipped.")
 
     json1_path = find_file(
-        results_folder1, prefix + "/", "*all_epitopes.aggregated.metrics.json"
+        results_folder1, folder1_prefix + "/", "*all_epitopes.aggregated.metrics.json"
     )
     json2_path = find_file(
-        results_folder2, prefix + "/", "*all_epitopes.aggregated.metrics.json"
+        results_folder2, folder2_prefix + "/", "*all_epitopes.aggregated.metrics.json"
     )
     if json1_path and json2_path:
         logging.info("\nRunning the metrics JSON comparison tool...")
@@ -92,26 +88,26 @@ def run_comparison(
     else:
         if json1_path:
             logging.error(
-                "ERROR: Could not locate the metrics JSON file in results folder 2 for %s.",
-                prefix,
+                "ERROR: Could not locate the metrics JSON file in results folder 2 for MHC Class %s.",
+                "I" if class_type == "1" else "II",
             )
         elif json2_path:
             logging.error(
-                "ERROR: Could not locate the metrics JSON file in results folder 1 for %s.",
-                prefix,
+                "ERROR: Could not locate the metrics JSON file in results folder 1 for MHC Class %s.",
+                "I" if class_type == "1" else "II",
             )
         else:
             logging.error(
-                "ERROR: Could not locate the metrics JSON file in either results folder for %s.",
-                prefix,
+                "ERROR: Could not locate the metrics JSON file in either results folder for MHC Class %s.",
+                "I" if class_type == "1" else "II",
             )
         logging.info("\u2716 Comparison skipped.")
 
     agg_tsv1_path = find_file(
-        results_folder1, prefix + "/", "*all_epitopes.aggregated.tsv"
+        results_folder1, folder1_prefix + "/", "*all_epitopes.aggregated.tsv"
     )
     agg_tsv2_path = find_file(
-        results_folder2, prefix + "/", "*all_epitopes.aggregated.tsv"
+        results_folder2, folder2_prefix + "/", "*all_epitopes.aggregated.tsv"
     )
     if agg_tsv1_path and agg_tsv2_path:
         logging.info("\nRunning the aggregated TSV comparison tool...")
@@ -122,23 +118,27 @@ def run_comparison(
     else:
         if agg_tsv1_path:
             logging.error(
-                "ERROR: Could not locate the aggregated TSV file in results folder 2 for %s.",
-                prefix,
+                "ERROR: Could not locate the aggregated TSV file in results folder 2 for MHC Class %s.",
+                "I" if class_type == "1" else "II",
             )
         elif agg_tsv2_path:
             logging.error(
-                "ERROR: Could not locate the aggregated TSV file in results folder 1 for %s.",
-                prefix,
+                "ERROR: Could not locate the aggregated TSV file in results folder 1 for MHC Class %s.",
+                "I" if class_type == "1" else "II",
             )
         else:
             logging.error(
-                "ERROR: Could not locate the aggregated TSV file in either results folder for %s.",
-                prefix,
+                "ERROR: Could not locate the aggregated TSV file in either results folder for MHC Class %s.",
+                "I" if class_type == "1" else "II",
             )
         logging.info("\u2716 Comparison skipped.")
 
-    unagg_tsv1_path = find_file(results_folder1, prefix + "/", "*all_epitopes.tsv")
-    unagg_tsv2_path = find_file(results_folder2, prefix + "/", "*all_epitopes.tsv")
+    unagg_tsv1_path = find_file(
+        results_folder1, folder1_prefix + "/", "*all_epitopes.tsv"
+    )
+    unagg_tsv2_path = find_file(
+        results_folder2, folder2_prefix + "/", "*all_epitopes.tsv"
+    )
     if unagg_tsv1_path and unagg_tsv2_path:
         logging.info("\nRunning the unaggregated TSV comparison tool...")
         run_compare_unaggregated_tsv(
@@ -152,23 +152,27 @@ def run_comparison(
     else:
         if unagg_tsv1_path:
             logging.error(
-                "ERROR: Could not locate the unaggregated TSV file in results folder 2 for %s.",
-                prefix,
+                "ERROR: Could not locate the unaggregated TSV file in results folder 2 for MHC Class %s.",
+                "I" if class_type == "1" else "II",
             )
         elif unagg_tsv2_path:
             logging.error(
-                "ERROR: Could not locate the unaggregated TSV file in results folder 1 for %s.",
-                prefix,
+                "ERROR: Could not locate the unaggregated TSV file in results folder 1 for MHC Class %s.",
+                "I" if class_type == "1" else "II",
             )
         else:
             logging.error(
-                "ERROR: Could not locate the unaggregated TSV file in either results folder for %s.",
-                prefix,
+                "ERROR: Could not locate the unaggregated TSV file in either results folder for MHC Class %s.",
+                "I" if class_type == "1" else "II",
             )
         logging.info("\u2716 Comparison skipped.")
 
-    refmatch_tsv1_path = find_file(results_folder1, prefix + "/", "*.reference_matches")
-    refmatch_tsv2_path = find_file(results_folder2, prefix + "/", "*.reference_matches")
+    refmatch_tsv1_path = find_file(
+        results_folder1, folder1_prefix + "/", "*.reference_matches"
+    )
+    refmatch_tsv2_path = find_file(
+        results_folder2, folder2_prefix + "/", "*.reference_matches"
+    )
     if refmatch_tsv1_path and refmatch_tsv2_path:
         logging.info("\nRunning the reference match TSV comparison tool...")
         run_compare_reference_matches_tsv(
@@ -182,20 +186,23 @@ def run_comparison(
     else:
         if refmatch_tsv1_path:
             logging.error(
-                "ERROR: Could not locate the reference match TSV file in results folder 2 for %s.",
-                prefix,
+                "ERROR: Could not locate the reference match TSV file in results folder 2 for MHC Class %s.",
+                "I" if class_type == "1" else "II",
             )
         elif refmatch_tsv2_path:
             logging.error(
-                "ERROR: Could not locate the reference match TSV file in results folder 1 for %s.",
-                prefix,
+                "ERROR: Could not locate the reference match TSV file in results folder 1 for MHC Class %s.",
+                "I" if class_type == "1" else "II",
             )
         else:
             logging.error(
-                "ERROR: Could not locate the reference match TSV file in either results folder for %s.",
-                prefix,
+                "ERROR: Could not locate the reference match TSV file in either results folder for MHC Class %s.",
+                "I" if class_type == "1" else "II",
             )
         logging.info("\u2716 Comparison skipped.")
     logging.info("\n" + "\u2500" * 55)
-    logging.info("Successfully generated %s comparison report.", prefix)
+    logging.info(
+        "Successfully generated MHC Class %s comparison report.",
+        "I" if class_type == "1" else "II",
+    )
     logging.info("\u2500" * 55)
diff --git a/pvaccompare/run.py b/pvaccompare/run.py
@@ -1,6 +1,7 @@
 from compare_tools import *
 import argparse
 import logging
+from datetime import datetime
 import os
 
 logging.basicConfig(level=logging.DEBUG, format="%(message)s")
@@ -152,24 +153,18 @@ def define_parser():
     return parser
 
 
-def determine_release_type(folder):
-    """
-    Purpose:    Determines the release type based on the presence of specific subdirectories
-    Modifies:   Nothing
-    Returns:    String, release type
-    """
-    if os.path.exists(os.path.join(folder, "MHC_Class_I")) or os.path.exists(
-        os.path.join(folder, "MHC_Class_II")
-    ):
-        return "immuno_release"
-    elif os.path.exists(os.path.join(folder, "pVACseq/mhc_i")) or os.path.exists(
-        os.path.join(folder, "pVACseq/mhc_ii")
-    ):
-        return "pvactools_release"
-    else:
-        raise FileNotFoundError(
-            f"Could not determine release type for folder: {folder}"
-        )
+def prepare_results_folder(classes, base_output_dir):
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    unique_output_dir = f"{base_output_dir}/results_{timestamp}"
+
+    os.makedirs(unique_output_dir)
+
+    if "1" in classes:
+        os.makedirs(f"{unique_output_dir}/mhc_class_i")
+    if "2" in classes:
+        os.makedirs(f"{unique_output_dir}/mhc_class_ii")
+
+    return unique_output_dir
 
 
 def main():
@@ -185,28 +180,12 @@ def main():
     validate_unaggregated_columns(args.unaggregated_columns, parser)
     validate_reference_match_columns(args.reference_match_columns, parser)
 
-    release_type1 = determine_release_type(args.results_folder1)
-    release_type2 = determine_release_type(args.results_folder2)
-
-    if release_type1 != release_type2:
-        raise ValueError(
-            "ERROR: You are trying to compare a pVACtools release with an immuno release"
-        )
-
-    release_type = release_type1
-
-    classes_to_run = [args.mhc_class] if args.mhc_class else ["1", "2"]
-
-    output_dir = prepare_results_folder(classes_to_run, args.output_dir)
+    classes = [args.mhc_class] if args.mhc_class else ["1", "2"]
+    output_dir = prepare_results_folder(classes, args.output_dir)
 
-    for class_type in classes_to_run:
-        if release_type == "immuno_release":
-            prefix = "MHC_Class_I" if class_type == "1" else "MHC_Class_II"
-        else:
-            prefix = "pVACseq/mhc_i" if class_type == "1" else "pVACseq/mhc_ii"
+    for class_type in classes:
         run_comparison(
             class_type,
-            prefix,
             args.results_folder1,
             args.results_folder2,
             output_dir,