transfer from ldhtnp/pVACcompare

griffithlab · Jan 17, 2025 · 2c36bdd · 2c36bdd
1 parent 5f668a5
commit 2c36bdd
Show file tree

Hide file tree

Showing 55 changed files with 21,639 additions and 0 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,41 @@
+name: pVACcompare tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.9', '3.10']
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install coverage
+      
+      - name: Change working directory and run tests with coverage
+        run: |
+          cd pvaccompare
+          coverage run -m unittest discover tests/
+      
+      - name: Generate coverage report
+        run: |
+          cd pvaccompare
+          coverage report
diff --git a/README.md b/README.md
@@ -1 +1,36 @@
 # pVACcompare
+pVACcompare is a file comparison suite built for comparing results generated by [pVACtools](https://github.com/griffithlab/pVACtools) or the [GriffithLab](https://github.com/griffithlab) immuno pipeline. Comparison can be specified for MHC Class I results, MHC Class II results, or both. In its current state the following files are included in the comparison:
+- log/inputs.yml
+- all_epitopes.tsv
+- all_epitopes.aggregated.tsv
+- all_epitopes.aggregated.tsv.reference_matches
+- all_epitopes.aggregated.metrics.json (inputs only)
+## Installation
+pVACcompare is written for Linux, other operating systems have not been tested. pVACcompare has been tested with Python 3.9 and 3.10. Other versions may work, but have not been tested.
+### Dependencies
+To install dependencies navigate inside the repository after cloning and run:<br>
+```bash
+pip3 install -r requirements.txt
+```
+## Usage
+pVACcompare offers several parameters that allow the user to have control of the comparisons. Running the following in the terminal will display the help menu with all of the available parameters and options:<br>
+```bash
+python3 run.py -h
+```
+An example of running the tool likes like the following:<br>
+```bash
+python3 run.py --pvactools_release --output_dir path/to/output/directory --mhc_class 1 --aggregated_columns 'Best Peptide', 'Best Transcript' version1/result version2/result
+```
+**Note**: You must specify if the results are from pVACtools or the immuno pipeline. All columns specified must be in quotes and comma separated. If you do not specify MHC Class, the tool will include both in the report. A list of available columns is displayed in the help menu.<br><br>
+The above command will perform a MHC Class I output comparison between two result folders generated by pVACtools only with the specified columns included in the aggregated tsv comparison. Columns for the unaggregated tsv comparison and reference match tsv comparison were not specified, so the default columns will be used. Results will be generated in the specified output directory. If an output directory is not specified, one will be created inside ```pvaccompare/```.
+## Viewing Results
+After completing a run, a results folder containing the JSON files generated by the tool will be created in the output directory. pVACcompare provides an organized HTML report for efficient parsing and visualization of results. To view the HTML report:
+1. Navigate to the ```pvaccompare/``` directory.
+2. Run the following command to start the local server:
+```bash
+python3 server.py
+```
+3. The server will output a link where you can access the report. Open this link in your browser to view the available results.<br><br>
+Once on the report page:
+    - Select the results directory you'd like to explore.
+    - To return to the directory selection screen, click **pVACcompare** in the navigation bar.
diff --git a/pvaccompare/compare_tools/__init__.py b/pvaccompare/compare_tools/__init__.py
@@ -0,0 +1,2 @@
+from .comparison_router import run_comparison, prepare_results_folder
+from .validators import *
diff --git a/pvaccompare/compare_tools/comparison_router.py b/pvaccompare/compare_tools/comparison_router.py
@@ -0,0 +1,201 @@
+import glob
+import os
+import shutil
+import logging
+from datetime import datetime
+from runners import *
+
+
+def find_file(results_folder, subfolder, pattern):
+    """
+    Purpose:    Attempts to locate the files needed for each comparison
+    Modifies:   Nothing
+    Returns:    A string of the file path
+    """
+    search_path = os.path.join(results_folder, subfolder, pattern)
+    files = glob.glob(search_path, recursive=True)
+    return files[0] if files else None
+
+
+def prepare_results_folder(classes, base_output_dir):
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    unique_output_dir = f"{base_output_dir}/results_{timestamp}"
+
+    os.makedirs(unique_output_dir)
+
+    if "1" in classes:
+        os.makedirs(f"{unique_output_dir}/mhc_class_i")
+    if "2" in classes:
+        os.makedirs(f"{unique_output_dir}/mhc_class_ii")
+
+    return unique_output_dir
+
+
+def run_comparison(
+    class_type,
+    prefix,
+    results_folder1,
+    results_folder2,
+    output_dir,
+    aggregated_columns,
+    unaggregated_columns,
+    reference_match_columns,
+):
+    """
+    Purpose:    Runs all of the different comparisons
+    Modifies:   Nothing
+    Returns:    None
+    """
+    output_path = (
+        f'{output_dir}/{"mhc_class_i" if class_type == "1" else "mhc_class_ii"}'
+    )
+
+    if "pVACseq" not in prefix:
+        yml1_path = find_file(results_folder1, prefix + "/log", "inputs.yml")
+        yml2_path = find_file(results_folder2, prefix + "/log", "inputs.yml")
+        if yml1_path and yml2_path:
+            logging.info("Running the input YML comparison tool...")
+            run_compare_yml(yml1_path, yml2_path, output_path, class_type)
+            logging.info("\u2713 Comparison completed successfully.")
+        else:
+            if yml1_path:
+                logging.error(
+                    "ERROR: Could not locate the input YML file in results folder 2 for %s.",
+                    prefix,
+                )
+            elif yml2_path:
+                logging.error(
+                    "ERROR: Could not locate the input YML file in results folder 1 for %s.",
+                    prefix,
+                )
+            else:
+                logging.error(
+                    "ERROR: Could not locate the input YML file in either results folder for %s.",
+                    prefix,
+                )
+
+            logging.info("\u2716 Comparison skipped.")
+    else:
+        logging.info("Input YML files are not included in immuno pipeline results")
+        logging.info("\u2716 Comparison skipped.")
+
+    json1_path = find_file(
+        results_folder1, prefix + "/", "*all_epitopes.aggregated.metrics.json"
+    )
+    json2_path = find_file(
+        results_folder2, prefix + "/", "*all_epitopes.aggregated.metrics.json"
+    )
+    if json1_path and json2_path:
+        logging.info("\nRunning the metrics JSON comparison tool...")
+        run_compare_json(json1_path, json2_path, output_path, class_type)
+        logging.info("\u2713 Comparison completed successfully.")
+    else:
+        if json1_path:
+            logging.error(
+                "ERROR: Could not locate the metrics JSON file in results folder 2 for %s.",
+                prefix,
+            )
+        elif json2_path:
+            logging.error(
+                "ERROR: Could not locate the metrics JSON file in results folder 1 for %s.",
+                prefix,
+            )
+        else:
+            logging.error(
+                "ERROR: Could not locate the metrics JSON file in either results folder for %s.",
+                prefix,
+            )
+        logging.info("\u2716 Comparison skipped.")
+
+    agg_tsv1_path = find_file(
+        results_folder1, prefix + "/", "*all_epitopes.aggregated.tsv"
+    )
+    agg_tsv2_path = find_file(
+        results_folder2, prefix + "/", "*all_epitopes.aggregated.tsv"
+    )
+    if agg_tsv1_path and agg_tsv2_path:
+        logging.info("\nRunning the aggregated TSV comparison tool...")
+        run_compare_aggregated_tsv(
+            agg_tsv1_path, agg_tsv2_path, aggregated_columns, output_path, class_type
+        )
+        logging.info("\u2713 Comparison completed successfully.")
+    else:
+        if agg_tsv1_path:
+            logging.error(
+                "ERROR: Could not locate the aggregated TSV file in results folder 2 for %s.",
+                prefix,
+            )
+        elif agg_tsv2_path:
+            logging.error(
+                "ERROR: Could not locate the aggregated TSV file in results folder 1 for %s.",
+                prefix,
+            )
+        else:
+            logging.error(
+                "ERROR: Could not locate the aggregated TSV file in either results folder for %s.",
+                prefix,
+            )
+        logging.info("\u2716 Comparison skipped.")
+
+    unagg_tsv1_path = find_file(results_folder1, prefix + "/", "*all_epitopes.tsv")
+    unagg_tsv2_path = find_file(results_folder2, prefix + "/", "*all_epitopes.tsv")
+    if unagg_tsv1_path and unagg_tsv2_path:
+        logging.info("\nRunning the unaggregated TSV comparison tool...")
+        run_compare_unaggregated_tsv(
+            unagg_tsv1_path,
+            unagg_tsv2_path,
+            unaggregated_columns,
+            output_path,
+            class_type,
+        )
+        logging.info("\u2713 Comparison completed successfully.")
+    else:
+        if unagg_tsv1_path:
+            logging.error(
+                "ERROR: Could not locate the unaggregated TSV file in results folder 2 for %s.",
+                prefix,
+            )
+        elif unagg_tsv2_path:
+            logging.error(
+                "ERROR: Could not locate the unaggregated TSV file in results folder 1 for %s.",
+                prefix,
+            )
+        else:
+            logging.error(
+                "ERROR: Could not locate the unaggregated TSV file in either results folder for %s.",
+                prefix,
+            )
+        logging.info("\u2716 Comparison skipped.")
+
+    refmatch_tsv1_path = find_file(results_folder1, prefix + "/", "*.reference_matches")
+    refmatch_tsv2_path = find_file(results_folder2, prefix + "/", "*.reference_matches")
+    if refmatch_tsv1_path and refmatch_tsv2_path:
+        logging.info("\nRunning the reference match TSV comparison tool...")
+        run_compare_reference_matches_tsv(
+            refmatch_tsv1_path,
+            refmatch_tsv2_path,
+            reference_match_columns,
+            output_path,
+            class_type,
+        )
+        logging.info("\u2713 Comparison completed successfully.")
+    else:
+        if refmatch_tsv1_path:
+            logging.error(
+                "ERROR: Could not locate the reference match TSV file in results folder 2 for %s.",
+                prefix,
+            )
+        elif refmatch_tsv2_path:
+            logging.error(
+                "ERROR: Could not locate the reference match TSV file in results folder 1 for %s.",
+                prefix,
+            )
+        else:
+            logging.error(
+                "ERROR: Could not locate the reference match TSV file in either results folder for %s.",
+                prefix,
+            )
+        logging.info("\u2716 Comparison skipped.")
+    logging.info("\n" + "\u2500" * 55)
+    logging.info("Successfully generated %s comparison report.", prefix)
+    logging.info("\u2500" * 55)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .comparison_router import run_comparison, prepare_results_folder
		from .validators import *