Skip to content

Commit

Permalink
improved logic and updated README
Browse files Browse the repository at this point in the history
  • Loading branch information
ldhtnp committed Jan 28, 2025
1 parent 1de1b2e commit 65591d6
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 116 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# pVACcompare
pVACcompare is a file comparison suite built for comparing results generated by [pVACtools](https://github.com/griffithlab/pVACtools) or the [GriffithLab](https://github.com/griffithlab) immuno pipeline. Comparison can be specified for MHC Class I results, MHC Class II results, or both. In its current state the following files are included in the comparison:
pVACcompare is a file comparison suite built for comparing results generated by [pVACtools](https://github.com/griffithlab/pVACtools). Comparison can be specified for MHC Class I results, MHC Class II results, or both. In its current state the following files are included in the comparison:
- log/inputs.yml
- all_epitopes.tsv
- all_epitopes.aggregated.tsv
Expand All @@ -19,10 +19,10 @@ python3 run.py -h
```
An example of running the tool likes like the following:<br>
```bash
python3 run.py --pvactools_release --output_dir path/to/output/directory --mhc_class 1 --aggregated_columns 'Best Peptide', 'Best Transcript' version1/result version2/result
python3 run.py --output_dir path/to/output/directory --mhc_class 1 --aggregated_columns 'Best Peptide', 'Best Transcript' version1/result version2/result
```
**Note**: You must specify if the results are from pVACtools or the immuno pipeline. All columns specified must be in quotes and comma separated. If you do not specify MHC Class, the tool will include both in the report. A list of available columns is displayed in the help menu.<br><br>
The above command will perform a MHC Class I output comparison between two result folders generated by pVACtools only with the specified columns included in the aggregated tsv comparison. Columns for the unaggregated tsv comparison and reference match tsv comparison were not specified, so the default columns will be used. Results will be generated in the specified output directory. If an output directory is not specified, one will be created inside ```pvaccompare/```.
**Note**: All columns specified must be in quotes and comma separated. If you do not specify MHC Class, the tool will include both in the report. A list of available columns is displayed in the help menu.<br><br>
The above command will perform a MHC Class I output comparison only with the specified columns included in the aggregated tsv comparison. Columns for the unaggregated tsv comparison and reference match tsv comparison were not specified, so the default columns will be used. Results will be generated in the specified output directory. If an output directory is not specified, one will be created inside ```pvaccompare/```.
## Viewing Results
After completing a run, a results folder containing the JSON files generated by the tool will be created in the output directory. pVACcompare provides an organized HTML report for efficient parsing and visualization of results. To view the HTML report:
1. Navigate to the ```pvaccompare/``` directory.
Expand All @@ -33,4 +33,5 @@ python3 server.py
3. The server will output a link where you can access the report. Open this link in your browser to view the available results.<br><br>
Once on the report page:
- Select the results directory you'd like to explore.
- To return to the directory selection screen, click **pVACcompare** in the navigation bar.
- To return to the directory selection screen, click **pVACcompare** in the navigation bar.
- Click the MHC Class dropdown in the navigation bar to switch between classes if both were included.
2 changes: 1 addition & 1 deletion pvaccompare/compare_tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .comparison_router import run_comparison, prepare_results_folder
from .comparison_router import run_comparison
from .validators import *
153 changes: 80 additions & 73 deletions pvaccompare/compare_tools/comparison_router.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import glob
import os
import shutil
import logging
from datetime import datetime
from runners import *


Expand All @@ -17,23 +15,23 @@ def find_file(results_folder, subfolder, pattern):
return files[0] if files else None


def prepare_results_folder(classes, base_output_dir):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
unique_output_dir = f"{base_output_dir}/results_{timestamp}"

os.makedirs(unique_output_dir)

if "1" in classes:
os.makedirs(f"{unique_output_dir}/mhc_class_i")
if "2" in classes:
os.makedirs(f"{unique_output_dir}/mhc_class_ii")

return unique_output_dir
def get_prefix(class_type, results_folder):
if os.path.exists(os.path.join(results_folder, "MHC_Class_I")) or os.path.exists(
os.path.join(results_folder, "MHC_Class_II")
):
return "MHC_Class_I" if class_type == "1" else "MHC_Class_II"
elif os.path.exists(
os.path.join(results_folder, "pVACseq/mhc_i")
) or os.path.exists(os.path.join(results_folder, "pVACseq/mhc_ii")):
return "pVACseq/mhc_i" if class_type == "1" else "pVACseq/mhc_ii"
else:
raise FileNotFoundError(
f"Could not locate result files for folder: {results_folder}"
)


def run_comparison(
class_type,
prefix,
results_folder1,
results_folder2,
output_dir,
Expand All @@ -46,44 +44,42 @@ def run_comparison(
Modifies: Nothing
Returns: None
"""
folder1_prefix = get_prefix(class_type, results_folder1)
folder2_prefix = get_prefix(class_type, results_folder2)
output_path = (
f'{output_dir}/{"mhc_class_i" if class_type == "1" else "mhc_class_ii"}'
)

if "pVACseq" not in prefix:
yml1_path = find_file(results_folder1, prefix + "/log", "inputs.yml")
yml2_path = find_file(results_folder2, prefix + "/log", "inputs.yml")
if yml1_path and yml2_path:
logging.info("Running the input YML comparison tool...")
run_compare_yml(yml1_path, yml2_path, output_path, class_type)
logging.info("\u2713 Comparison completed successfully.")
yml1_path = find_file(results_folder1, folder1_prefix + "/log", "inputs.yml")
yml2_path = find_file(results_folder2, folder2_prefix + "/log", "inputs.yml")
if yml1_path and yml2_path:
logging.info("Running the input YML comparison tool...")
run_compare_yml(yml1_path, yml2_path, output_path, class_type)
logging.info("\u2713 Comparison completed successfully.")
else:
if yml1_path:
logging.error(
"ERROR: Could not locate the input YML file in results folder 2 for MHC Class %s.",
"I" if class_type == "1" else "II",
)
elif yml2_path:
logging.error(
"ERROR: Could not locate the input YML file in results folder 1 for MHC Class %s.",
"I" if class_type == "1" else "II",
)
else:
if yml1_path:
logging.error(
"ERROR: Could not locate the input YML file in results folder 2 for %s.",
prefix,
)
elif yml2_path:
logging.error(
"ERROR: Could not locate the input YML file in results folder 1 for %s.",
prefix,
)
else:
logging.error(
"ERROR: Could not locate the input YML file in either results folder for %s.",
prefix,
)
logging.error(
"ERROR: Could not locate the input YML file in either results folder for MHC Class %s.",
"I" if class_type == "1" else "II",
)

logging.info("\u2716 Comparison skipped.")
else:
logging.info("Input YML files are not included in immuno pipeline results")
logging.info("\u2716 Comparison skipped.")

json1_path = find_file(
results_folder1, prefix + "/", "*all_epitopes.aggregated.metrics.json"
results_folder1, folder1_prefix + "/", "*all_epitopes.aggregated.metrics.json"
)
json2_path = find_file(
results_folder2, prefix + "/", "*all_epitopes.aggregated.metrics.json"
results_folder2, folder2_prefix + "/", "*all_epitopes.aggregated.metrics.json"
)
if json1_path and json2_path:
logging.info("\nRunning the metrics JSON comparison tool...")
Expand All @@ -92,26 +88,26 @@ def run_comparison(
else:
if json1_path:
logging.error(
"ERROR: Could not locate the metrics JSON file in results folder 2 for %s.",
prefix,
"ERROR: Could not locate the metrics JSON file in results folder 2 for MHC Class %s.",
"I" if class_type == "1" else "II",
)
elif json2_path:
logging.error(
"ERROR: Could not locate the metrics JSON file in results folder 1 for %s.",
prefix,
"ERROR: Could not locate the metrics JSON file in results folder 1 for MHC Class %s.",
"I" if class_type == "1" else "II",
)
else:
logging.error(
"ERROR: Could not locate the metrics JSON file in either results folder for %s.",
prefix,
"ERROR: Could not locate the metrics JSON file in either results folder for MHC Class %s.",
"I" if class_type == "1" else "II",
)
logging.info("\u2716 Comparison skipped.")

agg_tsv1_path = find_file(
results_folder1, prefix + "/", "*all_epitopes.aggregated.tsv"
results_folder1, folder1_prefix + "/", "*all_epitopes.aggregated.tsv"
)
agg_tsv2_path = find_file(
results_folder2, prefix + "/", "*all_epitopes.aggregated.tsv"
results_folder2, folder2_prefix + "/", "*all_epitopes.aggregated.tsv"
)
if agg_tsv1_path and agg_tsv2_path:
logging.info("\nRunning the aggregated TSV comparison tool...")
Expand All @@ -122,23 +118,27 @@ def run_comparison(
else:
if agg_tsv1_path:
logging.error(
"ERROR: Could not locate the aggregated TSV file in results folder 2 for %s.",
prefix,
"ERROR: Could not locate the aggregated TSV file in results folder 2 for MHC Class %s.",
"I" if class_type == "1" else "II",
)
elif agg_tsv2_path:
logging.error(
"ERROR: Could not locate the aggregated TSV file in results folder 1 for %s.",
prefix,
"ERROR: Could not locate the aggregated TSV file in results folder 1 for MHC Class %s.",
"I" if class_type == "1" else "II",
)
else:
logging.error(
"ERROR: Could not locate the aggregated TSV file in either results folder for %s.",
prefix,
"ERROR: Could not locate the aggregated TSV file in either results folder for MHC Class %s.",
"I" if class_type == "1" else "II",
)
logging.info("\u2716 Comparison skipped.")

unagg_tsv1_path = find_file(results_folder1, prefix + "/", "*all_epitopes.tsv")
unagg_tsv2_path = find_file(results_folder2, prefix + "/", "*all_epitopes.tsv")
unagg_tsv1_path = find_file(
results_folder1, folder1_prefix + "/", "*all_epitopes.tsv"
)
unagg_tsv2_path = find_file(
results_folder2, folder2_prefix + "/", "*all_epitopes.tsv"
)
if unagg_tsv1_path and unagg_tsv2_path:
logging.info("\nRunning the unaggregated TSV comparison tool...")
run_compare_unaggregated_tsv(
Expand All @@ -152,23 +152,27 @@ def run_comparison(
else:
if unagg_tsv1_path:
logging.error(
"ERROR: Could not locate the unaggregated TSV file in results folder 2 for %s.",
prefix,
"ERROR: Could not locate the unaggregated TSV file in results folder 2 for MHC Class %s.",
"I" if class_type == "1" else "II",
)
elif unagg_tsv2_path:
logging.error(
"ERROR: Could not locate the unaggregated TSV file in results folder 1 for %s.",
prefix,
"ERROR: Could not locate the unaggregated TSV file in results folder 1 for MHC Class %s.",
"I" if class_type == "1" else "II",
)
else:
logging.error(
"ERROR: Could not locate the unaggregated TSV file in either results folder for %s.",
prefix,
"ERROR: Could not locate the unaggregated TSV file in either results folder for MHC Class %s.",
"I" if class_type == "1" else "II",
)
logging.info("\u2716 Comparison skipped.")

refmatch_tsv1_path = find_file(results_folder1, prefix + "/", "*.reference_matches")
refmatch_tsv2_path = find_file(results_folder2, prefix + "/", "*.reference_matches")
refmatch_tsv1_path = find_file(
results_folder1, folder1_prefix + "/", "*.reference_matches"
)
refmatch_tsv2_path = find_file(
results_folder2, folder2_prefix + "/", "*.reference_matches"
)
if refmatch_tsv1_path and refmatch_tsv2_path:
logging.info("\nRunning the reference match TSV comparison tool...")
run_compare_reference_matches_tsv(
Expand All @@ -182,20 +186,23 @@ def run_comparison(
else:
if refmatch_tsv1_path:
logging.error(
"ERROR: Could not locate the reference match TSV file in results folder 2 for %s.",
prefix,
"ERROR: Could not locate the reference match TSV file in results folder 2 for MHC Class %s.",
"I" if class_type == "1" else "II",
)
elif refmatch_tsv2_path:
logging.error(
"ERROR: Could not locate the reference match TSV file in results folder 1 for %s.",
prefix,
"ERROR: Could not locate the reference match TSV file in results folder 1 for MHC Class %s.",
"I" if class_type == "1" else "II",
)
else:
logging.error(
"ERROR: Could not locate the reference match TSV file in either results folder for %s.",
prefix,
"ERROR: Could not locate the reference match TSV file in either results folder for MHC Class %s.",
"I" if class_type == "1" else "II",
)
logging.info("\u2716 Comparison skipped.")
logging.info("\n" + "\u2500" * 55)
logging.info("Successfully generated %s comparison report.", prefix)
logging.info(
"Successfully generated MHC Class %s comparison report.",
"I" if class_type == "1" else "II",
)
logging.info("\u2500" * 55)
53 changes: 16 additions & 37 deletions pvaccompare/run.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from compare_tools import *
import argparse
import logging
from datetime import datetime
import os

logging.basicConfig(level=logging.DEBUG, format="%(message)s")
Expand Down Expand Up @@ -152,24 +153,18 @@ def define_parser():
return parser


def determine_release_type(folder):
"""
Purpose: Determines the release type based on the presence of specific subdirectories
Modifies: Nothing
Returns: String, release type
"""
if os.path.exists(os.path.join(folder, "MHC_Class_I")) or os.path.exists(
os.path.join(folder, "MHC_Class_II")
):
return "immuno_release"
elif os.path.exists(os.path.join(folder, "pVACseq/mhc_i")) or os.path.exists(
os.path.join(folder, "pVACseq/mhc_ii")
):
return "pvactools_release"
else:
raise FileNotFoundError(
f"Could not determine release type for folder: {folder}"
)
def prepare_results_folder(classes, base_output_dir):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
unique_output_dir = f"{base_output_dir}/results_{timestamp}"

os.makedirs(unique_output_dir)

if "1" in classes:
os.makedirs(f"{unique_output_dir}/mhc_class_i")
if "2" in classes:
os.makedirs(f"{unique_output_dir}/mhc_class_ii")

return unique_output_dir


def main():
Expand All @@ -185,28 +180,12 @@ def main():
validate_unaggregated_columns(args.unaggregated_columns, parser)
validate_reference_match_columns(args.reference_match_columns, parser)

release_type1 = determine_release_type(args.results_folder1)
release_type2 = determine_release_type(args.results_folder2)

if release_type1 != release_type2:
raise ValueError(
"ERROR: You are trying to compare a pVACtools release with an immuno release"
)

release_type = release_type1

classes_to_run = [args.mhc_class] if args.mhc_class else ["1", "2"]

output_dir = prepare_results_folder(classes_to_run, args.output_dir)
classes = [args.mhc_class] if args.mhc_class else ["1", "2"]
output_dir = prepare_results_folder(classes, args.output_dir)

for class_type in classes_to_run:
if release_type == "immuno_release":
prefix = "MHC_Class_I" if class_type == "1" else "MHC_Class_II"
else:
prefix = "pVACseq/mhc_i" if class_type == "1" else "pVACseq/mhc_ii"
for class_type in classes:
run_comparison(
class_type,
prefix,
args.results_folder1,
args.results_folder2,
output_dir,
Expand Down

0 comments on commit 65591d6

Please sign in to comment.