From c7382e87fcd647cd4455af9c4b2e7b9e4c177920 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Mon, 19 Aug 2024 08:06:00 -0500 Subject: [PATCH] include run_time --- src/loaders/compute_tools/checkm2/checkm2.py | 15 ++++++++----- .../compute_tools/checkm2/versions.yaml | 1 + src/loaders/compute_tools/eggnog/eggnog.py | 11 ++++++++++ .../compute_tools/eggnog/versions.yaml | 1 + src/loaders/compute_tools/gtdb_tk/gtdb_tk.py | 15 +++++++------ .../compute_tools/gtdb_tk/versions.yaml | 1 + src/loaders/compute_tools/mash/mash.py | 15 ++++++++++++- src/loaders/compute_tools/mash/versions.yaml | 3 ++- .../compute_tools/microtrait/microtrait.py | 21 +++++++++++++++++++ .../compute_tools/microtrait/versions.yaml | 7 ++++++- 10 files changed, 76 insertions(+), 14 deletions(-) diff --git a/src/loaders/compute_tools/checkm2/checkm2.py b/src/loaders/compute_tools/checkm2/checkm2.py index fb043d51..3cb8a8be 100644 --- a/src/loaders/compute_tools/checkm2/checkm2.py +++ b/src/loaders/compute_tools/checkm2/checkm2.py @@ -46,10 +46,6 @@ def _run_checkm2( # checkm2 will clear output_dir before it starts, which will delete any log files log_dir = output_dir.parent / ("checkm2_log_" + output_dir.parts[-1]) run_command(command, log_dir if debug else None) - end_time = time.time() - print(f"Used {round((end_time - start) / 60, 2)} minutes to execute checkM2 predict " - + f"for {size} genomes" - ) tool_file_name, genome_id_col = 'quality_report.tsv', 'Name' genome_attri_docs = process_genome_attri_result(output_dir, @@ -70,6 +66,12 @@ def _run_checkm2( fatal_tuples.append(fatal_tuple) write_fatal_tuples_to_dict(fatal_tuples, output_dir) + end_time = time.time() + run_time = end_time - start + print(f"Used {round(run_time / 60, 2)} minutes to execute checkM2 predict " + + f"for {size} genomes" + ) + metadata = {'tool_name': 'checkm2', 'version': '1.0.1', 'command': command, @@ -77,7 +79,10 @@ def _run_checkm2( "version": None, "comment": "diamond_db, ver unknown", }, - 'ids_to_files': make_json_serializable(ids_to_files)} + 'ids_to_files': make_json_serializable(ids_to_files), + 'run_time': run_time, + 'batch_size': size, + } create_tool_metadata(output_dir, metadata) diff --git a/src/loaders/compute_tools/checkm2/versions.yaml b/src/loaders/compute_tools/checkm2/versions.yaml index 8e0ec7b2..21240e14 100644 --- a/src/loaders/compute_tools/checkm2/versions.yaml +++ b/src/loaders/compute_tools/checkm2/versions.yaml @@ -38,6 +38,7 @@ versions: - Create metadata file after running CheckM2 - Fix a typo for 'tool_name' metadata field - Add method to ensure 'ids_to_files' is JSON serializable + - Include execution time in metadata reference_db_version: 1.0.1 #Please keep this reminder at the end of this file diff --git a/src/loaders/compute_tools/eggnog/eggnog.py b/src/loaders/compute_tools/eggnog/eggnog.py index b133b20d..37f76112 100644 --- a/src/loaders/compute_tools/eggnog/eggnog.py +++ b/src/loaders/compute_tools/eggnog/eggnog.py @@ -5,6 +5,7 @@ Therefore, the parser program is not compatible with data generated by this tool. """ +import time from pathlib import Path from src.loaders.common.loader_common_names import TOOL_METADATA @@ -21,6 +22,9 @@ def _run_eggnog_single( threads_per_tool_run: int, debug: bool) -> None: + start = time.time() + print(f'Start executing EggNog for {data_id}') + metadata_file = output_dir / TOOL_METADATA if metadata_file.exists(): print(f"Skipping {source_file} as it has already been processed.") @@ -41,6 +45,11 @@ def _run_eggnog_single( run_command(command, output_dir if debug else None) + end_time = time.time() + run_time = end_time - start + print( + f'Used {round(run_time / 60, 2)} minutes to execute EggNog for {data_id}') + # Save run info to a metadata file in the output directory for parsing later metadata = {'source_file': str(source_file), 'input_type': INPUT_TYPE, @@ -51,6 +60,8 @@ def _run_eggnog_single( "reference_db": { "version": "5.0.2", }, + 'run_time': run_time, + 'batch_size': 1, } create_tool_metadata(output_dir, metadata) diff --git a/src/loaders/compute_tools/eggnog/versions.yaml b/src/loaders/compute_tools/eggnog/versions.yaml index 67c8b179..e9db90c4 100644 --- a/src/loaders/compute_tools/eggnog/versions.yaml +++ b/src/loaders/compute_tools/eggnog/versions.yaml @@ -27,6 +27,7 @@ versions: date: 2024-08-16 notes: | - Create metadata file after running Eggnog + - Include execution time in metadata reference_db_version: 5.0.2 #Please keep this reminder at the end of this file diff --git a/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py b/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py index ba218833..6d1d6c51 100644 --- a/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py +++ b/src/loaders/compute_tools/gtdb_tk/gtdb_tk.py @@ -98,11 +98,6 @@ def _run_gtdb_tk( print(f'running {" ".join(command)}') run_command(command, output_dir / "classify_wf_log" if debug else None) - end_time = time.time() - print( - f'Used {round((end_time - start) / 60, 2)} minutes to execute gtdbtk classify_wf for ' - f'{len(ids_to_files)} genomes') - summary_files = find_gtdbtk_summary_files(output_dir) if not summary_files: raise ValueError(f"No summary files exist for gtdb-tk in the specified " @@ -152,13 +147,21 @@ def _run_gtdb_tk( summary_files, ) + end_time = time.time() + run_time = end_time - start + print( + f'Used {round(run_time / 60, 2)} minutes to execute gtdbtk classify_wf for ' + f'{size} genomes') + metadata = {'tool_name': 'gtdb_tk', 'version': '2.3.2', 'command': command, "reference_db": { "version": "release214", }, - 'ids_to_files': make_json_serializable(ids_to_files)} + 'ids_to_files': make_json_serializable(ids_to_files), + 'run_time': run_time, + 'batch_size': size,} create_tool_metadata(output_dir, metadata) diff --git a/src/loaders/compute_tools/gtdb_tk/versions.yaml b/src/loaders/compute_tools/gtdb_tk/versions.yaml index 6235414d..28a0391d 100644 --- a/src/loaders/compute_tools/gtdb_tk/versions.yaml +++ b/src/loaders/compute_tools/gtdb_tk/versions.yaml @@ -47,6 +47,7 @@ versions: - Create metadata file after running GTDB-Tk - Fix a typo for 'tool_name' metadata field - Add method to ensure 'ids_to_files' is JSON serializable + - Include execution time in metadata reference_db_version: release214 #Please keep this reminder at the end of this file diff --git a/src/loaders/compute_tools/mash/mash.py b/src/loaders/compute_tools/mash/mash.py index a2987f29..53244704 100644 --- a/src/loaders/compute_tools/mash/mash.py +++ b/src/loaders/compute_tools/mash/mash.py @@ -1,6 +1,7 @@ """ Run Mash on a set of assemblies. """ +import time from pathlib import Path from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata @@ -18,6 +19,10 @@ def _run_mash_single( debug: bool, kmer_size: int = KMER_SIZE, sketch_size: int = SKETCH_SIZE) -> None: + + start = time.time() + print(f'Start executing Mash for {data_id}') + # RUN mash sketch for a single genome command = ['mash', 'sketch', '-o', source_file, # Output prefix. @@ -29,6 +34,11 @@ def _run_mash_single( run_command(command, output_dir if debug else None) + end_time = time.time() + run_time = end_time - start + print( + f'Used {round(run_time / 60, 2)} minutes to execute Mash for {data_id}') + # Save run info to a metadata file in the output directory for parsing later metadata = {'source_file': str(source_file), # Append '.msh' to the source file name to generate the sketch file name (default by Mash sketch) @@ -38,7 +48,10 @@ def _run_mash_single( 'data_id': data_id, 'tool_name': 'mash', 'version': '2.0', - 'command': command} + 'command': command, + 'run_time': run_time, + 'batch_size': 1, + } create_tool_metadata(output_dir, metadata) diff --git a/src/loaders/compute_tools/mash/versions.yaml b/src/loaders/compute_tools/mash/versions.yaml index f9b41945..ac29def2 100644 --- a/src/loaders/compute_tools/mash/versions.yaml +++ b/src/loaders/compute_tools/mash/versions.yaml @@ -20,4 +20,5 @@ versions: - version: 0.1.5 date: 2024-08-16 notes: | - - Create metadata file after running Mash \ No newline at end of file + - Create metadata file after running Mash + - Include execution time in metadata \ No newline at end of file diff --git a/src/loaders/compute_tools/microtrait/microtrait.py b/src/loaders/compute_tools/microtrait/microtrait.py index d2ae2144..3776fb40 100644 --- a/src/loaders/compute_tools/microtrait/microtrait.py +++ b/src/loaders/compute_tools/microtrait/microtrait.py @@ -2,6 +2,7 @@ Runs microtrait on a set of assemblies. """ import os +import time import uuid from pathlib import Path from typing import Any @@ -28,6 +29,7 @@ FatalTuple, ToolRunner, write_fatal_tuples_to_dict, + create_tool_metadata, ) from src.loaders.compute_tools.tool_result_parser import ( create_jsonl_files, @@ -208,6 +210,9 @@ def _run_microtrait( # since extract_traits function doesn't take the number of threads as an argument # https://github.com/ukaraoz/microtrait/blob/master/R/extract_traits.R#L22-L26 + start = time.time() + print(f'Start executing Microtrait for {data_id}') + # Load the R script as an R function r_script = """ library(microtrait) @@ -262,6 +267,22 @@ def _run_microtrait( create_jsonl_files(genome_dir / MICROTRAIT_CELLS, cells_meta) create_jsonl_files(genome_dir / MICROTRAIT_DATA, heatmap_row) + end_time = time.time() + run_time = end_time - start + print( + f'Used {round(run_time / 60, 2)} minutes to execute Microtrait for {data_id}') + + # Save run info to a metadata file in the output directory for parsing later + metadata = {'source_file': str(fna_file), + 'data_id': data_id, + 'tool_name': 'microtrait', + 'version': 'None', + 'command': 'None - R script', + 'run_time': run_time, + 'batch_size': 1, + } + create_tool_metadata(genome_dir, metadata) + def main(): runner = ToolRunner("microtrait") diff --git a/src/loaders/compute_tools/microtrait/versions.yaml b/src/loaders/compute_tools/microtrait/versions.yaml index 16c272aa..cd2e752c 100644 --- a/src/loaders/compute_tools/microtrait/versions.yaml +++ b/src/loaders/compute_tools/microtrait/versions.yaml @@ -22,4 +22,9 @@ versions: - version: 0.1.5 date: 2024-06-25 notes: | - - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names \ No newline at end of file + - Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names + - version: 0.1.6 + date: 2024-08-16 + notes: | + - Create metadata file after running Microtrait + - Include execution time in metadata \ No newline at end of file