Skip to content

Commit

Permalink
include run_time
Browse files Browse the repository at this point in the history
  • Loading branch information
Tianhao-Gu committed Aug 19, 2024
1 parent 765ad5e commit c7382e8
Show file tree
Hide file tree
Showing 10 changed files with 76 additions and 14 deletions.
15 changes: 10 additions & 5 deletions src/loaders/compute_tools/checkm2/checkm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,6 @@ def _run_checkm2(
# checkm2 will clear output_dir before it starts, which will delete any log files
log_dir = output_dir.parent / ("checkm2_log_" + output_dir.parts[-1])
run_command(command, log_dir if debug else None)
end_time = time.time()
print(f"Used {round((end_time - start) / 60, 2)} minutes to execute checkM2 predict "
+ f"for {size} genomes"
)

tool_file_name, genome_id_col = 'quality_report.tsv', 'Name'
genome_attri_docs = process_genome_attri_result(output_dir,
Expand All @@ -70,14 +66,23 @@ def _run_checkm2(
fatal_tuples.append(fatal_tuple)
write_fatal_tuples_to_dict(fatal_tuples, output_dir)

end_time = time.time()
run_time = end_time - start
print(f"Used {round(run_time / 60, 2)} minutes to execute checkM2 predict "
+ f"for {size} genomes"
)

metadata = {'tool_name': 'checkm2',
'version': '1.0.1',
'command': command,
"reference_db": {
"version": None,
"comment": "diamond_db, ver unknown",
},
'ids_to_files': make_json_serializable(ids_to_files)}
'ids_to_files': make_json_serializable(ids_to_files),
'run_time': run_time,
'batch_size': size,
}
create_tool_metadata(output_dir, metadata)


Expand Down
1 change: 1 addition & 0 deletions src/loaders/compute_tools/checkm2/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ versions:
- Create metadata file after running CheckM2
- Fix a typo for 'tool_name' metadata field
- Add method to ensure 'ids_to_files' is JSON serializable
- Include execution time in metadata
reference_db_version: 1.0.1

#Please keep this reminder at the end of this file
Expand Down
11 changes: 11 additions & 0 deletions src/loaders/compute_tools/eggnog/eggnog.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Therefore, the parser program is not compatible with data generated by this tool.
"""
import time
from pathlib import Path

from src.loaders.common.loader_common_names import TOOL_METADATA
Expand All @@ -21,6 +22,9 @@ def _run_eggnog_single(
threads_per_tool_run: int,
debug: bool) -> None:

start = time.time()
print(f'Start executing EggNog for {data_id}')

metadata_file = output_dir / TOOL_METADATA
if metadata_file.exists():
print(f"Skipping {source_file} as it has already been processed.")
Expand All @@ -41,6 +45,11 @@ def _run_eggnog_single(

run_command(command, output_dir if debug else None)

end_time = time.time()
run_time = end_time - start
print(
f'Used {round(run_time / 60, 2)} minutes to execute EggNog for {data_id}')

# Save run info to a metadata file in the output directory for parsing later
metadata = {'source_file': str(source_file),
'input_type': INPUT_TYPE,
Expand All @@ -51,6 +60,8 @@ def _run_eggnog_single(
"reference_db": {
"version": "5.0.2",
},
'run_time': run_time,
'batch_size': 1,
}
create_tool_metadata(output_dir, metadata)

Expand Down
1 change: 1 addition & 0 deletions src/loaders/compute_tools/eggnog/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ versions:
date: 2024-08-16
notes: |
- Create metadata file after running Eggnog
- Include execution time in metadata
reference_db_version: 5.0.2

#Please keep this reminder at the end of this file
Expand Down
15 changes: 9 additions & 6 deletions src/loaders/compute_tools/gtdb_tk/gtdb_tk.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,6 @@ def _run_gtdb_tk(
print(f'running {" ".join(command)}')
run_command(command, output_dir / "classify_wf_log" if debug else None)

end_time = time.time()
print(
f'Used {round((end_time - start) / 60, 2)} minutes to execute gtdbtk classify_wf for '
f'{len(ids_to_files)} genomes')

summary_files = find_gtdbtk_summary_files(output_dir)
if not summary_files:
raise ValueError(f"No summary files exist for gtdb-tk in the specified "
Expand Down Expand Up @@ -152,13 +147,21 @@ def _run_gtdb_tk(
summary_files,
)

end_time = time.time()
run_time = end_time - start
print(
f'Used {round(run_time / 60, 2)} minutes to execute gtdbtk classify_wf for '
f'{size} genomes')

metadata = {'tool_name': 'gtdb_tk',
'version': '2.3.2',
'command': command,
"reference_db": {
"version": "release214",
},
'ids_to_files': make_json_serializable(ids_to_files)}
'ids_to_files': make_json_serializable(ids_to_files),
'run_time': run_time,
'batch_size': size,}
create_tool_metadata(output_dir, metadata)


Expand Down
1 change: 1 addition & 0 deletions src/loaders/compute_tools/gtdb_tk/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ versions:
- Create metadata file after running GTDB-Tk
- Fix a typo for 'tool_name' metadata field
- Add method to ensure 'ids_to_files' is JSON serializable
- Include execution time in metadata
reference_db_version: release214

#Please keep this reminder at the end of this file
Expand Down
15 changes: 14 additions & 1 deletion src/loaders/compute_tools/mash/mash.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Run Mash on a set of assemblies.
"""
import time
from pathlib import Path

from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata
Expand All @@ -18,6 +19,10 @@ def _run_mash_single(
debug: bool,
kmer_size: int = KMER_SIZE,
sketch_size: int = SKETCH_SIZE) -> None:

start = time.time()
print(f'Start executing Mash for {data_id}')

# RUN mash sketch for a single genome
command = ['mash', 'sketch',
'-o', source_file, # Output prefix.
Expand All @@ -29,6 +34,11 @@ def _run_mash_single(

run_command(command, output_dir if debug else None)

end_time = time.time()
run_time = end_time - start
print(
f'Used {round(run_time / 60, 2)} minutes to execute Mash for {data_id}')

# Save run info to a metadata file in the output directory for parsing later
metadata = {'source_file': str(source_file),
# Append '.msh' to the source file name to generate the sketch file name (default by Mash sketch)
Expand All @@ -38,7 +48,10 @@ def _run_mash_single(
'data_id': data_id,
'tool_name': 'mash',
'version': '2.0',
'command': command}
'command': command,
'run_time': run_time,
'batch_size': 1,
}
create_tool_metadata(output_dir, metadata)


Expand Down
3 changes: 2 additions & 1 deletion src/loaders/compute_tools/mash/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ versions:
- version: 0.1.5
date: 2024-08-16
notes: |
- Create metadata file after running Mash
- Create metadata file after running Mash
- Include execution time in metadata
21 changes: 21 additions & 0 deletions src/loaders/compute_tools/microtrait/microtrait.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Runs microtrait on a set of assemblies.
"""
import os
import time
import uuid
from pathlib import Path
from typing import Any
Expand All @@ -28,6 +29,7 @@
FatalTuple,
ToolRunner,
write_fatal_tuples_to_dict,
create_tool_metadata,
)
from src.loaders.compute_tools.tool_result_parser import (
create_jsonl_files,
Expand Down Expand Up @@ -208,6 +210,9 @@ def _run_microtrait(
# since extract_traits function doesn't take the number of threads as an argument
# https://github.com/ukaraoz/microtrait/blob/master/R/extract_traits.R#L22-L26

start = time.time()
print(f'Start executing Microtrait for {data_id}')

# Load the R script as an R function
r_script = """
library(microtrait)
Expand Down Expand Up @@ -262,6 +267,22 @@ def _run_microtrait(
create_jsonl_files(genome_dir / MICROTRAIT_CELLS, cells_meta)
create_jsonl_files(genome_dir / MICROTRAIT_DATA, heatmap_row)

end_time = time.time()
run_time = end_time - start
print(
f'Used {round(run_time / 60, 2)} minutes to execute Microtrait for {data_id}')

# Save run info to a metadata file in the output directory for parsing later
metadata = {'source_file': str(fna_file),
'data_id': data_id,
'tool_name': 'microtrait',
'version': 'None',
'command': 'None - R script',
'run_time': run_time,
'batch_size': 1,
}
create_tool_metadata(genome_dir, metadata)


def main():
runner = ToolRunner("microtrait")
Expand Down
7 changes: 6 additions & 1 deletion src/loaders/compute_tools/microtrait/versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,9 @@ versions:
- version: 0.1.5
date: 2024-06-25
notes: |
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
- Bug fix: tool_common.py - Converted Data IDs to string format to ensure proper comparison with associated folder names
- version: 0.1.6
date: 2024-08-16
notes: |
- Create metadata file after running Microtrait
- Include execution time in metadata

0 comments on commit c7382e8

Please sign in to comment.