Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unique log files for each mongo worker #2016

Merged
merged 7 commits into from
Apr 5, 2024
Merged
15 changes: 10 additions & 5 deletions n3fit/src/n3fit/hyper_optimization/mongofiletrials.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Hyperopt trial object for parallel hyperoptimization with MongoDB.
Data are fetched from MongoDB databases and stored in the form of json and tar.gz files within the nnfit folder.
"""

import json
import logging
import os
Expand Down Expand Up @@ -154,6 +155,7 @@ def __init__(
f"mongo://{self.db_host}:{self.db_port}/{self._process_db_name(self.db_name)}/jobs"
)
self.workers = []
self.output_folder_name = replica_path.parts[-3]

self._store_trial = False
self._json_file = replica_path / "tries.json"
Expand Down Expand Up @@ -195,7 +197,6 @@ def refresh(self):

# write json to disk
if self._store_trial:
log.info("Storing scan in %s", self._json_file)
local_trials = []
for idx, t in enumerate(self._dynamic_trials):
local_trials.append(t)
Expand Down Expand Up @@ -264,10 +265,14 @@ def start_mongo_workers(
# avoid memory fragmentation issues?
# my_env["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

# run mongo workers
# we could use stdout=subprocess.DEVNULL and stderr=subprocess.DEVNULL in Popen to suppress output info
worker = subprocess.Popen(args, env=my_env)
self.workers.append(worker)
# create log files to redirect the mongo-workers output
mongo_workers_logfile = f"mongo-worker_{i+1}_{self.output_folder_name}.log"
with open(mongo_workers_logfile, mode='w', encoding="utf-8") as log_file:
# run mongo workers
worker = subprocess.Popen(
args, env=my_env, stdout=log_file, stderr=subprocess.STDOUT
)
self.workers.append(worker)
log.info(f"Started mongo worker {i+1}/{self.num_workers}")
except OSError as err:
msg = f"Failed to execute {args}. Make sure you have MongoDB installed."
Expand Down