From 900d04ef33a6554c5d1c7677c1778b7a59f7d081 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Mon, 26 Feb 2024 14:27:57 +0100 Subject: [PATCH 01/12] upload logs to mlflow --- luxonis_train/core/trainer.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/luxonis_train/core/trainer.py b/luxonis_train/core/trainer.py index cb2c5a2c..fc739a08 100644 --- a/luxonis_train/core/trainer.py +++ b/luxonis_train/core/trainer.py @@ -3,6 +3,7 @@ from typing import Any, Literal from lightning.pytorch.utilities import rank_zero_only # type: ignore +from luxonis_ml.utils import LuxonisFileSystem from luxonis_train.models import LuxonisModel from luxonis_train.utils.config import Config @@ -46,15 +47,28 @@ def train(self, new_thread: bool = False) -> None: @param new_thread: Runs training in new thread if set to True. """ if not new_thread: - logger.info(f"Checkpoints will be saved in: {self.get_save_dir()}") - logger.info("Starting training...") - self.pl_trainer.fit( - self.lightning_module, - self.pytorch_loader_train, - self.pytorch_loader_val, - ) - logger.info("Training finished") - logger.info(f"Checkpoints saved in: {self.get_save_dir()}") + try: + logger.info(f"Checkpoints will be saved in: {self.get_save_dir()}") + logger.info("Starting training...") + self.pl_trainer.fit( + self.lightning_module, + self.pytorch_loader_train, + self.pytorch_loader_val, + ) + logger.info("Training finished") + logger.info(f"Checkpoints saved in: {self.get_save_dir()}") + finally: + if self.cfg.tracker.is_mlflow: + self.fs = LuxonisFileSystem( + "mlflow://", + allow_active_mlflow_run=True, + allow_local=False, + ) + self.fs.put_file( + local_path="luxonis_train.log", + remote_path="luxonis_train.log", + ) + else: # Every time exception happens in the Thread, this hook will activate def thread_exception_hook(args): From b7f421216749a6b3c899212c5180d1221bb8813a Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Mon, 26 Feb 2024 15:07:52 +0100 Subject: [PATCH 02/12] added mlflwo instance --- luxonis_train/core/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/luxonis_train/core/trainer.py b/luxonis_train/core/trainer.py index fc739a08..8dbd2847 100644 --- a/luxonis_train/core/trainer.py +++ b/luxonis_train/core/trainer.py @@ -67,6 +67,7 @@ def train(self, new_thread: bool = False) -> None: self.fs.put_file( local_path="luxonis_train.log", remote_path="luxonis_train.log", + mlflow_instance=self.tracker.experiment.get("mlflow", None), ) else: From e8c323386c310d1fa955bbea1161ba65f479e621 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Mon, 26 Feb 2024 16:42:21 +0100 Subject: [PATCH 03/12] multithread log upload --- luxonis_train/core/trainer.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/luxonis_train/core/trainer.py b/luxonis_train/core/trainer.py index 8dbd2847..e0c0d72c 100644 --- a/luxonis_train/core/trainer.py +++ b/luxonis_train/core/trainer.py @@ -40,6 +40,19 @@ def __init__( input_shape=self.loader_train.input_shape, ) + def _upload_logs(self) -> None: + if self.cfg.tracker.is_mlflow: + self.fs = LuxonisFileSystem( + "mlflow://", + allow_active_mlflow_run=True, + allow_local=False, + ) + self.fs.put_file( + local_path="luxonis_train.log", + remote_path="luxonis_train.log", + mlflow_instance=self.tracker.experiment.get("mlflow", None), + ) + def train(self, new_thread: bool = False) -> None: """Runs training. @@ -58,22 +71,13 @@ def train(self, new_thread: bool = False) -> None: logger.info("Training finished") logger.info(f"Checkpoints saved in: {self.get_save_dir()}") finally: - if self.cfg.tracker.is_mlflow: - self.fs = LuxonisFileSystem( - "mlflow://", - allow_active_mlflow_run=True, - allow_local=False, - ) - self.fs.put_file( - local_path="luxonis_train.log", - remote_path="luxonis_train.log", - mlflow_instance=self.tracker.experiment.get("mlflow", None), - ) + self._upload_logs() else: # Every time exception happens in the Thread, this hook will activate def thread_exception_hook(args): self.error_message = str(args.exc_value) + self._upload_logs() threading.excepthook = thread_exception_hook From f17afdc8267058d105a77d37abc89ff9fd7e1ef0 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Tue, 27 Feb 2024 08:30:59 +0100 Subject: [PATCH 04/12] fixed upload logs --- luxonis_train/core/trainer.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/luxonis_train/core/trainer.py b/luxonis_train/core/trainer.py index e0c0d72c..e0d7e93e 100644 --- a/luxonis_train/core/trainer.py +++ b/luxonis_train/core/trainer.py @@ -53,6 +53,12 @@ def _upload_logs(self) -> None: mlflow_instance=self.tracker.experiment.get("mlflow", None), ) + def _traner_fit(self, *args, **kwargs): + try: + self.pl_trainer.fit(*args, **kwargs) + finally: + self._upload_logs() + def train(self, new_thread: bool = False) -> None: """Runs training. @@ -60,29 +66,25 @@ def train(self, new_thread: bool = False) -> None: @param new_thread: Runs training in new thread if set to True. """ if not new_thread: - try: - logger.info(f"Checkpoints will be saved in: {self.get_save_dir()}") - logger.info("Starting training...") - self.pl_trainer.fit( - self.lightning_module, - self.pytorch_loader_train, - self.pytorch_loader_val, - ) - logger.info("Training finished") - logger.info(f"Checkpoints saved in: {self.get_save_dir()}") - finally: - self._upload_logs() + logger.info(f"Checkpoints will be saved in: {self.get_save_dir()}") + logger.info("Starting training...") + self._traner_fit( + self.lightning_module, + self.pytorch_loader_train, + self.pytorch_loader_val, + ) + logger.info("Training finished") + logger.info(f"Checkpoints saved in: {self.get_save_dir()}") else: # Every time exception happens in the Thread, this hook will activate def thread_exception_hook(args): self.error_message = str(args.exc_value) - self._upload_logs() threading.excepthook = thread_exception_hook self.thread = threading.Thread( - target=self.pl_trainer.fit, + target=self._traner_fit, args=( self.lightning_module, self.pytorch_loader_train, From 817c382518906e2c59888cda4a2bf75a92192d39 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Tue, 27 Feb 2024 08:41:24 +0100 Subject: [PATCH 05/12] fixed log file path --- luxonis_train/core/core.py | 3 ++- luxonis_train/core/trainer.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/luxonis_train/core/core.py b/luxonis_train/core/core.py index 75bd1d2a..64712260 100644 --- a/luxonis_train/core/core.py +++ b/luxonis_train/core/core.py @@ -81,10 +81,11 @@ def __init__( ) # NOTE: to add the file handler (we only get the save dir now, # but we want to use the logger before) + self.log_file = osp.join(self.run_save_dir, "luxonis_train.log") reset_logging() setup_logging( use_rich=self.cfg.use_rich_text, - file=osp.join(self.run_save_dir, "luxonis_train.log"), + file=self.log_file, ) # NOTE: overriding logger in pl so it uses our logger to log device info diff --git a/luxonis_train/core/trainer.py b/luxonis_train/core/trainer.py index e0d7e93e..bf80fa83 100644 --- a/luxonis_train/core/trainer.py +++ b/luxonis_train/core/trainer.py @@ -42,13 +42,14 @@ def __init__( def _upload_logs(self) -> None: if self.cfg.tracker.is_mlflow: + logger.info("Uploading logs to MLFlow.") self.fs = LuxonisFileSystem( "mlflow://", allow_active_mlflow_run=True, allow_local=False, ) self.fs.put_file( - local_path="luxonis_train.log", + local_path=self.log_file, remote_path="luxonis_train.log", mlflow_instance=self.tracker.experiment.get("mlflow", None), ) From a984f76416fbc01da8510bc57dad0d385ce8864b Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Tue, 27 Feb 2024 08:43:55 +0100 Subject: [PATCH 06/12] removed exceptions --- luxonis_train/callbacks/export_on_train_end.py | 16 ++++++++++------ luxonis_train/models/luxonis_model.py | 7 ++++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/luxonis_train/callbacks/export_on_train_end.py b/luxonis_train/callbacks/export_on_train_end.py index 923267c1..e74cfcf7 100644 --- a/luxonis_train/callbacks/export_on_train_end.py +++ b/luxonis_train/callbacks/export_on_train_end.py @@ -8,6 +8,8 @@ from luxonis_train.utils.registry import CALLBACKS from luxonis_train.utils.tracker import LuxonisTrackerPL +logger = logging.getLogger(__name__) + @CALLBACKS.register_module() class ExportOnTrainEnd(pl.Callback): @@ -41,11 +43,13 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No # NOTE: assume that first checkpoint callback is based on val loss best_model_path = model_checkpoint_callbacks[0].best_model_path if not best_model_path: - raise RuntimeError( - "No best model path found. " - "Please make sure that ModelCheckpoint callback is present " - "and at least one validation epoch has been performed." + logger.error( + "No model checkpoint found. " + "Make sure that `ModelCheckpoint` callback is present " + "and at least one validation epoch has been performed. " + "Skipping model export." ) + return cfg: Config = pl_module.cfg cfg.model.weights = best_model_path if self.upload_to_mlflow: @@ -54,9 +58,9 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No new_upload_url = f"mlflow://{tracker.project_id}/{tracker.run_id}" cfg.exporter.upload_url = new_upload_url else: - logging.getLogger(__name__).warning( + logger.warning( "`upload_to_mlflow` is set to True, " - "but there is no MLFlow active run, skipping." + "but there is no MLFlow active run, skipping." ) exporter = Exporter(cfg=cfg) onnx_path = str(Path(best_model_path).parent.with_suffix(".onnx")) diff --git a/luxonis_train/models/luxonis_model.py b/luxonis_train/models/luxonis_model.py index 88d4fa28..4193189b 100644 --- a/luxonis_train/models/luxonis_model.py +++ b/luxonis_train/models/luxonis_model.py @@ -681,7 +681,12 @@ def load_checkpoint(self, path: str | None) -> None: """ if path is None: return - checkpoint = torch.load(path, map_location=self.device) + + try: + checkpoint = torch.load(path, map_location=self.device) + except Exception: + logger.error(f"Could not load checkpoint from '{path}'.") + return if "state_dict" not in checkpoint: raise ValueError("Checkpoint does not contain state_dict.") state_dict = {} From 4654d79f2d0839d794374a6c58ce2e6b98babe45 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Tue, 27 Feb 2024 08:49:32 +0100 Subject: [PATCH 07/12] logging exceptions --- luxonis_train/core/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/luxonis_train/core/trainer.py b/luxonis_train/core/trainer.py index bf80fa83..3aba1bec 100644 --- a/luxonis_train/core/trainer.py +++ b/luxonis_train/core/trainer.py @@ -57,6 +57,8 @@ def _upload_logs(self) -> None: def _traner_fit(self, *args, **kwargs): try: self.pl_trainer.fit(*args, **kwargs) + except Exception: + logger.exception("Encountered exception during training.") finally: self._upload_logs() From c5e7a57660431871043801d93fa009819ed16d27 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Tue, 27 Feb 2024 10:29:28 +0100 Subject: [PATCH 08/12] fixed typo --- luxonis_train/core/trainer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/luxonis_train/core/trainer.py b/luxonis_train/core/trainer.py index 3aba1bec..67b1f794 100644 --- a/luxonis_train/core/trainer.py +++ b/luxonis_train/core/trainer.py @@ -54,7 +54,7 @@ def _upload_logs(self) -> None: mlflow_instance=self.tracker.experiment.get("mlflow", None), ) - def _traner_fit(self, *args, **kwargs): + def _trainer_fit(self, *args, **kwargs): try: self.pl_trainer.fit(*args, **kwargs) except Exception: @@ -71,7 +71,7 @@ def train(self, new_thread: bool = False) -> None: if not new_thread: logger.info(f"Checkpoints will be saved in: {self.get_save_dir()}") logger.info("Starting training...") - self._traner_fit( + self._trainer_fit( self.lightning_module, self.pytorch_loader_train, self.pytorch_loader_val, @@ -87,7 +87,7 @@ def thread_exception_hook(args): threading.excepthook = thread_exception_hook self.thread = threading.Thread( - target=self._traner_fit, + target=self._trainer_fit, args=( self.lightning_module, self.pytorch_loader_train, From a151a5cc60f73318f521b96adc380a40edec27a5 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Tue, 27 Feb 2024 10:41:02 +0100 Subject: [PATCH 09/12] reverted exception --- luxonis_train/models/luxonis_model.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/luxonis_train/models/luxonis_model.py b/luxonis_train/models/luxonis_model.py index 4193189b..7cd396f9 100644 --- a/luxonis_train/models/luxonis_model.py +++ b/luxonis_train/models/luxonis_model.py @@ -682,11 +682,8 @@ def load_checkpoint(self, path: str | None) -> None: if path is None: return - try: - checkpoint = torch.load(path, map_location=self.device) - except Exception: - logger.error(f"Could not load checkpoint from '{path}'.") - return + checkpoint = torch.load(path, map_location=self.device) + if "state_dict" not in checkpoint: raise ValueError("Checkpoint does not contain state_dict.") state_dict = {} From 6080f7322a7e9c9be0784075d8f90e3fca283ccc Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Tue, 27 Feb 2024 10:43:13 +0100 Subject: [PATCH 10/12] moved line --- luxonis_train/core/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/luxonis_train/core/core.py b/luxonis_train/core/core.py index 64712260..86b63600 100644 --- a/luxonis_train/core/core.py +++ b/luxonis_train/core/core.py @@ -79,9 +79,10 @@ def __init__( self.run_save_dir = os.path.join( self.cfg.tracker.save_directory, self.tracker.run_name ) + self.log_file = osp.join(self.run_save_dir, "luxonis_train.log") + # NOTE: to add the file handler (we only get the save dir now, # but we want to use the logger before) - self.log_file = osp.join(self.run_save_dir, "luxonis_train.log") reset_logging() setup_logging( use_rich=self.cfg.use_rich_text, From 69bd044a0a534b9776841dd6157705017613a946 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Tue, 27 Feb 2024 10:45:37 +0100 Subject: [PATCH 11/12] replaced warning with error log --- luxonis_train/callbacks/export_on_train_end.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/luxonis_train/callbacks/export_on_train_end.py b/luxonis_train/callbacks/export_on_train_end.py index e74cfcf7..5d7bf6da 100644 --- a/luxonis_train/callbacks/export_on_train_end.py +++ b/luxonis_train/callbacks/export_on_train_end.py @@ -58,7 +58,7 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No new_upload_url = f"mlflow://{tracker.project_id}/{tracker.run_id}" cfg.exporter.upload_url = new_upload_url else: - logger.warning( + logger.error( "`upload_to_mlflow` is set to True, " "but there is no MLFlow active run, skipping." ) From 986680beeb6b971ca774c8586cef1d18a033ce94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Kozlovsk=C3=BD?= Date: Wed, 28 Feb 2024 16:21:49 +0100 Subject: [PATCH 12/12] Update trainer.py --- luxonis_train/core/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/luxonis_train/core/trainer.py b/luxonis_train/core/trainer.py index 67b1f794..2b3d6a78 100644 --- a/luxonis_train/core/trainer.py +++ b/luxonis_train/core/trainer.py @@ -43,12 +43,12 @@ def __init__( def _upload_logs(self) -> None: if self.cfg.tracker.is_mlflow: logger.info("Uploading logs to MLFlow.") - self.fs = LuxonisFileSystem( + fs = LuxonisFileSystem( "mlflow://", allow_active_mlflow_run=True, allow_local=False, ) - self.fs.put_file( + fs.put_file( local_path=self.log_file, remote_path="luxonis_train.log", mlflow_instance=self.tracker.experiment.get("mlflow", None),