luxonis · kozlov721 · Jun 13, 2024 · May 7, 2024 · May 13, 2024 · May 13, 2024
diff --git a/configs/README.md b/configs/README.md
@@ -241,13 +241,14 @@ Option specific for ONNX export.
 
 Here you can specify options for tuning.
 
-| Key        | Type              | Default value | Description                                                                                                                                                                                                                                                                                                        |
-| ---------- | ----------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| study_name | str               | "test-study"  | Name of the study.                                                                                                                                                                                                                                                                                                 |
-| use_pruner | bool              | True          | Whether to use the MedianPruner.                                                                                                                                                                                                                                                                                   |
-| n_trials   | int \| None       | 15            | Number of trials for each process. `None` represents no limit in terms of numbner of trials.                                                                                                                                                                                                                       |
-| timeout    | int \| None       | None          | Stop study after the given number of seconds.                                                                                                                                                                                                                                                                      |
-| params     | dict\[str, list\] | {}            | Which parameters to tune. The keys should be in the format `key1.key2.key3_<type>`. Type can be one of `[categorical, float, int, longuniform, uniform]`. For more information about the types, visit [Optuna documentation](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html). |
+| Key                     | Type              | Default value | Description                                                                                                                                                                                                                                                                                                        |
+| ----------------------- | ----------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| study_name              | str               | "test-study"  | Name of the study.                                                                                                                                                                                                                                                                                                 |
+| continue_existing_study | bool              | True          | Weather to continue existing study if `study_name` already exists.                                                                                                                                                                                                                                                 |
+| use_pruner              | bool              | True          | Whether to use the MedianPruner.                                                                                                                                                                                                                                                                                   |
+| n_trials                | int \| None       | 15            | Number of trials for each process. `None` represents no limit in terms of numbner of trials.                                                                                                                                                                                                                       |
+| timeout                 | int \| None       | None          | Stop study after the given number of seconds.                                                                                                                                                                                                                                                                      |
+| params                  | dict\[str, list\] | {}            | Which parameters to tune. The keys should be in the format `key1.key2.key3_<type>`. Type can be one of `[categorical, float, int, longuniform, uniform]`. For more information about the types, visit [Optuna documentation](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html). |
 
 Example of params for tuner block:
 

diff --git a/configs/example_tuning.yaml b/configs/example_tuning.yaml
@@ -22,7 +22,7 @@ trainer:
       active: True
 
   batch_size: 4
-  epochs: &epochs 1
+  epochs: &epochs 10
   validation_interval: 1
   num_log_images: 8
 

diff --git a/luxonis_train/core/tuner.py b/luxonis_train/core/tuner.py
@@ -1,4 +1,5 @@
 import os.path as osp
+from logging import getLogger
 from typing import Any
 
 import lightning.pytorch as pl
@@ -13,6 +14,8 @@
 
 from .core import Core
 
+logger = getLogger(__name__)
+
 
 class Tuner(Core):
     def __init__(self, cfg: str | dict, args: list[str] | tuple[str, ...] | None):
@@ -30,8 +33,23 @@ def __init__(self, cfg: str | dict, args: list[str] | tuple[str, ...] | None):
             raise ValueError("You have to specify the `tuner` section in config.")
         self.tune_cfg = self.cfg.tuner
 
+        # Parent tracker that only logs the best study parameters at the end
+        rank = rank_zero_only.rank
+        cfg_tracker = self.cfg.tracker
+        tracker_params = cfg_tracker.model_dump()
+        self.parent_tracker = LuxonisTrackerPL(
+            rank=rank,
+            mlflow_tracking_uri=self.cfg.ENVIRON.MLFLOW_TRACKING_URI,
+            is_sweep=False,
+            **tracker_params,
+        )
+        if self.parent_tracker.is_mlflow:
+            run = self.parent_tracker.experiment["mlflow"].active_run()
+            self.parent_run_id = run.info.run_id
+
     def tune(self) -> None:
         """Runs Optuna tunning of hyperparameters."""
+        logger.info("Starting tuning...")
 
         pruner = (
             optuna.pruners.MedianPruner()
@@ -57,7 +75,7 @@ def tune(self) -> None:
             storage=storage,
             direction="minimize",
             pruner=pruner,
-            load_if_exists=True,
+            load_if_exists=self.tune_cfg.continue_existing_study,
         )
 
         study.optimize(
@@ -66,25 +84,30 @@ def tune(self) -> None:
             timeout=self.tune_cfg.timeout,
         )
 
+        best_study_params = study.best_params
+        logger.info(f"Best study parameters: {best_study_params}")
+        self.parent_tracker.log_hyperparams(best_study_params)
+
     def _objective(self, trial: optuna.trial.Trial) -> float:
         """Objective function used to optimize Optuna study."""
         rank = rank_zero_only.rank
         cfg_tracker = self.cfg.tracker
         tracker_params = cfg_tracker.model_dump()
-        tracker = LuxonisTrackerPL(
+        child_tracker = LuxonisTrackerPL(
             rank=rank,
             mlflow_tracking_uri=self.cfg.ENVIRON.MLFLOW_TRACKING_URI,
             is_sweep=True,
             **tracker_params,
         )
-        run_save_dir = osp.join(cfg_tracker.save_directory, tracker.run_name)
+
+        run_save_dir = osp.join(cfg_tracker.save_directory, child_tracker.run_name)
 
         curr_params = self._get_trial_params(trial)
         curr_params["model.predefined_model"] = None
         Config.clear_instance()
         cfg = Config.get_config(self.cfg.model_dump(), curr_params)
 
-        tracker.log_hyperparams(curr_params)
+        child_tracker.log_hyperparams(curr_params)
 
         cfg.save_data(osp.join(run_save_dir, "config.yaml"))
 
@@ -94,14 +117,15 @@ def _objective(self, trial: optuna.trial.Trial) -> float:
             save_dir=run_save_dir,
             input_shape=self.loader_train.input_shape,
         )
-        pruner_callback = PyTorchLightningPruningCallback(
-            trial, monitor="val_loss/loss"
-        )
         callbacks: list[pl.Callback] = (
             [LuxonisProgressBar()] if self.cfg.use_rich_text else []
         )
+        pruner_callback = PyTorchLightningPruningCallback(trial, monitor="val/loss")
         callbacks.append(pruner_callback)
 
+        tracker_end_run = TrackerEndRun()
+        callbacks.append(tracker_end_run)
+
         deterministic = False
         if self.cfg.trainer.seed:
             pl.seed_everything(cfg.trainer.seed, workers=True)
@@ -111,7 +135,7 @@ def _objective(self, trial: optuna.trial.Trial) -> float:
             accelerator=cfg.trainer.accelerator,
             devices=cfg.trainer.devices,
             strategy=cfg.trainer.strategy,
-            logger=tracker,  # type: ignore
+            logger=child_tracker,  # type: ignore
             max_epochs=cfg.trainer.epochs,
             accumulate_grad_batches=cfg.trainer.accumulate_grad_batches,
             check_val_every_n_epoch=cfg.trainer.validation_interval,
@@ -121,12 +145,20 @@ def _objective(self, trial: optuna.trial.Trial) -> float:
             deterministic=deterministic,
         )
 
-        pl_trainer.fit(
-            lightning_module,  # type: ignore
-            self.pytorch_loader_train,
-            self.pytorch_loader_val,
-        )
-        pruner_callback.check_pruned()
+        try:
+            pl_trainer.fit(
+                lightning_module,  # type: ignore
+                self.pytorch_loader_train,
+                self.pytorch_loader_val,
+            )
+
+            pruner_callback.check_pruned()
+
+        except optuna.TrialPruned as e:
+            # Pruning is done by raising an error
+            # When .fit() errors out we have to gracefully also end the trackers
+            tracker_end_run.end_trackers(child_tracker)
+            logger.info(e)
 
         if "val/loss" not in pl_trainer.callback_metrics:
             raise ValueError(
@@ -177,3 +209,24 @@ def _get_trial_params(self, trial: optuna.trial.Trial) -> dict[str, Any]:
                 "No paramteres to tune. Specify them under `tuner.params`."
             )
         return new_params
+
+
+class TrackerEndRun(pl.Callback):
+    """Callback that ends trackers of child processes during tuning study"""
+
+    def teardown(
+        self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str
+    ) -> None:
+        self.end_trackers(trainer.logger)  # type: ignore
+        return super().teardown(trainer, pl_module, stage)
+
+    def end_trackers(self, tracker: LuxonisTrackerPL) -> None:
+        """Ends WandB and MLFlow trackers
+
+        Args:
+            tracker (LuxonisTrackerPL): Currently active tracker
+        """
+        if tracker.is_wandb:
+            tracker.experiment["wandb"].finish()
+        if tracker.is_mlflow:
+            tracker.experiment["mlflow"].end_run()
diff --git a/luxonis_train/utils/config.py b/luxonis_train/utils/config.py
@@ -285,6 +285,7 @@ class StorageConfig(CustomBaseModel):
 
 class TunerConfig(CustomBaseModel):
     study_name: str = "test-study"
+    continue_existing_study: bool = True
     use_pruner: bool = True
     n_trials: int | None = 15
     timeout: int | None = None

diff --git a/requirements.txt b/requirements.txt
@@ -5,8 +5,8 @@ luxonis-ml[all]@git+https://github.com/luxonis/luxonis-ml.git@dev
 onnx>=1.12.0
 onnxruntime>=1.13.1
 onnxsim>=0.4.10
-optuna>=3.2.0
-optuna_integration>=3.6.0
+optuna>=3.6.0
+optuna-integration>=3.6.0
 parameterized>=0.9.0
 psycopg2-binary>=2.9.1
 pycocotools>=2.0.7