diff --git a/configs/README.md b/configs/README.md index 01d1ebd3..e6eb0bad 100644 --- a/configs/README.md +++ b/configs/README.md @@ -241,13 +241,14 @@ Option specific for ONNX export. Here you can specify options for tuning. -| Key | Type | Default value | Description | -| ---------- | ----------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| study_name | str | "test-study" | Name of the study. | -| use_pruner | bool | True | Whether to use the MedianPruner. | -| n_trials | int \| None | 15 | Number of trials for each process. `None` represents no limit in terms of numbner of trials. | -| timeout | int \| None | None | Stop study after the given number of seconds. | -| params | dict\[str, list\] | {} | Which parameters to tune. The keys should be in the format `key1.key2.key3_`. Type can be one of `[categorical, float, int, longuniform, uniform]`. For more information about the types, visit [Optuna documentation](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html). | +| Key | Type | Default value | Description | +| ----------------------- | ----------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| study_name | str | "test-study" | Name of the study. | +| continue_existing_study | bool | True | Weather to continue existing study if `study_name` already exists. | +| use_pruner | bool | True | Whether to use the MedianPruner. | +| n_trials | int \| None | 15 | Number of trials for each process. `None` represents no limit in terms of numbner of trials. | +| timeout | int \| None | None | Stop study after the given number of seconds. | +| params | dict\[str, list\] | {} | Which parameters to tune. The keys should be in the format `key1.key2.key3_`. Type can be one of `[categorical, float, int, longuniform, uniform]`. For more information about the types, visit [Optuna documentation](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html). | Example of params for tuner block: diff --git a/configs/example_tuning.yaml b/configs/example_tuning.yaml index 41c4d8a8..009abc41 100755 --- a/configs/example_tuning.yaml +++ b/configs/example_tuning.yaml @@ -23,7 +23,7 @@ trainer: active: True batch_size: 4 - epochs: &epochs 1 + epochs: &epochs 10 validation_interval: 1 num_log_images: 8 diff --git a/luxonis_train/core/tuner.py b/luxonis_train/core/tuner.py index 4635789c..344e016a 100644 --- a/luxonis_train/core/tuner.py +++ b/luxonis_train/core/tuner.py @@ -1,4 +1,5 @@ import os.path as osp +from logging import getLogger from typing import Any import lightning.pytorch as pl @@ -13,6 +14,8 @@ from .core import Core +logger = getLogger(__name__) + class Tuner(Core): def __init__(self, cfg: str | dict, args: list[str] | tuple[str, ...] | None): @@ -30,8 +33,26 @@ def __init__(self, cfg: str | dict, args: list[str] | tuple[str, ...] | None): raise ValueError("You have to specify the `tuner` section in config.") self.tune_cfg = self.cfg.tuner + # Parent tracker that only logs the best study parameters at the end + rank = rank_zero_only.rank + cfg_tracker = self.cfg.tracker + tracker_params = cfg_tracker.model_dump() + tracker_params[ + "is_wandb" + ] = False # wandb doesn't allow multiple concurrent runs, handle this separately + self.parent_tracker = LuxonisTrackerPL( + rank=rank, + mlflow_tracking_uri=self.cfg.ENVIRON.MLFLOW_TRACKING_URI, + is_sweep=False, + **tracker_params, + ) + if self.parent_tracker.is_mlflow: + # Experiment needs to be interacted with to create actual MLFlow run + self.parent_tracker.experiment["mlflow"].active_run() + def tune(self) -> None: """Runs Optuna tunning of hyperparameters.""" + logger.info("Starting tuning...") pruner = ( optuna.pruners.MedianPruner() @@ -57,7 +78,7 @@ def tune(self) -> None: storage=storage, direction="minimize", pruner=pruner, - load_if_exists=True, + load_if_exists=self.tune_cfg.continue_existing_study, ) study.optimize( @@ -66,25 +87,44 @@ def tune(self) -> None: timeout=self.tune_cfg.timeout, ) + best_study_params = study.best_params + logger.info(f"Best study parameters: {best_study_params}") + + self.parent_tracker.log_hyperparams(best_study_params) + + if self.cfg.tracker.is_wandb: + # If wandb used then init parent tracker separately at the end + wandb_parent_tracker = LuxonisTrackerPL( + project_name=self.cfg.tracker.project_name, + project_id=self.cfg.tracker.project_id, + run_name=self.parent_tracker.run_name, + save_directory=self.cfg.tracker.save_directory, + is_wandb=True, + wandb_entity=self.cfg.tracker.wandb_entity, + rank=rank_zero_only.rank, + ) + wandb_parent_tracker.log_hyperparams(best_study_params) + def _objective(self, trial: optuna.trial.Trial) -> float: """Objective function used to optimize Optuna study.""" rank = rank_zero_only.rank cfg_tracker = self.cfg.tracker tracker_params = cfg_tracker.model_dump() - tracker = LuxonisTrackerPL( + child_tracker = LuxonisTrackerPL( rank=rank, mlflow_tracking_uri=self.cfg.ENVIRON.MLFLOW_TRACKING_URI, is_sweep=True, **tracker_params, ) - run_save_dir = osp.join(cfg_tracker.save_directory, tracker.run_name) + + run_save_dir = osp.join(cfg_tracker.save_directory, child_tracker.run_name) curr_params = self._get_trial_params(trial) curr_params["model.predefined_model"] = None Config.clear_instance() cfg = Config.get_config(self.cfg.model_dump(), curr_params) - tracker.log_hyperparams(curr_params) + child_tracker.log_hyperparams(curr_params) cfg.save_data(osp.join(run_save_dir, "config.yaml")) @@ -95,14 +135,11 @@ def _objective(self, trial: optuna.trial.Trial) -> float: input_shape=self.loaders["train"].input_shape, ) lightning_module._core = self - pruner_callback = PyTorchLightningPruningCallback( - trial, monitor="val_loss/loss" - ) callbacks: list[pl.Callback] = ( [LuxonisProgressBar()] if self.cfg.use_rich_text else [] ) + pruner_callback = PyTorchLightningPruningCallback(trial, monitor="val/loss") callbacks.append(pruner_callback) - deterministic = False if self.cfg.trainer.seed: pl.seed_everything(cfg.trainer.seed, workers=True) @@ -112,7 +149,7 @@ def _objective(self, trial: optuna.trial.Trial) -> float: accelerator=cfg.trainer.accelerator, devices=cfg.trainer.devices, strategy=cfg.trainer.strategy, - logger=tracker, # type: ignore + logger=child_tracker, # type: ignore max_epochs=cfg.trainer.epochs, accumulate_grad_batches=cfg.trainer.accumulate_grad_batches, check_val_every_n_epoch=cfg.trainer.validation_interval, @@ -122,12 +159,18 @@ def _objective(self, trial: optuna.trial.Trial) -> float: deterministic=deterministic, ) - pl_trainer.fit( - lightning_module, # type: ignore - self.pytorch_loaders["train"], - self.pytorch_loaders["val"], - ) - pruner_callback.check_pruned() + try: + pl_trainer.fit( + lightning_module, # type: ignore + self.pytorch_loaders["val"], + self.pytorch_loaders["train"], + ) + + pruner_callback.check_pruned() + + except optuna.TrialPruned as e: + # Pruning is done by raising an error + logger.info(e) if "val/loss" not in pl_trainer.callback_metrics: raise ValueError( diff --git a/luxonis_train/utils/config.py b/luxonis_train/utils/config.py index 96d132ab..3379f59f 100644 --- a/luxonis_train/utils/config.py +++ b/luxonis_train/utils/config.py @@ -286,6 +286,7 @@ class StorageConfig(CustomBaseModel): class TunerConfig(CustomBaseModel): study_name: str = "test-study" + continue_existing_study: bool = True use_pruner: bool = True n_trials: int | None = 15 timeout: int | None = None diff --git a/luxonis_train/utils/tracker.py b/luxonis_train/utils/tracker.py index 13c77cb2..df157b3b 100644 --- a/luxonis_train/utils/tracker.py +++ b/luxonis_train/utils/tracker.py @@ -1,8 +1,28 @@ from lightning.pytorch.loggers.logger import Logger +from lightning.pytorch.utilities import rank_zero_only # type: ignore from luxonis_ml.tracker import LuxonisTracker class LuxonisTrackerPL(LuxonisTracker, Logger): """Implementation of LuxonisTracker that is compatible with PytorchLightning.""" - ... + @rank_zero_only + def finalize(self, status: str = "success") -> None: + """Finalizes current run.""" + if self.is_tensorboard: + self.experiment["tensorboard"].flush() + self.experiment["tensorboard"].close() + if self.is_mlflow: + if status == "success": + mlflow_status = "FINISHED" + elif status == "failed": + mlflow_status = "FAILED" + elif status == "finished": + mlflow_status = "FINISHED" + self.experiment["mlflow"].end_run(mlflow_status) + if self.is_wandb: + if status == "success": + wandb_status = 0 + else: + wandb_status = 1 + self.experiment["wandb"].finish(wandb_status) diff --git a/requirements.txt b/requirements.txt index 6dc87275..42834ae6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,8 +5,8 @@ luxonis-ml[all]@git+https://github.com/luxonis/luxonis-ml.git@dev onnx>=1.12.0 onnxruntime>=1.13.1 onnxsim>=0.4.10 -optuna>=3.2.0 -optuna_integration>=3.6.0 +optuna>=3.6.0 +optuna-integration>=3.6.0 parameterized>=0.9.0 psycopg2-binary>=2.9.1 pycocotools>=2.0.7