Merge branch 'dev' into feature/seed

luxonis · May 13, 2024 · c5b77be · c5b77be
2 parents 5884c44 + d1d71f0
commit c5b77be
Show file tree

Hide file tree

Showing 40 changed files with 1,185 additions and 263 deletions.
diff --git a/configs/README.md b/configs/README.md
@@ -142,24 +142,25 @@ To store and load the data we use LuxonisDataset and LuxonisLoader. For specific
 
 Here you can change everything related to actual training of the model.
 
-| Key                     | Type                                    | Default value | Description                                                                                                                                      |
-| ----------------------- | --------------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
-| seed                    | int                                     | None          | seed for reproducibility                                                                                                                         |
-| batch_size              | int                                     | 32            | batch size used for training                                                                                                                     |
-| accumulate_grad_batches | int                                     | 1             | number of batches for gradient accumulation                                                                                                      |
-| use_weighted_sampler    | bool                                    | False         | bool if use WeightedRandomSampler for training, only works with classification tasks                                                             |
-| epochs                  | int                                     | 100           | number of training epochs                                                                                                                        |
-| num_workers             | int                                     | 2             | number of workers for data loading                                                                                                               |
-| train_metrics_interval  | int                                     | -1            | frequency of computing metrics on train data, -1 if don't perform                                                                                |
-| validation_interval     | int                                     | 1             | frequency of computing metrics on validation data                                                                                                |
-| num_log_images          | int                                     | 4             | maximum number of images to visualize and log                                                                                                    |
-| skip_last_batch         | bool                                    | True          | whether to skip last batch while training                                                                                                        |
-| accelerator             | Literal\["auto", "cpu", "gpu"\]         | "auto"        | What accelerator to use for training.                                                                                                            |
-| devices                 | int \| list\[int\] \| str               | "auto"        | Either specify how many devices to use (int), list specific devices, or use "auto" for automatic configuration based on the selected accelerator |
-| strategy                | Literal\["auto", "ddp"\]                | "auto"        | What strategy to use for training.                                                                                                               |
-| num_sanity_val_steps    | int                                     | 2             | Number of sanity validation steps performed before training.                                                                                     |
-| profiler                | Literal\["simple", "advanced"\] \| None | None          | PL profiler for GPU/CPU/RAM utilization analysis                                                                                                 |
-| verbose                 | bool                                    | True          | Print all intermediate results to console.                                                                                                       |
+| Key                     | Type                                           | Default value | Description                                                                                                                                      |
+| ----------------------- | ---------------------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
+| seed                    | int                                            | None          | seed for reproducibility                                                                                                                         |
+| batch_size              | int                                            | 32            | batch size used for training                                                                                                                     |
+| accumulate_grad_batches | int                                            | 1             | number of batches for gradient accumulation                                                                                                      |
+| use_weighted_sampler    | bool                                           | False         | bool if use WeightedRandomSampler for training, only works with classification tasks                                                             |
+| epochs                  | int                                            | 100           | number of training epochs                                                                                                                        |
+| num_workers             | int                                            | 2             | number of workers for data loading                                                                                                               |
+| train_metrics_interval  | int                                            | -1            | frequency of computing metrics on train data, -1 if don't perform                                                                                |
+| validation_interval     | int                                            | 1             | frequency of computing metrics on validation data                                                                                                |
+| num_log_images          | int                                            | 4             | maximum number of images to visualize and log                                                                                                    |
+| skip_last_batch         | bool                                           | True          | whether to skip last batch while training                                                                                                        |
+| accelerator             | Literal\["auto", "cpu", "gpu"\]                | "auto"        | What accelerator to use for training.                                                                                                            |
+| devices                 | int \| list\[int\] \| str                      | "auto"        | Either specify how many devices to use (int), list specific devices, or use "auto" for automatic configuration based on the selected accelerator |
+| matmul_precision        | Literal\["medium", "high", "highest"\] \| None | None          | Sets the internal precision of float32 matrix multiplications.                                                                                   |
+| strategy                | Literal\["auto", "ddp"\]                       | "auto"        | What strategy to use for training.                                                                                                               |
+| num_sanity_val_steps    | int                                            | 2             | Number of sanity validation steps performed before training.                                                                                     |
+| profiler                | Literal\["simple", "advanced"\] \| None        | None          | PL profiler for GPU/CPU/RAM utilization analysis                                                                                                 |
+| verbose                 | bool                                           | True          | Print all intermediate results to console.                                                                                                       |
 
 ### Preprocessing
 

diff --git a/luxonis_train/__main__.py b/luxonis_train/__main__.py
@@ -45,11 +45,17 @@ def __str__(self):
 
 
 @app.command()
-def train(config: ConfigType = None, opts: OptsType = None):
+def train(
+    config: ConfigType = None,
+    resume: Annotated[
+        Optional[str], typer.Option(help="Resume training from this checkpoint.")
+    ] = None,
+    opts: OptsType = None,
+):
     """Start training."""
     from luxonis_train.core import Trainer
 
-    Trainer(str(config), opts).train()
+    Trainer(str(config), opts, resume=resume).train()
 
 
 @app.command()
@@ -203,6 +209,20 @@ def inspect(
                     exit()
 
 
+@app.command()
+def archive(
+    executable: Annotated[
+        Optional[Path], typer.Option(help="Path to the model file.", show_default=False)
+    ],
+    config: ConfigType = None,
+    opts: OptsType = None,
+):
+    """Generate NN archive."""
+    from luxonis_train.core import Archiver
+
+    Archiver(str(config), opts).archive(executable)
+
+
 def version_callback(value: bool):
     if value:
         typer.echo(f"LuxonisTrain Version: {version(__package__)}")

diff --git a/luxonis_train/callbacks/__init__.py b/luxonis_train/callbacks/__init__.py
@@ -8,6 +8,7 @@
 
 from luxonis_train.utils.registry import CALLBACKS
 
+from .archive_on_train_end import ArchiveOnTrainEnd
 from .export_on_train_end import ExportOnTrainEnd
 from .luxonis_progress_bar import LuxonisProgressBar
 from .metadata_logger import MetadataLogger
@@ -23,6 +24,7 @@
 
 
 __all__ = [
+    "ArchiveOnTrainEnd",
     "ExportOnTrainEnd",
     "LuxonisProgressBar",
     "MetadataLogger",

diff --git a/luxonis_train/callbacks/archive_on_train_end.py b/luxonis_train/callbacks/archive_on_train_end.py
@@ -0,0 +1,72 @@
+import logging
+import os
+from pathlib import Path
+from typing import cast
+
+import lightning.pytorch as pl
+
+from luxonis_train.utils.config import Config
+from luxonis_train.utils.registry import CALLBACKS
+from luxonis_train.utils.tracker import LuxonisTrackerPL
+
+
+@CALLBACKS.register_module()
+class ArchiveOnTrainEnd(pl.Callback):
+    def __init__(self, upload_to_mlflow: bool = False):
+        """Callback that performs archiving of onnx or exported model at the end of
+        training/export. TODO: description.
+
+        @type upload_to_mlflow: bool
+        @param upload_to_mlflow: If set to True, overrides the upload url in Archiver
+            with currently active MLFlow run (if present).
+        """
+        super().__init__()
+        self.upload_to_mlflow = upload_to_mlflow
+
+    def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
+        """Archives the model on train end.
+
+        @type trainer: L{pl.Trainer}
+        @param trainer: Pytorch Lightning trainer.
+        @type pl_module: L{pl.LightningModule}
+        @param pl_module: Pytorch Lightning module.
+        @raises RuntimeError: If no best model path is found.
+        """
+        from luxonis_train.core.archiver import Archiver
+
+        model_checkpoint_callbacks = [
+            c
+            for c in trainer.callbacks  # type: ignore
+            if isinstance(c, pl.callbacks.ModelCheckpoint)  # type: ignore
+        ]
+
+        # NOTE: assume that first checkpoint callback is based on val loss
+        best_model_path = model_checkpoint_callbacks[0].best_model_path
+        if not best_model_path:
+            raise RuntimeError(
+                "No best model path found. "
+                "Please make sure that ModelCheckpoint callback is present "
+                "and at least one validation epoch has been performed."
+            )
+        cfg: Config = pl_module.cfg
+        cfg.model.weights = best_model_path
+        if self.upload_to_mlflow:
+            if cfg.tracker.is_mlflow:
+                tracker = cast(LuxonisTrackerPL, trainer.logger)
+                new_upload_url = f"mlflow://{tracker.project_id}/{tracker.run_id}"
+                cfg.archiver.upload_url = new_upload_url
+            else:
+                logging.getLogger(__name__).warning(
+                    "`upload_to_mlflow` is set to True, "
+                    "but there is  no MLFlow active run, skipping."
+                )
+
+        onnx_path = str(Path(best_model_path).parent.with_suffix(".onnx"))
+        if not os.path.exists(onnx_path):
+            raise FileNotFoundError(
+                "Model executable not found. Make sure to run exporter callback before archiver callback"
+            )
+
+        archiver = Archiver(cfg=cfg)
+
+        archiver.archive(onnx_path)
diff --git a/luxonis_train/callbacks/export_on_train_end.py b/luxonis_train/callbacks/export_on_train_end.py
@@ -8,6 +8,8 @@
 from luxonis_train.utils.registry import CALLBACKS
 from luxonis_train.utils.tracker import LuxonisTrackerPL
 
+logger = logging.getLogger(__name__)
+
 
 @CALLBACKS.register_module()
 class ExportOnTrainEnd(pl.Callback):
@@ -41,11 +43,13 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No
         # NOTE: assume that first checkpoint callback is based on val loss
         best_model_path = model_checkpoint_callbacks[0].best_model_path
         if not best_model_path:
-            raise RuntimeError(
-                "No best model path found. "
-                "Please make sure that ModelCheckpoint callback is present "
-                "and at least one validation epoch has been performed."
+            logger.error(
+                "No model checkpoint found. "
+                "Make sure that `ModelCheckpoint` callback is present "
+                "and at least one validation epoch has been performed. "
+                "Skipping model export."
             )
+            return
         cfg: Config = pl_module.cfg
         cfg.model.weights = best_model_path
         if self.upload_to_mlflow:
@@ -54,9 +58,9 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No
                 new_upload_url = f"mlflow://{tracker.project_id}/{tracker.run_id}"
                 cfg.exporter.upload_url = new_upload_url
             else:
-                logging.getLogger(__name__).warning(
+                logger.error(
                     "`upload_to_mlflow` is set to True, "
-                    "but there is  no MLFlow active run, skipping."
+                    "but there is no MLFlow active run, skipping."
                 )
         exporter = Exporter(cfg=cfg)
         onnx_path = str(Path(best_model_path).parent.with_suffix(".onnx"))

diff --git a/luxonis_train/callbacks/luxonis_progress_bar.py b/luxonis_train/callbacks/luxonis_progress_bar.py
@@ -28,7 +28,7 @@ def get_metrics(
     ) -> dict[str, int | str | float | dict[str, float]]:
         # NOTE: there might be a cleaner way of doing this
         items = super().get_metrics(trainer, pl_module)
-        if trainer.training:
+        if trainer.training and pl_module.training_step_outputs:
             items["Loss"] = pl_module.training_step_outputs[-1]["loss"].item()
         return items
 

diff --git a/luxonis_train/callbacks/upload_checkpoint.py b/luxonis_train/callbacks/upload_checkpoint.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from pathlib import Path
 from typing import Any
 
 import lightning.pytorch as pl
@@ -25,37 +26,41 @@ def __init__(self, upload_directory: str):
         )
         self.logger = logging.getLogger(__name__)
         self.last_logged_epoch = None
-        self.last_best_checkpoint = None
+        self.last_best_checkpoints = set()
 
     def on_save_checkpoint(
         self,
         trainer: pl.Trainer,
-        pl_module: pl.LightningModule,
+        _: pl.LightningModule,
         checkpoint: dict[str, Any],
     ) -> None:
         # Log only once per epoch in case there are multiple ModelCheckpoint callbacks
         if not self.last_logged_epoch == trainer.current_epoch:
-            model_checkpoint_callbacks = [
-                c
+            checkpoint_paths = [
+                c.best_model_path
                 for c in trainer.callbacks  # type: ignore
                 if isinstance(c, pl.callbacks.ModelCheckpoint)  # type: ignore
+                and c.best_model_path
             ]
-            # NOTE: assume that first checkpoint callback is based on val loss
-            curr_best_checkpoint = model_checkpoint_callbacks[0].best_model_path
-
-            if self.last_best_checkpoint != curr_best_checkpoint:
-                self.logger.info(f"Started checkpoint upload to {self.fs.full_path}...")
-                temp_filename = "curr_best_val_loss.ckpt"
-                torch.save(checkpoint, temp_filename)
-                self.fs.put_file(
-                    local_path=temp_filename,
-                    remote_path=temp_filename,
-                    mlflow_instance=trainer.logger.experiment.get(  # type: ignore
-                        "mlflow", None
-                    ),
-                )
-                os.remove(temp_filename)
-                self.logger.info("Checkpoint upload finished")
-                self.last_best_checkpoint = curr_best_checkpoint
+            for curr_best_checkpoint in checkpoint_paths:
+                if curr_best_checkpoint not in self.last_best_checkpoints:
+                    self.logger.info(
+                        f"Started checkpoint upload to {self.fs.full_path}..."
+                    )
+                    temp_filename = (
+                        Path(curr_best_checkpoint).parent.with_suffix(".ckpt").name
+                    )
+                    torch.save(checkpoint, temp_filename)
+
+                    self.fs.put_file(
+                        local_path=temp_filename,
+                        remote_path=temp_filename,
+                        mlflow_instance=trainer.logger.experiment.get(  # type: ignore
+                            "mlflow", None
+                        ),
+                    )
+                    os.remove(temp_filename)
+                    self.logger.info("Checkpoint upload finished")
+                    self.last_best_checkpoints.add(curr_best_checkpoint)
 
             self.last_logged_epoch = trainer.current_epoch
diff --git a/luxonis_train/core/__init__.py b/luxonis_train/core/__init__.py
@@ -1,6 +1,7 @@
+from .archiver import Archiver
 from .exporter import Exporter
 from .inferer import Inferer
 from .trainer import Trainer
 from .tuner import Tuner
 
-__all__ = ["Exporter", "Trainer", "Tuner", "Inferer"]
+__all__ = ["Exporter", "Trainer", "Tuner", "Inferer", "Archiver"]