Merge branch 'dev' into fix/tuner

luxonis · May 13, 2024 · 02e6e99 · 02e6e99
2 parents e66d325 + d1d71f0
commit 02e6e99
Show file tree

Hide file tree

Showing 51 changed files with 1,374 additions and 568 deletions.
diff --git a/configs/README.md b/configs/README.md
@@ -142,23 +142,24 @@ To store and load the data we use LuxonisDataset and LuxonisLoader. For specific
 
 Here you can change everything related to actual training of the model.
 
-| Key                     | Type                                    | Default value | Description                                                                                                                                      |
-| ----------------------- | --------------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
-| batch_size              | int                                     | 32            | batch size used for training                                                                                                                     |
-| accumulate_grad_batches | int                                     | 1             | number of batches for gradient accumulation                                                                                                      |
-| use_weighted_sampler    | bool                                    | False         | bool if use WeightedRandomSampler for training, only works with classification tasks                                                             |
-| epochs                  | int                                     | 100           | number of training epochs                                                                                                                        |
-| num_workers             | int                                     | 2             | number of workers for data loading                                                                                                               |
-| train_metrics_interval  | int                                     | -1            | frequency of computing metrics on train data, -1 if don't perform                                                                                |
-| validation_interval     | int                                     | 1             | frequency of computing metrics on validation data                                                                                                |
-| num_log_images          | int                                     | 4             | maximum number of images to visualize and log                                                                                                    |
-| skip_last_batch         | bool                                    | True          | whether to skip last batch while training                                                                                                        |
-| accelerator             | Literal\["auto", "cpu", "gpu"\]         | "auto"        | What accelerator to use for training.                                                                                                            |
-| devices                 | int \| list\[int\] \| str               | "auto"        | Either specify how many devices to use (int), list specific devices, or use "auto" for automatic configuration based on the selected accelerator |
-| strategy                | Literal\["auto", "ddp"\]                | "auto"        | What strategy to use for training.                                                                                                               |
-| num_sanity_val_steps    | int                                     | 2             | Number of sanity validation steps performed before training.                                                                                     |
-| profiler                | Literal\["simple", "advanced"\] \| None | None          | PL profiler for GPU/CPU/RAM utilization analysis                                                                                                 |
-| verbose                 | bool                                    | True          | Print all intermediate results to console.                                                                                                       |
+| Key                     | Type                                           | Default value | Description                                                                                                                                      |
+| ----------------------- | ---------------------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
+| batch_size              | int                                            | 32            | batch size used for training                                                                                                                     |
+| accumulate_grad_batches | int                                            | 1             | number of batches for gradient accumulation                                                                                                      |
+| use_weighted_sampler    | bool                                           | False         | bool if use WeightedRandomSampler for training, only works with classification tasks                                                             |
+| epochs                  | int                                            | 100           | number of training epochs                                                                                                                        |
+| num_workers             | int                                            | 2             | number of workers for data loading                                                                                                               |
+| train_metrics_interval  | int                                            | -1            | frequency of computing metrics on train data, -1 if don't perform                                                                                |
+| validation_interval     | int                                            | 1             | frequency of computing metrics on validation data                                                                                                |
+| num_log_images          | int                                            | 4             | maximum number of images to visualize and log                                                                                                    |
+| skip_last_batch         | bool                                           | True          | whether to skip last batch while training                                                                                                        |
+| accelerator             | Literal\["auto", "cpu", "gpu"\]                | "auto"        | What accelerator to use for training.                                                                                                            |
+| devices                 | int \| list\[int\] \| str                      | "auto"        | Either specify how many devices to use (int), list specific devices, or use "auto" for automatic configuration based on the selected accelerator |
+| matmul_precision        | Literal\["medium", "high", "highest"\] \| None | None          | Sets the internal precision of float32 matrix multiplications.                                                                                   |
+| strategy                | Literal\["auto", "ddp"\]                       | "auto"        | What strategy to use for training.                                                                                                               |
+| num_sanity_val_steps    | int                                            | 2             | Number of sanity validation steps performed before training.                                                                                     |
+| profiler                | Literal\["simple", "advanced"\] \| None        | None          | PL profiler for GPU/CPU/RAM utilization analysis                                                                                                 |
+| verbose                 | bool                                           | True          | Print all intermediate results to console.                                                                                                       |
 
 ### Preprocessing
 

diff --git a/configs/coco_model.yaml b/configs/coco_model.yaml
@@ -117,7 +117,6 @@ trainer:
   validation_interval: 10
   num_log_images: 8
   skip_last_batch: True
-  main_head_index: 0
   log_sub_losses: True
   save_top_k: 3
 

diff --git a/configs/resnet_model.yaml b/configs/resnet_model.yaml
@@ -0,0 +1,58 @@
+
+model:
+  name: resnet50_classification
+  nodes:
+    - name: ResNet
+      params:
+        variant: "50"
+        download_weights: True
+
+    - name: ClassificationHead
+      inputs:
+        - ResNet
+
+  losses:
+    - name: CrossEntropyLoss
+      attached_to: ClassificationHead
+
+  metrics:
+    - name: Accuracy
+      is_main_metric: true
+      attached_to: ClassificationHead
+
+  visualizers:
+    - name: ClassificationVisualizer
+      attached_to: ClassificationHead
+      params:
+        font_scale: 0.5
+        color: [255, 0, 0]
+        thickness: 2
+        include_plot: True
+
+dataset:
+  name: cifar10_test
+
+trainer:
+  batch_size: 4
+  epochs: &epochs 200
+  num_workers: 4
+  validation_interval: 10
+  num_log_images: 8
+
+  preprocessing:
+    train_image_size: [&height 224, &width 224]
+    keep_aspect_ratio: False
+    normalize:
+      active: True
+
+  callbacks:
+    - name: ExportOnTrainEnd
+    - name: TestOnTrainEnd
+
+  optimizer:
+    name: SGD
+    params:
+      lr: 0.02
+
+  scheduler:
+    name: ConstantLR
diff --git a/luxonis_train/__main__.py b/luxonis_train/__main__.py
@@ -45,11 +45,17 @@ def __str__(self):
 
 
 @app.command()
-def train(config: ConfigType = None, opts: OptsType = None):
+def train(
+    config: ConfigType = None,
+    resume: Annotated[
+        Optional[str], typer.Option(help="Resume training from this checkpoint.")
+    ] = None,
+    opts: OptsType = None,
+):
     """Start training."""
     from luxonis_train.core import Trainer
 
-    Trainer(str(config), opts).train()
+    Trainer(str(config), opts, resume=resume).train()
 
 
 @app.command()
@@ -200,6 +206,20 @@ def inspect(
                     exit()
 
 
+@app.command()
+def archive(
+    executable: Annotated[
+        Optional[Path], typer.Option(help="Path to the model file.", show_default=False)
+    ],
+    config: ConfigType = None,
+    opts: OptsType = None,
+):
+    """Generate NN archive."""
+    from luxonis_train.core import Archiver
+
+    Archiver(str(config), opts).archive(executable)
+
+
 def version_callback(value: bool):
     if value:
         typer.echo(f"LuxonisTrain Version: {version(__package__)}")
@@ -214,13 +234,18 @@ def common(
             "--version", callback=version_callback, help="Show version and exit."
         ),
     ] = False,
+    source: Annotated[
+        Optional[Path],
+        typer.Option(
+            help="Path to a python file with custom components. "
+            "Will be sourced before running the command.",
+            metavar="FILE",
+        ),
+    ] = None,
 ):
-    ...
-
-
-def main():
-    app()
+    if source:
+        exec(source.read_text())
 
 
 if __name__ == "__main__":
-    main()
+    app()
diff --git a/luxonis_train/attached_modules/visualizers/multi_visualizer.py b/luxonis_train/attached_modules/visualizers/multi_visualizer.py
@@ -47,7 +47,7 @@ def forward(
     ) -> tuple[Tensor, Tensor]:
         for visualizer in self.visualizers:
             match visualizer.run(label_canvas, prediction_canvas, outputs, labels):
-                case Tensor(data=prediction_viz):
+                case Tensor() as prediction_viz:
                     prediction_canvas = prediction_viz
                 case (Tensor(data=label_viz), Tensor(data=prediction_viz)):
                     label_canvas = label_viz

diff --git a/luxonis_train/attached_modules/visualizers/utils.py b/luxonis_train/attached_modules/visualizers/utils.py
@@ -405,7 +405,7 @@ def resize_to_match(
         return fst_resized, snd_resized
 
     match visualization:
-        case Tensor(data=viz):
+        case Tensor() as viz:
             return viz
         case (Tensor(data=viz_labels), Tensor(data=viz_predictions)):
             viz_labels, viz_predictions = resize_to_match(viz_labels, viz_predictions)

diff --git a/luxonis_train/callbacks/README.md b/luxonis_train/callbacks/README.md
@@ -9,6 +9,7 @@ List of all supported callbacks.
 - [LuxonisProgressBar](#luxonisprogressbar)
 - [MetadataLogger](#metadatalogger)
 - [TestOnTrainEnd](#testontrainend)
+- [UploadCheckpoint](#uploadcheckpoint)
 
 ## PytorchLightning Callbacks
 
@@ -51,3 +52,11 @@ Metadata include all defined hyperparameters together with git hashes of `luxoni
 ## TestOnTrainEnd
 
 Callback to perform a test run at the end of the training.
+
+## UploadCheckpoint
+
+Callback that uploads currently best checkpoint (based on validation loss) to specified cloud directory after every validation epoch.
+
+| Key              | Type | Default value | Description                                                                                                                   |
+| ---------------- | ---- | ------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| upload_directory | str  | /             | Path to cloud directory where checkpoints should be uploaded to. If you want to use current mlflow run set it to `mlflow://`. |
diff --git a/luxonis_train/callbacks/__init__.py b/luxonis_train/callbacks/__init__.py
@@ -8,12 +8,13 @@
 
 from luxonis_train.utils.registry import CALLBACKS
 
+from .archive_on_train_end import ArchiveOnTrainEnd
 from .export_on_train_end import ExportOnTrainEnd
 from .luxonis_progress_bar import LuxonisProgressBar
 from .metadata_logger import MetadataLogger
 from .module_freezer import ModuleFreezer
 from .test_on_train_end import TestOnTrainEnd
-from .upload_checkpoint_on_train_end import UploadCheckpointOnTrainEnd
+from .upload_checkpoint import UploadCheckpoint
 
 CALLBACKS.register_module(module=EarlyStopping)
 CALLBACKS.register_module(module=LearningRateMonitor)
@@ -23,10 +24,11 @@
 
 
 __all__ = [
+    "ArchiveOnTrainEnd",
     "ExportOnTrainEnd",
     "LuxonisProgressBar",
     "MetadataLogger",
     "ModuleFreezer",
     "TestOnTrainEnd",
-    "UploadCheckpointOnTrainEnd",
+    "UploadCheckpoint",
 ]
diff --git a/luxonis_train/callbacks/archive_on_train_end.py b/luxonis_train/callbacks/archive_on_train_end.py
@@ -0,0 +1,72 @@
+import logging
+import os
+from pathlib import Path
+from typing import cast
+
+import lightning.pytorch as pl
+
+from luxonis_train.utils.config import Config
+from luxonis_train.utils.registry import CALLBACKS
+from luxonis_train.utils.tracker import LuxonisTrackerPL
+
+
+@CALLBACKS.register_module()
+class ArchiveOnTrainEnd(pl.Callback):
+    def __init__(self, upload_to_mlflow: bool = False):
+        """Callback that performs archiving of onnx or exported model at the end of
+        training/export. TODO: description.
+
+        @type upload_to_mlflow: bool
+        @param upload_to_mlflow: If set to True, overrides the upload url in Archiver
+            with currently active MLFlow run (if present).
+        """
+        super().__init__()
+        self.upload_to_mlflow = upload_to_mlflow
+
+    def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
+        """Archives the model on train end.
+
+        @type trainer: L{pl.Trainer}
+        @param trainer: Pytorch Lightning trainer.
+        @type pl_module: L{pl.LightningModule}
+        @param pl_module: Pytorch Lightning module.
+        @raises RuntimeError: If no best model path is found.
+        """
+        from luxonis_train.core.archiver import Archiver
+
+        model_checkpoint_callbacks = [
+            c
+            for c in trainer.callbacks  # type: ignore
+            if isinstance(c, pl.callbacks.ModelCheckpoint)  # type: ignore
+        ]
+
+        # NOTE: assume that first checkpoint callback is based on val loss
+        best_model_path = model_checkpoint_callbacks[0].best_model_path
+        if not best_model_path:
+            raise RuntimeError(
+                "No best model path found. "
+                "Please make sure that ModelCheckpoint callback is present "
+                "and at least one validation epoch has been performed."
+            )
+        cfg: Config = pl_module.cfg
+        cfg.model.weights = best_model_path
+        if self.upload_to_mlflow:
+            if cfg.tracker.is_mlflow:
+                tracker = cast(LuxonisTrackerPL, trainer.logger)
+                new_upload_url = f"mlflow://{tracker.project_id}/{tracker.run_id}"
+                cfg.archiver.upload_url = new_upload_url
+            else:
+                logging.getLogger(__name__).warning(
+                    "`upload_to_mlflow` is set to True, "
+                    "but there is  no MLFlow active run, skipping."
+                )
+
+        onnx_path = str(Path(best_model_path).parent.with_suffix(".onnx"))
+        if not os.path.exists(onnx_path):
+            raise FileNotFoundError(
+                "Model executable not found. Make sure to run exporter callback before archiver callback"
+            )
+
+        archiver = Archiver(cfg=cfg)
+
+        archiver.archive(onnx_path)
diff --git a/luxonis_train/callbacks/export_on_train_end.py b/luxonis_train/callbacks/export_on_train_end.py
@@ -8,6 +8,8 @@
 from luxonis_train.utils.registry import CALLBACKS
 from luxonis_train.utils.tracker import LuxonisTrackerPL
 
+logger = logging.getLogger(__name__)
+
 
 @CALLBACKS.register_module()
 class ExportOnTrainEnd(pl.Callback):
@@ -41,22 +43,24 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No
         # NOTE: assume that first checkpoint callback is based on val loss
         best_model_path = model_checkpoint_callbacks[0].best_model_path
         if not best_model_path:
-            raise RuntimeError(
-                "No best model path found. "
-                "Please make sure that ModelCheckpoint callback is present "
-                "and at least one validation epoch has been performed."
+            logger.error(
+                "No model checkpoint found. "
+                "Make sure that `ModelCheckpoint` callback is present "
+                "and at least one validation epoch has been performed. "
+                "Skipping model export."
             )
+            return
         cfg: Config = pl_module.cfg
         cfg.model.weights = best_model_path
         if self.upload_to_mlflow:
             if cfg.tracker.is_mlflow:
                 tracker = cast(LuxonisTrackerPL, trainer.logger)
-                new_upload_directory = f"mlflow://{tracker.project_id}/{tracker.run_id}"
-                cfg.exporter.upload_directory = new_upload_directory
+                new_upload_url = f"mlflow://{tracker.project_id}/{tracker.run_id}"
+                cfg.exporter.upload_url = new_upload_url
             else:
-                logging.getLogger(__name__).warning(
+                logger.error(
                     "`upload_to_mlflow` is set to True, "
-                    "but there is  no MLFlow active run, skipping."
+                    "but there is no MLFlow active run, skipping."
                 )
         exporter = Exporter(cfg=cfg)
         onnx_path = str(Path(best_model_path).parent.with_suffix(".onnx"))

diff --git a/luxonis_train/callbacks/luxonis_progress_bar.py b/luxonis_train/callbacks/luxonis_progress_bar.py
@@ -28,7 +28,7 @@ def get_metrics(
     ) -> dict[str, int | str | float | dict[str, float]]:
         # NOTE: there might be a cleaner way of doing this
         items = super().get_metrics(trainer, pl_module)
-        if trainer.training:
+        if trainer.training and pl_module.training_step_outputs:
             items["Loss"] = pl_module.training_step_outputs[-1]["loss"].item()
         return items