Skip to content

Commit

Permalink
Merge branch 'dev' into fix/tuner
Browse files Browse the repository at this point in the history
  • Loading branch information
klemen1999 authored May 13, 2024
2 parents e66d325 + d1d71f0 commit 02e6e99
Show file tree
Hide file tree
Showing 51 changed files with 1,374 additions and 568 deletions.
35 changes: 18 additions & 17 deletions configs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,23 +142,24 @@ To store and load the data we use LuxonisDataset and LuxonisLoader. For specific

Here you can change everything related to actual training of the model.

| Key | Type | Default value | Description |
| ----------------------- | --------------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
| batch_size | int | 32 | batch size used for training |
| accumulate_grad_batches | int | 1 | number of batches for gradient accumulation |
| use_weighted_sampler | bool | False | bool if use WeightedRandomSampler for training, only works with classification tasks |
| epochs | int | 100 | number of training epochs |
| num_workers | int | 2 | number of workers for data loading |
| train_metrics_interval | int | -1 | frequency of computing metrics on train data, -1 if don't perform |
| validation_interval | int | 1 | frequency of computing metrics on validation data |
| num_log_images | int | 4 | maximum number of images to visualize and log |
| skip_last_batch | bool | True | whether to skip last batch while training |
| accelerator | Literal\["auto", "cpu", "gpu"\] | "auto" | What accelerator to use for training. |
| devices | int \| list\[int\] \| str | "auto" | Either specify how many devices to use (int), list specific devices, or use "auto" for automatic configuration based on the selected accelerator |
| strategy | Literal\["auto", "ddp"\] | "auto" | What strategy to use for training. |
| num_sanity_val_steps | int | 2 | Number of sanity validation steps performed before training. |
| profiler | Literal\["simple", "advanced"\] \| None | None | PL profiler for GPU/CPU/RAM utilization analysis |
| verbose | bool | True | Print all intermediate results to console. |
| Key | Type | Default value | Description |
| ----------------------- | ---------------------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
| batch_size | int | 32 | batch size used for training |
| accumulate_grad_batches | int | 1 | number of batches for gradient accumulation |
| use_weighted_sampler | bool | False | bool if use WeightedRandomSampler for training, only works with classification tasks |
| epochs | int | 100 | number of training epochs |
| num_workers | int | 2 | number of workers for data loading |
| train_metrics_interval | int | -1 | frequency of computing metrics on train data, -1 if don't perform |
| validation_interval | int | 1 | frequency of computing metrics on validation data |
| num_log_images | int | 4 | maximum number of images to visualize and log |
| skip_last_batch | bool | True | whether to skip last batch while training |
| accelerator | Literal\["auto", "cpu", "gpu"\] | "auto" | What accelerator to use for training. |
| devices | int \| list\[int\] \| str | "auto" | Either specify how many devices to use (int), list specific devices, or use "auto" for automatic configuration based on the selected accelerator |
| matmul_precision | Literal\["medium", "high", "highest"\] \| None | None | Sets the internal precision of float32 matrix multiplications. |
| strategy | Literal\["auto", "ddp"\] | "auto" | What strategy to use for training. |
| num_sanity_val_steps | int | 2 | Number of sanity validation steps performed before training. |
| profiler | Literal\["simple", "advanced"\] \| None | None | PL profiler for GPU/CPU/RAM utilization analysis |
| verbose | bool | True | Print all intermediate results to console. |

### Preprocessing

Expand Down
1 change: 0 additions & 1 deletion configs/coco_model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ trainer:
validation_interval: 10
num_log_images: 8
skip_last_batch: True
main_head_index: 0
log_sub_losses: True
save_top_k: 3

Expand Down
58 changes: 58 additions & 0 deletions configs/resnet_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@

model:
name: resnet50_classification
nodes:
- name: ResNet
params:
variant: "50"
download_weights: True

- name: ClassificationHead
inputs:
- ResNet

losses:
- name: CrossEntropyLoss
attached_to: ClassificationHead

metrics:
- name: Accuracy
is_main_metric: true
attached_to: ClassificationHead

visualizers:
- name: ClassificationVisualizer
attached_to: ClassificationHead
params:
font_scale: 0.5
color: [255, 0, 0]
thickness: 2
include_plot: True

dataset:
name: cifar10_test

trainer:
batch_size: 4
epochs: &epochs 200
num_workers: 4
validation_interval: 10
num_log_images: 8

preprocessing:
train_image_size: [&height 224, &width 224]
keep_aspect_ratio: False
normalize:
active: True

callbacks:
- name: ExportOnTrainEnd
- name: TestOnTrainEnd

optimizer:
name: SGD
params:
lr: 0.02

scheduler:
name: ConstantLR
41 changes: 33 additions & 8 deletions luxonis_train/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,17 @@ def __str__(self):


@app.command()
def train(config: ConfigType = None, opts: OptsType = None):
def train(
config: ConfigType = None,
resume: Annotated[
Optional[str], typer.Option(help="Resume training from this checkpoint.")
] = None,
opts: OptsType = None,
):
"""Start training."""
from luxonis_train.core import Trainer

Trainer(str(config), opts).train()
Trainer(str(config), opts, resume=resume).train()


@app.command()
Expand Down Expand Up @@ -200,6 +206,20 @@ def inspect(
exit()


@app.command()
def archive(
executable: Annotated[
Optional[Path], typer.Option(help="Path to the model file.", show_default=False)
],
config: ConfigType = None,
opts: OptsType = None,
):
"""Generate NN archive."""
from luxonis_train.core import Archiver

Archiver(str(config), opts).archive(executable)


def version_callback(value: bool):
if value:
typer.echo(f"LuxonisTrain Version: {version(__package__)}")
Expand All @@ -214,13 +234,18 @@ def common(
"--version", callback=version_callback, help="Show version and exit."
),
] = False,
source: Annotated[
Optional[Path],
typer.Option(
help="Path to a python file with custom components. "
"Will be sourced before running the command.",
metavar="FILE",
),
] = None,
):
...


def main():
app()
if source:
exec(source.read_text())


if __name__ == "__main__":
main()
app()
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def forward(
) -> tuple[Tensor, Tensor]:
for visualizer in self.visualizers:
match visualizer.run(label_canvas, prediction_canvas, outputs, labels):
case Tensor(data=prediction_viz):
case Tensor() as prediction_viz:
prediction_canvas = prediction_viz
case (Tensor(data=label_viz), Tensor(data=prediction_viz)):
label_canvas = label_viz
Expand Down
2 changes: 1 addition & 1 deletion luxonis_train/attached_modules/visualizers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ def resize_to_match(
return fst_resized, snd_resized

match visualization:
case Tensor(data=viz):
case Tensor() as viz:
return viz
case (Tensor(data=viz_labels), Tensor(data=viz_predictions)):
viz_labels, viz_predictions = resize_to_match(viz_labels, viz_predictions)
Expand Down
9 changes: 9 additions & 0 deletions luxonis_train/callbacks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ List of all supported callbacks.
- [LuxonisProgressBar](#luxonisprogressbar)
- [MetadataLogger](#metadatalogger)
- [TestOnTrainEnd](#testontrainend)
- [UploadCheckpoint](#uploadcheckpoint)

## PytorchLightning Callbacks

Expand Down Expand Up @@ -51,3 +52,11 @@ Metadata include all defined hyperparameters together with git hashes of `luxoni
## TestOnTrainEnd

Callback to perform a test run at the end of the training.

## UploadCheckpoint

Callback that uploads currently best checkpoint (based on validation loss) to specified cloud directory after every validation epoch.

| Key | Type | Default value | Description |
| ---------------- | ---- | ------------- | ----------------------------------------------------------------------------------------------------------------------------- |
| upload_directory | str | / | Path to cloud directory where checkpoints should be uploaded to. If you want to use current mlflow run set it to `mlflow://`. |
6 changes: 4 additions & 2 deletions luxonis_train/callbacks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@

from luxonis_train.utils.registry import CALLBACKS

from .archive_on_train_end import ArchiveOnTrainEnd
from .export_on_train_end import ExportOnTrainEnd
from .luxonis_progress_bar import LuxonisProgressBar
from .metadata_logger import MetadataLogger
from .module_freezer import ModuleFreezer
from .test_on_train_end import TestOnTrainEnd
from .upload_checkpoint_on_train_end import UploadCheckpointOnTrainEnd
from .upload_checkpoint import UploadCheckpoint

CALLBACKS.register_module(module=EarlyStopping)
CALLBACKS.register_module(module=LearningRateMonitor)
Expand All @@ -23,10 +24,11 @@


__all__ = [
"ArchiveOnTrainEnd",
"ExportOnTrainEnd",
"LuxonisProgressBar",
"MetadataLogger",
"ModuleFreezer",
"TestOnTrainEnd",
"UploadCheckpointOnTrainEnd",
"UploadCheckpoint",
]
72 changes: 72 additions & 0 deletions luxonis_train/callbacks/archive_on_train_end.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import logging
import os
from pathlib import Path
from typing import cast

import lightning.pytorch as pl

from luxonis_train.utils.config import Config
from luxonis_train.utils.registry import CALLBACKS
from luxonis_train.utils.tracker import LuxonisTrackerPL


@CALLBACKS.register_module()
class ArchiveOnTrainEnd(pl.Callback):
def __init__(self, upload_to_mlflow: bool = False):
"""Callback that performs archiving of onnx or exported model at the end of
training/export. TODO: description.
@type upload_to_mlflow: bool
@param upload_to_mlflow: If set to True, overrides the upload url in Archiver
with currently active MLFlow run (if present).
"""
super().__init__()
self.upload_to_mlflow = upload_to_mlflow

def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
"""Archives the model on train end.
@type trainer: L{pl.Trainer}
@param trainer: Pytorch Lightning trainer.
@type pl_module: L{pl.LightningModule}
@param pl_module: Pytorch Lightning module.
@raises RuntimeError: If no best model path is found.
"""
from luxonis_train.core.archiver import Archiver

model_checkpoint_callbacks = [
c
for c in trainer.callbacks # type: ignore
if isinstance(c, pl.callbacks.ModelCheckpoint) # type: ignore
]

# NOTE: assume that first checkpoint callback is based on val loss
best_model_path = model_checkpoint_callbacks[0].best_model_path
if not best_model_path:
raise RuntimeError(
"No best model path found. "
"Please make sure that ModelCheckpoint callback is present "
"and at least one validation epoch has been performed."
)
cfg: Config = pl_module.cfg
cfg.model.weights = best_model_path
if self.upload_to_mlflow:
if cfg.tracker.is_mlflow:
tracker = cast(LuxonisTrackerPL, trainer.logger)
new_upload_url = f"mlflow://{tracker.project_id}/{tracker.run_id}"
cfg.archiver.upload_url = new_upload_url
else:
logging.getLogger(__name__).warning(
"`upload_to_mlflow` is set to True, "
"but there is no MLFlow active run, skipping."
)

onnx_path = str(Path(best_model_path).parent.with_suffix(".onnx"))
if not os.path.exists(onnx_path):
raise FileNotFoundError(
"Model executable not found. Make sure to run exporter callback before archiver callback"
)

archiver = Archiver(cfg=cfg)

archiver.archive(onnx_path)
20 changes: 12 additions & 8 deletions luxonis_train/callbacks/export_on_train_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from luxonis_train.utils.registry import CALLBACKS
from luxonis_train.utils.tracker import LuxonisTrackerPL

logger = logging.getLogger(__name__)


@CALLBACKS.register_module()
class ExportOnTrainEnd(pl.Callback):
Expand Down Expand Up @@ -41,22 +43,24 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No
# NOTE: assume that first checkpoint callback is based on val loss
best_model_path = model_checkpoint_callbacks[0].best_model_path
if not best_model_path:
raise RuntimeError(
"No best model path found. "
"Please make sure that ModelCheckpoint callback is present "
"and at least one validation epoch has been performed."
logger.error(
"No model checkpoint found. "
"Make sure that `ModelCheckpoint` callback is present "
"and at least one validation epoch has been performed. "
"Skipping model export."
)
return
cfg: Config = pl_module.cfg
cfg.model.weights = best_model_path
if self.upload_to_mlflow:
if cfg.tracker.is_mlflow:
tracker = cast(LuxonisTrackerPL, trainer.logger)
new_upload_directory = f"mlflow://{tracker.project_id}/{tracker.run_id}"
cfg.exporter.upload_directory = new_upload_directory
new_upload_url = f"mlflow://{tracker.project_id}/{tracker.run_id}"
cfg.exporter.upload_url = new_upload_url
else:
logging.getLogger(__name__).warning(
logger.error(
"`upload_to_mlflow` is set to True, "
"but there is no MLFlow active run, skipping."
"but there is no MLFlow active run, skipping."
)
exporter = Exporter(cfg=cfg)
onnx_path = str(Path(best_model_path).parent.with_suffix(".onnx"))
Expand Down
2 changes: 1 addition & 1 deletion luxonis_train/callbacks/luxonis_progress_bar.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def get_metrics(
) -> dict[str, int | str | float | dict[str, float]]:
# NOTE: there might be a cleaner way of doing this
items = super().get_metrics(trainer, pl_module)
if trainer.training:
if trainer.training and pl_module.training_step_outputs:
items["Loss"] = pl_module.training_step_outputs[-1]["loss"].item()
return items

Expand Down
Loading

0 comments on commit 02e6e99

Please sign in to comment.