Skip to content

Commit

Permalink
Merge branch 'dev' into feature/seed
Browse files Browse the repository at this point in the history
  • Loading branch information
kozlov721 authored May 13, 2024
2 parents 5884c44 + d1d71f0 commit c5b77be
Show file tree
Hide file tree
Showing 40 changed files with 1,185 additions and 263 deletions.
37 changes: 19 additions & 18 deletions configs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,24 +142,25 @@ To store and load the data we use LuxonisDataset and LuxonisLoader. For specific

Here you can change everything related to actual training of the model.

| Key | Type | Default value | Description |
| ----------------------- | --------------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
| seed | int | None | seed for reproducibility |
| batch_size | int | 32 | batch size used for training |
| accumulate_grad_batches | int | 1 | number of batches for gradient accumulation |
| use_weighted_sampler | bool | False | bool if use WeightedRandomSampler for training, only works with classification tasks |
| epochs | int | 100 | number of training epochs |
| num_workers | int | 2 | number of workers for data loading |
| train_metrics_interval | int | -1 | frequency of computing metrics on train data, -1 if don't perform |
| validation_interval | int | 1 | frequency of computing metrics on validation data |
| num_log_images | int | 4 | maximum number of images to visualize and log |
| skip_last_batch | bool | True | whether to skip last batch while training |
| accelerator | Literal\["auto", "cpu", "gpu"\] | "auto" | What accelerator to use for training. |
| devices | int \| list\[int\] \| str | "auto" | Either specify how many devices to use (int), list specific devices, or use "auto" for automatic configuration based on the selected accelerator |
| strategy | Literal\["auto", "ddp"\] | "auto" | What strategy to use for training. |
| num_sanity_val_steps | int | 2 | Number of sanity validation steps performed before training. |
| profiler | Literal\["simple", "advanced"\] \| None | None | PL profiler for GPU/CPU/RAM utilization analysis |
| verbose | bool | True | Print all intermediate results to console. |
| Key | Type | Default value | Description |
| ----------------------- | ---------------------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
| seed | int | None | seed for reproducibility |
| batch_size | int | 32 | batch size used for training |
| accumulate_grad_batches | int | 1 | number of batches for gradient accumulation |
| use_weighted_sampler | bool | False | bool if use WeightedRandomSampler for training, only works with classification tasks |
| epochs | int | 100 | number of training epochs |
| num_workers | int | 2 | number of workers for data loading |
| train_metrics_interval | int | -1 | frequency of computing metrics on train data, -1 if don't perform |
| validation_interval | int | 1 | frequency of computing metrics on validation data |
| num_log_images | int | 4 | maximum number of images to visualize and log |
| skip_last_batch | bool | True | whether to skip last batch while training |
| accelerator | Literal\["auto", "cpu", "gpu"\] | "auto" | What accelerator to use for training. |
| devices | int \| list\[int\] \| str | "auto" | Either specify how many devices to use (int), list specific devices, or use "auto" for automatic configuration based on the selected accelerator |
| matmul_precision | Literal\["medium", "high", "highest"\] \| None | None | Sets the internal precision of float32 matrix multiplications. |
| strategy | Literal\["auto", "ddp"\] | "auto" | What strategy to use for training. |
| num_sanity_val_steps | int | 2 | Number of sanity validation steps performed before training. |
| profiler | Literal\["simple", "advanced"\] \| None | None | PL profiler for GPU/CPU/RAM utilization analysis |
| verbose | bool | True | Print all intermediate results to console. |

### Preprocessing

Expand Down
24 changes: 22 additions & 2 deletions luxonis_train/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,17 @@ def __str__(self):


@app.command()
def train(config: ConfigType = None, opts: OptsType = None):
def train(
config: ConfigType = None,
resume: Annotated[
Optional[str], typer.Option(help="Resume training from this checkpoint.")
] = None,
opts: OptsType = None,
):
"""Start training."""
from luxonis_train.core import Trainer

Trainer(str(config), opts).train()
Trainer(str(config), opts, resume=resume).train()


@app.command()
Expand Down Expand Up @@ -203,6 +209,20 @@ def inspect(
exit()


@app.command()
def archive(
executable: Annotated[
Optional[Path], typer.Option(help="Path to the model file.", show_default=False)
],
config: ConfigType = None,
opts: OptsType = None,
):
"""Generate NN archive."""
from luxonis_train.core import Archiver

Archiver(str(config), opts).archive(executable)


def version_callback(value: bool):
if value:
typer.echo(f"LuxonisTrain Version: {version(__package__)}")
Expand Down
2 changes: 2 additions & 0 deletions luxonis_train/callbacks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from luxonis_train.utils.registry import CALLBACKS

from .archive_on_train_end import ArchiveOnTrainEnd
from .export_on_train_end import ExportOnTrainEnd
from .luxonis_progress_bar import LuxonisProgressBar
from .metadata_logger import MetadataLogger
Expand All @@ -23,6 +24,7 @@


__all__ = [
"ArchiveOnTrainEnd",
"ExportOnTrainEnd",
"LuxonisProgressBar",
"MetadataLogger",
Expand Down
72 changes: 72 additions & 0 deletions luxonis_train/callbacks/archive_on_train_end.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import logging
import os
from pathlib import Path
from typing import cast

import lightning.pytorch as pl

from luxonis_train.utils.config import Config
from luxonis_train.utils.registry import CALLBACKS
from luxonis_train.utils.tracker import LuxonisTrackerPL


@CALLBACKS.register_module()
class ArchiveOnTrainEnd(pl.Callback):
def __init__(self, upload_to_mlflow: bool = False):
"""Callback that performs archiving of onnx or exported model at the end of
training/export. TODO: description.
@type upload_to_mlflow: bool
@param upload_to_mlflow: If set to True, overrides the upload url in Archiver
with currently active MLFlow run (if present).
"""
super().__init__()
self.upload_to_mlflow = upload_to_mlflow

def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
"""Archives the model on train end.
@type trainer: L{pl.Trainer}
@param trainer: Pytorch Lightning trainer.
@type pl_module: L{pl.LightningModule}
@param pl_module: Pytorch Lightning module.
@raises RuntimeError: If no best model path is found.
"""
from luxonis_train.core.archiver import Archiver

model_checkpoint_callbacks = [
c
for c in trainer.callbacks # type: ignore
if isinstance(c, pl.callbacks.ModelCheckpoint) # type: ignore
]

# NOTE: assume that first checkpoint callback is based on val loss
best_model_path = model_checkpoint_callbacks[0].best_model_path
if not best_model_path:
raise RuntimeError(
"No best model path found. "
"Please make sure that ModelCheckpoint callback is present "
"and at least one validation epoch has been performed."
)
cfg: Config = pl_module.cfg
cfg.model.weights = best_model_path
if self.upload_to_mlflow:
if cfg.tracker.is_mlflow:
tracker = cast(LuxonisTrackerPL, trainer.logger)
new_upload_url = f"mlflow://{tracker.project_id}/{tracker.run_id}"
cfg.archiver.upload_url = new_upload_url
else:
logging.getLogger(__name__).warning(
"`upload_to_mlflow` is set to True, "
"but there is no MLFlow active run, skipping."
)

onnx_path = str(Path(best_model_path).parent.with_suffix(".onnx"))
if not os.path.exists(onnx_path):
raise FileNotFoundError(
"Model executable not found. Make sure to run exporter callback before archiver callback"
)

archiver = Archiver(cfg=cfg)

archiver.archive(onnx_path)
16 changes: 10 additions & 6 deletions luxonis_train/callbacks/export_on_train_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from luxonis_train.utils.registry import CALLBACKS
from luxonis_train.utils.tracker import LuxonisTrackerPL

logger = logging.getLogger(__name__)


@CALLBACKS.register_module()
class ExportOnTrainEnd(pl.Callback):
Expand Down Expand Up @@ -41,11 +43,13 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No
# NOTE: assume that first checkpoint callback is based on val loss
best_model_path = model_checkpoint_callbacks[0].best_model_path
if not best_model_path:
raise RuntimeError(
"No best model path found. "
"Please make sure that ModelCheckpoint callback is present "
"and at least one validation epoch has been performed."
logger.error(
"No model checkpoint found. "
"Make sure that `ModelCheckpoint` callback is present "
"and at least one validation epoch has been performed. "
"Skipping model export."
)
return
cfg: Config = pl_module.cfg
cfg.model.weights = best_model_path
if self.upload_to_mlflow:
Expand All @@ -54,9 +58,9 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No
new_upload_url = f"mlflow://{tracker.project_id}/{tracker.run_id}"
cfg.exporter.upload_url = new_upload_url
else:
logging.getLogger(__name__).warning(
logger.error(
"`upload_to_mlflow` is set to True, "
"but there is no MLFlow active run, skipping."
"but there is no MLFlow active run, skipping."
)
exporter = Exporter(cfg=cfg)
onnx_path = str(Path(best_model_path).parent.with_suffix(".onnx"))
Expand Down
2 changes: 1 addition & 1 deletion luxonis_train/callbacks/luxonis_progress_bar.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def get_metrics(
) -> dict[str, int | str | float | dict[str, float]]:
# NOTE: there might be a cleaner way of doing this
items = super().get_metrics(trainer, pl_module)
if trainer.training:
if trainer.training and pl_module.training_step_outputs:
items["Loss"] = pl_module.training_step_outputs[-1]["loss"].item()
return items

Expand Down
47 changes: 26 additions & 21 deletions luxonis_train/callbacks/upload_checkpoint.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
from pathlib import Path
from typing import Any

import lightning.pytorch as pl
Expand All @@ -25,37 +26,41 @@ def __init__(self, upload_directory: str):
)
self.logger = logging.getLogger(__name__)
self.last_logged_epoch = None
self.last_best_checkpoint = None
self.last_best_checkpoints = set()

def on_save_checkpoint(
self,
trainer: pl.Trainer,
pl_module: pl.LightningModule,
_: pl.LightningModule,
checkpoint: dict[str, Any],
) -> None:
# Log only once per epoch in case there are multiple ModelCheckpoint callbacks
if not self.last_logged_epoch == trainer.current_epoch:
model_checkpoint_callbacks = [
c
checkpoint_paths = [
c.best_model_path
for c in trainer.callbacks # type: ignore
if isinstance(c, pl.callbacks.ModelCheckpoint) # type: ignore
and c.best_model_path
]
# NOTE: assume that first checkpoint callback is based on val loss
curr_best_checkpoint = model_checkpoint_callbacks[0].best_model_path

if self.last_best_checkpoint != curr_best_checkpoint:
self.logger.info(f"Started checkpoint upload to {self.fs.full_path}...")
temp_filename = "curr_best_val_loss.ckpt"
torch.save(checkpoint, temp_filename)
self.fs.put_file(
local_path=temp_filename,
remote_path=temp_filename,
mlflow_instance=trainer.logger.experiment.get( # type: ignore
"mlflow", None
),
)
os.remove(temp_filename)
self.logger.info("Checkpoint upload finished")
self.last_best_checkpoint = curr_best_checkpoint
for curr_best_checkpoint in checkpoint_paths:
if curr_best_checkpoint not in self.last_best_checkpoints:
self.logger.info(
f"Started checkpoint upload to {self.fs.full_path}..."
)
temp_filename = (
Path(curr_best_checkpoint).parent.with_suffix(".ckpt").name
)
torch.save(checkpoint, temp_filename)

self.fs.put_file(
local_path=temp_filename,
remote_path=temp_filename,
mlflow_instance=trainer.logger.experiment.get( # type: ignore
"mlflow", None
),
)
os.remove(temp_filename)
self.logger.info("Checkpoint upload finished")
self.last_best_checkpoints.add(curr_best_checkpoint)

self.last_logged_epoch = trainer.current_epoch
3 changes: 2 additions & 1 deletion luxonis_train/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .archiver import Archiver
from .exporter import Exporter
from .inferer import Inferer
from .trainer import Trainer
from .tuner import Tuner

__all__ = ["Exporter", "Trainer", "Tuner", "Inferer"]
__all__ = ["Exporter", "Trainer", "Tuner", "Inferer", "Archiver"]
Loading

0 comments on commit c5b77be

Please sign in to comment.