Skip to content

Commit

Permalink
MLFlow Upload Fix (#10)
Browse files Browse the repository at this point in the history
* fixed incorrect class property call

* fixed exporter uploading

* uploadCheckpoint uploads on every checkpoint epoch

* fix temp files names

* updated callback readme

* pre-commit run
  • Loading branch information
klemen1999 authored and kozlov721 committed Oct 9, 2024
1 parent afade1f commit 2c654a5
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 47 deletions.
9 changes: 9 additions & 0 deletions luxonis_train/callbacks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ List of all supported callbacks.
- [LuxonisProgressBar](#luxonisprogressbar)
- [MetadataLogger](#metadatalogger)
- [TestOnTrainEnd](#testontrainend)
- [UploadCheckpoint](#uploadcheckpoint)

## PytorchLightning Callbacks

Expand Down Expand Up @@ -51,3 +52,11 @@ Metadata include all defined hyperparameters together with git hashes of `luxoni
## TestOnTrainEnd

Callback to perform a test run at the end of the training.

## UploadCheckpoint

Callback that uploads currently best checkpoint (based on validation loss) to specified cloud directory after every validation epoch.

| Key | Type | Default value | Description |
| ---------------- | ---- | ------------- | ----------------------------------------------------------------------------------------------------------------------------- |
| upload_directory | str | / | Path to cloud directory where checkpoints should be uploaded to. If you want to use current mlflow run set it to `mlflow://`. |
4 changes: 2 additions & 2 deletions luxonis_train/callbacks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from .metadata_logger import MetadataLogger
from .module_freezer import ModuleFreezer
from .test_on_train_end import TestOnTrainEnd
from .upload_checkpoint_on_train_end import UploadCheckpointOnTrainEnd
from .upload_checkpoint import UploadCheckpoint

CALLBACKS.register_module(module=EarlyStopping)
CALLBACKS.register_module(module=LearningRateMonitor)
Expand All @@ -28,5 +28,5 @@
"MetadataLogger",
"ModuleFreezer",
"TestOnTrainEnd",
"UploadCheckpointOnTrainEnd",
"UploadCheckpoint",
]
4 changes: 2 additions & 2 deletions luxonis_train/callbacks/export_on_train_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No
if self.upload_to_mlflow:
if cfg.tracker.is_mlflow:
tracker = cast(LuxonisTrackerPL, trainer.logger)
new_upload_directory = f"mlflow://{tracker.project_id}/{tracker.run_id}"
cfg.exporter.upload_directory = new_upload_directory
new_upload_url = f"mlflow://{tracker.project_id}/{tracker.run_id}"
cfg.exporter.upload_url = new_upload_url
else:
logging.getLogger(__name__).warning(
"`upload_to_mlflow` is set to True, "
Expand Down
61 changes: 61 additions & 0 deletions luxonis_train/callbacks/upload_checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import logging
import os
from typing import Any

import lightning.pytorch as pl
import torch
from luxonis_ml.utils.filesystem import LuxonisFileSystem

from luxonis_train.utils.registry import CALLBACKS


@CALLBACKS.register_module()
class UploadCheckpoint(pl.Callback):
"""Callback that uploads best checkpoint based on the validation loss."""

def __init__(self, upload_directory: str):
"""Constructs `UploadCheckpoint`.
@type upload_directory: str
@param upload_directory: Path used as upload directory
"""
super().__init__()
self.fs = LuxonisFileSystem(
upload_directory, allow_active_mlflow_run=True, allow_local=False
)
self.logger = logging.getLogger(__name__)
self.last_logged_epoch = None
self.last_best_checkpoint = None

def on_save_checkpoint(
self,
trainer: pl.Trainer,
pl_module: pl.LightningModule,
checkpoint: dict[str, Any],
) -> None:
# Log only once per epoch in case there are multiple ModelCheckpoint callbacks
if not self.last_logged_epoch == trainer.current_epoch:
model_checkpoint_callbacks = [
c
for c in trainer.callbacks # type: ignore
if isinstance(c, pl.callbacks.ModelCheckpoint) # type: ignore
]
# NOTE: assume that first checkpoint callback is based on val loss
curr_best_checkpoint = model_checkpoint_callbacks[0].best_model_path

if self.last_best_checkpoint != curr_best_checkpoint:
self.logger.info(f"Started checkpoint upload to {self.fs.full_path}...")
temp_filename = "curr_best_val_loss.ckpt"
torch.save(checkpoint, temp_filename)
self.fs.put_file(
local_path=temp_filename,
remote_path=temp_filename,
mlflow_instance=trainer.logger.experiment.get( # type: ignore
"mlflow", None
),
)
os.remove(temp_filename)
self.logger.info("Checkpoint upload finished")
self.last_best_checkpoint = curr_best_checkpoint

self.last_logged_epoch = trainer.current_epoch
41 changes: 0 additions & 41 deletions luxonis_train/callbacks/upload_checkpoint_on_train_end.py

This file was deleted.

6 changes: 4 additions & 2 deletions luxonis_train/core/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def _upload(self, files_to_upload: list[str]):
remote_path=self.cfg.exporter.export_model_name + suffix,
)

with tempfile.TemporaryFile() as f:
with tempfile.NamedTemporaryFile(prefix="config", suffix=".yaml") as f:
self.cfg.save_data(f.name)
fs.put_file(local_path=f.name, remote_path="config.yaml")

Expand All @@ -209,7 +209,9 @@ def _upload(self, files_to_upload: list[str]):
)
modelconverter_config = self._get_modelconverter_config(onnx_path)

with tempfile.TemporaryFile() as f:
with tempfile.NamedTemporaryFile(
prefix="config_export", suffix=".yaml", mode="w+"
) as f:
yaml.dump(modelconverter_config, f, default_flow_style=False)
fs.put_file(local_path=f.name, remote_path="config_export.yaml")

Expand Down

0 comments on commit 2c654a5

Please sign in to comment.