From 75ffa480bfe43c02f572dc62ebb2b4af79dc000b Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Wed, 8 May 2024 01:55:26 +0200 Subject: [PATCH 1/2] option to set torch matmul precision for tensor cores --- luxonis_train/core/trainer.py | 4 ++++ luxonis_train/utils/config.py | 1 + 2 files changed, 5 insertions(+) diff --git a/luxonis_train/core/trainer.py b/luxonis_train/core/trainer.py index 8326ce48..fc634544 100644 --- a/luxonis_train/core/trainer.py +++ b/luxonis_train/core/trainer.py @@ -4,6 +4,7 @@ from logging import getLogger from typing import Any, Literal +import torch from lightning.pytorch.utilities import rank_zero_only # type: ignore from luxonis_ml.utils import LuxonisFileSystem @@ -39,6 +40,9 @@ def __init__( """ super().__init__(cfg, opts) + if self.cfg.trainer.matmul_precision is not None: + torch.set_float32_matmul_precision(self.cfg.trainer.matmul_precision) + if resume is not None: self.resume = str(LuxonisFileSystem.download(resume, self.run_save_dir)) else: diff --git a/luxonis_train/utils/config.py b/luxonis_train/utils/config.py index 45dde192..e94c591e 100644 --- a/luxonis_train/utils/config.py +++ b/luxonis_train/utils/config.py @@ -203,6 +203,7 @@ class TrainerConfig(CustomBaseModel): strategy: Literal["auto", "ddp"] = "auto" num_sanity_val_steps: int = 2 profiler: Literal["simple", "advanced"] | None = None + matmul_precision: Literal["medium", "high", "highest"] | None = None verbose: bool = True batch_size: int = 32 From 9aa42d0e260272a333b09172bdd6790e418a39d8 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Wed, 8 May 2024 02:02:57 +0200 Subject: [PATCH 2/2] updated readme --- configs/README.md | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/configs/README.md b/configs/README.md index 27e2fb6e..c1f4889b 100644 --- a/configs/README.md +++ b/configs/README.md @@ -142,23 +142,24 @@ To store and load the data we use LuxonisDataset and LuxonisLoader. For specific Here you can change everything related to actual training of the model. -| Key | Type | Default value | Description | -| ----------------------- | --------------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | -| batch_size | int | 32 | batch size used for training | -| accumulate_grad_batches | int | 1 | number of batches for gradient accumulation | -| use_weighted_sampler | bool | False | bool if use WeightedRandomSampler for training, only works with classification tasks | -| epochs | int | 100 | number of training epochs | -| num_workers | int | 2 | number of workers for data loading | -| train_metrics_interval | int | -1 | frequency of computing metrics on train data, -1 if don't perform | -| validation_interval | int | 1 | frequency of computing metrics on validation data | -| num_log_images | int | 4 | maximum number of images to visualize and log | -| skip_last_batch | bool | True | whether to skip last batch while training | -| accelerator | Literal\["auto", "cpu", "gpu"\] | "auto" | What accelerator to use for training. | -| devices | int \| list\[int\] \| str | "auto" | Either specify how many devices to use (int), list specific devices, or use "auto" for automatic configuration based on the selected accelerator | -| strategy | Literal\["auto", "ddp"\] | "auto" | What strategy to use for training. | -| num_sanity_val_steps | int | 2 | Number of sanity validation steps performed before training. | -| profiler | Literal\["simple", "advanced"\] \| None | None | PL profiler for GPU/CPU/RAM utilization analysis | -| verbose | bool | True | Print all intermediate results to console. | +| Key | Type | Default value | Description | +| ----------------------- | ---------------------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | +| batch_size | int | 32 | batch size used for training | +| accumulate_grad_batches | int | 1 | number of batches for gradient accumulation | +| use_weighted_sampler | bool | False | bool if use WeightedRandomSampler for training, only works with classification tasks | +| epochs | int | 100 | number of training epochs | +| num_workers | int | 2 | number of workers for data loading | +| train_metrics_interval | int | -1 | frequency of computing metrics on train data, -1 if don't perform | +| validation_interval | int | 1 | frequency of computing metrics on validation data | +| num_log_images | int | 4 | maximum number of images to visualize and log | +| skip_last_batch | bool | True | whether to skip last batch while training | +| accelerator | Literal\["auto", "cpu", "gpu"\] | "auto" | What accelerator to use for training. | +| devices | int \| list\[int\] \| str | "auto" | Either specify how many devices to use (int), list specific devices, or use "auto" for automatic configuration based on the selected accelerator | +| matmul_precision | Literal\["medium", "high", "highest"\] \| None | None | Sets the internal precision of float32 matrix multiplications. | +| strategy | Literal\["auto", "ddp"\] | "auto" | What strategy to use for training. | +| num_sanity_val_steps | int | 2 | Number of sanity validation steps performed before training. | +| profiler | Literal\["simple", "advanced"\] \| None | None | PL profiler for GPU/CPU/RAM utilization analysis | +| verbose | bool | True | Print all intermediate results to console. | ### Preprocessing