From 2f470daa048e3c7102d79f557e5d2909a5d913ad Mon Sep 17 00:00:00 2001 From: Sebastian Hoffmann Date: Mon, 6 Jan 2025 13:39:38 +0100 Subject: [PATCH] feat: detailed logs of used cuda devices, closes #32 --- dmlcloud/core/callbacks.py | 60 +++++++++++++++++++++++++++++++++++--- dmlcloud/core/pipeline.py | 4 +++ dmlcloud/util/logging.py | 12 ++------ 3 files changed, 63 insertions(+), 13 deletions(-) diff --git a/dmlcloud/core/callbacks.py b/dmlcloud/core/callbacks.py index 0bcf899..e764877 100644 --- a/dmlcloud/core/callbacks.py +++ b/dmlcloud/core/callbacks.py @@ -1,4 +1,5 @@ import csv +import json import os import sys from datetime import datetime, timedelta @@ -6,6 +7,7 @@ from pathlib import Path from typing import Callable, Optional, TYPE_CHECKING, Union +import pynvml import torch from omegaconf import OmegaConf from progress_table import ProgressTable @@ -16,6 +18,7 @@ from . import logging as dml_logging from .distributed import all_gather_object, is_root + if TYPE_CHECKING: from .pipeline import Pipeline from .stage import Stage @@ -33,6 +36,7 @@ 'CsvCallback', 'WandbCallback', 'TensorboardCallback', + 'CudaCallback', ] @@ -91,8 +95,9 @@ class CbPriority(IntEnum): CHECKPOINT = -190 STAGE_TIMER = -180 DIAGNOSTICS = -170 - GIT = -160 - METRIC_REDUCTION = -150 + CUDA = -160 + GIT = -150 + METRIC_REDUCTION = -100 OBJECT_METHODS = 0 @@ -482,13 +487,60 @@ class GitDiffCallback(Callback): def pre_run(self, pipe): diff = git_diff() - if pipe.checkpointing_enabled: + if pipe.checkpointing_enabled and is_root(): self._save(pipe.checkpoint_dir.path / 'git_diff.txt', diff) msg = '* GIT-DIFF:\n' - msg += '\n'.join('\t' + line for line in diff.splitlines()) + msg += '\n'.join(' ' + line for line in diff.splitlines()) dml_logging.info(msg) def _save(self, path, diff): with open(path, 'w') as f: f.write(diff) + + +class CudaCallback(Callback): + """ + Logs various properties pertaining to CUDA devices. + """ + + def pre_run(self, pipe): + handle = torch.cuda._get_pynvml_handler(pipe.device) + + info = { + 'name': pynvml.nvmlDeviceGetName(handle), + 'uuid': pynvml.nvmlDeviceGetUUID(handle), + 'serial': pynvml.nvmlDeviceGetSerial(handle), + 'torch_device': str(pipe.device), + 'minor_number': pynvml.nvmlDeviceGetMinorNumber(handle), + 'architecture': pynvml.nvmlDeviceGetArchitecture(handle), + 'brand': pynvml.nvmlDeviceGetBrand(handle), + 'vbios_version': pynvml.nvmlDeviceGetVbiosVersion(handle), + 'driver_version': pynvml.nvmlSystemGetDriverVersion(), + 'cuda_driver_version': pynvml.nvmlSystemGetCudaDriverVersion_v2(), + 'nvml_version': pynvml.nvmlSystemGetNVMLVersion(), + 'total_memory': pynvml.nvmlDeviceGetMemoryInfo(handle, pynvml.nvmlMemory_v2).total, + 'reserved_memory': pynvml.nvmlDeviceGetMemoryInfo(handle, pynvml.nvmlMemory_v2).reserved, + 'num_gpu_cores': pynvml.nvmlDeviceGetNumGpuCores(handle), + 'power_managment_limit': pynvml.nvmlDeviceGetPowerManagementLimit(handle), + 'power_managment_default_limit': pynvml.nvmlDeviceGetPowerManagementDefaultLimit(handle), + 'cuda_compute_capability': pynvml.nvmlDeviceGetCudaComputeCapability(handle), + } + all_devices = all_gather_object(info) + + msg = '* CUDA-DEVICES:\n' + info_strings = [ + f'{info["torch_device"]} -> /dev/nvidia{info["minor_number"]} -> {info["name"]} (UUID: {info["uuid"]}) (VRAM: {info["total_memory"] / 1000 ** 2:.0f} MB)' + for info in all_devices + ] + msg += '\n'.join(f' - [{i}] {info_str}' for i, info_str in enumerate(info_strings)) + dml_logging.info(msg) + + if pipe.checkpointing_enabled and is_root(): + self._save(pipe.checkpoint_dir.path / 'cuda_devices.json', all_devices) + + def _save(self, path, all_devices): + with open(path, 'w') as f: + devices = {f'rank_{i}': device for i, device in enumerate(all_devices)} + obj = {'devices': devices} + json.dump(obj, f, indent=4) diff --git a/dmlcloud/core/pipeline.py b/dmlcloud/core/pipeline.py index 98a9158..c5db070 100644 --- a/dmlcloud/core/pipeline.py +++ b/dmlcloud/core/pipeline.py @@ -14,6 +14,7 @@ CbPriority, CheckpointCallback, CsvCallback, + CudaCallback, DiagnosticsCallback, GitDiffCallback, TensorboardCallback, @@ -178,6 +179,9 @@ def enable_checkpointing( self.add_callback(CsvCallback(self.checkpoint_dir.path, append_stage_name=True), CbPriority.CSV) self.add_callback(TensorboardCallback(self.checkpoint_dir.path), CbPriority.TENSORBOARD) + if self.device.type == 'cuda': + self.add_callback(CudaCallback(), CbPriority.CUDA) + def enable_wandb( self, project: str | None = None, diff --git a/dmlcloud/util/logging.py b/dmlcloud/util/logging.py index 48b1897..33c0cf4 100644 --- a/dmlcloud/util/logging.py +++ b/dmlcloud/util/logging.py @@ -1,6 +1,5 @@ import io import os -import subprocess import sys from datetime import datetime from pathlib import Path @@ -114,21 +113,16 @@ def general_diagnostics() -> str: msg += f' - backend: {dist.get_backend()}\n' msg += f' - cuda: {torch.cuda.is_available()}\n' - if torch.cuda.is_available(): - msg += '* GPUs (root):\n' - nvsmi = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout.decode() - for line in nvsmi.splitlines(): - msg += f' - {line}\n' - msg += '* VERSIONS:\n' msg += f' - python: {sys.version}\n' - msg += f' - dmlcloud: {dmlcloud.__version__}\n' - msg += f' - cuda: {torch.version.cuda}\n' + msg += f' - cuda (torch): {torch.version.cuda}\n' try: msg += ' - ' + Path('/proc/driver/nvidia/version').read_text().splitlines()[0] + '\n' except (FileNotFoundError, IndexError): pass + msg += f' - dmlcloud: {dmlcloud.__version__}\n' + for module_name in ML_MODULES: if is_imported(module_name): msg += f' - {module_name}: {try_get_version(module_name)}\n'