From 55a5480988b98ebaf05428da0c6079326039cd8d Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Tue, 12 Apr 2022 15:24:12 +0200 Subject: [PATCH 01/16] Reworking distributed support (WIP). --- avalanche/benchmarks/classic/cmnist.py | 8 +- avalanche/benchmarks/utils/data_loader.py | 268 +++++++--- avalanche/distributed/__init__.py | 5 + avalanche/distributed/distributed_batch.py | 106 ++++ avalanche/distributed/distributed_commons.py | 22 + avalanche/distributed/distributed_helper.py | 509 +++++++++++++++++++ avalanche/distributed/distributed_model.py | 171 +++++++ avalanche/distributed/distributed_tensor.py | 67 +++ avalanche/distributed/distributed_value.py | 295 +++++++++++ avalanche/logging/base_logger.py | 27 + avalanche/training/plugins/evaluation.py | 40 +- tests/distributed/__init__.py | 0 tests/distributed/test_distributed_batch.py | 79 +++ tests/distributed/test_distributed_model.py | 80 +++ tests/distributed/test_distributed_tensor.py | 88 ++++ tests/run_dist_tests.py | 68 +++ 16 files changed, 1771 insertions(+), 62 deletions(-) create mode 100644 avalanche/distributed/__init__.py create mode 100644 avalanche/distributed/distributed_batch.py create mode 100644 avalanche/distributed/distributed_commons.py create mode 100644 avalanche/distributed/distributed_helper.py create mode 100644 avalanche/distributed/distributed_model.py create mode 100644 avalanche/distributed/distributed_tensor.py create mode 100644 avalanche/distributed/distributed_value.py create mode 100644 tests/distributed/__init__.py create mode 100644 tests/distributed/test_distributed_batch.py create mode 100644 tests/distributed/test_distributed_model.py create mode 100644 tests/distributed/test_distributed_tensor.py create mode 100644 tests/run_dist_tests.py diff --git a/avalanche/benchmarks/classic/cmnist.py b/avalanche/benchmarks/classic/cmnist.py index 491f0985c..e5a4cc1a9 100644 --- a/avalanche/benchmarks/classic/cmnist.py +++ b/avalanche/benchmarks/classic/cmnist.py @@ -29,6 +29,7 @@ ) from avalanche.benchmarks.datasets import default_dataset_location from avalanche.benchmarks.utils import AvalancheDataset +from avalanche.distributed import DistributedHelper _default_mnist_train_transform = Compose( [ToTensor(), Normalize((0.1307,), (0.3081,))] @@ -394,9 +395,12 @@ def _get_mnist_dataset(dataset_root): if dataset_root is None: dataset_root = default_dataset_location("mnist") - train_set = MNIST(root=dataset_root, train=True, download=True) + with DistributedHelper.main_process_first(): + train_set = MNIST(root=dataset_root, + train=True, download=True) - test_set = MNIST(root=dataset_root, train=False, download=True) + test_set = MNIST(root=dataset_root, + train=False, download=True) return train_set, test_set diff --git a/avalanche/benchmarks/utils/data_loader.py b/avalanche/benchmarks/utils/data_loader.py index e1e5336c7..b5f276a2d 100644 --- a/avalanche/benchmarks/utils/data_loader.py +++ b/avalanche/benchmarks/utils/data_loader.py @@ -17,13 +17,14 @@ import itertools from collections import defaultdict from itertools import chain -from typing import Dict, Sequence +from typing import Dict, Sequence, Union import torch -from torch.utils.data import RandomSampler +from torch.utils.data import RandomSampler, DistributedSampler from torch.utils.data.dataloader import DataLoader from avalanche.benchmarks.utils import AvalancheDataset +from avalanche.distributed import DistributedHelper def _default_collate_mbatches_fn(mbatches): @@ -98,7 +99,7 @@ def __init__( each task separately. See pytorch :class:`DataLoader`. """ self.data = data - self.dataloaders: Dict[int, DataLoader] = {} + self.dataloaders: Dict[int, DataLoader] = dict() self.oversample_small_tasks = oversample_small_tasks self.collate_mbatches = collate_mbatches @@ -135,6 +136,7 @@ def __init__( oversample_small_groups: bool = False, collate_mbatches=_default_collate_mbatches_fn, batch_size: int = 32, + distributed_sampling: bool = True, **kwargs ): """Data loader that balances data from multiple datasets. @@ -159,9 +161,11 @@ def __init__( each group separately. See pytorch :class:`DataLoader`. """ self.datasets = datasets - self.dataloaders = [] + self.batch_sizes = [] self.oversample_small_groups = oversample_small_groups self.collate_mbatches = collate_mbatches + self.distributed_sampling = distributed_sampling + self.loader_kwargs = kwargs # check if batch_size is larger than or equal to the number of datasets assert batch_size >= len(datasets) @@ -170,47 +174,75 @@ def __init__( ds_batch_size = batch_size // len(datasets) remaining = batch_size % len(datasets) - for data in self.datasets: + for _ in self.datasets: bs = ds_batch_size if remaining > 0: bs += 1 remaining -= 1 - self.dataloaders.append(DataLoader(data, batch_size=bs, **kwargs)) - self.max_len = max([len(d) for d in self.dataloaders]) + self.batch_sizes.append(bs) + + loaders_for_len_estimation = [ + _make_data_loader( + dataset, + distributed_sampling, + kwargs, + mb_size, + force_no_workers=True)[0] + for dataset, mb_size in zip(self.datasets, self.batch_sizes)] + + self.max_len = max([len(d) for d in loaders_for_len_estimation]) def __iter__(self): + dataloaders = [] + samplers = [] + for dataset, mb_size in zip(self.datasets, self.batch_sizes): + data_l, data_l_sampler = _make_data_loader( + dataset, + self.distributed_sampling, + self.loader_kwargs, + mb_size) + + dataloaders.append(data_l) + samplers.append(data_l_sampler) + iter_dataloaders = [] - for dl in self.dataloaders: + for dl in dataloaders: iter_dataloaders.append(iter(dl)) - max_num_mbatches = max([len(d) for d in iter_dataloaders]) + max_num_mbatches = max([len(d) for d in dataloaders]) for it in range(max_num_mbatches): mb_curr = [] - is_removed_dataloader = False + removed_dataloaders_idxs = [] # copy() is necessary because we may remove keys from the # dictionary. This would break the generator. - for tid, t_loader in enumerate(iter_dataloaders): + for tid, (t_loader, t_loader_sampler) in \ + enumerate(zip(iter_dataloaders, samplers)): try: batch = next(t_loader) except StopIteration: # StopIteration is thrown if dataset ends. if self.oversample_small_groups: # reinitialize data loader - iter_dataloaders[tid] = iter(self.dataloaders[tid]) + if isinstance(t_loader_sampler, DistributedSampler): + # Manage shuffling in DistributedSampler + t_loader_sampler.set_epoch(t_loader_sampler.epoch+1) + + iter_dataloaders[tid] = iter(dataloaders[tid]) batch = next(iter_dataloaders[tid]) else: # We iteratated over all the data from this group # and we don't need the iterator anymore. iter_dataloaders[tid] = None - is_removed_dataloader = True + samplers[tid] = None + removed_dataloaders_idxs.append(tid) continue mb_curr.append(batch) yield self.collate_mbatches(mb_curr) # clear empty data-loaders - if is_removed_dataloader: - while None in iter_dataloaders: - iter_dataloaders.remove(None) + for tid in reversed(removed_dataloaders_idxs): + del iter_dataloaders[tid] + del samplers[tid] def __len__(self): return self.max_len @@ -224,6 +256,7 @@ def __init__( self, datasets: Sequence[AvalancheDataset], collate_mbatches=_default_collate_mbatches_fn, + distributed_sampling: bool = True, **kwargs ): """Data loader that balances data from multiple datasets emitting an @@ -245,8 +278,20 @@ def __init__( self.collate_mbatches = collate_mbatches for data in self.datasets: + if DistributedHelper.is_distributed and distributed_sampling: + seed = torch.randint( + 0, + 2 ** 32 - 1 - DistributedHelper.world_size, + (1,), + dtype=torch.int64) + seed += DistributedHelper.rank + generator = torch.Generator() + generator.manual_seed(int(seed)) + else: + generator = None # Default infinite_sampler = RandomSampler( - data, replacement=True, num_samples=10 ** 10 + data, replacement=True, num_samples=10 ** 10, + generator=generator ) dl = DataLoader(data, sampler=infinite_sampler, **kwargs) self.dataloaders.append(dl) @@ -277,11 +322,12 @@ def __init__(self, data: AvalancheDataset, memory: AvalancheDataset = None, batch_size: int = 32, batch_size_mem: int = 32, task_balanced_dataloader: bool = False, + distributed_sampling: bool = True, **kwargs): """ Custom data loader for rehearsal strategies. - The iterates in parallel two datasets, the current `data` and the - rehearsal `memory`, which are used to create mini-batches by + This dataloader iterates in parallel two datasets, the current `data` + and the rehearsal `memory`, which are used to create mini-batches by concatenating their data together. Mini-batches from both of them are balanced using the task label (i.e. each mini-batch contains a balanced number of examples from all the tasks in the `data` and `memory`). @@ -307,13 +353,15 @@ def __init__(self, data: AvalancheDataset, memory: AvalancheDataset = None, :param kwargs: data loader arguments used to instantiate the loader for each task separately. See pytorch :class:`DataLoader`. """ - self.data = data self.memory = memory - self.loader_data: Sequence[DataLoader] = {} - self.loader_memory: Sequence[DataLoader] = {} self.oversample_small_tasks = oversample_small_tasks + self.task_balanced_dataloader = task_balanced_dataloader self.collate_mbatches = collate_mbatches + self.data_batch_sizes: Union[int, Dict[int, int]] = dict() + self.memory_batch_sizes: Union[int, Dict[int, int]] = dict() + self.distributed_sampling = distributed_sampling + self.loader_kwargs = kwargs num_keys = len(self.memory.task_set) if task_balanced_dataloader: @@ -322,9 +370,8 @@ def __init__(self, data: AvalancheDataset, memory: AvalancheDataset = None, "to the number of tasks in the memory " \ "and current data." - # Create dataloader for data items - self.loader_data, _ = self._create_dataloaders( - data, batch_size, 0, False, **kwargs) + self.data_batch_sizes, _ = self._get_batch_sizes( + data, batch_size, 0, False) # Create dataloader for memory items if task_balanced_dataloader: @@ -334,37 +381,88 @@ def __init__(self, data: AvalancheDataset, memory: AvalancheDataset = None, single_group_batch_size = batch_size_mem remaining_example = 0 - self.loader_memory, remaining_example = self._create_dataloaders( + self.memory_batch_sizes, _ = self._get_batch_sizes( memory, single_group_batch_size, remaining_example, - task_balanced_dataloader, **kwargs) + task_balanced_dataloader) + + loaders_for_len_estimation = [] + + if isinstance(self.data_batch_sizes, int): + loaders_for_len_estimation.append(_make_data_loader( + data, distributed_sampling, kwargs, self.data_batch_sizes, + force_no_workers=True + )[0]) + else: + # Task balanced + for task_id in data.task_set: + dataset = data.task_set[task_id] + mb_sz = self.data_batch_sizes[task_id] + + loaders_for_len_estimation.append(_make_data_loader( + dataset, distributed_sampling, kwargs, mb_sz, + force_no_workers=True + )[0]) + + if isinstance(self.memory_batch_sizes, int): + loaders_for_len_estimation.append(_make_data_loader( + memory, distributed_sampling, kwargs, self.memory_batch_sizes, + force_no_workers=True + )[0]) + else: + for task_id in memory.task_set: + dataset = memory.task_set[task_id] + mb_sz = self.memory_batch_sizes[task_id] + + loaders_for_len_estimation.append(_make_data_loader( + dataset, distributed_sampling, kwargs, mb_sz, + force_no_workers=True + )[0]) - self.max_len = max([len(d) for d in chain( - self.loader_data.values(), self.loader_memory.values())] - ) + self.max_len = max([len(d) for d in loaders_for_len_estimation]) def __iter__(self): + loader_data, sampler_data = self._create_loaders_and_samplers( + self.data, self.data_batch_sizes) + + loader_memory, sampler_memory = self._create_loaders_and_samplers( + self.memory, self.memory_batch_sizes) + iter_data_dataloaders = {} iter_buffer_dataloaders = {} - for t in self.loader_data.keys(): - iter_data_dataloaders[t] = iter(self.loader_data[t]) - for t in self.loader_memory.keys(): - iter_buffer_dataloaders[t] = iter(self.loader_memory[t]) - - max_len = max([len(d) for d in iter_data_dataloaders.values()]) + for t in loader_data.keys(): + iter_data_dataloaders[t] = iter(loader_data[t]) + for t in loader_memory.keys(): + iter_buffer_dataloaders[t] = iter(loader_memory[t]) + + max_len = max( + [ + len(d) + for d in chain( + loader_data.values(), + loader_memory.values(), + ) + ] + ) try: for it in range(max_len): mb_curr = [] self._get_mini_batch_from_data_dict( - self.data, iter_data_dataloaders, - self.loader_data, False, - mb_curr) + iter_data_dataloaders, + sampler_data, + loader_data, + self.oversample_small_tasks, + mb_curr, + ) self._get_mini_batch_from_data_dict( - self.memory, iter_buffer_dataloaders, - self.loader_memory, self.oversample_small_tasks, - mb_curr) + iter_buffer_dataloaders, + sampler_memory, + loader_memory, + self.oversample_small_tasks, + mb_curr, + ) yield self.collate_mbatches(mb_curr) except StopIteration: @@ -373,13 +471,19 @@ def __iter__(self): def __len__(self): return self.max_len - def _get_mini_batch_from_data_dict(self, data, iter_dataloaders, - loaders_dict, oversample_small_tasks, - mb_curr): + def _get_mini_batch_from_data_dict( + self, + iter_dataloaders, + iter_samplers, + loaders_dict, + oversample_small_tasks, + mb_curr, + ): # list() is necessary because we may remove keys from the # dictionary. This would break the generator. for t in list(iter_dataloaders.keys()): t_loader = iter_dataloaders[t] + t_sampler = iter_samplers[t] try: tbatch = next(t_loader) except StopIteration: @@ -387,36 +491,84 @@ def _get_mini_batch_from_data_dict(self, data, iter_dataloaders, # reinitialize data loader if oversample_small_tasks: # reinitialize data loader + if isinstance(t_sampler, DistributedSampler): + # Manage shuffling in DistributedSampler + t_sampler.set_epoch(t_sampler.epoch + 1) + iter_dataloaders[t] = iter(loaders_dict[t]) tbatch = next(iter_dataloaders[t]) else: del iter_dataloaders[t] + del iter_samplers[t] continue mb_curr.append(tbatch) - def _create_dataloaders(self, data_dict, single_exp_batch_size, - remaining_example, task_balanced_dataloader, - **kwargs): - loaders_dict: Dict[int, DataLoader] = {} + def _create_loaders_and_samplers(self, data, batch_sizes): + loaders = dict() + samplers = dict() + + if isinstance(batch_sizes, int): + loader, sampler = _make_data_loader( + data, self.distributed_sampling, self.loader_kwargs, + batch_sizes, + ) + loaders[0] = loader + samplers[0] = sampler + else: + for task_id in data.task_set: + dataset = data.task_set[task_id] + mb_sz = batch_sizes[task_id] + + loader, sampler = _make_data_loader( + dataset, self.distributed_sampling, + self.loader_kwargs, mb_sz) + + loaders[task_id] = loader + samplers[task_id] = sampler + return loaders, samplers + + @staticmethod + def _get_batch_sizes(data_dict, single_exp_batch_size, remaining_example, + task_balanced_dataloader): + batch_sizes = dict() if task_balanced_dataloader: for task_id in data_dict.task_set: - data = data_dict.task_set[task_id] current_batch_size = single_exp_batch_size if remaining_example > 0: current_batch_size += 1 remaining_example -= 1 - loaders_dict[task_id] = DataLoader( - data, batch_size=current_batch_size, **kwargs) + batch_sizes[task_id] = current_batch_size else: - loaders_dict[0] = DataLoader( - data_dict, batch_size=single_exp_batch_size, **kwargs) - - return loaders_dict, remaining_example + # Current data is loaded without task balancing + batch_sizes = single_exp_batch_size + return batch_sizes, remaining_example + + +def _make_data_loader( + dataset, distributed_sampling, data_loader_args, + batch_size, force_no_workers=False): + data_loader_args = data_loader_args.copy() + if force_no_workers: + data_loader_args['num_workers'] = 0 + + if DistributedHelper.is_distributed and distributed_sampling: + sampler = DistributedSampler( + dataset, + shuffle=data_loader_args.pop('shuffle', False), + drop_last=data_loader_args.pop('drop_last', False) + ) + data_loader = DataLoader( + dataset, sampler=sampler, batch_size=batch_size, + **data_loader_args) + else: + sampler = None + data_loader = DataLoader( + dataset, batch_size=batch_size, **data_loader_args) + + return data_loader, sampler __all__ = [ - "detection_collate_fn", - "detection_collate_mbatches_fn", "TaskBalancedDataLoader", "GroupBalancedDataLoader", "ReplayDataLoader", diff --git a/avalanche/distributed/__init__.py b/avalanche/distributed/__init__.py new file mode 100644 index 000000000..af11a110e --- /dev/null +++ b/avalanche/distributed/__init__.py @@ -0,0 +1,5 @@ +from .distributed_helper import * +from .distributed_value import * +from .distributed_batch import * +from .distributed_model import * +from .distributed_commons import * diff --git a/avalanche/distributed/distributed_batch.py b/avalanche/distributed/distributed_batch.py new file mode 100644 index 000000000..0f9aef88c --- /dev/null +++ b/avalanche/distributed/distributed_batch.py @@ -0,0 +1,106 @@ +from abc import abstractmethod, ABC +from typing import TypeVar, List, Optional + +import torch +from torch import Tensor + +from avalanche.distributed import DistributedHelper +from avalanche.distributed.distributed_value import SwitchableDistributedValue + +TupleT = TypeVar('TupleT', bound='Tuple') +OptTupleT = Optional[TupleT] +LocalT = TypeVar('LocalT') +DistributedT = TypeVar('DistributedT') + + +class DistributedObject(SwitchableDistributedValue[LocalT, DistributedT], ABC): + """ + An intermediate abstract class in charge of synchronizing objects. + + The merge procedure must be implemented in child classes. + """ + def _synchronize_distributed_value(self) -> DistributedT: + objects = self._synchronize_objects() + return self._merge_objects(objects) + + def _synchronize_objects(self) -> List[LocalT]: + return DistributedHelper.gather_all_objects( + self._local_value + ) + + @abstractmethod + def _merge_objects(self, objects: List[LocalT]) -> DistributedT: + pass + + +class DistributedBatch(DistributedObject[LocalT, LocalT], ABC): + """ + An intermediate abstract class in charge of synchronizing data batches. + + This class can handle batches as either tuples of elements (as usual) or + even single values. + + The merge procedure of single elements must be implemented in child classes. + + NOTE: In the future, this class may be replaced with a version in which only + the accessed tuple elements are synchronized, instead of the whole batch. + The current design, in which child classes only have to implement + `_merge_single_values`, allows for this change to happen without affecting + child classes. + """ + + def __init__(self, name: str, initial_local_value: LocalT): + super(DistributedBatch, self).__init__( + name, initial_local_value + ) + self._value_is_tuple = False + + def _synchronize_distributed_value(self) -> LocalT: + if self._local_value is None: + return None + else: + return super()._synchronize_distributed_value() + + def _set_local_value(self, new_local_value): + self._value_is_tuple = isinstance(new_local_value, (tuple, list)) + super(DistributedBatch, self)._set_local_value(new_local_value) + + def _merge_objects(self, objects: List[LocalT]) -> LocalT: + if self._value_is_tuple: + return self._merge_tuples(objects) + else: + return self._merge_single_values(objects) + + def _merge_tuples(self, tuples: List[LocalT]): + merged_elements = [] + n_elements = len(self._local_value) + for element_idx in range(n_elements): + to_merge_elements = [] + for tp in tuples: + to_merge_elements.append(tp[element_idx]) + + merged_elements.append( + self._merge_single_values(to_merge_elements) + ) + + return tuple(merged_elements) + + @abstractmethod + def _merge_single_values(self, values: List): + pass + + +class ClassificationBatch(DistributedBatch[LocalT]): + """ + An implementation of :class:`DistributedBatch` that assumes that all values + are Tensors. + """ + def _merge_single_values(self, values: List[Tensor]): + return torch.cat(values) + + +__all__ = [ + 'DistributedObject', + 'DistributedBatch', + 'ClassificationBatch' +] diff --git a/avalanche/distributed/distributed_commons.py b/avalanche/distributed/distributed_commons.py new file mode 100644 index 000000000..9844adc4e --- /dev/null +++ b/avalanche/distributed/distributed_commons.py @@ -0,0 +1,22 @@ +import torch + +from avalanche.distributed.distributed_tensor import DistributedMeanTensor + + +class DistributedLoss(DistributedMeanTensor): + """ + A distributed value in charge of obtaining the mean loss. + + The mean loss is computed as the mean of losses from all processes, without + weighting using the mini batch sizes in each process. + + This is current mostly an alias for :class:`DistributedMeanTensor`. However, + in the future this class may be extended to add loss-specific features. + """ + def __init__(self, name: str = 'loss'): + super(DistributedLoss, self).__init__(name, torch.zeros((1,))) + + +__all__ = [ + 'DistributedLoss' +] diff --git a/avalanche/distributed/distributed_helper.py b/avalanche/distributed/distributed_helper.py new file mode 100644 index 000000000..b33f5657b --- /dev/null +++ b/avalanche/distributed/distributed_helper.py @@ -0,0 +1,509 @@ +import os +import random +import warnings +from collections import OrderedDict +from typing import Optional, List, Tuple + +import numpy as np +import torch +from torch import Tensor +from torch.distributed import init_process_group +from torch.nn.modules import Module +from torch.nn.parallel import DistributedDataParallel +from typing_extensions import Literal + +from avalanche.benchmarks import GenericCLScenario + + +class _Singleton(type): + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(_Singleton, cls).__call__( + *args, **kwargs) + return cls._instances[cls] + + +class _RollingSeedContext(object): + """ + Implement seed alignment by storing random number generators state. + + Doesn't require a distributed communication (even broadcast), which makes + this the best choices when wrapping sections that (may) both: + - behave differently depending on the rank + - change the global state of random number generators + """ + def __init__(self): + self.generators_state = None + + def save_generators_state(self): + self.generators_state = dict() + for gen_name, gen_def in DistributedHelper.random_generators.items(): + self.generators_state[gen_name] = gen_def['save_state']() + + def load_generators_state(self): + for gen_name, gen_def in DistributedHelper.random_generators.items(): + gen_def['load_state'](self.generators_state[gen_name]) + + def step_random_generators(self): + for gen_name, gen_def in DistributedHelper.random_generators.items(): + gen_def['step']() + + def __enter__(self): + self.save_generators_state() + + def __exit__(self, *_): + self.load_generators_state() + self.step_random_generators() + + +class _BroadcastSeedContext(object): + """ + Implement seed alignment by broadcasting a new seed from the main process. + + This is usually slower than using :class:`_RollingSeedContext`. + """ + def __init__(self): + pass + + def __enter__(self): + pass + + def __exit__(self, *_): + DistributedHelper.align_seeds() + + +class _MainProcessFirstContext(object): + """ + A context in which the main process must enter and exit the section before + other processes. + + For instance, can be used to wrap the dataset download procedure. + """ + + def __init__( + self, + seed_alignment: Literal["rolling", "broadcast"] = 'rolling', + final_barrier: bool = False): + if seed_alignment == 'rolling': + self._seed_aligner = _RollingSeedContext() + else: + self._seed_aligner = _BroadcastSeedContext() + + self._final_barrier = final_barrier + + def __enter__(self): + self._seed_aligner.__enter__() + + if not DistributedHelper.is_main_process: + # Wait for the main process + DistributedHelper.barrier() + + def __exit__(self, exc_type, exc_val, exc_tb): + if DistributedHelper.is_main_process: + # Let other process enter the section + DistributedHelper.barrier() + + self._seed_aligner.__exit__() + if self._final_barrier: + DistributedHelper.barrier() + + +class _DistributedHelperCls(object): + __metaclass__ = _Singleton + + def __init__(self): + self.use_cuda = True + + self.random_generators = OrderedDict() + + self.register_random_generator('torch', { + 'seed': torch.random.manual_seed, + 'save_state': torch.random.get_rng_state, + 'load_state': torch.random.set_rng_state, + 'step': lambda: torch.rand(1) + }) + + self.register_random_generator('numpy', { + 'seed': np.random.seed, + 'save_state': np.random.get_state, + 'load_state': np.random.set_state, + 'step': lambda: np.random.rand(1) + }) + + self.register_random_generator('random', { + 'seed': random.seed, + 'save_state': random.getstate, + 'load_state': random.setstate, + 'step': random.random + }) + + def init_distributed(self, random_seed, backend=None, use_cuda=True): + if self.is_distributed: + raise RuntimeError('Distributed API already initialized') + + if backend is None: + if use_cuda: + backend = 'nccl' + else: + backend = 'gloo' + + if backend == 'nccl' and not use_cuda: + warnings.warn( + 'Bad configuration: using NCCL, but you set use_cuda=False!') + + if os.environ.get('LOCAL_RANK', None) is None: + warnings.warn( + 'Torch distributed could not be initialized ' + '(missing environment configuration)') + else: + init_process_group(backend=backend) + + self.set_random_seeds(random_seed) + self.use_cuda = use_cuda + + if use_cuda or backend == 'nccl': + # https://github.com/pytorch/pytorch/issues/6351 + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + return True + + def get_device_id(self): + if self.is_distributed: + device_id = self.rank + else: + device_id = 0 + + if self.use_cuda and torch.cuda.is_available(): + return device_id + + return -1 + + def make_device(self): + if self.is_distributed: + device_id = self.rank + else: + device_id = 0 + + if self.use_cuda and torch.cuda.is_available() and device_id >= 0: + torch.cuda.set_device(device_id) + ref_device = torch.device(f'cuda:{device_id}') + else: + ref_device = torch.device('cpu') + return ref_device + + def wrap_model(self, model: Module) -> Module: + if self.is_distributed: + if self.forced_cuda_comm or self.use_cuda: + # forced_cuda_comm is True if using NCCL; use_cuda may be true + # even when not using NCCL. + # User already warned if using NCCL with use_cuda==False. + # device_ids must be a single device id + # (an int, a device object or a str) + # If not set, output_device defaults to device_ids[0] + return DistributedDataParallel( + model, device_ids=[self.make_device()]) + else: + return DistributedDataParallel(model) + else: + return model + + def unwrap_model(self, model: Module) -> Module: + if isinstance(model, DistributedDataParallel): + return model.module + + return model + + def register_random_generator(self, name: str, rng_def: dict): + if 'save_state' not in rng_def or \ + 'load_state' not in rng_def or 'step' not in rng_def: + raise ValueError('Invalid random number generator definition') + + self.random_generators[name] = rng_def + + def set_random_seeds(self, random_seed): + for gen_name, gen_dict in self.random_generators.items(): + gen_dict['seed'](random_seed) + + def align_seeds(self): + if not self.is_distributed: + return + + if self.is_main_process: + reference_seed = torch.randint(0, 2**32-1, (1,), dtype=torch.int64) + else: + reference_seed = torch.empty((1,), dtype=torch.int64) + + self.broadcast(reference_seed) + seed = int(reference_seed) + self.set_random_seeds(seed) + + def main_process_first(self): + return _MainProcessFirstContext() + + def barrier(self): + if self.is_distributed: + torch.distributed.barrier() + + def broadcast(self, tensor: Tensor, src=0): + if not self.is_distributed: + return tensor + + tensor_distrib, orig_data = self._prepare_for_distributed_comm(tensor) + torch.distributed.broadcast(tensor_distrib, src=src) + tensor = self._revert_to_original_device(tensor_distrib, orig_data) + + return tensor + + def cat_all(self, tensor: Tensor): + if not self.is_distributed: + return tensor + + gathered_tensors = self.gather_all( + tensor, different_shape0=True, different_shape1_n=False) + for i, t in enumerate(gathered_tensors): + if len(t.shape) == 0: + # Tensor with 0-length shape + gathered_tensors[i] = torch.reshape(t, (1,)) + + return torch.cat(gathered_tensors) + + def gather_all( + self, + tensor: Tensor, + out_tensors: Optional[List[Tensor]] = None, + different_shape0: bool = None, + different_shape1_n: bool = None): + if not self.is_distributed: + return [tensor] + + if different_shape0 is None or different_shape1_n is None: + warnings.warn('different_shape0 and different_shape1_n not set. ' + 'This may lead to inefficiencies.') + + if different_shape0 is None: + different_shape0 = True + + if different_shape1_n is None: + different_shape1_n = True + + # Based on: + # https://discuss.pytorch.org/t/how-to-concatenate-different-size-tensors-from-distributed-processes/44819/4 + + if out_tensors is None: + all_tensors_shape = None + if different_shape1_n: + # TODO: needs unit test (especially for 0-shaped tensors) + # Tensor differ by whole shape (not very common case) + tensor_size = torch.zeros(10, dtype=torch.int64) + for i in range(len(tensor.shape)): + tensor_size[i] = tensor.shape[i] + + elif different_shape0: + # Tensors differ by shape[0] (most common case) + if len(tensor.shape) > 0: + # Usual case + tensor_size = torch.tensor([tensor.shape[0]], + dtype=torch.int64) + else: + # Some tensors, especially loss tensors, have 0-length shape + tensor_size = torch.tensor([0], dtype=torch.int64) + else: + # TODO: needs unit test (especially for 0-shaped tensors) + # Same size for all tensors + tensor_size = torch.tensor(tensor.shape, dtype=torch.int64) + all_tensors_shape = \ + [tensor_size for _ in range(self.world_size)] + + if all_tensors_shape is None: + all_tensors_shape = [ + self._prepare_for_distributed_comm( + torch.zeros_like(tensor_size))[0] + for _ in range(self.world_size)] + tensor_size, _ = self._prepare_for_distributed_comm(tensor_size) + + torch.distributed.all_gather(all_tensors_shape, tensor_size) + + all_tensors_shape = [t.cpu() for t in all_tensors_shape] + + if different_shape1_n: + # TODO: needs unit test (especially for 0-shaped tensors) + # Trim shape + for i, t in enumerate(all_tensors_shape): + for x in range(len(t)): + if t[x] == 0: + if x == 0: + # Tensor with 0-length shape + all_tensors_shape[i] = t[:x+1] + else: + all_tensors_shape[i] = t[:x] + + break + + elif different_shape0: + if len(tensor.shape[1:]) == 0: + # To manage tensors with 0-length shape + pass + else: + all_tensors_shape = \ + [torch.cat( + [t, + torch.as_tensor(tensor.shape[1:], + dtype=torch.int64)]) + for t in all_tensors_shape] + + all_tensors_shape = \ + [t_shape.tolist() for t_shape in all_tensors_shape] + dtype = tensor.dtype + + out_tensors = [] + for t_shape in all_tensors_shape: + if t_shape[0] == 0 and len(t_shape) == 1: + # Tensor with 0-length shape + out_tensors.append(torch.zeros(tuple(), dtype=dtype)) + else: + out_tensors.append(torch.zeros(*t_shape, dtype=dtype)) + + orig_device = tensor.device + tensor, _ = self._prepare_for_distributed_comm(tensor) + out_tensors = [self._prepare_for_distributed_comm(t)[0] + for t in out_tensors] + torch.distributed.all_gather(out_tensors, tensor) + out_tensors = [t.to(orig_device) for t in out_tensors] + return out_tensors + + def gather_all_objects(self, obj): + out_list = [None for _ in range(self.world_size)] + torch.distributed.all_gather_object(out_list, obj) + return out_list + + def check_equal_tensors(self, tensor: Tensor): + if not DistributedHelper.is_distributed: + return + + all_tensors = self.gather_all( + tensor, + different_shape0=True, + different_shape1_n=True) + + tensors_hashes = [hash_tensor(t) for t in all_tensors] + + if len(set(tensors_hashes)) != 1: + # Equal tensors + raise ValueError('Different tensors. Got hashes: {}'.format( + tensors_hashes)) + + def check_equal_objects(self, obj): + if not DistributedHelper.is_distributed: + return + + output = [None for _ in range(self.world_size)] + torch.distributed.all_gather_object(output, obj) + + for i, o in enumerate(output): + if obj != o: + raise ValueError( + 'Different object ranks this={}, remote={}. ' + 'Got this={}, remote={}'.format( + self.rank, i, obj, o)) + + def _prepare_for_distributed_comm(self, tensor: Tensor): + original_device = tensor.device + copy_back = self.forced_cuda_comm and not tensor.is_cuda + if self.forced_cuda_comm: + tensor_distributed = tensor.cuda() + else: + tensor_distributed = tensor + + return tensor_distributed, (original_device, copy_back, tensor) + + def _revert_to_original_device(self, tensor_distributed, orig_data): + original_device, copy_back, tensor = orig_data + if copy_back: + if tensor is None: + tensor = tensor_distributed.to(original_device) + else: + tensor[:] = tensor_distributed + + return tensor + + @property + def rank(self) -> int: + if torch.distributed.is_initialized(): + return torch.distributed.get_rank() + return 0 + + @property + def world_size(self) -> int: + if torch.distributed.is_initialized(): + return torch.distributed.get_world_size() + return 1 + + @property + def is_distributed(self) -> bool: + return torch.distributed.is_initialized() + + @property + def is_main_process(self) -> bool: + return self.rank == 0 + + @property + def backend(self) -> str: + return torch.distributed.get_backend() + + @property + def forced_cuda_comm(self) -> bool: + return self.backend == 'nccl' + + +def hash_benchmark(benchmark: GenericCLScenario) -> str: + import hashlib + import io + + hash_engine = hashlib.sha256() + for stream_name, stream in benchmark.streams.items(): + hash_engine.update(stream_name.encode()) + for experience in stream: + exp_dataset = experience.dataset + dataset_content = exp_dataset[:] + for tuple_elem in dataset_content: + # https://stackoverflow.com/a/63880190 + buff = io.BytesIO() + torch.save(tuple_elem, buff) + buff.seek(0) + hash_engine.update(buff.read()) + return hash_engine.hexdigest() + + +def hash_minibatch(minibatch: Tuple[Tensor]) -> str: + import hashlib + import io + + hash_engine = hashlib.sha256() + for tuple_elem in minibatch: + buff = io.BytesIO() + torch.save(tuple_elem, buff) + buff.seek(0) + hash_engine.update(buff.read()) + return hash_engine.hexdigest() + + +def hash_tensor(tensor: Tensor) -> str: + import hashlib + import io + + hash_engine = hashlib.sha256() + buff = io.BytesIO() + torch.save(tensor, buff) + buff.seek(0) + hash_engine.update(buff.read()) + return hash_engine.hexdigest() + + +DistributedHelper = _DistributedHelperCls() + +__all__ = [ + 'DistributedHelper', + '_DistributedHelperCls' +] diff --git a/avalanche/distributed/distributed_model.py b/avalanche/distributed/distributed_model.py new file mode 100644 index 000000000..8a367f550 --- /dev/null +++ b/avalanche/distributed/distributed_model.py @@ -0,0 +1,171 @@ +################################################################################ +# Copyright (c) 2021 ContinualAI. # +# Copyrights licensed under the MIT License. # +# See the accompanying LICENSE file for terms. # +# # +# Date: 1/12/2021 # +# Author(s): Lorenzo Pellegrini # +# E-mail: contact@continualai.org # +# Website: avalanche.continualai.org # +################################################################################ +from typing import Optional, Union, Tuple + +from torch.nn import Module +from torch.nn.parallel import DistributedDataParallel +from typing_extensions import Type + +from avalanche.distributed import OptionalDistributedValue +from avalanche.distributed.distributed_value import DistributedT + + +class DistributedModel(OptionalDistributedValue[Optional[Module]]): + """ + Contains the model used in the :class:`BaseTemplate` strategy template. + + Instances of this class can also carry the distributed (that is, wrapped + in a PyTorch `DistributedDataParallel`) version of a local model. If no + distributed model is set, then the model returned by the + `distributed_model` field will be the local one. + + By setting the `distributed_model` field, the model stored in the + `local_model` field will be discarded (from that moment, retrieving the + `local_model` will be the same as obtaining the `distributed_model.module` + field). Setting the `local_model` will discard the current + `distributed_model`. + + Beware that the setter of this class behaves a bit differently + from superclasses. When setting the `value`, the class of the new value + us checked against a list of distributed model classes (by default, + only :class:`DistributedDataParallel` is considered). If the model + is an instance of these classes, then the distributed value is set + instead of the local value. + """ + + def __init__( + self, + initial_model: Module = None, + distributed_model_class: Union[Type, Tuple[Type]] = + DistributedDataParallel): + """ + Creates a `ModelInstance`. + + :param initial_model: The initial model to use. Defaults to None. + :param distributed_model_class: The type(s) of the distributed model. + Defaults to `DistributedDataParallel`. + """ + super().__init__('model', initial_local_value=initial_model) + self.distributed_model_class = distributed_model_class + + @OptionalDistributedValue.value.setter + def value(self, new_value: Module): + """ + Sets the local or distributed model, depending on if the model is a + subclass of DistributedDataParallel. + + This will discard the current distributed value. + """ + + if isinstance(new_value, self.distributed_model_class): + self.distributed_value = new_value + else: + self.local_value = new_value + + @OptionalDistributedValue.local_value.getter + def local_value(self) -> Module: + if self._distributed_value is not None: + return self._distributed_value.module + return self._local_value + + @OptionalDistributedValue.distributed_value.setter + def distributed_value(self, new_distributed_value: Module): + if new_distributed_value is None: + self.reset_distributed_value() + else: + self._distributed_value = new_distributed_value + self._distributed_value_set = True + + # Prevent alignment and memory issues. + # The local model will be retrieved from the distributed model. + self._local_value = None + + def reset_distributed_value(self): + if self._distributed_value_set: + if self._distributed_value is not None: + # Unwrap the DistributedDataParallel to obtain the local value. + self._local_value = self._distributed_value.module + self._distributed_value = None + self._distributed_value_set = False + + def reset_distributed_model(self): + """ + Discards the distributed model. + + If the distributed model was not set, nothing happens. + """ + return self.reset_distributed_value() + + def _synchronize_distributed_value(self) -> DistributedT: + raise RuntimeError( + 'The distributed model needs to be wrapped and set by using the ' + f'following class(es): {self.distributed_model_class}') + + # BEGIN ALIASES for "(local|distributed)value" + @property + def model(self): + """ + The current model. + """ + return self.value + + @model.setter + def model(self, new_model: Module): + """ + Sets the current model. + """ + self.value = new_model + + @property + def local_model(self) -> Module: + """ + The current (local) model. + + If a `distributed_model` was set, then the value of the + `distributed_model.module` field will be returned. + """ + return self.local_value + + @local_model.setter + def local_model(self, new_local_value): + """ + Sets the local model. + + This will discard the current distributed model. + """ + self.local_value = new_local_value + + @property + def distributed_model(self): + """ + The current (distributed) model. + + If not set (not running a distributed training, or if the wrapped + model has not been created yet), this is the same as `local_model`. + """ + return self.distributed_value + + @distributed_model.setter + def distributed_model(self, new_distributed_value): + """ + Sets the model wrapped by PyTorch `DistributedDataParallel`. + + Setting this field will release the reference to the current local + model. In that case, the `local_model` field will return + `distributed_model.module` instead. + """ + self.distributed_value = new_distributed_value + # END ALIASES for "(local|distributed)value" + + +__all__ = [ + 'DistributedModel' +] diff --git a/avalanche/distributed/distributed_tensor.py b/avalanche/distributed/distributed_tensor.py new file mode 100644 index 000000000..689a467a1 --- /dev/null +++ b/avalanche/distributed/distributed_tensor.py @@ -0,0 +1,67 @@ +from abc import ABC, abstractmethod +from typing import List + +import torch +from torch import Tensor + +from avalanche.distributed import DistributedHelper +from avalanche.distributed.distributed_value import SwitchableDistributedValue + + +class DistributedTensor(SwitchableDistributedValue[Tensor, Tensor], ABC): + """ + A distributed Tensor wrapper. + + This abstract class is in charge of synchronizing Tensors across processes. + + Child classes must override `_merge_tensors` to define how those tensors + should be merged. + """ + def _synchronize_distributed_value(self) -> Tensor: + return self._merge_tensors( + DistributedHelper.gather_all(self.local_value)) + + @abstractmethod + def _merge_tensors(self, tensors: List[Tensor]) -> Tensor: + """ + Merge all tensors into one. + + :param tensors: The list of tensors obtained from all processes, in the + order defined by the rank. + :return: The merged tensor. + """ + pass + + +class ConcatDistributedTensor(DistributedTensor): + """ + A distributed tensor obtained by concatenating tensors from all processes + (in the order defined by the rank). + + This also correctly manages tensors with 0-length shapes (like losses). + """ + def _merge_tensors(self, tensors: List[Tensor]) -> Tensor: + # Manage tensors without shape (0-length shape) + for i, t in enumerate(tensors): + if len(t.shape) == 0: + # Tensor with 0-length shape + tensors[i] = torch.reshape(t, (1,)) + + return torch.cat(tensors) + + +class DistributedMeanTensor(ConcatDistributedTensor): + """ + A distributed 1-item tensor obtained by computing the mean of tensors + from all processes. + """ + def _merge_tensors(self, tensors: List[Tensor]) -> Tensor: + concat_tensor = super()._merge_tensors(tensors) + return torch.mean(concat_tensor) + + +__all__ = [ + 'DistributedTensor', + 'ConcatDistributedTensor', + 'DistributedMeanTensor' +] diff --git a/avalanche/distributed/distributed_value.py b/avalanche/distributed/distributed_value.py new file mode 100644 index 000000000..b12546d20 --- /dev/null +++ b/avalanche/distributed/distributed_value.py @@ -0,0 +1,295 @@ +from contextlib import contextmanager +from typing import TypeVar, Generic, Optional, Union, Generator, List, \ + Tuple +from abc import ABC, abstractmethod + + +LocalT = TypeVar('LocalT') +DistributedT = TypeVar('DistributedT') +SwitchableT = TypeVar('SwitchableT', bound='SwitchableDistributedValue') + + +class DistributedValue(Generic[LocalT, DistributedT], ABC): + """ + Class used to generically implement values that may need + a lazy synchronization when running a distributed training. + + When not running a distributed training, this class will act as a + no-op wrapper. + + This class considers setting the 'value' and 'local_value' as the + same operation (setting the local value). However, retrieving 'value' will + trigger the synchronization procedure. + + This class exposes methods that can be customized to define how different + values should be gathered (and merged) from all processes. For instance, + loss values should be averaged together, minibatch outputs should be + concatenated, etcetera. + + Beware that the purpose of this class is to only manage the + local and distributed values. When implementing the subclass, please do not + transform the value and/or type of the local and global values. This + would make it difficult to understand what is going on. + + Also, consider having the same type for the local and distributed value. + That is, if the local value is a Tensor, the distributed value should be + a Tensor as well, not a List[Tensor]. This is because local and distributed + values will be transparently used by users without considering the possibly + distributed nature of the value. + + Feel free to implement, in subclasses, properties with more readable names. + For instance 'mb_output', 'local_mb_output', 'loss', 'local_loss', ... + instead of the default 'value' and 'local_value' already implemented by + this class. + """ + + def __init__(self, name: str, initial_local_value: LocalT): + """ + Creates an instance of a distributed value. + + :param name: The name of the value. Also used when obtaining a string + representation. + :param initial_local_value: The initial local value. + """ + + self.name: str = name + self._local_value: LocalT = initial_local_value + self._distributed_value: Optional[DistributedT] = None + self._distributed_value_set: bool = False + + @property + def value(self) -> DistributedT: + """ + The current value. + + When running a distributed training, this will be the value obtained + by gathering and merging values coming from all processes. + """ + + return self._get_distributed_value() + + @value.setter + def value(self, new_value: LocalT): + """ + Sets the (local) value. + + This will discard the current distributed value. + """ + self._set_local_value(new_value) + + @property + def local_value(self) -> LocalT: + """ + The current (local) value. + + Even when running a distributed training, this property will always + contain the local value only. + """ + return self._local_value + + @local_value.setter + def local_value(self, new_value: LocalT): + """ + Sets the (local) value. + + This will discard the current distributed value. + """ + self._set_local_value(new_value) + + def _set_local_value(self, new_local_value: LocalT): + self._local_value = new_local_value + self._distributed_value = None + self._distributed_value_set = False + + def _get_distributed_value(self) -> DistributedT: + if not self._distributed_value_set: + self._distributed_value = self._synchronize_distributed_value() + self._distributed_value_set = True + + return self._distributed_value + + @abstractmethod + def _synchronize_distributed_value(self) -> DistributedT: + pass + + def __str__(self): + base_str = f'DistributedObject_{self.name} = {self.local_value}' + if self._distributed_value_set: + return base_str + \ + f' (distributed value = {self.value})' + else: + return base_str + \ + f' (distributed value not synchronized yet)' + + +class SettableDistributedValue(DistributedValue[LocalT, DistributedT], ABC): + """ + A version of :class:`DistributedValue` in which the distributed value can be + set (and reset) externally instead of being synchronized. + + If this class should only allow for distributed values to be set + externally (that is, synchronization should be disabled), please + override `_synchronize_distributed_value` to raise an appropriate error. + In that case, this means this class is mainly used as a switch between a + local and a distributed value based on whether the distributed value has + been set or not. + """ + + def __init__(self, name: str, initial_local_value: LocalT): + super(SettableDistributedValue, self).__init__( + name, initial_local_value + ) + + @property + def distributed_value(self) -> DistributedT: + """ + The current value. + + When running a distributed training, this will be the value obtained + by gathering and merging values coming from all processes. + """ + return self._get_distributed_value() + + @distributed_value.setter + def distributed_value(self, new_distributed_value: DistributedT): + """ + Set the distributed value. + """ + self._distributed_value = new_distributed_value + self._distributed_value_set = True + + def reset_distributed_value(self): + """ + Discards the distributed value (if set). + + If the distributed value was not set, nothing happens. + """ + self._distributed_value = None + self._distributed_value_set = False + + def __str__(self): + base_str = super(SettableDistributedValue, self).__str__() + return f'(Settable){base_str}' + + +class SwitchableDistributedValue(SettableDistributedValue[LocalT, DistributedT], + ABC): + """ + A version of :class:`SettableDistributedValue` in which the behaviour of + the `value` property can be switched so that it returns the local value + instead of the distributed one. The setter behaviour can be customized as + well. + + Useful for situations in which one has to force components interacting with + this value to use the local value.Properties whose name feature an explicit + `local` or `distributed` part are not affected. + """ + + def __init__(self, name: str, initial_local_value: LocalT): + """ + Creates an instance of a distributed value. + + :param name: The name of the value. Also used when obtaining a string + representation. + :param initial_local_value: The initial local value. + """ + super().__init__(name, initial_local_value) + + self._behaviour_stack: List[Tuple[bool, bool]] = list() + """ + If greater than 0, the `value` property will return the local value. + """ + + @contextmanager + def use_local_value(self: SwitchableT, getter=True, setter=True) -> \ + Generator[SwitchableT, None, None]: + """ + A context manager used to set the behaviour of the value property. + + Please note that in a plain code section (not wrapped by this + context manager), the default behaviour is that the getter returns the + distributed value while the setter sets the local value. + + :param getter: If True, the local value will be returned by the getter. + Defaults to True, which means that the getter behaviour will be + changed. + :param setter: If True, the local value will be set by the setter. + Defaults to True, which means that the setter will behave as usual. + :return: This object (self). + """ + self._behaviour_stack.append((getter, setter)) + try: + yield self + finally: + self._behaviour_stack.pop() + + @SettableDistributedValue.value.getter + def value(self) -> Union[LocalT, DistributedT]: + if self._use_local_getter(): + return self.local_value + else: + return self.distributed_value + + @SettableDistributedValue.value.setter + def value(self, new_value): + if self._use_local_setter(): + self.local_value = new_value + else: + self.distributed_value = new_value + + def _use_local_getter(self): + if len(self._behaviour_stack) == 0: + return False + + return self._behaviour_stack[-1][0] + + def _use_local_setter(self): + if len(self._behaviour_stack) == 0: + return True + + return self._behaviour_stack[-1][1] + + def __str__(self): + base_str = super(SettableDistributedValue, self).__str__() + + current_get_behaviour = 'local' if self._use_local_getter() \ + else 'distributed' + current_set_behaviour = 'local' if self._use_local_setter() \ + else 'distributed' + + return f'(fget={current_get_behaviour},' \ + f'fset={current_set_behaviour}){base_str}' + + +class OptionalDistributedValue(SwitchableDistributedValue[LocalT, LocalT], ABC): + """ + A version of :class:`SettableDistributedValue` in which the + 'value' property returns the local value if no distributed value has + been set yet (without attempting a synchronization). Accessing the + 'distributed_value' property will still force a synchronization. + + Beware that, when using this class, the generic types for the local and + distributed values is enforced to be the same. + + This class is mainly used for managing models wrapped using + `DistributedDataParallel`. + """ + + def __init__(self, name, initial_local_value): + super().__init__(name, initial_local_value) + + def _get_distributed_value(self) -> DistributedT: + if not self._distributed_value_set: + return self._local_value + + return self._distributed_value + + +__all__ = [ + 'DistributedValue', + 'SettableDistributedValue', + 'SwitchableDistributedValue', + 'OptionalDistributedValue', + 'LocalT', + 'DistributedT' +] diff --git a/avalanche/logging/base_logger.py b/avalanche/logging/base_logger.py index 9e03daa87..51e020a7e 100644 --- a/avalanche/logging/base_logger.py +++ b/avalanche/logging/base_logger.py @@ -2,6 +2,9 @@ from typing import TYPE_CHECKING, List +from avalanche.distributed import DistributedHelper + + if TYPE_CHECKING: from avalanche.evaluation.metric_results import MetricValue from avalanche.training.templates.supervised import SupervisedTemplate @@ -28,6 +31,30 @@ class BaseLogger(ABC): def __init__(self): super().__init__() + if not DistributedHelper.is_main_process: + raise RuntimeError( + 'You are creating a logger in a non-main process during a ' + 'distributed training session. ' + 'Jump to this error for an example on how to fix this.') + + # You have to create the loggers in the main process only. Otherwise, + # metrics will end up duplicated in your log files and consistency + # errors may arise, too. When creating the EvaluationPlugin in a + # non-main process, just pass loggers=None. + # + # Recommended way: + # if not DistributedHelper.is_main_process + # # Define the loggers + # loggers = [...] + # else: + # loggers = None + # + # # Instantiate the evaluation plugin + # eval_plugin = EvaluationPlugin(metricA, metricB, ..., loggers=loggers) + # + # # Instantiate the strategy + # strategy = MyStrategy(..., evaluator=eval_plugin) + def log_single_metric(self, name, value, x_plot): """Log a metric value. diff --git a/avalanche/training/plugins/evaluation.py b/avalanche/training/plugins/evaluation.py index a617b4e91..dd78aeb6c 100644 --- a/avalanche/training/plugins/evaluation.py +++ b/avalanche/training/plugins/evaluation.py @@ -3,6 +3,7 @@ from collections import defaultdict from typing import Union, Sequence, TYPE_CHECKING +from avalanche.distributed import DistributedHelper from avalanche.evaluation.metric_results import MetricValue from avalanche.evaluation.metrics import accuracy_metrics, loss_metrics from avalanche.logging import InteractiveLogger @@ -92,7 +93,7 @@ def __init__( self.loggers: Sequence["BaseLogger"] = loggers - if len(self.loggers) == 0: + if len(self.loggers) == 0 and DistributedHelper.is_main_process: warnings.warn("No loggers specified, metrics will not be logged") if self.collect_all: @@ -238,10 +239,45 @@ def before_eval(self, strategy: "SupervisedTemplate", **kwargs): warnings.warn(msgw) +class LazyDefaultLoggersList(Sequence["BaseLogger"]): + """ + Used to prevent the creation of loggers on a non-main process when + running distributed training jobs. + + Beware that the content of this sequence (and thus the behavior of + `__len__` and `__getitem__`) varies depending on the value of + `DistributedHelper.is_main_process`. This means that objects of this class + should be used only by modules able to handle this behavior, which is not + standard for Sequences. + """ + + def __init__(self): + self._default_loggers = None + + def __len__(self): + if DistributedHelper.is_main_process: + return 1 + else: + return 0 + + def __getitem__(self, item): + self._instantiate_loggers() + return self._default_loggers[item] + + def _instantiate_loggers(self): + if self._default_loggers is not None: + return + + if DistributedHelper.is_main_process: + self._default_loggers = [InteractiveLogger()] + else: + self._default_loggers = [] + + default_evaluator = EvaluationPlugin( accuracy_metrics(minibatch=False, epoch=True, experience=True, stream=True), loss_metrics(minibatch=False, epoch=True, experience=True, stream=True), - loggers=[InteractiveLogger()], + loggers=LazyDefaultLoggersList(), suppress_warnings=True, ) diff --git a/tests/distributed/__init__.py b/tests/distributed/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/distributed/test_distributed_batch.py b/tests/distributed/test_distributed_batch.py new file mode 100644 index 000000000..9a00be6be --- /dev/null +++ b/tests/distributed/test_distributed_batch.py @@ -0,0 +1,79 @@ +import contextlib +import os +import unittest +from typing import Tuple, Optional + +import torch +from torch import Tensor + +from avalanche.distributed import DistributedHelper, ClassificationBatch + + +@contextlib.contextmanager +def manage_output(): + if os.environ['LOCAL_RANK'] != 0: + with contextlib.redirect_stderr(None): + with contextlib.redirect_stdout(None): + yield + else: + yield + + +class DistributedBatchesTests(unittest.TestCase): + + def setUp(self) -> None: + DistributedHelper.init_distributed(1234, use_cuda=False) + + @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + 'Distributed tests ignored') + def test_classification_batch(self): + dt: ClassificationBatch[Optional[Tuple[Tensor, Tensor]]] = \ + ClassificationBatch('mb', None) + + self.assertEqual(None, dt.local_value) + self.assertEqual(None, dt.value) + + batch = (torch.ones((8, 1, 28, 28)), + torch.full( + (8,), fill_value=DistributedHelper.rank, dtype=torch.long)) + + dt.value = batch + + distrib_val = dt.value + + self.assertEqual(2, len(distrib_val)) + self.assertSequenceEqual((8*DistributedHelper.world_size, 1, 28, 28), + distrib_val[0].shape) + for rank in range(DistributedHelper.world_size): + expect = torch.full((8,), + rank, + dtype=torch.long) + self.assertTrue(torch.equal(expect, + distrib_val[1][8*rank:8*(rank+1)])) + + @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + 'Distributed tests ignored') + def test_unsupervised_classification_batch(self): + dt: ClassificationBatch[Optional[Tuple[Tensor, Tensor]]] = \ + ClassificationBatch('mb', None) + + self.assertEqual(None, dt.local_value) + self.assertEqual(None, dt.value) + + batch = torch.ones((8, 1, 28, 28)) + + dt.value = batch + + distrib_val = dt.value + + self.assertIsInstance(distrib_val, Tensor) + self.assertSequenceEqual((8*DistributedHelper.world_size, 1, 28, 28), + distrib_val.shape) + + +if __name__ == "__main__": + with manage_output(): + verbosity = 1 + if DistributedHelper.rank > 0: + verbosity = 0 + unittest.main(verbosity=verbosity) diff --git a/tests/distributed/test_distributed_model.py b/tests/distributed/test_distributed_model.py new file mode 100644 index 000000000..95c0ac1cf --- /dev/null +++ b/tests/distributed/test_distributed_model.py @@ -0,0 +1,80 @@ +import contextlib +import os +import unittest + +from torch.nn.parallel import DistributedDataParallel + +from avalanche.distributed import DistributedHelper, DistributedModel +from avalanche.models import SimpleMLP + + +@contextlib.contextmanager +def manage_output(): + if os.environ['LOCAL_RANK'] != 0: + with contextlib.redirect_stderr(None): + with contextlib.redirect_stdout(None): + yield + else: + yield + + +class DistributedModelTests(unittest.TestCase): + + def setUp(self) -> None: + DistributedHelper.init_distributed(1234, use_cuda=False) + + @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + 'Distributed tests ignored') + def test_distributed_model(self): + dt: DistributedModel = DistributedModel() + model = SimpleMLP() + self.assertIsNone(dt.local_value) + self.assertIsNone(dt.value) + self.assertIsNone(dt.distributed_value) + + dt.model = model + + self.assertEqual(model, dt.local_value) + self.assertEqual(model, dt.value) + self.assertEqual(model, dt.distributed_value) + + wrapped = DistributedDataParallel(model) + + dt.model = wrapped + + self.assertEqual(model, dt.local_value) + self.assertNotIsInstance(dt.local_value, DistributedDataParallel) + + self.assertIsInstance(dt.value, DistributedDataParallel) + self.assertEqual(wrapped, dt.value) + self.assertEqual(wrapped, dt.distributed_value) + + dt.reset_distributed_value() + + self.assertEqual(model, dt.local_value) + self.assertEqual(model, dt.value) + self.assertEqual(model, dt.distributed_value) + + self.assertNotIsInstance(dt.value, DistributedDataParallel) + + dt.reset_distributed_value() + self.assertIsNotNone(dt.local_value) + + dt.value = wrapped + dt.distributed_model = None + + self.assertIsNotNone(dt.local_value) + + dt.value = None + + self.assertIsNone(dt.local_value) + self.assertIsNone(dt.distributed_value) + self.assertIsNone(dt.value) + + +if __name__ == "__main__": + with manage_output(): + verbosity = 1 + if DistributedHelper.rank > 0: + verbosity = 0 + unittest.main(verbosity=verbosity) diff --git a/tests/distributed/test_distributed_tensor.py b/tests/distributed/test_distributed_tensor.py new file mode 100644 index 000000000..3add6c554 --- /dev/null +++ b/tests/distributed/test_distributed_tensor.py @@ -0,0 +1,88 @@ +import contextlib +import os +import unittest + +import torch + +from avalanche.distributed import DistributedHelper +from avalanche.distributed.distributed_tensor import \ + DistributedMeanTensor + + +@contextlib.contextmanager +def manage_output(): + if os.environ['LOCAL_RANK'] != 0: + with contextlib.redirect_stderr(None): + with contextlib.redirect_stdout(None): + yield + else: + yield + + +class DistributedTensorTests(unittest.TestCase): + + def setUp(self) -> None: + DistributedHelper.init_distributed(1234, use_cuda=False) + + @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + 'Distributed tests ignored') + def test_one_element_tensor(self): + dt = DistributedMeanTensor('dt', torch.zeros((1,), dtype=torch.float32)) + + self.assertEqual(0.0, dt.local_value.float()) + self.assertEqual(0.0, dt.value.float()) + + i = DistributedHelper.rank + 1 + + dt.value = torch.full((1,), fill_value=i, + dtype=torch.float32) + + n = DistributedHelper.world_size + expected = n * (n + 1) / 2 + + self.assertEqual(i, float(dt.local_value)) + self.assertEqual(expected / n, float(dt.value)) + + @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + 'Distributed tests ignored') + def test_one_element_tensor_random(self): + dt = DistributedMeanTensor('dt', torch.zeros((1,), dtype=torch.float32)) + + rnd_value = torch.randint(0, 100000, (10,), dtype=torch.float32) + dt.value = rnd_value + + expected = torch.mean(rnd_value) + + self.assertTrue(torch.allclose(expected, torch.mean(dt.local_value))) + self.assertTrue(torch.allclose(expected, dt.value)) + + @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + 'Distributed tests ignored') + def test_unshaped_tensor(self): + dt = DistributedMeanTensor('dt', + torch.as_tensor(5, dtype=torch.float32)) + + self.assertEqual(5.0, dt.local_value.float()) + self.assertEqual(5.0, dt.value.float()) + self.assertEqual(0, len(dt.local_value.shape)) + self.assertEqual(0, len(dt.value.shape)) + + i = DistributedHelper.rank + 1 + + dt.value = torch.as_tensor(i, dtype=torch.float32) + + n = DistributedHelper.world_size + expected = n * (n + 1) / 2 + + self.assertEqual(i, float(dt.local_value)) + self.assertEqual(expected / n, float(dt.value)) + self.assertEqual(0, len(dt.local_value.shape)) + self.assertEqual(0, len(dt.value.shape)) + + +if __name__ == "__main__": + with manage_output(): + verbosity = 1 + if DistributedHelper.rank > 0: + verbosity = 0 + unittest.main(verbosity=verbosity) diff --git a/tests/run_dist_tests.py b/tests/run_dist_tests.py new file mode 100644 index 000000000..90d15fad0 --- /dev/null +++ b/tests/run_dist_tests.py @@ -0,0 +1,68 @@ +import signal +import sys +import unittest +from subprocess import Popen +from typing import Union, Set +from unittest import TestSuite, TestCase + + +def get_distributed_test_cases(suite: Union[TestCase, TestSuite]) -> Set[str]: + found_cases = set() + if isinstance(suite, TestSuite): + for x in suite: + found_cases.update(get_distributed_test_cases(x)) + + if isinstance(suite, TestCase): + case_id = suite.id() + + if case_id.startswith('distributed.') or \ + case_id.startswith('tests.distributed.'): + found_cases.add(case_id) + + if '_FailedTest' in case_id: + raise RuntimeError( + f'Errors encountered while listing test cases: {case_id}') + + return found_cases + + +def run_distributed_suites(): + cases_names = get_distributed_test_cases( + unittest.defaultTestLoader.discover('.')) # Don't change the path! + cases_names = list(sorted(cases_names)) + print('Running', len(cases_names), 'tests') + p = None + success = True + exited = False + + for case_name in cases_names: + if exited: + print('Exiting due to keyboard interrupt') + break + print('Running test:', case_name, flush=True) + try: + p = Popen( + ['python', '-m', 'torch.distributed.run', '--nnodes=1', + '--nproc_per_node=4', '-m', 'unittest', case_name], + stdout=sys.stdout, stderr=sys.stderr) + p.communicate() + except KeyboardInterrupt: + success = False + exited = True + p.send_signal(signal.SIGINT) + finally: + exit_code = p.wait() + print('Test completed with code', exit_code) + success = success and exit_code == 0 + p = None + + if success: + print('Tests completed successfully') + exit(0) + else: + print('Tests terminated with errors') + exit(1) + + +if __name__ == '__main__': + run_distributed_suites() From b0ce2e32fa6e174c93673fa65be5b7c656936deb Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Thu, 21 Apr 2022 17:51:26 +0200 Subject: [PATCH 02/16] Working strategy composition and example (naive, replay, scheduler). --- .../benchmarks/utils/collate_functions.py | 69 ++++++ avalanche/benchmarks/utils/data_loader.py | 55 ++--- avalanche/distributed/distributed_batch.py | 66 +++-- avalanche/distributed/distributed_commons.py | 6 + avalanche/distributed/distributed_helper.py | 48 +++- avalanche/distributed/distributed_model.py | 16 +- avalanche/distributed/distributed_value.py | 10 +- avalanche/distributed/strategies/__init__.py | 3 + .../strategies/distributed_loss_strategy.py | 48 ++++ .../strategies/distributed_mbatch_strategy.py | 150 ++++++++++++ .../strategies/distributed_model_strategy.py | 45 ++++ avalanche/training/supervised/deep_slda.py | 25 +- .../training/supervised/joint_training.py | 2 +- avalanche/training/supervised/lamaml.py | 2 +- .../supervised/naive_object_detection.py | 30 +-- .../training/supervised/strategy_wrappers.py | 9 +- avalanche/training/templates/base.py | 5 +- avalanche/training/templates/base_sgd.py | 49 +++- .../training/templates/online_supervised.py | 2 +- avalanche/training/templates/supervised.py | 41 +++- examples/distributed_training.py | 231 ++++++++++++++++++ examples/run_distributed_training_example.sh | 19 ++ 22 files changed, 820 insertions(+), 111 deletions(-) create mode 100644 avalanche/benchmarks/utils/collate_functions.py create mode 100644 avalanche/distributed/strategies/__init__.py create mode 100644 avalanche/distributed/strategies/distributed_loss_strategy.py create mode 100644 avalanche/distributed/strategies/distributed_mbatch_strategy.py create mode 100644 avalanche/distributed/strategies/distributed_model_strategy.py create mode 100644 examples/distributed_training.py create mode 100755 examples/run_distributed_training_example.sh diff --git a/avalanche/benchmarks/utils/collate_functions.py b/avalanche/benchmarks/utils/collate_functions.py new file mode 100644 index 000000000..cc4345d91 --- /dev/null +++ b/avalanche/benchmarks/utils/collate_functions.py @@ -0,0 +1,69 @@ +################################################################################ +# Copyright (c) 2021 ContinualAI. # +# Copyrights licensed under the MIT License. # +# See the accompanying LICENSE file for terms. # +# # +# Date: 21-04-2022 # +# Author(s): Antonio Carta, Lorenzo Pellegrini # +# E-mail: contact@continualai.org # +# Website: avalanche.continualai.org # +################################################################################ + +import itertools +from collections import defaultdict + +import torch + + +def classification_collate_mbatches_fn(mbatches): + """Combines multiple mini-batches together. + + Concatenates each tensor in the mini-batches along dimension 0 (usually + this is the batch size). + + :param mbatches: sequence of mini-batches. + :return: a single mini-batch + """ + batch = [] + for i in range(len(mbatches[0])): + t = classification_single_values_collate_fn( + [el[i] for el in mbatches], i) + batch.append(t) + return batch + + +def classification_single_values_collate_fn(values_list, index): + return torch.cat(values_list, dim=0) + + +def detection_collate_fn(batch): + """ + Collate function used when loading detection datasets using a DataLoader. + """ + return tuple(zip(*batch)) + + +def detection_collate_mbatches_fn(mbatches): + """ + Collate function used when loading detection datasets using a DataLoader. + """ + lists_dict = defaultdict(list) + for mb in mbatches: + for mb_elem_idx, mb_elem in enumerate(mb): + lists_dict[mb_elem_idx].append(mb_elem) + + lists = [] + for mb_elem_idx in range(max(lists_dict.keys()) + 1): + lists.append(list(itertools.chain.from_iterable( + lists_dict[mb_elem_idx] + ))) + + return lists + + +__all__ = [ + 'classification_collate_mbatches_fn', + 'classification_single_values_collate_fn', + 'detection_collate_fn', + 'detection_collate_mbatches_fn' +] diff --git a/avalanche/benchmarks/utils/data_loader.py b/avalanche/benchmarks/utils/data_loader.py index b5f276a2d..b1aa09d88 100644 --- a/avalanche/benchmarks/utils/data_loader.py +++ b/avalanche/benchmarks/utils/data_loader.py @@ -14,58 +14,31 @@ support for balanced dataloading between different tasks or balancing between the current data and the replay memory. """ -import itertools -from collections import defaultdict +import math from itertools import chain -from typing import Dict, Sequence, Union +from typing import Dict, Sequence, Union, Iterator, Optional import torch -from torch.utils.data import RandomSampler, DistributedSampler +from torch.utils.data import RandomSampler, DistributedSampler, Dataset from torch.utils.data.dataloader import DataLoader +from torch.utils.data.sampler import T_co from avalanche.benchmarks.utils import AvalancheDataset -from avalanche.distributed import DistributedHelper - - -def _default_collate_mbatches_fn(mbatches): - """Combines multiple mini-batches together. - - Concatenates each tensor in the mini-batches along dimension 0 (usually this - is the batch size). +from avalanche.benchmarks.utils.collate_functions import \ + classification_collate_mbatches_fn - :param mbatches: sequence of mini-batches. - :return: a single mini-batch - """ - batch = [] - for i in range(len(mbatches[0])): - t = torch.cat([el[i] for el in mbatches], dim=0) - batch.append(t) - return batch - - -def detection_collate_fn(batch): - """ - Collate function used when loading detection datasets using a DataLoader. - """ - return tuple(zip(*batch)) +from avalanche.benchmarks.utils.collate_functions import detection_collate_fn \ + as _detection_collate_fn +from avalanche.benchmarks.utils.collate_functions import \ + detection_collate_mbatches_fn as _detection_collate_mbatches_fn +from avalanche.distributed import DistributedHelper -def detection_collate_mbatches_fn(mbatches): - """ - Collate function used when loading detection datasets using a DataLoader. - """ - lists_dict = defaultdict(list) - for mb in mbatches: - for mb_elem_idx, mb_elem in enumerate(mb): - lists_dict[mb_elem_idx].append(mb_elem) +_default_collate_mbatches_fn = classification_collate_mbatches_fn - lists = [] - for mb_elem_idx in range(max(lists_dict.keys()) + 1): - lists.append(list(itertools.chain.from_iterable( - lists_dict[mb_elem_idx] - ))) +detection_collate_fn = _detection_collate_fn - return lists +detection_collate_mbatches_fn = _detection_collate_mbatches_fn class TaskBalancedDataLoader: diff --git a/avalanche/distributed/distributed_batch.py b/avalanche/distributed/distributed_batch.py index 0f9aef88c..e84e9bddb 100644 --- a/avalanche/distributed/distributed_batch.py +++ b/avalanche/distributed/distributed_batch.py @@ -1,5 +1,5 @@ from abc import abstractmethod, ABC -from typing import TypeVar, List, Optional +from typing import TypeVar, List, Optional, Callable, Any import torch from torch import Tensor @@ -40,19 +40,18 @@ class DistributedBatch(DistributedObject[LocalT, LocalT], ABC): This class can handle batches as either tuples of elements (as usual) or even single values. - The merge procedure of single elements must be implemented in child classes. + The merge procedure of tuples and single elements must be implemented in + child classes. By default, the tuples will be merged value by value. NOTE: In the future, this class may be replaced with a version in which only the accessed tuple elements are synchronized, instead of the whole batch. - The current design, in which child classes only have to implement + The current design, in which child classes have to implement `_merge_single_values`, allows for this change to happen without affecting child classes. """ def __init__(self, name: str, initial_local_value: LocalT): - super(DistributedBatch, self).__init__( - name, initial_local_value - ) + super().__init__(name, initial_local_value) self._value_is_tuple = False def _synchronize_distributed_value(self) -> LocalT: @@ -63,13 +62,13 @@ def _synchronize_distributed_value(self) -> LocalT: def _set_local_value(self, new_local_value): self._value_is_tuple = isinstance(new_local_value, (tuple, list)) - super(DistributedBatch, self)._set_local_value(new_local_value) + super()._set_local_value(new_local_value) def _merge_objects(self, objects: List[LocalT]) -> LocalT: if self._value_is_tuple: return self._merge_tuples(objects) else: - return self._merge_single_values(objects) + return self._merge_single_values(objects, 0) def _merge_tuples(self, tuples: List[LocalT]): merged_elements = [] @@ -80,27 +79,62 @@ def _merge_tuples(self, tuples: List[LocalT]): to_merge_elements.append(tp[element_idx]) merged_elements.append( - self._merge_single_values(to_merge_elements) + self._merge_single_values(to_merge_elements, element_idx) ) return tuple(merged_elements) @abstractmethod - def _merge_single_values(self, values: List): + def _merge_single_values(self, values: List, value_index: int): pass -class ClassificationBatch(DistributedBatch[LocalT]): +class CollateDistributedBatch(DistributedBatch[LocalT]): + """ + An implementation of :class:`DistributedBatch` in which the + `_merge_tuples` mechanism is given as a callable function. + """ + + def __init__(self, name: str, initial_local_value: LocalT, + tuples_collate_fn: Optional[Callable[[List], LocalT]], + single_values_collate_fn: Callable[[Any, int], Any]): + super().__init__(name, initial_local_value) + self.tuples_collate_fn = tuples_collate_fn + self.single_values_collate_fn = single_values_collate_fn + + def _merge_tuples(self, tuples: List[LocalT]): + if self.tuples_collate_fn is None: + return super()._merge_tuples(tuples) + + return self.tuples_collate_fn(tuples) + + def _merge_single_values(self, values: List, value_index: int): + # if DistributedHelper.is_main_process: + # print('MERGING VALUES:') + # for elem in values: + # if isinstance(elem, Tensor): + # print(elem.device) + # print(elem.shape) + # else: + # print(type(elem)) + + return self.single_values_collate_fn(values, value_index) + + +def make_classification_distributed_batch(name: str) -> \ + CollateDistributedBatch[Optional[Tensor]]: """ - An implementation of :class:`DistributedBatch` that assumes that all values - are Tensors. + Return a :class:`CollateDistributedBatch` that assumes that all values + are Tensors. Values are obtained by concatenating these tensors. """ - def _merge_single_values(self, values: List[Tensor]): - return torch.cat(values) + return CollateDistributedBatch( + name, None, None, lambda x, y: torch.cat(x) + ) __all__ = [ 'DistributedObject', 'DistributedBatch', - 'ClassificationBatch' + 'CollateDistributedBatch', + 'make_classification_distributed_batch' ] diff --git a/avalanche/distributed/distributed_commons.py b/avalanche/distributed/distributed_commons.py index 9844adc4e..2ebc39b71 100644 --- a/avalanche/distributed/distributed_commons.py +++ b/avalanche/distributed/distributed_commons.py @@ -1,5 +1,6 @@ import torch +from avalanche.distributed import DistributedHelper from avalanche.distributed.distributed_tensor import DistributedMeanTensor @@ -16,6 +17,11 @@ class DistributedLoss(DistributedMeanTensor): def __init__(self, name: str = 'loss'): super(DistributedLoss, self).__init__(name, torch.zeros((1,))) + def _merge_tensors(self, tensors): + # with DistributedHelper.main_process_first(): + # print('Rank', DistributedHelper.rank, 'losses=', tensors, flush=True) + return super(DistributedLoss, self)._merge_tensors(tensors) + __all__ = [ 'DistributedLoss' diff --git a/avalanche/distributed/distributed_helper.py b/avalanche/distributed/distributed_helper.py index b33f5657b..e380fcfab 100644 --- a/avalanche/distributed/distributed_helper.py +++ b/avalanche/distributed/distributed_helper.py @@ -2,6 +2,7 @@ import random import warnings from collections import OrderedDict +from io import BytesIO from typing import Optional, List, Tuple import numpy as np @@ -14,6 +15,8 @@ from avalanche.benchmarks import GenericCLScenario +import pickle + class _Singleton(type): _instances = {} @@ -167,6 +170,8 @@ def init_distributed(self, random_seed, backend=None, use_cuda=True): # https://github.com/pytorch/pytorch/issues/6351 torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False + + self.make_device() # Force-init the default CUDA device (if any) return True def get_device_id(self): @@ -187,8 +192,8 @@ def make_device(self): device_id = 0 if self.use_cuda and torch.cuda.is_available() and device_id >= 0: - torch.cuda.set_device(device_id) ref_device = torch.device(f'cuda:{device_id}') + torch.cuda.set_device(ref_device) else: ref_device = torch.device('cpu') return ref_device @@ -501,8 +506,49 @@ def hash_tensor(tensor: Tensor) -> str: return hash_engine.hexdigest() +def hash_model(model: Module) -> str: + import hashlib + import io + + hash_engine = hashlib.sha256() + for name, param in model.named_parameters(): + hash_engine.update(name.encode()) + buff = io.BytesIO() + torch.save(param, buff) + buff.seek(0) + hash_engine.update(buff.read()) + return hash_engine.hexdigest() + + + DistributedHelper = _DistributedHelperCls() + +def fix(): + return lambda b: torch.load(BytesIO(b), + map_location=DistributedHelper.make_device()) + + +class MappedUnpickler(pickle.Unpickler): + # Based on: + # https://github.com/pytorch/pytorch/issues/16797#issuecomment-777059657 + + # In turn based on: + # https://github.com/pytorch/pytorch/issues/16797#issuecomment-633423219 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def find_class(self, module, name): + if module == 'torch.storage' and name == '_load_from_bytes': + return fix() + else: + return super().find_class(module, name) + + +torch.distributed.distributed_c10d._unpickler = MappedUnpickler + + __all__ = [ 'DistributedHelper', '_DistributedHelperCls' diff --git a/avalanche/distributed/distributed_model.py b/avalanche/distributed/distributed_model.py index 8a367f550..e304a3542 100644 --- a/avalanche/distributed/distributed_model.py +++ b/avalanche/distributed/distributed_model.py @@ -15,7 +15,8 @@ from typing_extensions import Type from avalanche.distributed import OptionalDistributedValue -from avalanche.distributed.distributed_value import DistributedT +from avalanche.distributed.distributed_value import DistributedT, \ + DistributedValue, SettableDistributedValue, SwitchableDistributedValue class DistributedModel(OptionalDistributedValue[Optional[Module]]): @@ -43,20 +44,23 @@ class DistributedModel(OptionalDistributedValue[Optional[Module]]): def __init__( self, + *, + name: str = 'model', initial_model: Module = None, distributed_model_class: Union[Type, Tuple[Type]] = - DistributedDataParallel): + DistributedDataParallel,): """ Creates a `ModelInstance`. + :param name: The name of this value. Defaults to 'model'. :param initial_model: The initial model to use. Defaults to None. :param distributed_model_class: The type(s) of the distributed model. Defaults to `DistributedDataParallel`. """ - super().__init__('model', initial_local_value=initial_model) + super().__init__(name, initial_local_value=initial_model) self.distributed_model_class = distributed_model_class - @OptionalDistributedValue.value.setter + @SwitchableDistributedValue.value.setter def value(self, new_value: Module): """ Sets the local or distributed model, depending on if the model is a @@ -70,13 +74,13 @@ def value(self, new_value: Module): else: self.local_value = new_value - @OptionalDistributedValue.local_value.getter + @DistributedValue.local_value.getter def local_value(self) -> Module: if self._distributed_value is not None: return self._distributed_value.module return self._local_value - @OptionalDistributedValue.distributed_value.setter + @SettableDistributedValue.distributed_value.setter def distributed_value(self, new_distributed_value: Module): if new_distributed_value is None: self.reset_distributed_value() diff --git a/avalanche/distributed/distributed_value.py b/avalanche/distributed/distributed_value.py index b12546d20..ab231fe2e 100644 --- a/avalanche/distributed/distributed_value.py +++ b/avalanche/distributed/distributed_value.py @@ -3,6 +3,7 @@ Tuple from abc import ABC, abstractmethod +from avalanche.distributed import DistributedHelper LocalT = TypeVar('LocalT') DistributedT = TypeVar('DistributedT') @@ -51,7 +52,6 @@ def __init__(self, name: str, initial_local_value: LocalT): representation. :param initial_local_value: The initial local value. """ - self.name: str = name self._local_value: LocalT = initial_local_value self._distributed_value: Optional[DistributedT] = None @@ -65,7 +65,6 @@ def value(self) -> DistributedT: When running a distributed training, this will be the value obtained by gathering and merging values coming from all processes. """ - return self._get_distributed_value() @value.setter @@ -102,6 +101,9 @@ def _set_local_value(self, new_local_value: LocalT): self._distributed_value_set = False def _get_distributed_value(self) -> DistributedT: + if not DistributedHelper.is_distributed: + return self._local_value + if not self._distributed_value_set: self._distributed_value = self._synchronize_distributed_value() self._distributed_value_set = True @@ -223,14 +225,14 @@ def use_local_value(self: SwitchableT, getter=True, setter=True) -> \ finally: self._behaviour_stack.pop() - @SettableDistributedValue.value.getter + @property def value(self) -> Union[LocalT, DistributedT]: if self._use_local_getter(): return self.local_value else: return self.distributed_value - @SettableDistributedValue.value.setter + @value.setter def value(self, new_value): if self._use_local_setter(): self.local_value = new_value diff --git a/avalanche/distributed/strategies/__init__.py b/avalanche/distributed/strategies/__init__.py new file mode 100644 index 000000000..8ce5532e3 --- /dev/null +++ b/avalanche/distributed/strategies/__init__.py @@ -0,0 +1,3 @@ +from .distributed_model_strategy import * +from .distributed_mbatch_strategy import * +from .distributed_loss_strategy import * diff --git a/avalanche/distributed/strategies/distributed_loss_strategy.py b/avalanche/distributed/strategies/distributed_loss_strategy.py new file mode 100644 index 000000000..824dff3f1 --- /dev/null +++ b/avalanche/distributed/strategies/distributed_loss_strategy.py @@ -0,0 +1,48 @@ +from torch import Tensor + +from avalanche.distributed import DistributedLoss + + +class DistributedLossStrategySupport: + + def __init__(self): + super().__init__() + self._loss = DistributedLoss() + + @property + def loss(self) -> Tensor: + """ The loss tensor. """ + return self._loss.value + + @loss.setter + def loss(self, value): + """ Sets the loss. """ + self._loss.value = value + + @property + def local_loss(self): + return self._loss.local_value + + @local_loss.setter + def local_loss(self, value): + self._loss.local_value = value + + @property + def distributed_loss(self): + return self._loss.distributed_value + + @distributed_loss.setter + def distributed_loss(self, value): + self._loss.distributed_value = value + + def reset_distributed_loss(self): + """ Resets the distributed value of the loss. """ + self._loss.reset_distributed_value() + + def use_local_loss(self, *args, **kwargs): + return self._loss.use_local_value(*args, **kwargs) + + +__all__ = [ + 'DistributedLossStrategySupport' +] diff --git a/avalanche/distributed/strategies/distributed_mbatch_strategy.py b/avalanche/distributed/strategies/distributed_mbatch_strategy.py new file mode 100644 index 000000000..8fdd18699 --- /dev/null +++ b/avalanche/distributed/strategies/distributed_mbatch_strategy.py @@ -0,0 +1,150 @@ +from typing import Callable, List, Any + +import torch + +from avalanche.benchmarks.utils.collate_functions import \ + classification_collate_mbatches_fn, classification_single_values_collate_fn +from avalanche.distributed import CollateDistributedBatch + + +class DistributedMiniBatchStrategySupport: + + def __init__(self): + super().__init__() + self._mbatch = CollateDistributedBatch( + 'mbatch', + None, + classification_collate_mbatches_fn, + classification_single_values_collate_fn + ) + + self._mb_output = CollateDistributedBatch( + 'mb_output', + None, + classification_collate_mbatches_fn, + classification_single_values_collate_fn + ) + + # --- START INPUT MINIBATCH PROPERTY --- + @property + def mbatch(self): + """ Current mini-batch. """ + return self._mbatch.value + + @mbatch.setter + def mbatch(self, value): + """ Sets the current mini-batch. """ + self._mbatch.value = value + + @property + def local_mbatch(self): + """ The current local mini-batch. """ + return self._mbatch.local_value + + @local_mbatch.setter + def local_mbatch(self, value): + """ Sets the current local mini-batch. """ + self._mbatch.local_value = value + + @property + def distributed_mbatch(self): + """ The current distributed mini-batch. """ + return self._mbatch.distributed_value + + @distributed_mbatch.setter + def distributed_mbatch(self, value): + """ Sets the current distributed mini-batch. """ + self._mbatch.distributed_value = value + + def reset_distributed_mbatch(self): + """ Resets the distributed value of the mini-batch. """ + self._mbatch.reset_distributed_value() + # --- END INPUT MINIBATCH PROPERTY --- + + # --- START OUTPUT MINIBATCH PROPERTY --- + @property + def mb_output(self): + """ Model's output computed on the current mini-batch. """ + return self._mb_output.value + + @mb_output.setter + def mb_output(self, value): + """ Sets the model's output computed on the current mini-batch. """ + self._mb_output.value = value + + @property + def local_mb_output(self): + """ The current local output. """ + return self._mb_output.local_value + + @local_mb_output.setter + def local_mb_output(self, value): + """ Sets the current local output. """ + self._mb_output.local_value = value + + @property + def distributed_mb_output(self): + """ The current distributed output. """ + return self._mb_output.local_value + + @distributed_mb_output.setter + def distributed_mb_output(self, value): + """ Sets the current distributed output. """ + self._mb_output.distributed_value = value + + def reset_distributed_mb_output(self): + """ Resets the distributed value of the output. """ + self._mb_output.reset_distributed_value() + # --- END OUTPUT MINIBATCH PROPERTY --- + + # --- START COLLATE FUNCTIONS (INPUT MB) --- + @property + def input_batch_collate_fn(self): + return self._mbatch.tuples_collate_fn + + @input_batch_collate_fn.setter + def input_batch_collate_fn(self, batch_collate_fn: Callable[[List], Any]): + self._mbatch.tuples_collate_fn = batch_collate_fn + + @property + def input_batch_single_values_collate_fn(self): + return self._mbatch.single_values_collate_fn + + @input_batch_single_values_collate_fn.setter + def input_batch_single_values_collate_fn( + self, single_values_collate_fn: Callable[[List], Any]): + self._mbatch.single_values_collate_fn = single_values_collate_fn + + # --- END COLLATE FUNCTIONS (INPUT MB) --- + + # --- START COLLATE FUNCTIONS (OUTPUT MB) --- + @property + def output_batch_collate_fn(self): + return self._mb_output.tuples_collate_fn + + @output_batch_collate_fn.setter + def output_batch_collate_fn(self, batch_collate_fn: Callable[[List], Any]): + self._mb_output.tuples_collate_fn = batch_collate_fn + + @property + def output_batch_single_values_collate_fn(self): + return self._mb_output.single_values_collate_fn + + @output_batch_single_values_collate_fn.setter + def output_batch_single_values_collate_fn( + self, single_values_collate_fn: Callable[[List], Any]): + self._mb_output.single_values_collate_fn = single_values_collate_fn + # --- END COLLATE FUNCTIONS (OUTPUT MB) --- + + # --- START LOCAL CONTEXT MANAGERS --- + def use_local_input_batch(self, *args, **kwargs): + return self._mbatch.use_local_value(*args, **kwargs) + + def use_local_output_batch(self, *args, **kwargs): + return self._mb_output.use_local_value(*args, **kwargs) + # --- END LOCAL CONTEXT MANAGERS --- + + +__all__ = [ + 'DistributedMiniBatchStrategySupport' +] diff --git a/avalanche/distributed/strategies/distributed_model_strategy.py b/avalanche/distributed/strategies/distributed_model_strategy.py new file mode 100644 index 000000000..c97559caa --- /dev/null +++ b/avalanche/distributed/strategies/distributed_model_strategy.py @@ -0,0 +1,45 @@ +from torch.nn import Module + +from avalanche.distributed import DistributedModel + + +class DistributedModelStrategySupport: + + def __init__(self): + super().__init__() + self._model = DistributedModel() + + @property + def model(self) -> Module: + """ PyTorch model. """ + # This will return the local model if training locally + return self._model.value + + @model.setter + def model(self, value): + """ Sets the PyTorch model. """ + self._model.value = value + + @property + def local_model(self): + return self._model.local_model + + @local_model.setter + def local_model(self, value): + self._model.local_model = value + + @property + def distributed_model(self): + return self._model.distributed_model + + @distributed_model.setter + def distributed_model(self, value): + self._model.distributed_model = value + + def use_local_model(self, *args, **kwargs): + return self._model.use_local_value(*args, **kwargs) + + +__all__ = [ + 'DistributedModelStrategySupport' +] diff --git a/avalanche/training/supervised/deep_slda.py b/avalanche/training/supervised/deep_slda.py index d120869ea..eb0ce0fb2 100644 --- a/avalanche/training/supervised/deep_slda.py +++ b/avalanche/training/supervised/deep_slda.py @@ -101,16 +101,17 @@ def __init__( def forward(self, return_features=False): """Compute the model's output given the current mini-batch.""" - self.model.eval() - if isinstance(self.model, MultiTaskModule): - feat = self.model(self.mb_x, self.mb_task_id) - else: # no task labels - feat = self.model(self.mb_x) - out = self.predict(feat) - if return_features: - return out, feat - else: - return out + with self.use_local_input_batch(): + self.model.eval() + if isinstance(self.model, MultiTaskModule): + feat = self.model(self.mb_x, self.mb_task_id) + else: # no task labels + feat = self.model(self.mb_x) + out = self.predict(feat) + if return_features: + return out, feat + else: + return out def training_epoch(self, **kwargs): """ @@ -119,7 +120,7 @@ def training_epoch(self, **kwargs): :return: """ for _, self.mbatch in enumerate(self.dataloader): - self._unpack_minibatch() + self.unpack_minibatch() self._before_training_iteration(**kwargs) self.loss = 0 @@ -131,7 +132,7 @@ def training_epoch(self, **kwargs): self._after_forward(**kwargs) # Loss & Backward - self.loss += self.criterion() + self.loss = self.criterion() # Optimization step self._before_update(**kwargs) diff --git a/avalanche/training/supervised/joint_training.py b/avalanche/training/supervised/joint_training.py index 3e164d86c..5ce8a4c7a 100644 --- a/avalanche/training/supervised/joint_training.py +++ b/avalanche/training/supervised/joint_training.py @@ -154,7 +154,7 @@ def train_dataset_adaptation(self, **kwargs): self.adapted_dataset = cat_data self.adapted_dataset = self.adapted_dataset.train() - def model_adaptation(self, model=None): + def _model_adaptation(self, model=None): """Adapts strategy's model for all experiences.""" if model is None: model = self.model diff --git a/avalanche/training/supervised/lamaml.py b/avalanche/training/supervised/lamaml.py index 96f0c9cb5..19e25e07a 100644 --- a/avalanche/training/supervised/lamaml.py +++ b/avalanche/training/supervised/lamaml.py @@ -140,7 +140,7 @@ def training_epoch(self, **kwargs): if self._stop_training: break - self._unpack_minibatch() + self.unpack_minibatch() self._before_training_iteration(**kwargs) self.loss = 0 diff --git a/avalanche/training/supervised/naive_object_detection.py b/avalanche/training/supervised/naive_object_detection.py index 72b3c8fc8..bd87873e2 100644 --- a/avalanche/training/supervised/naive_object_detection.py +++ b/avalanche/training/supervised/naive_object_detection.py @@ -183,19 +183,21 @@ def criterion(self): Beware that the loss can only be obtained for the training phase as no loss dictionary is returned when evaluating. """ - if self.is_training: - return sum( - loss for loss in self.detection_loss_dict.values()) - else: - # eval does not compute the loss directly. - # Metrics will use self.mb_output and self.detection_predictions - # to compute AP, AR, ... - self.detection_predictions = \ - {target["image_id"].item(): output - for target, output in zip(self.mb_y, self.mb_output)} - return torch.zeros((1,)) - - def forward(self): + with self.local_mb_output(): + with self.local_mbatch(): + if self.is_training: + return sum( + loss for loss in self.detection_loss_dict.values()) + else: + # eval does not compute the loss directly. + # Metrics will use self.mb_output and + # self.detection_predictions to compute AP, AR, ... + self.detection_predictions = \ + {target["image_id"].item(): output + for target, output in zip(self.mb_y, self.mb_output)} + return torch.zeros((1,)) + + def _forward(self): """ Compute the model's output given the current mini-batch. @@ -221,7 +223,7 @@ def _unpack_minibatch(self): self.mbatch[0] = images self.mbatch[1] = targets - def backward(self): + def _backward(self): if self.scaler is not None: self.scaler.scale(self.loss).backward() else: diff --git a/avalanche/training/supervised/strategy_wrappers.py b/avalanche/training/supervised/strategy_wrappers.py index 73a977430..03c1dba64 100644 --- a/avalanche/training/supervised/strategy_wrappers.py +++ b/avalanche/training/supervised/strategy_wrappers.py @@ -449,11 +449,12 @@ def __init__( :param **base_kwargs: any additional :class:`~avalanche.training.BaseTemplate` constructor arguments. """ + self._vae_criterion = criterion super().__init__( model, optimizer, - criterion, + self._vae_criterion_adapter, train_mb_size=train_mb_size, train_epochs=train_epochs, eval_mb_size=eval_mb_size, @@ -464,10 +465,10 @@ def __init__( **base_kwargs ) - def criterion(self): - """Adapt input to criterion as needed to compute reconstruction loss + def _vae_criterion_adapter(self, *ignored): + """Adapt input to criterion as needed to compute reconstruction loss and KL divergence. See default criterion VAELoss.""" - return self._criterion(self.mb_x, self.mb_output) + return self._vae_criterion(self.mb_x, self.mb_output) class GSS_greedy(SupervisedTemplate): diff --git a/avalanche/training/templates/base.py b/avalanche/training/templates/base.py index 06f8ddf89..3cf9834b1 100644 --- a/avalanche/training/templates/base.py +++ b/avalanche/training/templates/base.py @@ -6,10 +6,11 @@ from avalanche.benchmarks import Experience from avalanche.core import BasePlugin +from avalanche.distributed.strategies import DistributedModelStrategySupport from avalanche.training.utils import trigger_plugins -class BaseTemplate: +class BaseTemplate(DistributedModelStrategySupport): """Base class for continual learning skeletons. **Training loop** @@ -37,6 +38,8 @@ def __init__( ): """Init.""" + super(BaseTemplate, self).__init__() + self.model: Module = model """ PyTorch model. """ diff --git a/avalanche/training/templates/base_sgd.py b/avalanche/training/templates/base_sgd.py index 604c9d70c..127c9199e 100644 --- a/avalanche/training/templates/base_sgd.py +++ b/avalanche/training/templates/base_sgd.py @@ -3,8 +3,12 @@ import torch from torch.nn import Module from torch.optim import Optimizer +from typing_extensions import final from avalanche.benchmarks import Experience +from avalanche.distributed import DistributedHelper +from avalanche.distributed.strategies import \ + DistributedMiniBatchStrategySupport, DistributedLossStrategySupport from avalanche.training.plugins import SupervisedPlugin, EvaluationPlugin from avalanche.training.plugins.clock import Clock from avalanche.training.plugins.evaluation import default_evaluator @@ -18,7 +22,8 @@ from avalanche.training.templates.supervised import SupervisedTemplate -class BaseSGDTemplate(BaseTemplate): +class BaseSGDTemplate(BaseTemplate, DistributedMiniBatchStrategySupport, + DistributedLossStrategySupport): """Base class for continual learning skeletons. **Training loop** @@ -148,6 +153,7 @@ def _before_training_exp(self, **kwargs): self.make_train_dataloader(**kwargs) # Model Adaptation (e.g. freeze/add new units) self.model = self.model_adaptation() + self.model = self.wrap_distributed_model(self.model) self.make_optimizer() super()._before_training_exp(**kwargs) @@ -179,11 +185,18 @@ def _before_eval_exp(self, **kwargs): self.make_eval_dataloader(**kwargs) # Model Adaptation (e.g. freeze/add new units) self.model = self.model_adaptation() + self.model = self.wrap_distributed_model(self.model) super()._before_eval_exp(**kwargs) def _eval_exp(self, **kwargs): self.eval_epoch(**kwargs) + def wrap_distributed_model(self, model): + """ + Prepare a model for distributed training/eval. + """ + return DistributedHelper.wrap_model(model) + def make_train_dataloader(self, **kwargs): """Assign dataloader to self.dataloader.""" raise NotImplementedError() @@ -222,11 +235,10 @@ def training_epoch(self, **kwargs): if self._stop_training: break - self._unpack_minibatch() + self.unpack_minibatch() self._before_training_iteration(**kwargs) self.optimizer.zero_grad() - self.loss = 0 # Forward self._before_forward(**kwargs) @@ -234,7 +246,7 @@ def training_epoch(self, **kwargs): self._after_forward(**kwargs) # Loss & Backward - self.loss += self.criterion() + self.loss = self.criterion() self._before_backward(**kwargs) self.backward() @@ -247,8 +259,20 @@ def training_epoch(self, **kwargs): self._after_training_iteration(**kwargs) + @final def backward(self): - """Run the backward pass.""" + """ + Run the backward pass. + + This method should not be overridden by child classes. + Consider overriding :meth:`_backward` instead. + """ + with self.use_local_loss(): + self._backward() + self.reset_distributed_loss() + + def _backward(self): + """ Implementation of the backward pass. """ self.loss.backward() def optimizer_step(self): @@ -258,7 +282,7 @@ def optimizer_step(self): def eval_epoch(self, **kwargs): """Evaluation loop over the current `self.dataloader`.""" for self.mbatch in self.dataloader: - self._unpack_minibatch() + self.unpack_minibatch() self._before_eval_iteration(**kwargs) self._before_eval_forward(**kwargs) @@ -268,8 +292,21 @@ def eval_epoch(self, **kwargs): self._after_eval_iteration(**kwargs) + @final + def unpack_minibatch(self): + """ + Move minibatch elements to device. + + This method should not be overridden by child classes. + Consider overriding :meth:`_unpack_minibatch` instead. + """ + with self.use_local_input_batch(): + self._unpack_minibatch() + self.reset_distributed_mbatch() + def _unpack_minibatch(self): """Move to device""" + for i in range(len(self.mbatch)): self.mbatch[i] = self.mbatch[i].to(self.device) diff --git a/avalanche/training/templates/online_supervised.py b/avalanche/training/templates/online_supervised.py index 1f072cd32..d28d2ebd0 100644 --- a/avalanche/training/templates/online_supervised.py +++ b/avalanche/training/templates/online_supervised.py @@ -194,7 +194,7 @@ def _train_exp( self._after_training_epoch(**kwargs) self._after_training_exp(**kwargs) - def model_adaptation(self, model=None): + def _model_adaptation(self, model=None): """Adapts the model to the data from the current (full) experience. diff --git a/avalanche/training/templates/supervised.py b/avalanche/training/templates/supervised.py index c43e1bf56..e4ad7ddce 100644 --- a/avalanche/training/templates/supervised.py +++ b/avalanche/training/templates/supervised.py @@ -4,9 +4,12 @@ import torch from torch.nn import Module, CrossEntropyLoss from torch.optim import Optimizer -from torch.utils.data import DataLoader +from torch.utils.data import DataLoader, DistributedSampler +from typing_extensions import final from avalanche.benchmarks.utils.data_loader import TaskBalancedDataLoader +from avalanche.distributed import DistributedHelper +from avalanche.distributed.distributed_helper import hash_tensor from avalanche.models import avalanche_forward from avalanche.models.dynamic_optimizers import reset_optimizer from avalanche.models.utils import avalanche_model_adaptation @@ -143,9 +146,12 @@ def mb_task_id(self): assert len(self.mbatch) >= 3 return self.mbatch[-1] + @final def criterion(self): """Loss function.""" - return self._criterion(self.mb_output, self.mb_y) + with self.use_local_output_batch(): + with self.use_local_input_batch(): + return self._criterion(self.mb_output, self.mb_y) def _before_training_exp(self, **kwargs): """Setup to train on a single experience.""" @@ -214,6 +220,7 @@ def make_train_dataloader( batch_size=self.train_mb_size, shuffle=shuffle, pin_memory=pin_memory, + drop_last=True, **other_dataloader_args ) @@ -235,23 +242,51 @@ def make_eval_dataloader( if parse_version(torch.__version__) >= parse_version('1.7.0'): other_dataloader_args['persistent_workers'] = persistent_workers + sampler = None + if DistributedHelper.is_distributed: + sampler = DistributedSampler( + self.adapted_dataset, shuffle=False, drop_last=False) + self.dataloader = DataLoader( self.adapted_dataset, num_workers=num_workers, batch_size=self.eval_mb_size, pin_memory=pin_memory, + sampler=sampler, + shuffle=False, + drop_last=False, **other_dataloader_args ) + @final def forward(self): - """Compute the model's output given the current mini-batch.""" + """ + Compute the model's output given the current mini-batch. + + This method should not be overridden by child classes. + Consider overriding :meth:`_forward` instead. + """ + with self.use_local_input_batch(): + return self._forward() + + def _forward(self): + """Implementation of the forward pass.""" + # print('mbx hash:', hash_tensor(self.distributed_mbatch[0])) return avalanche_forward(self.model, self.mb_x, self.mb_task_id) + @final def model_adaptation(self, model=None): """Adapts the model to the current data. Calls the :class:`~avalanche.models.DynamicModule`s adaptation. + + This method should not be overridden by child classes. + Consider overriding :meth:`_model_adaptation` instead. """ + with self.use_local_model(): + return self._model_adaptation(model=model) + + def _model_adaptation(self, model=None): if model is None: model = self.model avalanche_model_adaptation(model, self.experience.dataset) diff --git a/examples/distributed_training.py b/examples/distributed_training.py new file mode 100644 index 000000000..d0c6d8495 --- /dev/null +++ b/examples/distributed_training.py @@ -0,0 +1,231 @@ +################################################################################ +# Copyright (c) 2021 ContinualAI. # +# Copyrights licensed under the MIT License. # +# See the accompanying LICENSE file for terms. # +# # +# Date: 28-12-2021 # +# Author(s): Lorenzo Pellegrini # +# E-mail: contact@continualai.org # +# Website: avalanche.continualai.org # +################################################################################ + +""" +This is a simple example on how to enable distributed training in Avalanche. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os +import sys +import time + +import torch +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.optim.lr_scheduler import ReduceLROnPlateau +from torch.utils.data import DistributedSampler, DataLoader +from torchvision import transforms +from torchvision.transforms import ToTensor, RandomCrop + +from avalanche.benchmarks import SplitMNIST +from avalanche.benchmarks.utils import AvalancheSubset +from avalanche.benchmarks.utils.data_loader import TaskBalancedDataLoader +from avalanche.distributed import DistributedHelper +from avalanche.distributed.distributed_helper import hash_benchmark, hash_model +from avalanche.evaluation.metrics import accuracy_metrics, loss_metrics +from avalanche.logging import TensorboardLogger +from avalanche.models import SimpleMLP +from avalanche.training import Naive, ClassBalancedBuffer +from avalanche.training.plugins import EvaluationPlugin, ReplayPlugin, \ + LRSchedulerPlugin + + +OVERALL_MB_SIZE = 192 + + +class AdaptedNaive(Naive): + + def make_train_dataloader( + self, num_workers=0, shuffle=True, pin_memory=True, + persistent_workers=False, **kwargs + ): + dataset_len = len(self.adapted_dataset) + while (dataset_len % OVERALL_MB_SIZE) > 0: + # Note: when using OVERALL_MB_SIZE == 192, + # means that with N_GPUS = 1, 2, 3, 4, 6, 8 (any factor of 192) + # you will get the same number of iterations + # (due to how DistributedSampler works, which is slightly different + # from the default sampler) + dataset_len -= 1 + + other_dataloader_args = {} + other_dataloader_args['persistent_workers'] = persistent_workers + + self.dataloader = TaskBalancedDataLoader( + AvalancheSubset( + self.adapted_dataset, indices=list(range(dataset_len))), + oversample_small_groups=True, + num_workers=num_workers, + batch_size=self.train_mb_size, + shuffle=shuffle, + pin_memory=pin_memory, + drop_last=True, + **other_dataloader_args + ) + + def make_eval_dataloader( + self, num_workers=0, pin_memory=True, persistent_workers=False, + **kwargs): + dataset_len = len(self.adapted_dataset) + while (dataset_len % OVERALL_MB_SIZE) > 0: + # Note: when using OVERALL_MB_SIZE == 192, + # means that with N_GPUS = 1, 2, 3, 4, 6, 8 (any factor of 192) + # you will get the same number of iterations + # (due to how DistributedSampler works, which is slightly different + # from the default sampler) + dataset_len -= 1 + + other_dataloader_args = {} + other_dataloader_args['persistent_workers'] = persistent_workers + + d_set = AvalancheSubset( + self.adapted_dataset, indices=list(range(dataset_len))) + sampler = None + if DistributedHelper.is_distributed: + sampler = DistributedSampler( + d_set, shuffle=False, drop_last=False) + + self.dataloader = DataLoader( + d_set, + num_workers=num_workers, + batch_size=self.eval_mb_size, + pin_memory=pin_memory, + sampler=sampler, + shuffle=False, + drop_last=False, + **other_dataloader_args + ) + + +def main(args): + DistributedHelper.init_distributed(random_seed=4321, use_cuda=args.use_cuda) + rank = DistributedHelper.rank + world_size = DistributedHelper.world_size + device = DistributedHelper.make_device() + print(f'Current process rank: {rank}/{world_size}, ' + f'will use device: {device}') + + if not DistributedHelper.is_main_process: + sys.stdout = open(os.devnull, 'w') + sys.stderr = open(os.devnull, 'w') + + # --- TRANSFORMATIONS + train_transform = transforms.Compose([ + # RandomCrop(28, padding=4), + ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) + test_transform = transforms.Compose([ + ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) + # --------- + + # --- SCENARIO CREATION + scenario = SplitMNIST( + 5, + train_transform=train_transform, + eval_transform=test_transform) + # --------- + + # MODEL CREATION + model = SimpleMLP(num_classes=scenario.n_classes) + + optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9) + + # CREATE THE STRATEGY INSTANCE (NAIVE) + + loggers = [] + if DistributedHelper.is_main_process: + distr_str = 'single_process' + approach_str = 'naive' + sched_str = 'unsched' + cuda_str = 'cpu' + + if DistributedHelper.is_distributed: + distr_str = 'distributed' + + if args.use_replay: + approach_str = 'replay' + + if args.use_scheduler: + sched_str = 'plateau' + + if args.use_cuda: + cuda_str = 'cuda' + + loggers.append(TensorboardLogger( + tb_log_dir=f'./tb_data/{distr_str}_{approach_str}_{sched_str}_' + f'{cuda_str}{args.exp_postfix}')) + + my_evaluator = EvaluationPlugin( + accuracy_metrics(epoch=True, experience=True, stream=True), + loss_metrics(epoch=True, experience=True, stream=True), + loggers=loggers, + suppress_warnings=True + ) + + # Adapt the minibatch size + mb_size = OVERALL_MB_SIZE // DistributedHelper.world_size + + plugins = [] + if args.use_replay: + class_balanced_policy = ClassBalancedBuffer(1500) + plugins.append(ReplayPlugin( + 1500, + storage_policy=class_balanced_policy)) + + if args.use_scheduler: + plugins.append( + LRSchedulerPlugin( + ReduceLROnPlateau(optimizer), step_granularity='iteration', + metric='train_loss' + ) + ) + + cl_strategy = AdaptedNaive( + model, optimizer, + CrossEntropyLoss(), train_mb_size=mb_size, train_epochs=4, + eval_mb_size=mb_size, plugins=plugins, + device=device, evaluator=my_evaluator) + + start_time = time.time() + + # TRAINING LOOP + print('Starting experiment...') + results = [] + for experience in scenario.train_stream: + print("Start of experience: ", experience.current_experience) + print("Current Classes: ", experience.classes_in_this_experience) + + cl_strategy.train(experience, num_workers=4) + + print('Training completed') + + print('Computing accuracy on the whole test set') + results.append(cl_strategy.eval(scenario.test_stream, num_workers=4)) + + print('Training+eval took', time.time() - start_time) + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--use_cuda', action='store_true') + parser.add_argument('--use_replay', action='store_true') + parser.add_argument('--use_scheduler', action='store_true') + parser.add_argument('--exp_postfix', default='') + main(parser.parse_args()) diff --git a/examples/run_distributed_training_example.sh b/examples/run_distributed_training_example.sh new file mode 100755 index 000000000..4a0f2d0d6 --- /dev/null +++ b/examples/run_distributed_training_example.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +echo "This will run single-GPU and multi-GPU training for naive and replay" +echo "Run me from the avalanche repo root as 'bash examples/run_distributed_training_example.sh'" +eval "$(conda shell.bash hook)" +conda activate avalanche-dev-env +set -euo pipefail +ngpus=$(nvidia-smi -L | wc -l) +export PYTHONPATH="${PYTHONPATH-}:${PWD}" +CUDA_VISIBLE_DEVICES=-1 torchrun --standalone --nnodes=1 --nproc_per_node=$ngpus examples/distributed_training.py +CUDA_VISIBLE_DEVICES=-1 python examples/distributed_training.py +CUDA_VISIBLE_DEVICES=-1 torchrun --standalone --nnodes=1 --nproc_per_node=$ngpus examples/distributed_training.py --use_replay +CUDA_VISIBLE_DEVICES=-1 python examples/distributed_training.py --use_replay +CUDA_VISIBLE_DEVICES=-1 torchrun --standalone --nnodes=1 --nproc_per_node=$ngpus examples/distributed_training.py --use_replay --use_scheduler +CUDA_VISIBLE_DEVICES=-1 python examples/distributed_training.py --use_replay --use_scheduler + +#torchrun --standalone --nnodes=1 --nproc_per_node=$ngpus examples/distributed_training.py --use_cuda +#python examples/distributed_training.py --use_cuda +#torchrun --standalone --nnodes=1 --nproc_per_node=$ngpus examples/distributed_training.py --use_cuda --use_replay +#python examples/distributed_training.py --use_cuda --use_replay From 976e5c5403b893f7980559aab64c1ad86fbe83b0 Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Fri, 22 Apr 2022 12:12:41 +0200 Subject: [PATCH 03/16] Fixed pep8 issues. --- avalanche/distributed/distributed_commons.py | 3 --- avalanche/distributed/distributed_helper.py | 1 - examples/distributed_training.py | 2 -- 3 files changed, 6 deletions(-) diff --git a/avalanche/distributed/distributed_commons.py b/avalanche/distributed/distributed_commons.py index 2ebc39b71..ce5e15ae6 100644 --- a/avalanche/distributed/distributed_commons.py +++ b/avalanche/distributed/distributed_commons.py @@ -1,6 +1,5 @@ import torch -from avalanche.distributed import DistributedHelper from avalanche.distributed.distributed_tensor import DistributedMeanTensor @@ -18,8 +17,6 @@ def __init__(self, name: str = 'loss'): super(DistributedLoss, self).__init__(name, torch.zeros((1,))) def _merge_tensors(self, tensors): - # with DistributedHelper.main_process_first(): - # print('Rank', DistributedHelper.rank, 'losses=', tensors, flush=True) return super(DistributedLoss, self)._merge_tensors(tensors) diff --git a/avalanche/distributed/distributed_helper.py b/avalanche/distributed/distributed_helper.py index e380fcfab..bb66e7e74 100644 --- a/avalanche/distributed/distributed_helper.py +++ b/avalanche/distributed/distributed_helper.py @@ -520,7 +520,6 @@ def hash_model(model: Module) -> str: return hash_engine.hexdigest() - DistributedHelper = _DistributedHelperCls() diff --git a/examples/distributed_training.py b/examples/distributed_training.py index d0c6d8495..a38fce945 100644 --- a/examples/distributed_training.py +++ b/examples/distributed_training.py @@ -34,7 +34,6 @@ from avalanche.benchmarks.utils import AvalancheSubset from avalanche.benchmarks.utils.data_loader import TaskBalancedDataLoader from avalanche.distributed import DistributedHelper -from avalanche.distributed.distributed_helper import hash_benchmark, hash_model from avalanche.evaluation.metrics import accuracy_metrics, loss_metrics from avalanche.logging import TensorboardLogger from avalanche.models import SimpleMLP @@ -221,7 +220,6 @@ def main(args): print('Training+eval took', time.time() - start_time) - if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--use_cuda', action='store_true') From efb7f8626a5dc3a91db6ddb2e71d9cf4dca65452 Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Fri, 22 Apr 2022 12:19:48 +0200 Subject: [PATCH 04/16] Fixed typing error. Removed debug code. --- avalanche/distributed/distributed_batch.py | 13 +------------ tests/distributed/test_distributed_batch.py | 10 ++++------ 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/avalanche/distributed/distributed_batch.py b/avalanche/distributed/distributed_batch.py index e84e9bddb..206fa286a 100644 --- a/avalanche/distributed/distributed_batch.py +++ b/avalanche/distributed/distributed_batch.py @@ -1,5 +1,5 @@ from abc import abstractmethod, ABC -from typing import TypeVar, List, Optional, Callable, Any +from typing import TypeVar, List, Optional, Callable, Any, Tuple import torch from torch import Tensor @@ -7,8 +7,6 @@ from avalanche.distributed import DistributedHelper from avalanche.distributed.distributed_value import SwitchableDistributedValue -TupleT = TypeVar('TupleT', bound='Tuple') -OptTupleT = Optional[TupleT] LocalT = TypeVar('LocalT') DistributedT = TypeVar('DistributedT') @@ -109,15 +107,6 @@ def _merge_tuples(self, tuples: List[LocalT]): return self.tuples_collate_fn(tuples) def _merge_single_values(self, values: List, value_index: int): - # if DistributedHelper.is_main_process: - # print('MERGING VALUES:') - # for elem in values: - # if isinstance(elem, Tensor): - # print(elem.device) - # print(elem.shape) - # else: - # print(type(elem)) - return self.single_values_collate_fn(values, value_index) diff --git a/tests/distributed/test_distributed_batch.py b/tests/distributed/test_distributed_batch.py index 9a00be6be..881c76b44 100644 --- a/tests/distributed/test_distributed_batch.py +++ b/tests/distributed/test_distributed_batch.py @@ -1,12 +1,12 @@ import contextlib import os import unittest -from typing import Tuple, Optional import torch from torch import Tensor -from avalanche.distributed import DistributedHelper, ClassificationBatch +from avalanche.distributed import DistributedHelper, \ + make_classification_distributed_batch @contextlib.contextmanager @@ -27,8 +27,7 @@ def setUp(self) -> None: @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, 'Distributed tests ignored') def test_classification_batch(self): - dt: ClassificationBatch[Optional[Tuple[Tensor, Tensor]]] = \ - ClassificationBatch('mb', None) + dt = make_classification_distributed_batch('mb') self.assertEqual(None, dt.local_value) self.assertEqual(None, dt.value) @@ -54,8 +53,7 @@ def test_classification_batch(self): @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, 'Distributed tests ignored') def test_unsupervised_classification_batch(self): - dt: ClassificationBatch[Optional[Tuple[Tensor, Tensor]]] = \ - ClassificationBatch('mb', None) + dt = make_classification_distributed_batch('mb') self.assertEqual(None, dt.local_value) self.assertEqual(None, dt.value) From 3017aeb7aa6c26bb504d17dc772fcfd21843f90a Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Fri, 22 Apr 2022 14:53:20 +0200 Subject: [PATCH 05/16] Removed debug prints. --- avalanche/training/templates/base_sgd.py | 2 -- avalanche/training/templates/supervised.py | 1 - 2 files changed, 3 deletions(-) diff --git a/avalanche/training/templates/base_sgd.py b/avalanche/training/templates/base_sgd.py index 22a8da77d..2b1b23cf7 100644 --- a/avalanche/training/templates/base_sgd.py +++ b/avalanche/training/templates/base_sgd.py @@ -248,9 +248,7 @@ def training_epoch(self, **kwargs): :param kwargs: :return: """ - print('Pre-mbatch') for self.mbatch in self.dataloader: - print('mbatch', self.mbatch) if self._stop_training: break diff --git a/avalanche/training/templates/supervised.py b/avalanche/training/templates/supervised.py index 2a87e9665..755fefdb9 100644 --- a/avalanche/training/templates/supervised.py +++ b/avalanche/training/templates/supervised.py @@ -276,7 +276,6 @@ def forward(self): def _forward(self): """Implementation of the forward pass.""" - # print('mbx hash:', hash_tensor(self.distributed_mbatch[0])) return avalanche_forward(self.model, self.mb_x, self.mb_task_id) @final From f8882d77252511fe99bddec0345e340d1f40917d Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Fri, 29 Apr 2022 14:56:54 +0200 Subject: [PATCH 06/16] Implemented lazy creation of the default logger. --- avalanche/training/plugins/evaluation.py | 49 +++++++----------------- 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/avalanche/training/plugins/evaluation.py b/avalanche/training/plugins/evaluation.py index ff6cb54bc..55ac8434a 100644 --- a/avalanche/training/plugins/evaluation.py +++ b/avalanche/training/plugins/evaluation.py @@ -2,6 +2,7 @@ from copy import copy from collections import defaultdict from typing import Union, Sequence, TYPE_CHECKING +from typing_extensions import Literal from avalanche.distributed import DistributedHelper from avalanche.evaluation.metric_results import MetricValue @@ -32,7 +33,9 @@ class EvaluationPlugin: def __init__( self, *metrics: Union["PluginMetric", Sequence["PluginMetric"]], - loggers: Union["BaseLogger", Sequence["BaseLogger"]] = None, + loggers: Union["BaseLogger", + Sequence["BaseLogger"], + Literal['default']] = 'default', collect_all=True, benchmark=None, strict_checks=False, @@ -67,7 +70,9 @@ def __init__( flat_metrics_list.append(metric) self.metrics = flat_metrics_list - if loggers is None: + if loggers == 'default': + loggers = make_default_loggers() + elif loggers is None: loggers = [] elif not isinstance(loggers, Sequence): loggers = [loggers] @@ -229,45 +234,17 @@ def before_eval(self, strategy: "SupervisedTemplate", **kwargs): warnings.warn(msgw) -class LazyDefaultLoggersList(Sequence["BaseLogger"]): - """ - Used to prevent the creation of loggers on a non-main process when - running distributed training jobs. - - Beware that the content of this sequence (and thus the behavior of - `__len__` and `__getitem__`) varies depending on the value of - `DistributedHelper.is_main_process`. This means that objects of this class - should be used only by modules able to handle this behavior, which is not - standard for Sequences. - """ - - def __init__(self): - self._default_loggers = None - - def __len__(self): - if DistributedHelper.is_main_process: - return 1 - else: - return 0 - - def __getitem__(self, item): - self._instantiate_loggers() - return self._default_loggers[item] - - def _instantiate_loggers(self): - if self._default_loggers is not None: - return - - if DistributedHelper.is_main_process: - self._default_loggers = [InteractiveLogger()] - else: - self._default_loggers = [] +def make_default_loggers(): + if DistributedHelper.is_main_process: + return [InteractiveLogger()] + else: + return [] default_evaluator = EvaluationPlugin( accuracy_metrics(minibatch=False, epoch=True, experience=True, stream=True), loss_metrics(minibatch=False, epoch=True, experience=True, stream=True), - loggers=LazyDefaultLoggersList(), + loggers='default', suppress_warnings=True, ) From 8571b91cf59ee29753cac42365073b6c226e52e1 Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Fri, 29 Apr 2022 14:57:44 +0200 Subject: [PATCH 07/16] [Distributed] Simplified internal API and example. Added in-code guide. --- avalanche/distributed/distributed_batch.py | 12 +- avalanche/distributed/distributed_commons.py | 4 +- avalanche/distributed/distributed_helper.py | 2 +- avalanche/distributed/distributed_model.py | 2 +- avalanche/distributed/distributed_tensor.py | 14 +- avalanche/distributed/distributed_value.py | 12 +- examples/distributed_training.py | 132 +++++-------------- examples/run_distributed_training_example.sh | 112 ++++++++++++++-- 8 files changed, 157 insertions(+), 133 deletions(-) diff --git a/avalanche/distributed/distributed_batch.py b/avalanche/distributed/distributed_batch.py index 206fa286a..f5d0a281b 100644 --- a/avalanche/distributed/distributed_batch.py +++ b/avalanche/distributed/distributed_batch.py @@ -1,5 +1,5 @@ from abc import abstractmethod, ABC -from typing import TypeVar, List, Optional, Callable, Any, Tuple +from typing import TypeVar, List, Optional, Callable, Any import torch from torch import Tensor @@ -17,7 +17,7 @@ class DistributedObject(SwitchableDistributedValue[LocalT, DistributedT], ABC): The merge procedure must be implemented in child classes. """ - def _synchronize_distributed_value(self) -> DistributedT: + def _synchronize(self) -> DistributedT: objects = self._synchronize_objects() return self._merge_objects(objects) @@ -52,15 +52,15 @@ def __init__(self, name: str, initial_local_value: LocalT): super().__init__(name, initial_local_value) self._value_is_tuple = False - def _synchronize_distributed_value(self) -> LocalT: + def _synchronize(self) -> LocalT: if self._local_value is None: return None else: - return super()._synchronize_distributed_value() + return super()._synchronize() - def _set_local_value(self, new_local_value): + def _set_local(self, new_local_value): self._value_is_tuple = isinstance(new_local_value, (tuple, list)) - super()._set_local_value(new_local_value) + super()._set_local(new_local_value) def _merge_objects(self, objects: List[LocalT]) -> LocalT: if self._value_is_tuple: diff --git a/avalanche/distributed/distributed_commons.py b/avalanche/distributed/distributed_commons.py index ce5e15ae6..7a43654b1 100644 --- a/avalanche/distributed/distributed_commons.py +++ b/avalanche/distributed/distributed_commons.py @@ -16,8 +16,8 @@ class DistributedLoss(DistributedMeanTensor): def __init__(self, name: str = 'loss'): super(DistributedLoss, self).__init__(name, torch.zeros((1,))) - def _merge_tensors(self, tensors): - return super(DistributedLoss, self)._merge_tensors(tensors) + def _merge(self, tensors): + return super(DistributedLoss, self)._merge(tensors) __all__ = [ diff --git a/avalanche/distributed/distributed_helper.py b/avalanche/distributed/distributed_helper.py index bb66e7e74..52b4578b9 100644 --- a/avalanche/distributed/distributed_helper.py +++ b/avalanche/distributed/distributed_helper.py @@ -191,7 +191,7 @@ def make_device(self): else: device_id = 0 - if self.use_cuda and torch.cuda.is_available() and device_id >= 0: + if self.use_cuda and device_id >= 0 and torch.cuda.is_available(): ref_device = torch.device(f'cuda:{device_id}') torch.cuda.set_device(ref_device) else: diff --git a/avalanche/distributed/distributed_model.py b/avalanche/distributed/distributed_model.py index e304a3542..56afeb683 100644 --- a/avalanche/distributed/distributed_model.py +++ b/avalanche/distributed/distributed_model.py @@ -108,7 +108,7 @@ def reset_distributed_model(self): """ return self.reset_distributed_value() - def _synchronize_distributed_value(self) -> DistributedT: + def _synchronize(self) -> DistributedT: raise RuntimeError( 'The distributed model needs to be wrapped and set by using the ' f'following class(es): {self.distributed_model_class}') diff --git a/avalanche/distributed/distributed_tensor.py b/avalanche/distributed/distributed_tensor.py index 689a467a1..cfb3d2fec 100644 --- a/avalanche/distributed/distributed_tensor.py +++ b/avalanche/distributed/distributed_tensor.py @@ -14,15 +14,15 @@ class DistributedTensor(SwitchableDistributedValue[Tensor, Tensor], ABC): This abstract class is in charge of synchronizing Tensors across processes. - Child classes must override `_merge_tensors` to define how those tensors + Child classes must override `_merge` to define how those tensors should be merged. """ - def _synchronize_distributed_value(self) -> Tensor: - return self._merge_tensors( + def _synchronize(self) -> Tensor: + return self._merge( DistributedHelper.gather_all(self.local_value)) @abstractmethod - def _merge_tensors(self, tensors: List[Tensor]) -> Tensor: + def _merge(self, tensors: List[Tensor]) -> Tensor: """ Merge all tensors into one. @@ -40,7 +40,7 @@ class ConcatDistributedTensor(DistributedTensor): This also correctly manages tensors with 0-length shapes (like losses). """ - def _merge_tensors(self, tensors: List[Tensor]) -> Tensor: + def _merge(self, tensors: List[Tensor]) -> Tensor: # Manage tensors without shape (0-length shape) for i, t in enumerate(tensors): if len(t.shape) == 0: @@ -55,8 +55,8 @@ class DistributedMeanTensor(ConcatDistributedTensor): A distributed 1-item tensor obtained by computing the mean of tensors from all processes. """ - def _merge_tensors(self, tensors: List[Tensor]) -> Tensor: - concat_tensor = super()._merge_tensors(tensors) + def _merge(self, tensors: List[Tensor]) -> Tensor: + concat_tensor = super()._merge(tensors) return torch.mean(concat_tensor) diff --git a/avalanche/distributed/distributed_value.py b/avalanche/distributed/distributed_value.py index ab231fe2e..8d4e869cf 100644 --- a/avalanche/distributed/distributed_value.py +++ b/avalanche/distributed/distributed_value.py @@ -74,7 +74,7 @@ def value(self, new_value: LocalT): This will discard the current distributed value. """ - self._set_local_value(new_value) + self._set_local(new_value) @property def local_value(self) -> LocalT: @@ -93,9 +93,9 @@ def local_value(self, new_value: LocalT): This will discard the current distributed value. """ - self._set_local_value(new_value) + self._set_local(new_value) - def _set_local_value(self, new_local_value: LocalT): + def _set_local(self, new_local_value: LocalT): self._local_value = new_local_value self._distributed_value = None self._distributed_value_set = False @@ -105,13 +105,13 @@ def _get_distributed_value(self) -> DistributedT: return self._local_value if not self._distributed_value_set: - self._distributed_value = self._synchronize_distributed_value() + self._distributed_value = self._synchronize() self._distributed_value_set = True return self._distributed_value @abstractmethod - def _synchronize_distributed_value(self) -> DistributedT: + def _synchronize(self) -> DistributedT: pass def __str__(self): @@ -131,7 +131,7 @@ class SettableDistributedValue(DistributedValue[LocalT, DistributedT], ABC): If this class should only allow for distributed values to be set externally (that is, synchronization should be disabled), please - override `_synchronize_distributed_value` to raise an appropriate error. + override `_synchronize` to raise an appropriate error. In that case, this means this class is mainly used as a switch between a local and a distributed value based on whether the distributed value has been set or not. diff --git a/examples/distributed_training.py b/examples/distributed_training.py index a38fce945..2dada397b 100644 --- a/examples/distributed_training.py +++ b/examples/distributed_training.py @@ -13,26 +13,19 @@ This is a simple example on how to enable distributed training in Avalanche. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function import argparse import os import sys import time -import torch from torch.nn import CrossEntropyLoss from torch.optim import SGD from torch.optim.lr_scheduler import ReduceLROnPlateau -from torch.utils.data import DistributedSampler, DataLoader from torchvision import transforms from torchvision.transforms import ToTensor, RandomCrop from avalanche.benchmarks import SplitMNIST -from avalanche.benchmarks.utils import AvalancheSubset -from avalanche.benchmarks.utils.data_loader import TaskBalancedDataLoader from avalanche.distributed import DistributedHelper from avalanche.evaluation.metrics import accuracy_metrics, loss_metrics from avalanche.logging import TensorboardLogger @@ -41,75 +34,36 @@ from avalanche.training.plugins import EvaluationPlugin, ReplayPlugin, \ LRSchedulerPlugin - OVERALL_MB_SIZE = 192 -class AdaptedNaive(Naive): - - def make_train_dataloader( - self, num_workers=0, shuffle=True, pin_memory=True, - persistent_workers=False, **kwargs - ): - dataset_len = len(self.adapted_dataset) - while (dataset_len % OVERALL_MB_SIZE) > 0: - # Note: when using OVERALL_MB_SIZE == 192, - # means that with N_GPUS = 1, 2, 3, 4, 6, 8 (any factor of 192) - # you will get the same number of iterations - # (due to how DistributedSampler works, which is slightly different - # from the default sampler) - dataset_len -= 1 - - other_dataloader_args = {} - other_dataloader_args['persistent_workers'] = persistent_workers - - self.dataloader = TaskBalancedDataLoader( - AvalancheSubset( - self.adapted_dataset, indices=list(range(dataset_len))), - oversample_small_groups=True, - num_workers=num_workers, - batch_size=self.train_mb_size, - shuffle=shuffle, - pin_memory=pin_memory, - drop_last=True, - **other_dataloader_args - ) - - def make_eval_dataloader( - self, num_workers=0, pin_memory=True, persistent_workers=False, - **kwargs): - dataset_len = len(self.adapted_dataset) - while (dataset_len % OVERALL_MB_SIZE) > 0: - # Note: when using OVERALL_MB_SIZE == 192, - # means that with N_GPUS = 1, 2, 3, 4, 6, 8 (any factor of 192) - # you will get the same number of iterations - # (due to how DistributedSampler works, which is slightly different - # from the default sampler) - dataset_len -= 1 - - other_dataloader_args = {} - other_dataloader_args['persistent_workers'] = persistent_workers - - d_set = AvalancheSubset( - self.adapted_dataset, indices=list(range(dataset_len))) - sampler = None - if DistributedHelper.is_distributed: - sampler = DistributedSampler( - d_set, shuffle=False, drop_last=False) - - self.dataloader = DataLoader( - d_set, - num_workers=num_workers, - batch_size=self.eval_mb_size, - pin_memory=pin_memory, - sampler=sampler, - shuffle=False, - drop_last=False, - **other_dataloader_args - ) - - def main(args): + # >> Notes on enabling distributed training support in Avalanche << + # + # There are only a few changes to be made when enabling distributed + # training in Avalanche. These are all shown in this example. To recap: + # + # 1. Wrap the main code in a function. Call that function from + # within a "if __name__ == '__main__':" section. + # 2. Add a call to `init_distributed` at the beginning of the main function. + # Obtain the device object using `make_device`. + # 3. (Optional, recommended) Suppress the output for non-main processes. + # 4. (If needed) Avalanche classic benchmarks already have proper ways + # to ensure that dataset files are not downloaded and written + # concurrently. If you need to dynamically download a custom dataset or + # create other working files, do it in the main process only (the one + # with rank 0). + # 5. Loggers cannot be created in non-main processes. Make sure you create + # them in the main process only. Metrics should be instantiated as usual. + # 6. IMPORTANT! Scale your minibatch size by the number of processes used. + # + # Notice that these changes do not impact your ability to run the same + # script in the classic single-process fashion. + # + # You can check how to run this script in a distributed way by looking at + # the `run_distributed_training_example.sh` script in the `examples` folder. + print('Starting experiment', args.exp_name) + DistributedHelper.init_distributed(random_seed=4321, use_cuda=args.use_cuda) rank = DistributedHelper.rank world_size = DistributedHelper.world_size @@ -118,12 +72,14 @@ def main(args): f'will use device: {device}') if not DistributedHelper.is_main_process: + # Suppress the output of non-main processes + # This prevents the output from being duplicated in the console sys.stdout = open(os.devnull, 'w') sys.stderr = open(os.devnull, 'w') # --- TRANSFORMATIONS train_transform = transforms.Compose([ - # RandomCrop(28, padding=4), + RandomCrop(28, padding=4), ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) @@ -135,7 +91,7 @@ def main(args): # --- SCENARIO CREATION scenario = SplitMNIST( - 5, + n_experiences=5, train_transform=train_transform, eval_transform=test_transform) # --------- @@ -146,30 +102,14 @@ def main(args): optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9) # CREATE THE STRATEGY INSTANCE (NAIVE) - loggers = [] if DistributedHelper.is_main_process: - distr_str = 'single_process' - approach_str = 'naive' - sched_str = 'unsched' - cuda_str = 'cpu' - - if DistributedHelper.is_distributed: - distr_str = 'distributed' - - if args.use_replay: - approach_str = 'replay' - - if args.use_scheduler: - sched_str = 'plateau' - - if args.use_cuda: - cuda_str = 'cuda' - + # Loggers should be created in the main process only loggers.append(TensorboardLogger( - tb_log_dir=f'./tb_data/{distr_str}_{approach_str}_{sched_str}_' - f'{cuda_str}{args.exp_postfix}')) + tb_log_dir=f'./logs/{args.exp_name}')) + # Metrics should be created as usual, with no differences between main and + # non-main processes. my_evaluator = EvaluationPlugin( accuracy_metrics(epoch=True, experience=True, stream=True), loss_metrics(epoch=True, experience=True, stream=True), @@ -195,7 +135,7 @@ def main(args): ) ) - cl_strategy = AdaptedNaive( + cl_strategy = Naive( model, optimizer, CrossEntropyLoss(), train_mb_size=mb_size, train_epochs=4, eval_mb_size=mb_size, plugins=plugins, @@ -225,5 +165,5 @@ def main(args): parser.add_argument('--use_cuda', action='store_true') parser.add_argument('--use_replay', action='store_true') parser.add_argument('--use_scheduler', action='store_true') - parser.add_argument('--exp_postfix', default='') + parser.add_argument('--exp_name', default='dist_exp') main(parser.parse_args()) diff --git a/examples/run_distributed_training_example.sh b/examples/run_distributed_training_example.sh index 4a0f2d0d6..5d514b685 100755 --- a/examples/run_distributed_training_example.sh +++ b/examples/run_distributed_training_example.sh @@ -1,19 +1,103 @@ #!/usr/bin/env bash -echo "This will run single-GPU and multi-GPU training for naive and replay" -echo "Run me from the avalanche repo root as 'bash examples/run_distributed_training_example.sh'" eval "$(conda shell.bash hook)" conda activate avalanche-dev-env set -euo pipefail -ngpus=$(nvidia-smi -L | wc -l) + +CPU_PARALLELISM=4 +GPU_PARALLELISM=0 + +usage() { + echo "This will run single-process and multi-process training for naive, replay, and replay+scheduler setups." + echo "Used to check for differences between local and distributed training." + echo "" + echo "Run me from the avalanche repo root as 'bash examples/run_distributed_training_example.sh'" + echo + echo "Syntax: examples/run_distributed_training_example [-h] [-c CPU_PARALLELISM] [-g GPU_PARALLELISM]" + echo "" + echo "Options:" + echo "-h Print this Help." + echo "-c Set the CPU parallelism for distributed experiments. Defaults to 4." + echo " Set this value to 0 to skip CPU experiments." + echo "-g Set the GPU parallelism for distributed experiments. Defaults to 0 (skip GPU experiments)." + echo " Set this value to -1 to auto-detect how many GPUs are in the system." +} + +exit_abnormal() { + usage + exit 1 +} + +while getopts ":c:g:" options; do + case "${options}" in + c) + CPU_PARALLELISM=${OPTARG} + ;; + g) + GPU_PARALLELISM=${OPTARG} + ;; + h) + usage + exit 0 + ;; + :) + echo "Error: -${OPTARG} requires an argument!" + echo "" + exit_abnormal + ;; + *) + exit_abnormal + ;; + esac +done + +if [[ "$GPU_PARALLELISM" == "-1" ]]; then + GPU_PARALLELISM=$(nvidia-smi -L | wc -l) + echo "Auto-detected $GPU_PARALLELISM GPUs." +fi + export PYTHONPATH="${PYTHONPATH-}:${PWD}" -CUDA_VISIBLE_DEVICES=-1 torchrun --standalone --nnodes=1 --nproc_per_node=$ngpus examples/distributed_training.py -CUDA_VISIBLE_DEVICES=-1 python examples/distributed_training.py -CUDA_VISIBLE_DEVICES=-1 torchrun --standalone --nnodes=1 --nproc_per_node=$ngpus examples/distributed_training.py --use_replay -CUDA_VISIBLE_DEVICES=-1 python examples/distributed_training.py --use_replay -CUDA_VISIBLE_DEVICES=-1 torchrun --standalone --nnodes=1 --nproc_per_node=$ngpus examples/distributed_training.py --use_replay --use_scheduler -CUDA_VISIBLE_DEVICES=-1 python examples/distributed_training.py --use_replay --use_scheduler - -#torchrun --standalone --nnodes=1 --nproc_per_node=$ngpus examples/distributed_training.py --use_cuda -#python examples/distributed_training.py --use_cuda -#torchrun --standalone --nnodes=1 --nproc_per_node=$ngpus examples/distributed_training.py --use_cuda --use_replay -#python examples/distributed_training.py --use_cuda --use_replay + +if [[ "$CPU_PARALLELISM" == "0" ]]; then + echo "Skipping CPU experiments." +else + # Naive experiments + torchrun --standalone --nnodes=1 --nproc_per_node=$CPU_PARALLELISM examples/distributed_training.py \ + --exp_name "distributed_naive_unsched_cpu" + python examples/distributed_training.py \ + --exp_name "single_process_naive_unsched_cpu" + + # Replay experiments + torchrun --standalone --nnodes=1 --nproc_per_node=$CPU_PARALLELISM examples/distributed_training.py \ + --use_replay --exp_name "distributed_replay_unsched_cpu" + python examples/distributed_training.py \ + --use_replay --exp_name "single_process_replay_unsched_cpu" + + # Replay + LR scheduler experiments + torchrun --standalone --nnodes=1 --nproc_per_node=$CPU_PARALLELISM examples/distributed_training.py \ + --use_replay --use_scheduler --exp_name "distributed_replay_scheduler_cpu" + python examples/distributed_training.py \ + --use_replay --use_scheduler --exp_name "single_process_replay_scheduler_cpu" +fi + +if [[ "$GPU_PARALLELISM" == "0" ]]; then + echo "Skipping GPU experiments." + exit 0 +fi + +# Naive experiments (GPU) +torchrun --standalone --nnodes=1 --nproc_per_node=$GPU_PARALLELISM examples/distributed_training.py \ + --exp_name "distributed_naive_unsched_gpu" --use_cuda +python examples/distributed_training.py \ + --exp_name "single_process_naive_unsched_gpu" --use_cuda + +# Replay experiments (GPU) +torchrun --standalone --nnodes=1 --nproc_per_node=$GPU_PARALLELISM examples/distributed_training.py \ + --exp_name "distributed_replay_unsched_gpu" --use_cuda --use_replay +python examples/distributed_training.py \ + --exp_name "single_process_replay_unsched_gpu" --use_cuda --use_replay + +# Replay + LR scheduler experiments (GPU) +torchrun --standalone --nnodes=1 --nproc_per_node=$GPU_PARALLELISM examples/distributed_training.py \ + --exp_name "distributed_replay_scheduler_gpu" --use_cuda --use_replay --use_scheduler +python examples/distributed_training.py \ + --exp_name "single_process_replay_scheduler_gpu" --use_cuda --use_replay --use_scheduler \ No newline at end of file From b752568df06a2557e225d4fb17e3a3553837b209 Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Fri, 29 Apr 2022 16:11:05 +0200 Subject: [PATCH 08/16] Added support for general use_local in strategies. --- avalanche/distributed/strategies/__init__.py | 1 + .../strategies/distributed_loss_strategy.py | 4 +- .../strategies/distributed_mbatch_strategy.py | 7 +- .../strategies/distributed_model_strategy.py | 4 +- .../distributed_strategy_support.py | 48 +++++++++++ .../test_distributed_strategy_support.py | 84 +++++++++++++++++++ tests/run_dist_tests.py | 3 + 7 files changed, 147 insertions(+), 4 deletions(-) create mode 100644 avalanche/distributed/strategies/distributed_strategy_support.py create mode 100644 tests/distributed/test_distributed_strategy_support.py diff --git a/avalanche/distributed/strategies/__init__.py b/avalanche/distributed/strategies/__init__.py index 8ce5532e3..9205b85d7 100644 --- a/avalanche/distributed/strategies/__init__.py +++ b/avalanche/distributed/strategies/__init__.py @@ -1,3 +1,4 @@ +from .distributed_strategy_support import * from .distributed_model_strategy import * from .distributed_mbatch_strategy import * from .distributed_loss_strategy import * diff --git a/avalanche/distributed/strategies/distributed_loss_strategy.py b/avalanche/distributed/strategies/distributed_loss_strategy.py index 824dff3f1..61a9bfd68 100644 --- a/avalanche/distributed/strategies/distributed_loss_strategy.py +++ b/avalanche/distributed/strategies/distributed_loss_strategy.py @@ -1,13 +1,15 @@ from torch import Tensor from avalanche.distributed import DistributedLoss +from avalanche.distributed.strategies import DistributedStrategySupport -class DistributedLossStrategySupport: +class DistributedLossStrategySupport(DistributedStrategySupport): def __init__(self): super().__init__() self._loss = DistributedLoss() + self._use_local_contexts.append(self.use_local_loss) @property def loss(self) -> Tensor: diff --git a/avalanche/distributed/strategies/distributed_mbatch_strategy.py b/avalanche/distributed/strategies/distributed_mbatch_strategy.py index 8fdd18699..f374807c2 100644 --- a/avalanche/distributed/strategies/distributed_mbatch_strategy.py +++ b/avalanche/distributed/strategies/distributed_mbatch_strategy.py @@ -1,13 +1,13 @@ from typing import Callable, List, Any -import torch from avalanche.benchmarks.utils.collate_functions import \ classification_collate_mbatches_fn, classification_single_values_collate_fn from avalanche.distributed import CollateDistributedBatch +from avalanche.distributed.strategies import DistributedStrategySupport -class DistributedMiniBatchStrategySupport: +class DistributedMiniBatchStrategySupport(DistributedStrategySupport): def __init__(self): super().__init__() @@ -25,6 +25,9 @@ def __init__(self): classification_single_values_collate_fn ) + self._use_local_contexts.append(self.use_local_input_batch) + self._use_local_contexts.append(self.use_local_output_batch) + # --- START INPUT MINIBATCH PROPERTY --- @property def mbatch(self): diff --git a/avalanche/distributed/strategies/distributed_model_strategy.py b/avalanche/distributed/strategies/distributed_model_strategy.py index c97559caa..6a31244db 100644 --- a/avalanche/distributed/strategies/distributed_model_strategy.py +++ b/avalanche/distributed/strategies/distributed_model_strategy.py @@ -1,13 +1,15 @@ from torch.nn import Module from avalanche.distributed import DistributedModel +from avalanche.distributed.strategies import DistributedStrategySupport -class DistributedModelStrategySupport: +class DistributedModelStrategySupport(DistributedStrategySupport): def __init__(self): super().__init__() self._model = DistributedModel() + self._use_local_contexts.append(self.use_local_model) @property def model(self) -> Module: diff --git a/avalanche/distributed/strategies/distributed_strategy_support.py b/avalanche/distributed/strategies/distributed_strategy_support.py new file mode 100644 index 000000000..b67501b2c --- /dev/null +++ b/avalanche/distributed/strategies/distributed_strategy_support.py @@ -0,0 +1,48 @@ +from contextlib import contextmanager, ExitStack + + +class DistributedStrategySupport: + + def __init__(self): + """ + Implements the basic elements needed to support distributed training + in Avalanche strategies. + """ + super().__init__() + self._use_local_contexts = [] + """ + A list of context manager factories to be used in `use_local`. + """ + + @contextmanager + def use_local(self, *args, **kwargs): + """ + A context manager used to change the behavior of some property getters. + + When running code in this context, the property getter implementation + of some distributed-critical fields will return the local value instead + of the distributed (synchronized) one. + + Examples of distributed-critical fields are `model`, `mbatch`, + `mb_output`, `loss`. + + Beware that this is method will modify the behavior of getters of ALL + such properties. This may not be desirable. Use the field-specific + `use_local_*` context managers to control the behavior of these + fields in a finer way. + + :param args: Passed to all field-specific `use_local_*` context + managers. + :param kwargs: Passed to all field-specific `use_local_*` context + managers. + :return: The context manager to be used through the `with` syntax. + """ + with ExitStack() as stack: + for lcm in self._use_local_contexts: + stack.enter_context(lcm(*args, **kwargs)) + yield + + +__all__ = [ + 'DistributedStrategySupport' +] diff --git a/tests/distributed/test_distributed_strategy_support.py b/tests/distributed/test_distributed_strategy_support.py new file mode 100644 index 000000000..686555a12 --- /dev/null +++ b/tests/distributed/test_distributed_strategy_support.py @@ -0,0 +1,84 @@ +import contextlib +import os +import time +import unittest + +import torch + +from avalanche.distributed import DistributedHelper +from avalanche.distributed.strategies import DistributedMiniBatchStrategySupport + + +@contextlib.contextmanager +def manage_output(): + if os.environ['LOCAL_RANK'] != 0: + with contextlib.redirect_stderr(None): + with contextlib.redirect_stdout(None): + yield + else: + yield + + +class DistributedStrategySupportTests(unittest.TestCase): + + def setUp(self) -> None: + DistributedHelper.init_distributed(1234, use_cuda=False) + + @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + 'Distributed tests ignored') + def test_use_local_works(self): + uut = DistributedMiniBatchStrategySupport() + uut.mbatch = torch.full((5, 10), DistributedHelper.rank, + dtype=torch.float32) + uut.mb_output = torch.full((5, 10), DistributedHelper.rank, + dtype=torch.float32) + + # Test without use_local + got_mbatch = uut.mbatch + got_mb_output = uut.mb_output + + expected_shape = (DistributedHelper.world_size * 5, 10) + + self.assertSequenceEqual(expected_shape, got_mbatch.shape) + self.assertSequenceEqual(expected_shape, got_mb_output.shape) + + for row_idx in range(expected_shape[0]): + from_rank = row_idx // 5 + self.assertTrue(torch.equal( + torch.full((10,), from_rank, dtype=torch.float32), + got_mbatch[row_idx])) + self.assertTrue(torch.equal( + torch.full((10,), from_rank, dtype=torch.float32), + got_mb_output[row_idx])) + + # Test with use_local + uut.mbatch = torch.full((5, 10), DistributedHelper.rank, + dtype=torch.float32) + uut.mb_output = torch.full((5, 10), DistributedHelper.rank, + dtype=torch.float32) + + with uut.use_local(): + got_mbatch = uut.mbatch + got_mb_output = uut.mb_output + + expected_shape = (5, 10) + + self.assertSequenceEqual(expected_shape, got_mbatch.shape) + self.assertSequenceEqual(expected_shape, got_mb_output.shape) + + for row_idx in range(expected_shape[0]): + from_rank = DistributedHelper.rank + self.assertTrue(torch.equal( + torch.full((10,), from_rank, dtype=torch.float32), + got_mbatch[row_idx])) + self.assertTrue(torch.equal( + torch.full((10,), from_rank, dtype=torch.float32), + got_mb_output[row_idx])) + + +if __name__ == "__main__": + with manage_output(): + verbosity = 1 + if DistributedHelper.rank > 0: + verbosity = 0 + unittest.main(verbosity=verbosity) diff --git a/tests/run_dist_tests.py b/tests/run_dist_tests.py index 90d15fad0..c17718fa9 100644 --- a/tests/run_dist_tests.py +++ b/tests/run_dist_tests.py @@ -1,3 +1,4 @@ +import os import signal import sys import unittest @@ -5,6 +6,8 @@ from typing import Union, Set from unittest import TestSuite, TestCase +os.environ['DISTRIBUTED_TESTS'] = '1' + def get_distributed_test_cases(suite: Union[TestCase, TestSuite]) -> Set[str]: found_cases = set() From d1b9d28fb9e81eec1b0342d8071fd4f04a62d3a7 Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Tue, 19 Jul 2022 18:13:59 +0200 Subject: [PATCH 09/16] Add type hints to _make_data_loader. Fix distributed training example. --- avalanche/benchmarks/utils/data_loader.py | 11 +++++++---- examples/distributed_training.py | 3 +-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/avalanche/benchmarks/utils/data_loader.py b/avalanche/benchmarks/utils/data_loader.py index d299bd507..7fcd0cb57 100644 --- a/avalanche/benchmarks/utils/data_loader.py +++ b/avalanche/benchmarks/utils/data_loader.py @@ -15,10 +15,10 @@ between the current data and the replay memory. """ from itertools import chain -from typing import Dict, Sequence, Union +from typing import Dict, Sequence, Union, Any import torch -from torch.utils.data import RandomSampler, DistributedSampler +from torch.utils.data import RandomSampler, DistributedSampler, Dataset from torch.utils.data.dataloader import DataLoader from avalanche.benchmarks.utils import AvalancheDataset @@ -527,8 +527,11 @@ def _get_batch_sizes(data_dict, single_exp_batch_size, remaining_example, def _make_data_loader( - dataset, distributed_sampling, data_loader_args, - batch_size, force_no_workers=False): + dataset: Dataset, + distributed_sampling: bool, + data_loader_args: Dict[str, Any], + batch_size: int, + force_no_workers=False): data_loader_args = data_loader_args.copy() collate_from_data_or_kwargs(dataset, data_loader_args) diff --git a/examples/distributed_training.py b/examples/distributed_training.py index 2dada397b..06c15cecd 100644 --- a/examples/distributed_training.py +++ b/examples/distributed_training.py @@ -113,8 +113,7 @@ def main(args): my_evaluator = EvaluationPlugin( accuracy_metrics(epoch=True, experience=True, stream=True), loss_metrics(epoch=True, experience=True, stream=True), - loggers=loggers, - suppress_warnings=True + loggers=loggers ) # Adapt the minibatch size From 88f75a97225fe53d84e944f6018af8d3fd5ffa27 Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Tue, 22 Nov 2022 16:06:01 +0000 Subject: [PATCH 10/16] Integrated distributed training with RNGManager, new collate system. Additional tests. Detection WIP. --- .github/workflows/environment-update.yml | 4 +- .github/workflows/unit-test.yml | 2 + .../scenarios/classification_scenario.py | 14 +- .../scenarios/detection_scenario.py | 516 ++++++++++- .../scenarios/lazy_dataset_sequence.py | 7 +- avalanche/benchmarks/utils/__init__.py | 1 + .../utils/classification_dataset.py | 4 + .../benchmarks/utils/collate_functions.py | 166 +++- avalanche/benchmarks/utils/data.py | 4 +- avalanche/benchmarks/utils/data_attribute.py | 51 +- .../benchmarks/utils/detection_dataset.py | 851 ++++++++++++++++++ avalanche/benchmarks/utils/flat_data.py | 27 +- avalanche/distributed/distributed_batch.py | 84 +- .../distributed_consistency_verification.py | 74 ++ avalanche/distributed/distributed_helper.py | 139 +-- .../strategies/distributed_mbatch_strategy.py | 24 +- .../distributed_strategy_support.py | 2 +- avalanche/training/determinism/rng_manager.py | 1 - avalanche/training/supervised/ar1.py | 2 +- .../supervised/naive_object_detection.py | 8 +- avalanche/training/templates/base_sgd.py | 66 +- .../observation_type/batch_observation.py | 23 +- .../observation_type/online_observation.py | 34 +- .../problem_type/supervised_problem.py | 20 +- .../templates/update_type/meta_update.py | 2 +- .../templates/update_type/sgd_update.py | 5 +- avalanche/training/utils.py | 1 + examples/detection.py | 32 +- examples/detection_examples_utils.py | 15 +- tests/distributed/test_distributed_batch.py | 37 +- tests/distributed/test_distributed_helper.py | 89 ++ tests/run_dist_tests.py | 11 +- tests/training/test_supervised_regression.py | 4 +- 33 files changed, 2011 insertions(+), 309 deletions(-) create mode 100644 avalanche/benchmarks/utils/detection_dataset.py create mode 100644 avalanche/distributed/distributed_consistency_verification.py create mode 100644 tests/distributed/test_distributed_helper.py diff --git a/.github/workflows/environment-update.yml b/.github/workflows/environment-update.yml index 1b926ee23..bad34100d 100644 --- a/.github/workflows/environment-update.yml +++ b/.github/workflows/environment-update.yml @@ -56,7 +56,9 @@ jobs: id: unittest shell: bash -l -c "conda run -n avalanche-env --no-capture-output bash {0}" run: | - python -m unittest discover tests + python -m unittest discover tests && + bash ./tests/checkpointing/test_checkpointing.sh && + python ./tests/run_dist_tests.py - name: checkout avalanche-docker repo if: always() uses: actions/checkout@v3 diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 7eee2e3ff..a2baa3717 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -55,6 +55,8 @@ jobs: python -m unittest discover tests && echo "Running checkpointing tests..." && bash ./tests/checkpointing/test_checkpointing.sh && + echo "Running distributed training tests..." && + python ./tests/run_dist_tests.py && echo "While running unit tests, the following datasets were downloaded:" && ls ~/.avalanche/data diff --git a/avalanche/benchmarks/scenarios/classification_scenario.py b/avalanche/benchmarks/scenarios/classification_scenario.py index c99aaca44..3e4617871 100644 --- a/avalanche/benchmarks/scenarios/classification_scenario.py +++ b/avalanche/benchmarks/scenarios/classification_scenario.py @@ -1,5 +1,6 @@ import copy import re +import warnings from abc import ABC from typing import ( Generic, @@ -18,10 +19,8 @@ Mapping, ) -from typing_extensions import Protocol - -import warnings from torch.utils.data.dataset import Dataset +from typing_extensions import Protocol from avalanche.benchmarks.scenarios.generic_definitions import ( TCLExperience, @@ -32,10 +31,7 @@ from avalanche.benchmarks.scenarios.lazy_dataset_sequence import ( LazyDatasetSequence, ) -from avalanche.benchmarks.utils import make_classification_dataset -from avalanche.benchmarks.utils.classification_dataset import ( - ClassificationDataset, -) +from avalanche.benchmarks.utils import make_classification_dataset, AvalancheDataset from avalanche.benchmarks.utils.dataset_utils import manage_advanced_indexing TGenericCLClassificationScenario = TypeVar( @@ -494,7 +490,7 @@ def _check_and_adapt_user_stream_def( # exp_data[0] must contain the generator stream_length = exp_data[1] is_lazy = True - elif isinstance(exp_data, ClassificationDataset): + elif isinstance(exp_data, AvalancheDataset): # Single element exp_data = [exp_data] is_lazy = False @@ -506,7 +502,7 @@ def _check_and_adapt_user_stream_def( if not is_lazy: for i, dataset in enumerate(exp_data): - if not isinstance(dataset, ClassificationDataset): + if not isinstance(dataset, AvalancheDataset): raise ValueError( "All experience datasets must be subclasses of" " AvalancheDataset" diff --git a/avalanche/benchmarks/scenarios/detection_scenario.py b/avalanche/benchmarks/scenarios/detection_scenario.py index db78d9c76..b90f51a61 100644 --- a/avalanche/benchmarks/scenarios/detection_scenario.py +++ b/avalanche/benchmarks/scenarios/detection_scenario.py @@ -8,26 +8,34 @@ # E-mail: contact@continualai.org # # Website: avalanche.continualai.org # ################################################################################ - -from typing import TypeVar, List, Callable +import copy +import warnings +from abc import abstractmethod, ABC +from typing import TypeVar, List, Callable, Protocol, runtime_checkable, \ + Union, Iterable, Generic, Sequence, Optional, Mapping, Set from avalanche.benchmarks import ( - GenericClassificationExperience, - ClassificationExperience, TCLScenario, TCLStream, GenericCLScenario, - TStreamsUserDict, - ClassificationStream, -) -from avalanche.benchmarks.utils import make_classification_dataset + TStreamsUserDict, TCLExperience, ) +from avalanche.benchmarks.scenarios.classification_scenario import \ + _get_slice_ids +from avalanche.benchmarks.utils.dataset_utils import manage_advanced_indexing +from avalanche.benchmarks.utils.detection_dataset import DetectionDataset -TDetectionExperience = TypeVar( - "TDetectionExperience", bound=GenericClassificationExperience +TGenericCLDetectionScenario = TypeVar( + "TGenericCLDetectionScenario", bound="DetectionCLScenario" +) +TGenericDetectionExperience = TypeVar( + "TGenericDetectionExperience", bound="GenericDetectionExperience" +) +TGenericScenarioStream = TypeVar( + "TGenericScenarioStream", bound="DetectionStream" ) -class DetectionCLScenario(GenericCLScenario[TDetectionExperience]): +class DetectionCLScenario(GenericCLScenario[TCLExperience]): """ Base implementation of a Continual Learning object detection benchmark. @@ -43,7 +51,7 @@ def __init__( n_classes: int = None, complete_test_set_only: bool = False, experience_factory: Callable[ - ["ClassificationStream", int], TDetectionExperience + ["DetectionStream", int], TCLExperience ] = None, ): """ @@ -66,7 +74,7 @@ def __init__( """ if experience_factory is None: - experience_factory = DetectionExperience + experience_factory = GenericDetectionExperience super(DetectionCLScenario, self).__init__( stream_definitions=stream_definitions, @@ -79,50 +87,417 @@ def __init__( The number of classes in the scenario. """ + @GenericCLScenario.classes_in_experience.getter + def classes_in_experience( + self, + ) -> Mapping[str, Sequence[Optional[Set[int]]]]: + """ + A dictionary mapping each stream (by name) to a list. + + Each element of the list is a set describing the classes included in + that experience (identified by its index). + + In previous releases this field contained the list of sets for the + training stream (that is, there was no way to obtain the list for other + streams). That behavior is deprecated and support for that usage way + will be removed in the future. + """ + + return _LazyStreamClassesInDetectionExps(self) + + +class _LazyStreamClassesInDetectionExps(Mapping[str, Sequence[Optional[Set[int]]]]): + def __init__(self, benchmark: GenericCLScenario): + self._benchmark = benchmark + self._default_lcie = _LazyClassesInDetectionExps(benchmark, stream="train") + + def __len__(self): + return len(self._benchmark.stream_definitions) + + def __getitem__(self, stream_name_or_exp_id): + if isinstance(stream_name_or_exp_id, str): + return _LazyClassesInDetectionExps( + self._benchmark, stream=stream_name_or_exp_id + ) + + warnings.warn( + "Using classes_in_experience[exp_id] is deprecated. " + "Consider using classes_in_experience[stream_name][exp_id]" + "instead.", + stacklevel=2, + ) + return self._default_lcie[stream_name_or_exp_id] + + def __iter__(self): + yield from self._benchmark.stream_definitions.keys() + + +class _LazyClassesInDetectionExps(Sequence[Optional[Set[int]]]): + def __init__(self, benchmark: GenericCLScenario, stream: str = "train"): + self._benchmark = benchmark + self._stream = stream + + def __len__(self): + return len(self._benchmark.streams[self._stream]) + + def __getitem__(self, exp_id) -> Set[int]: + return manage_advanced_indexing( + exp_id, + self._get_single_exp_classes, + len(self), + _LazyClassesInDetectionExps._slice_collate, + ) -class DetectionExperience(ClassificationExperience[TCLScenario, TCLStream]): + def __str__(self): + return ( + "[" + ", ".join([str(self[idx]) for idx in range(len(self))]) + "]" + ) + + def _get_single_exp_classes(self, exp_id): + b = self._benchmark.stream_definitions[self._stream] + if not b.is_lazy and exp_id not in b.exps_data.targets_field_sequence: + raise IndexError + targets = b.exps_data.targets_field_sequence[exp_id] + if targets is None: + return None + + classes_in_exp = set() + for target in targets: + for label in target['labels']: + classes_in_exp.add(int(label)) + return classes_in_exp + + @staticmethod + def _slice_collate(*classes_in_exps: Optional[Set[int]]): + if any(x is None for x in classes_in_exps): + return None + + return [list(x) for x in classes_in_exps] + + +class DetectionScenarioStream(Protocol[TCLScenario, TCLExperience]): """ - Definition of a learning experience based on a :class:`DetectionScenario` - instance. + A scenario stream describes a sequence of incremental experiences. + Experiences are described as :class:`IExperience` instances. They contain a + set of patterns which has become available at a particular time instant + along with any optional, scenario specific, metadata. - This experience implementation uses the generic experience-patterns - assignment defined in the :class:`DetectionScenario` instance. Instances of - this class are usually obtained from an object detection benchmark stream. + Most scenario expose two different streams: the training stream and the test + stream. + """ + + name: str + """ + The name of the stream. + """ + + benchmark: TCLScenario + """ + A reference to the scenario this stream belongs to. + """ + + @property + def scenario(self) -> TCLScenario: + """This property is DEPRECATED, use self.benchmark instead.""" + warnings.warn( + "Using self.scenario is deprecated ScenarioStream. " + "Consider using self.benchmark instead.", + stacklevel=2, + ) + return self.benchmark + + def __getitem__( + self: TCLStream, experience_idx: Union[int, slice, Iterable[int]] + ) -> Union[TCLExperience, TCLStream]: + """ + Gets an experience given its experience index (or a stream slice given + the experience order). + + :param experience_idx: An int describing the experience index or an + iterable/slice object describing a slice of this stream. + :return: The Experience instance associated to the given experience + index or a sliced stream instance. + """ + ... + + def __len__(self) -> int: + """ + Used to get the length of this stream (the amount of experiences). + + :return: The amount of experiences in this stream. + """ + ... + + +class DetectionStream( + Generic[TCLExperience, TGenericCLDetectionScenario], + DetectionScenarioStream[ + TGenericCLDetectionScenario, TCLExperience + ], + Sequence[TCLExperience], +): + def __init__( + self: TGenericScenarioStream, + name: str, + benchmark: TGenericCLDetectionScenario, + *, + slice_ids: List[int] = None, + ): + super(DetectionStream, self).__init__() + self.slice_ids: Optional[List[int]] = slice_ids + """ + Describes which experiences are contained in the current stream slice. + Can be None, which means that this object is the original stream. """ + + self.name: str = name + """ + The name of the stream (for instance: "train", "test", "valid", ...). + """ + + self.benchmark = benchmark + """ + A reference to the benchmark. + """ + + def __len__(self) -> int: + """ + Gets the number of experiences this stream it's made of. + + :return: The number of experiences in this stream. + """ + if self.slice_ids is None: + return len(self.benchmark.stream_definitions[self.name].exps_data) + else: + return len(self.slice_ids) + + def __getitem__( + self, exp_idx: Union[int, slice, Iterable[int]] + ) -> Union[TCLExperience, TCLStream]: + """ + Gets an experience given its experience index (or a stream slice given + the experience order). + + :param exp_idx: An int describing the experience index or an + iterable/slice object describing a slice of this stream. + + :return: The experience instance associated to the given experience + index or a sliced stream instance. + """ + if isinstance(exp_idx, int): + if exp_idx < len(self): + if self.slice_ids is None: + return self.benchmark.experience_factory(self, exp_idx) + else: + return self.benchmark.experience_factory( + self, self.slice_ids[exp_idx] + ) + raise IndexError( + "Experience index out of bounds" + str(int(exp_idx)) + ) + else: + return self._create_slice(exp_idx) + + def _create_slice( + self: TGenericScenarioStream, + exps_slice: Union[int, slice, Iterable[int]], + ) -> TCLStream: + """ + Creates a sliced version of this stream. + + In its base version, a shallow copy of this stream is created and + then its ``slice_ids`` field is adapted. + + :param exps_slice: The slice to use. + :return: A sliced version of this stream. + """ + stream_copy = copy.copy(self) + slice_exps = _get_slice_ids(exps_slice, len(self)) + + if self.slice_ids is None: + stream_copy.slice_ids = slice_exps + else: + stream_copy.slice_ids = [self.slice_ids[x] for x in slice_exps] + return stream_copy + + def drop_previous_experiences(self, to_exp: int) -> None: + """ + Drop the reference to experiences up to a certain experience ID + (inclusive). + + This means that any reference to experiences with ID [0, from_exp] will + be released. By dropping the reference to previous experiences, the + memory associated with them can be freed, especially the one occupied by + the dataset. However, if external references to the experience or the + dataset still exist, dropping previous experiences at the stream level + will have little to no impact on the memory usage. + + To make sure that the underlying dataset can be freed, make sure that: + - No reference to previous datasets or experiences are kept in you code; + - The replay implementation doesn't keep a reference to previous + datasets (in which case, is better to store a copy of the raw + tensors instead); + - The benchmark is being generated using a lazy initializer. + + By dropping previous experiences, those experiences will no longer be + available in the stream. Trying to access them will result in an + exception. + + :param to_exp: The ID of the last exp to drop (inclusive). Can be a + negative number, in which case this method doesn't have any effect. + Can be greater or equal to the stream length, in which case all + currently loaded experiences will be dropped. + :return: None + """ + self.benchmark.stream_definitions[ + self.name + ].exps_data.drop_previous_experiences(to_exp) + + +@runtime_checkable +class DetectionExperience(Protocol[TCLScenario, TCLStream]): + """Definition of a detection experience. + + A classification detection contains a set of patterns + which has become available at a particular time instant. The content and + size of an Experience is defined by the specific benchmark that creates the + IExperience instance. + + Experiences of Single Incremental Task (a.k.a. task-free) scenarios are + usually called "batches" while in Multi Task scenarios an Experience is + usually associated to a "task". Finally, in a Multi Incremental Task + scenario the Experience may be composed by patterns from different tasks. + """ + + origin_stream: TCLStream + """ + A reference to the original stream from which this experience was obtained. + """ + + benchmark: TCLScenario + """ + A reference to the benchmark. + """ + + current_experience: int + """ + This is an incremental, 0-indexed, value used to keep track of the position + of current experience in the original stream. + + Beware that this value only describes the experience position in the + original stream and may be unrelated to the order in which the strategy will + encounter experiences. + """ + + dataset: DetectionDataset + """ + The dataset containing the patterns available in this experience. + """ + + @property + @abstractmethod + def task_labels(self) -> List[int]: + """ + This list will contain the unique task labels of the patterns contained + in this experience. In the most common scenarios this will be a list + with a single value. Note: for scenarios that don't produce task labels, + a placeholder task label value like 0 is usually set to each pattern + (see the description of the originating scenario for details). + """ + ... + + @property + @abstractmethod + def task_label(self) -> int: + """ + The task label. This value will never have value "None". However, + for scenarios that don't produce task labels a placeholder value like 0 + is usually set. Beware that this field is meant as a shortcut to obtain + a unique task label: it assumes that only patterns labeled with a + single task label are present. If this experience contains patterns from + multiple tasks, accessing this property will result in an exception. + """ + ... + + @property + def scenario(self) -> TCLScenario: + """This property is DEPRECATED, use self.benchmark instead.""" + warnings.warn( + "Using self.scenario is deprecated in Experience. " + "Consider using self.benchmark instead.", + stacklevel=2, + ) + return self.benchmark + + +class AbstractDetectionExperience( + DetectionExperience[TGenericCLDetectionScenario, TCLStream], ABC +): + """ + Definition of a learning experience. A learning experience contains a set of + patterns which has become available at a particular time instant. The + content and size of an Experience is defined by the specific benchmark that + creates the experience. + + For instance, an experience of a New Classes scenario will contain all + patterns belonging to a subset of classes of the original training set. An + experience of a New Instance scenario will contain patterns from previously + seen classes. """ def __init__( - self: TDetectionExperience, + self, origin_stream: TCLStream, current_experience: int, + classes_in_this_exp: Sequence[int], + previous_classes: Sequence[int], + classes_seen_so_far: Sequence[int], + future_classes: Optional[Sequence[int]], ): """ - Creates an instance of an experience given the stream from this - experience was taken and the current experience ID. + Creates an instance of the abstract experience given the benchmark + stream, the current experience ID and data about the classes timeline. :param origin_stream: The stream from which this experience was obtained. :param current_experience: The current experience ID, as an integer. + :param classes_in_this_exp: The list of classes in this experience. + :param previous_classes: The list of classes in previous experiences. + :param classes_seen_so_far: List of classes of current and previous + experiences. + :param future_classes: The list of classes of next experiences. """ + self.origin_stream: TCLStream = origin_stream + + # benchmark keeps a reference to the base benchmark self.benchmark: TCLScenario = origin_stream.benchmark + + # current_experience is usually an incremental, 0-indexed, value used to + # keep track of the current batch/task. self.current_experience: int = current_experience - self.dataset: make_classification_dataset = ( - origin_stream.benchmark.stream_definitions[ - origin_stream.name - ].exps_data[current_experience] - ) + self.classes_in_this_experience: Sequence[int] = classes_in_this_exp + """ The list of classes in this experience """ - def _get_stream_def(self): - return self.benchmark.stream_definitions[self.origin_stream.name] + self.previous_classes: Sequence[int] = previous_classes + """ The list of classes in previous experiences """ - @property - def task_labels(self) -> List[int]: - stream_def = self._get_stream_def() - return list(stream_def.exps_task_labels[self.current_experience]) + self.classes_seen_so_far: Sequence[int] = classes_seen_so_far + """ List of classes of current and previous experiences """ + + self.future_classes: Optional[Sequence[int]] = future_classes + """ The list of classes of next experiences """ @property def task_label(self) -> int: + """ + The task label. This value will never have value "None". However, + for scenarios that don't produce task labels a placeholder value like 0 + is usually set. Beware that this field is meant as a shortcut to obtain + a unique task label: it assumes that only patterns labeled with a + single task label are present. If this experience contains patterns from + multiple tasks, accessing this property will result in an exception. + """ if len(self.task_labels) != 1: raise ValueError( "The task_label property can only be accessed " @@ -132,4 +507,77 @@ def task_label(self) -> int: return self.task_labels[0] -__all__ = ["TDetectionExperience", "DetectionCLScenario", "DetectionExperience"] +class GenericDetectionExperience( + AbstractDetectionExperience[ + TGenericCLDetectionScenario, + DetectionStream[ + TGenericDetectionExperience, TGenericCLDetectionScenario + ], + ] +): + """ + Definition of a learning experience based on a :class:`GenericCLScenario` + instance. + + This experience implementation uses the generic experience-patterns + assignment defined in the :class:`GenericCLScenario` instance. Instances of + this class are usually obtained from a benchmark stream. + """ + + def __init__( + self: TGenericDetectionExperience, + origin_stream: DetectionStream[ + TGenericDetectionExperience, TGenericCLDetectionScenario + ], + current_experience: int, + ): + """ + Creates an instance of a generic experience given the stream from this + experience was taken and the current experience ID. + + :param origin_stream: The stream from which this experience was + obtained. + :param current_experience: The current experience ID, as an integer. + """ + self.dataset: DetectionDataset = ( + origin_stream.benchmark.stream_definitions[ + origin_stream.name + ].exps_data[current_experience] + ) + + ( + classes_in_this_exp, + previous_classes, + classes_seen_so_far, + future_classes, + ) = origin_stream.benchmark.get_classes_timeline( + current_experience, stream=origin_stream.name + ) + + super().__init__( + origin_stream, + current_experience, + classes_in_this_exp, + previous_classes, + classes_seen_so_far, + future_classes, + ) + + def _get_stream_def(self): + return self.benchmark.stream_definitions[self.origin_stream.name] + + @property + def task_labels(self) -> List[int]: + stream_def = self._get_stream_def() + return list(stream_def.exps_task_labels[self.current_experience]) + + +__all__ = [ + 'TGenericCLDetectionScenario', + 'TGenericDetectionExperience', + 'TGenericScenarioStream', + 'DetectionCLScenario', + 'DetectionStream', + 'AbstractDetectionExperience', + 'GenericDetectionExperience' +] diff --git a/avalanche/benchmarks/scenarios/lazy_dataset_sequence.py b/avalanche/benchmarks/scenarios/lazy_dataset_sequence.py index 6460199c7..044dfd9af 100644 --- a/avalanche/benchmarks/scenarios/lazy_dataset_sequence.py +++ b/avalanche/benchmarks/scenarios/lazy_dataset_sequence.py @@ -12,10 +12,7 @@ from collections import defaultdict from typing import Sequence, Iterable, Dict, Optional, Iterator -from avalanche.benchmarks.utils import make_classification_dataset -from avalanche.benchmarks.utils.classification_dataset import ( - ClassificationDataset, -) +from avalanche.benchmarks.utils import make_classification_dataset, AvalancheDataset class LazyDatasetSequence(Sequence[make_classification_dataset]): @@ -212,7 +209,7 @@ def load_all_experiences(self, to_exp: int = None) -> None: f"while generating experience {exp_id}." ) - if not isinstance(generated_exp, ClassificationDataset): + if not isinstance(generated_exp, AvalancheDataset): raise ValueError( "All experience datasets must be subclasses of" " AvalancheDataset" diff --git a/avalanche/benchmarks/utils/__init__.py b/avalanche/benchmarks/utils/__init__.py index 773520ac8..c3e29b407 100644 --- a/avalanche/benchmarks/utils/__init__.py +++ b/avalanche/benchmarks/utils/__init__.py @@ -1,5 +1,6 @@ from .transforms import * from .classification_dataset import * +from .detection_dataset import * from .datasets_from_filelists import * from .torchvision_wrapper import * from .data import * diff --git a/avalanche/benchmarks/utils/classification_dataset.py b/avalanche/benchmarks/utils/classification_dataset.py index 20c4c3f4b..46eb068de 100644 --- a/avalanche/benchmarks/utils/classification_dataset.py +++ b/avalanche/benchmarks/utils/classification_dataset.py @@ -23,6 +23,7 @@ from torch.utils.data import Dataset from torch.utils.data.dataset import Subset, ConcatDataset, TensorDataset +from .collate_functions import ClassificationCollate from .data import make_avalanche_dataset, AvalancheDataset from .transform_groups import TransformGroups, DefaultTransformGroups from .data_attribute import DataAttribute @@ -216,6 +217,9 @@ def make_classification_dataset( if len(das) == 0: das = None + if collate_fn is None: + collate_fn = getattr(dataset, 'collate_fn', ClassificationCollate()) + data = ClassificationDataset( [dataset], data_attributes=das, diff --git a/avalanche/benchmarks/utils/collate_functions.py b/avalanche/benchmarks/utils/collate_functions.py index 342504691..e5ff22e52 100644 --- a/avalanche/benchmarks/utils/collate_functions.py +++ b/avalanche/benchmarks/utils/collate_functions.py @@ -10,9 +10,17 @@ ################################################################################ import itertools +from abc import ABC, abstractmethod from collections import defaultdict +from typing import List, TypeVar, Generic, Sequence, Tuple, Dict import torch +from torch import Tensor +from torch.utils.data import default_collate + +BatchT = TypeVar("BatchT") +ExampleT = TypeVar("ExampleT") +FeatureT = TypeVar("FeatureT") def classification_collate_mbatches_fn(mbatches): @@ -26,17 +34,22 @@ def classification_collate_mbatches_fn(mbatches): """ batch = [] for i in range(len(mbatches[0])): - t = classification_single_values_collate_fn( + t = classification_single_values_collate_mbatches_fn( [el[i] for el in mbatches], i ) batch.append(t) return batch -def classification_single_values_collate_fn(values_list, index): +def classification_single_values_collate_mbatches_fn(values_list, index): """ Collate function used to merge the single elements (x or y or t, - etcetera) of a minibatch of data from a classification dataset. + etcetera) of multiple minibatches of data from a classification dataset. + + Beware that this function expects a list of already batched values, + which means that it accepts a list of [mb_size, X, Y, Z, ...] tensors. + This is different from :func:`classification_single_values_collate_fn`, + which expects a flat list of tensors [X, Y, Z, ...] to be collated. This function assumes that all values are tensors of the same shape (excluding the first dimension). @@ -49,6 +62,26 @@ def classification_single_values_collate_fn(values_list, index): return torch.cat(values_list, dim=0) +def classification_single_values_collate_fn(values_list, index): + """ + Collate function used to merge the single elements (x or y or t, + etcetera) of a minibatch of data from a classification dataset. + + This function expects a flat list of tensors [X, Y, Z, ...] to be collated. + For a version of the functions that can collate pre-batched values + [mb_size, X, Y, Z, ...], refer to + :func:`classification_single_values_collate_mbatches_fn`. + + This function assumes that all values are tensors of the same shape. + + :param values_list: The list of values to merge. + :param index: The index of the element. 0 for x values, 1 for y values, + etcetera. In this implementation, this parameter is ignored. + :return: The merged values. + """ + return torch.stack(values_list) + + def detection_collate_fn(batch): """ Collate function used when loading detection datasets using a DataLoader. @@ -83,9 +116,134 @@ def detection_collate_mbatches_fn(mbatches): return lists +class Collate(ABC, Generic[ExampleT, BatchT]): + + @abstractmethod + def collate_fn(self, batch: Sequence[ExampleT]) -> BatchT: + """ + + Merge multiple examples to create a batch. + + This function expects a list of elements as obtained from + the dataset. + + PyTorch official documentation described the default_collate_fn as: + "Function that takes in a batch of data and puts the elements within the batch + into a tensor with an additional outer dimension - batch size." + + :param batch: The list of examples. + :return: The batch. + """ + pass + + @abstractmethod + def collate_single_value_fn( + self, + feature_batch: Sequence[FeatureT], + feature_idx: int) -> Sequence[FeatureT]: + """ + Merge a specific feature to create a single-feature batch. + + This function expects a list of features. + + :param feature_batch: The list of features to be batched. + :param feature_idx: The index of the feature being batched. + This may be useful to customize how features are merged. + + :return: The batched features. + """ + pass + + @abstractmethod + def collate_batches_fn(self, batches: Sequence[BatchT]) -> BatchT: + """ + Merge multiple batches. + + This function expects a list of pre-collated batches + (as collated through :meth:`collate_fn`.) + + :param batches: A list of batches to be merged together. + :return: A batch made by collating the input batches. + """ + pass + + @abstractmethod + def collate_single_value_batches_fn( + self, + feature_batches: Sequence[Sequence[FeatureT]], + feature_idx: int) -> FeatureT: + """ + Merge a specific feature of examples contained in multiple batches. + + This function expects a list of pre-batched features. + + :param feature_batches: A list of batched features to be merged together. + :param feature_idx: The index of the feature being batched. + This may be useful to customize how features are merged. + :return: A batch of featured made by collating the input batched featured. + """ + pass + + def __call__(self, batch: List[ExampleT]) -> BatchT: + """ + Merges multiple examples to create a batch. + + In practice, this will call :meth:`collate_fn`. + """ + return self.collate_fn(batch) + + +class ClassificationCollate(Collate[Tuple[Tensor, ...], Tuple[Tensor, ...]]): + + def collate_fn(self, batch): + return default_collate(batch) + + def collate_single_value_fn(self, feature_batch: Sequence[Tensor], feature_idx): + return torch.stack(feature_batch) + + def collate_batches_fn(self, batches): + batch = [] + for i in range(len(batches[0])): + t = self.collate_single_value_batches_fn( + [el[i] for el in batches], i + ) + batch.append(t) + return batch + + def collate_single_value_batches_fn( + self, + feature_batch: Sequence[Tensor], + feature_idx) -> Tensor: + return torch.cat(feature_batch, dim=0) + + +class DetectionCollate(Collate[Tuple[Tensor, Dict, int], Tuple[Tuple[Tensor], Tuple[Dict], Tuple[int]]]): + + def collate_fn(self, batch): + return detection_collate_fn(batch) + + def collate_single_value_fn(self, feature_batch, feature_idx): + return tuple(feature_batch) + + def collate_batches_fn(self, batches): + return detection_collate_mbatches_fn(batches) + + def collate_single_value_batches_fn( + self, + feature_batch: Sequence[Sequence[FeatureT]], + feature_idx) -> Sequence[FeatureT]: + flattened_features = [] + for batch in feature_batch: + flattened_features.extend(batch) + return tuple(flattened_features) + + __all__ = [ "classification_collate_mbatches_fn", - "classification_single_values_collate_fn", + "classification_single_values_collate_mbatches_fn", "detection_collate_fn", "detection_collate_mbatches_fn", + "Collate", + "ClassificationCollate", + "DetectionCollate" ] diff --git a/avalanche/benchmarks/utils/data.py b/avalanche/benchmarks/utils/data.py index 533d79eed..3801a09eb 100644 --- a/avalanche/benchmarks/utils/data.py +++ b/avalanche/benchmarks/utils/data.py @@ -34,7 +34,7 @@ TAvalancheDataset = TypeVar("TAvalancheDataset", bound="AvalancheDataset") -class AvalancheDataset(FlatData): +class AvalancheDataset(FlatData[T_co]): """Avalanche Dataset. Avlanche dataset are pytorch-compatible Datasets with some additional @@ -255,7 +255,7 @@ def _getitem_recursive_call(self, idx, group_name): element = self._transform_groups(element, group_name=group_name) return element - def __getitem__(self, idx) -> Union[T_co, Sequence[T_co]]: + def __getitem__(self, idx) -> T_co: elem = self._getitem_recursive_call( idx, self._transform_groups.current_group ) diff --git a/avalanche/benchmarks/utils/data_attribute.py b/avalanche/benchmarks/utils/data_attribute.py index 0505160ae..9cc50a5a1 100644 --- a/avalanche/benchmarks/utils/data_attribute.py +++ b/avalanche/benchmarks/utils/data_attribute.py @@ -15,6 +15,7 @@ concatenation and subsampling operations and are automatically managed by AvalancheDatasets. """ +from typing import TypeVar, Generic, Sequence, Set, Dict, Optional import torch @@ -22,7 +23,10 @@ from .flat_data import ConstantSequence, FlatData -class DataAttribute: +DataT = TypeVar("DataT") + + +class DataAttribute(Generic[DataT]): """Data attributes manage sample-wise information such as task or class labels. @@ -32,7 +36,7 @@ class labels. Data attributes can be efficiently concatenated and subsampled. """ - def __init__(self, data: IDataset, name: str = None, use_in_getitem=False): + def __init__(self, data: IDataset[DataT], name: str = None, use_in_getitem: bool = False): """Data Attribute. :param data: a sequence of values, one for each sample. @@ -42,16 +46,16 @@ def __init__(self, data: IDataset, name: str = None, use_in_getitem=False): :param use_in_getitem: If True, `AvalancheDataset` will add the value at the end of each sample. """ - self.name = name - self.use_in_getitem = use_in_getitem + self.name: str = name + self.use_in_getitem: bool = use_in_getitem - self._data = self._normalize_sequence(data) + self._data: FlatData = self._normalize_sequence(data) - self._uniques = None # set() - self._val_to_idx = None # dict() - self._count = None # dict() + self._uniques: Optional[Set[DataT]] = None + self._val_to_idx: Optional[Dict[DataT, Sequence[int]]] = None + self._count: Optional[Dict[DataT, int]] = None - def __getitem__(self, item): + def __getitem__(self, item) -> DataT: return self.data[item] def __len__(self): @@ -64,26 +68,18 @@ def __str__(self): return str(self.data[:]) @property - def data(self): + def data(self) -> FlatData[DataT]: return self._data @property - def uniques(self): + def uniques(self) -> Set[DataT]: """Set of unique values in the attribute.""" if self._uniques is None: - self._uniques = set() - # init. uniques with fast paths for special cases - if isinstance(self.data, ConstantSequence): - self.uniques.add(self.data[0]) - elif isinstance(self.data, DataAttribute): - self.uniques.update(self.data.uniques) - else: - for el in self.data: - self.uniques.add(el) + self._uniques = set(self.data) return self._uniques @property - def count(self): + def count(self) -> Dict[DataT, int]: """Dictionary of value -> count.""" if self._count is None: self._count = {} @@ -94,7 +90,7 @@ def count(self): return self._count @property - def val_to_idx(self): + def val_to_idx(self) -> Dict[DataT, Sequence[int]]: """Dictionary mapping unique values to indices.""" if self._val_to_idx is None: # init. val-to-idx @@ -108,7 +104,7 @@ def val_to_idx(self): self._val_to_idx[x].append(i) return self._val_to_idx - def subset(self, indices): + def subset(self, indices) -> "DataAttribute[DataT]": """Subset operation. Return a new `DataAttribute` by keeping only the elements in `indices`. @@ -122,14 +118,14 @@ def subset(self, indices): use_in_getitem=self.use_in_getitem, ) - def concat(self, other: "DataAttribute"): + def concat(self, other: "DataAttribute[DataT]") -> "DataAttribute[DataT]": """Concatenation operation. :param other: the other `DataAttribute` :return: the new concatenated `DataAttribute` """ assert self.name == other.name, ( - "Cannot concatenate DataAttributes" + "with different names." + "Cannot concatenate DataAttributes with different names." ) return DataAttribute( self.data.concat(other.data), @@ -155,4 +151,7 @@ def __init__(self, task_labels): super().__init__(task_labels, "task_labels", use_in_getitem=True) -__all__ = ["DataAttribute", "TaskLabels"] +__all__ = [ + "DataAttribute", + "TaskLabels" +] diff --git a/avalanche/benchmarks/utils/detection_dataset.py b/avalanche/benchmarks/utils/detection_dataset.py new file mode 100644 index 000000000..b7045ed9e --- /dev/null +++ b/avalanche/benchmarks/utils/detection_dataset.py @@ -0,0 +1,851 @@ +################################################################################ +# Copyright (c) 2021 ContinualAI. # +# Copyrights licensed under the MIT License. # +# See the accompanying LICENSE file for terms. # +# # +# Date: 12-05-2020 # +# Author(s): Lorenzo Pellegrini, Antonio Carta # +# E-mail: contact@continualai.org # +# Website: avalanche.continualai.org # +################################################################################ + +""" +This module contains the implementation of the ``DetectionDataset``, +which is the dataset used for supervised continual learning benchmarks. +DetectionDatasets are ``AvalancheDatasets`` that manage targets and task +labels automatically. Concatenation and subsampling operations are optimized +to be used frequently, as is common in replay strategies. +""" +import warnings +from collections import defaultdict, deque +from functools import partial +from typing import ( + List, + Any, + Sequence, + Union, + Optional, + TypeVar, + Callable, + Dict, + Tuple, + Mapping, ) + +import torch +from torch import Tensor +from torch.utils.data import Dataset +from torch.utils.data.dataset import Subset, ConcatDataset +from typing_extensions import Protocol + +from .collate_functions import DetectionCollate +from .data import AvalancheDataset +from .data_attribute import DataAttribute +from .dataset_definitions import ( + IDatasetWithTargets, ) +from .dataset_utils import ( + SubSequence, + find_list_from_index, +) +from .flat_data import ConstantSequence +from .transform_groups import TransformGroups, DefaultTransformGroups + +T_co = TypeVar("T_co", covariant=True) +TAvalancheDataset = TypeVar("TAvalancheDataset", bound="AvalancheDataset") +TTargetType = Dict[str, Tensor] + + +# Info: https://mypy.readthedocs.io/en/stable/protocols.html#callback-protocols +class XComposedTransformDef(Protocol): + def __call__(self, *input_values: Any) -> Any: + pass + + +class XTransformDef(Protocol): + def __call__(self, input_value: Any) -> Any: + pass + + +class YTransformDef(Protocol): + def __call__(self, input_value: Any) -> Any: + pass + + +XTransform = Optional[Union[XTransformDef, XComposedTransformDef]] +YTransform = Optional[YTransformDef] +TransformGroupDef = Union[None, XTransform, Tuple[XTransform, YTransform]] + + +SupportedDetectionDataset = Union[ + IDatasetWithTargets, + Subset, + ConcatDataset, +] + +DetectionExampleT = Tuple[Tensor, TTargetType, int] # Image (tensor), target dict, task label + + +class DetectionDataset(AvalancheDataset, IDatasetWithTargets[DetectionExampleT, TTargetType]): + def __init__(self, *args, **kwargs): + # Here defined only to provide type hinting + self.targets_task_labels: DataAttribute[int] = DataAttribute( + [], + name='targets_task_labels', + use_in_getitem=True + ) + self.targets: DataAttribute[Dict[str, Tensor]] = DataAttribute( + [], + name='targets', + use_in_getitem=False + ) + + del self.targets_task_labels + del self.targets + + super().__init__(*args, **kwargs) + + assert hasattr(self, 'targets_task_labels') + assert hasattr(self, 'targets') + + def subset(self, indices): + data = super().subset(indices) + return data.with_transforms(self._transform_groups.current_group) + + def concat(self, other): + data = super().concat(other) + return data.with_transforms(self._transform_groups.current_group) + + @property + def task_pattern_indices(self): + """A dictionary mapping task ids to their sample indices.""" + return self.targets_task_labels.val_to_idx + + @property + def task_set(self): + """Returns the dataset's ``TaskSet``, which is a mapping .""" + return DetectionTaskSet(self) + + +def make_detection_dataset( + dataset: SupportedDetectionDataset, + *, + transform: XTransform = None, + target_transform: YTransform = None, + transform_groups: Dict[str, TransformGroupDef] = None, + initial_transform_group: str = None, + task_labels: Union[int, Sequence[int]] = None, + targets: Sequence[TTargetType] = None, + collate_fn: Callable[[List], Any] = None +): + """Avalanche Detection Dataset. + + Supervised continual learning benchmarks in Avalanche return instances of + this dataset, but it can also be used in a completely standalone manner. + + This dataset applies input/target transformations, it supports + slicing and advanced indexing and it also contains useful fields as + `targets`, which contains the pattern dictionaries, and `targets_task_labels`, + which contains the pattern task labels. The `task_set` field can be used to + obtain a the subset of patterns labeled with a given task label. + + This dataset can also be used to apply several advanced operations involving + transformations. For instance, it allows the user to add and replace + transformations, freeze them so that they can't be changed, etc. + + This dataset also allows the user to keep distinct transformations groups. + Simply put, a transformation group is a pair of transform+target_transform + (exactly as in torchvision datasets). This dataset natively supports keeping + two transformation groups: the first, 'train', contains transformations + applied to training patterns. Those transformations usually involve some + kind of data augmentation. The second one is 'eval', that will contain + transformations applied to test patterns. Having both groups can be + useful when, for instance, in need to test on the training data (as this + process usually involves removing data augmentation operations). Switching + between transformations can be easily achieved by using the + :func:`train` and :func:`eval` methods. + + Moreover, arbitrary transformation groups can be added and used. For more + info see the constructor and the :func:`with_transforms` method. + + This dataset will try to inherit the task labels from the input + dataset. If none are available and none are given via the `task_labels` + parameter, each pattern will be assigned a default task label 0. + + Creates a ``AvalancheDataset`` instance. + + :param dataset: The dataset to decorate. Beware that + AvalancheDataset will not overwrite transformations already + applied by this dataset. + :param transform: A function/transform that takes the X value of a + pattern from the original dataset and returns a transformed version. + :param target_transform: A function/transform that takes in the target + and transforms it. + :param transform_groups: A dictionary containing the transform groups. + Transform groups are used to quickly switch between training and + eval (test) transformations. This becomes useful when in need to + test on the training dataset as test transformations usually don't + contain random augmentations. ``AvalancheDataset`` natively supports + the 'train' and 'eval' groups by calling the ``train()`` and + ``eval()`` methods. When using custom groups one can use the + ``with_transforms(group_name)`` method instead. Defaults to None, + which means that the current transforms will be used to + handle both 'train' and 'eval' groups (just like in standard + ``torchvision`` datasets). + :param initial_transform_group: The name of the initial transform group + to be used. Defaults to None, which means that the current group of + the input dataset will be used (if an AvalancheDataset). If the + input dataset is not an AvalancheDataset, then 'train' will be + used. + :param task_labels: The task label of each instance. Must be a sequence + of ints, one for each instance in the dataset. Alternatively can be + a single int value, in which case that value will be used as the + task label for all the instances. Defaults to None, which means that + the dataset will try to obtain the task labels from the original + dataset. If no task labels could be found, a default task label + 0 will be applied to all instances. + :param targets: The dictionary of detection boxes of each pattern. + Defaults to None, which means that the targets will be retrieved from + the dataset (if possible). + :param collate_fn: The function to use when slicing to merge single + patterns. This function is the function used in the data loading + process, too. If None, the constructor will check if a + `collate_fn` field exists in the dataset. If no such field exists, + the default collate function for detection will be used. + """ + transform_gs = _init_transform_groups( + transform_groups, + transform, + target_transform, + initial_transform_group, + dataset, + ) + targets = _init_targets(dataset, targets) + task_labels = _init_task_labels(dataset, task_labels) + + das = [] + if targets is not None: + das.append(targets) + if task_labels is not None: + das.append(task_labels) + if len(das) == 0: + das = None + + if collate_fn is None: + collate_fn = getattr(dataset, 'collate_fn', DetectionCollate()) + + data = DetectionDataset( + [dataset], + data_attributes=das, + transform_groups=transform_gs, + collate_fn=collate_fn, + ) + if initial_transform_group is not None: + return data.with_transforms(initial_transform_group) + else: + return data + + +def _init_transform_groups( + transform_groups, + transform, + target_transform, + initial_transform_group, + dataset, +): + if transform_groups is not None and ( + transform is not None or target_transform is not None + ): + raise ValueError( + "transform_groups can't be used with transform" + "and target_transform values" + ) + + if transform_groups is not None: + _check_groups_dict_format(transform_groups) + + if initial_transform_group is None: + # Detect from the input dataset. If not an AvalancheDataset then + # use 'train' as the initial transform group + if ( + isinstance(dataset, DetectionDataset) + and dataset._transform_groups is not None + ): + initial_transform_group = dataset._transform_groups.current_group + else: + initial_transform_group = "train" + + if transform_groups is None: + if target_transform is None and transform is None: + tgs = None + else: + tgs = TransformGroups( + { + "train": (transform, target_transform), + "eval": (transform, target_transform), + }, + current_group=initial_transform_group, + ) + else: + tgs = TransformGroups( + transform_groups, current_group=initial_transform_group + ) + return tgs + + +def _check_groups_dict_format(groups_dict): + # The original groups_dict must be convertible to native Python dict + groups_dict = dict(groups_dict) + + # Check if the format of the groups is correct + for map_key in groups_dict: + if not isinstance(map_key, str): + raise ValueError( + "Every group must be identified by a string." + 'Wrong key was: "' + str(map_key) + '"' + ) + + if "test" in groups_dict: + warnings.warn( + 'A transformation group named "test" has been found. Beware ' + "that by default AvalancheDataset supports test transformations" + ' through the "eval" group. Consider using that one!' + ) + + +def _init_targets(dataset, targets, check_shape=True): + if targets is not None: + # User defined targets always take precedence + if len(targets) != len(dataset) and check_shape: + raise ValueError( + "Invalid amount of target labels. It must be equal to the " + "number of patterns in the dataset. Got {}, expected " + "{}!".format(len(targets), len(dataset)) + ) + return DataAttribute(targets, "targets") + + if isinstance(dataset, DetectionDataset): + return None # targets are initialized automatically + else: + targets = _traverse_supported_dataset(dataset, _select_targets) + + if targets is None: + return None + return DataAttribute(targets, "targets") + + +def _init_task_labels(dataset, task_labels, check_shape=True): + """A task label for each pattern in the dataset.""" + if task_labels is not None: + # task_labels has priority over the dataset fields + if isinstance(task_labels, int): + task_labels = ConstantSequence(task_labels, len(dataset)) + elif len(task_labels) != len(dataset) and check_shape: + raise ValueError( + "Invalid amount of task labels. It must be equal to the " + "number of patterns in the dataset. Got {}, expected " + "{}!".format(len(task_labels), len(dataset)) + ) + tls = SubSequence(task_labels, converter=int) + else: + if isinstance(dataset, DetectionDataset): + tls = None + else: + task_labels = _traverse_supported_dataset( + dataset, _select_task_labels + ) + tls = SubSequence(task_labels, converter=int) + + if tls is None: + return None + return DataAttribute(tls, "targets_task_labels", use_in_getitem=True) + + +def _detection_class_mapping_transform(class_mapping, example_target_dict): + example_target_dict = dict(example_target_dict) + + # example_target_dict["labels"] is a tensor containing one label + # for each bounding box in the image. We need to remap each of them + example_target_labels = example_target_dict["labels"] + example_mapped_labels = [class_mapping[int(el)] for el in example_target_labels] + + if isinstance(example_target_labels, Tensor): + example_mapped_labels = torch.as_tensor(example_mapped_labels) + + example_target_dict["labels"] = example_mapped_labels + + return example_target_dict + + +def detection_subset( + dataset: SupportedDetectionDataset, + indices: Sequence[int] = None, + *, + class_mapping: Sequence[int] = None, + transform: Callable[[Any], Any] = None, + target_transform: Callable[[int], int] = None, + transform_groups: Dict[str, Tuple[XTransform, YTransform]] = None, + initial_transform_group: str = None, + task_labels: Union[int, Sequence[int]] = None, + targets: Sequence[TTargetType] = None, + collate_fn: Callable[[List], Any] = None +): + """Creates an ``AvalancheSubset`` instance. + + For simple subset operations you should use the method + `dataset.subset(indices)`. + Use this constructor only if you need to redefine transformation or + class/task labels. + + A Dataset that behaves like a PyTorch :class:`torch.utils.data.Subset`. + This Dataset also supports transformations, slicing, advanced indexing, + the targets field, class mapping and all the other goodies listed in + :class:`AvalancheDataset`. + + :param dataset: The whole dataset. + :param indices: Indices in the whole set selected for subset. Can + be None, which means that the whole dataset will be returned. + :param class_mapping: A list that, for each possible class label value, + contains its corresponding remapped value. Can be None. + :param transform: A function/transform that takes the X value of a + pattern from the original dataset and returns a transformed version. + :param target_transform: A function/transform that takes in the target + and transforms it. + :param transform_groups: A dictionary containing the transform groups. + Transform groups are used to quickly switch between training and + eval (test) transformations. This becomes useful when in need to + test on the training dataset as test transformations usually don't + contain random augmentations. ``AvalancheDataset`` natively supports + the 'train' and 'eval' groups by calling the ``train()`` and + ``eval()`` methods. When using custom groups one can use the + ``with_transforms(group_name)`` method instead. Defaults to None, + which means that the current transforms will be used to + handle both 'train' and 'eval' groups (just like in standard + ``torchvision`` datasets). + :param initial_transform_group: The name of the initial transform group + to be used. Defaults to None, which means that the current group of + the input dataset will be used (if an AvalancheDataset). If the + input dataset is not an AvalancheDataset, then 'train' will be + used. + :param task_labels: The task label for each instance. Must be a sequence + of ints, one for each instance in the dataset. This can either be a + list of task labels for the original dataset or the list of task + labels for the instances of the subset (an automatic detection will + be made). In the unfortunate case in which the original dataset and + the subset contain the same amount of instances, then this parameter + is considered to contain the task labels of the subset. + Alternatively can be a single int value, in which case + that value will be used as the task label for all the instances. + Defaults to None, which means that the dataset will try to + obtain the task labels from the original dataset. If no task labels + could be found, a default task label 0 will be applied to all + instances. + :param targets: The target dictionary of each pattern. Defaults to None, + which means that the targets will be retrieved from the dataset (if + possible). This can either be a list of target dictionaries for the + original dataset or the list of target dictionaries for the instances + of the subset (an automatic detection will be made). In the + unfortunate case in which the original dataset and the subset contain + the same amount of instances, then this parameter is considered to + contain the target dictionaries of the subset. + :param collate_fn: The function to use when slicing to merge single + patterns. This function is the function used in the data loading + process, too. If None, the constructor will check if a + `collate_fn` field exists in the dataset. If no such field exists, + the default collate function for detection will be used + """ + if isinstance(dataset, DetectionDataset): + if ( + class_mapping is None + and transform is None + and target_transform is None + and transform_groups is None + and initial_transform_group is None + and task_labels is None + and targets is None + and collate_fn is None + ): + return dataset.subset(indices) + + targets = _init_targets(dataset, targets, check_shape=False) + task_labels = _init_task_labels(dataset, task_labels, check_shape=False) + transform_gs = _init_transform_groups( + transform_groups, + transform, + target_transform, + initial_transform_group, + dataset, + ) + + if initial_transform_group is not None and isinstance( + dataset, AvalancheDataset + ): + dataset = dataset.with_transforms(initial_transform_group) + + if class_mapping is not None: # update targets + + if targets is None: + targets = dataset.targets + + tgs = [_detection_class_mapping_transform(class_mapping, example_target_dict) + for example_target_dict in targets] + + targets = DataAttribute(tgs, "targets") + + if class_mapping is not None: + mapping_fn = partial(_detection_class_mapping_transform, class_mapping) + frozen_transform_groups = DefaultTransformGroups( + (None, mapping_fn) + ) + else: + frozen_transform_groups = None + + das = [] + if targets is not None: + das.append(targets) + if task_labels is not None: + das.append(task_labels) + if len(das) == 0: + das = None + + if collate_fn is None: + collate_fn = DetectionCollate() + + return DetectionDataset( + [dataset], + indices=indices, + data_attributes=das, + transform_groups=transform_gs, + frozen_transform_groups=frozen_transform_groups, + collate_fn=collate_fn, + ) + + +def concat_detection_datasets( + datasets: List[SupportedDetectionDataset], + *, + transform: Callable[[Any], Any] = None, + target_transform: Callable[[int], int] = None, + transform_groups: Dict[str, Tuple[XTransform, YTransform]] = None, + initial_transform_group: str = None, + task_labels: Union[int, Sequence[int], Sequence[Sequence[int]]] = None, + targets: Union[ + Sequence[TTargetType], Sequence[Sequence[TTargetType]] + ] = None, + collate_fn: Callable[[List], Any] = None +): + """Creates a ``AvalancheConcatDataset`` instance. + + For simple subset operations you should use the method + `dataset.concat(other)` or + `concat_datasets` from `avalanche.benchmarks.utils.utils`. + Use this constructor only if you need to redefine transformation or + class/task labels. + + A Dataset that behaves like a PyTorch + :class:`torch.utils.data.ConcatDataset`. However, this Dataset also supports + transformations, slicing, advanced indexing and the targets field and all + the other goodies listed in :class:`AvalancheDataset`. + + This dataset guarantees that the operations involving the transformations + and transformations groups are consistent across the concatenated dataset + (if they are subclasses of :class:`AvalancheDataset`). + + :param datasets: A collection of datasets. + :param transform: A function/transform that takes the X value of a + pattern from the original dataset and returns a transformed version. + :param target_transform: A function/transform that takes in the target + and transforms it. + :param transform_groups: A dictionary containing the transform groups. + Transform groups are used to quickly switch between training and + eval (test) transformations. This becomes useful when in need to + test on the training dataset as test transformations usually don't + contain random augmentations. ``AvalancheDataset`` natively supports + the 'train' and 'eval' groups by calling the ``train()`` and + ``eval()`` methods. When using custom groups one can use the + ``with_transforms(group_name)`` method instead. Defaults to None, + which means that the current transforms will be used to + handle both 'train' and 'eval' groups (just like in standard + ``torchvision`` datasets). + :param initial_transform_group: The name of the initial transform group + to be used. Defaults to None, which means that if all + AvalancheDatasets in the input datasets list agree on a common + group (the "current group" is the same for all datasets), then that + group will be used as the initial one. If the list of input datasets + does not contain an AvalancheDataset or if the AvalancheDatasets + do not agree on a common group, then 'train' will be used. + :param targets: The label of each pattern. Can either be a sequence of + labels or, alternatively, a sequence containing sequences of labels + (one for each dataset to be concatenated). Defaults to None, which + means that the targets will be retrieved from the datasets (if + possible). + :param task_labels: The task labels for each pattern. Must be a sequence + of ints, one for each pattern in the dataset. Alternatively, task + labels can be expressed as a sequence containing sequences of ints + (one for each dataset to be concatenated) or even a single int, + in which case that value will be used as the task label for all + instances. Defaults to None, which means that the dataset will try + to obtain the task labels from the original datasets. If no task + labels could be found for a dataset, a default task label 0 will + be applied to all patterns of that dataset. + :param collate_fn: The function to use when slicing to merge single + patterns. This function is the function used in the data loading + process, too. If None, the constructor will check if a `collate_fn` + field exists in the first dataset. If no such field exists, the + default collate function for detection will be used. + Beware that the chosen collate function will be applied to all + the concatenated datasets even if a different collate is defined + in different datasets. + """ + dds = [] + for dd in datasets: + if not isinstance(dd, AvalancheDataset): + dd = make_detection_dataset( + dd, + transform=transform, + target_transform=target_transform, + transform_groups=transform_groups, + initial_transform_group=initial_transform_group, + task_labels=task_labels, + targets=targets, + collate_fn=collate_fn, + ) + dds.append(dd) + if ( + transform is None + and target_transform is None + and transform_groups is None + and initial_transform_group is None + and task_labels is None + and targets is None + and collate_fn is None + and len(datasets) > 0 + ): + d0 = datasets[0] + if isinstance(d0, DetectionDataset): + for d1 in datasets[1:]: + d0 = d0.concat(d1) + return d0 + + das = [] + if len(dds) > 0: + ####################################### + # TRANSFORMATION GROUPS + ####################################### + transform_groups = _init_transform_groups( + transform_groups, + transform, + target_transform, + initial_transform_group, + dds[0], + ) + + if initial_transform_group is None: + uniform_group = None + for d_set in datasets: + if isinstance(d_set, AvalancheDataset): + if uniform_group is None: + uniform_group = d_set._transform_groups.current_group + else: + if ( + uniform_group + != d_set._transform_groups.current_group + ): + uniform_group = None + break + + if uniform_group is None: + initial_transform_group = "train" + else: + initial_transform_group = uniform_group + + ####################################### + # DATA ATTRIBUTES + ####################################### + + totlen = sum([len(d) for d in datasets]) + if ( + task_labels is not None + ): # User defined targets always take precedence + if isinstance(task_labels, int): + task_labels = ConstantSequence(task_labels, totlen) + elif len(task_labels) != totlen: + raise ValueError( + "Invalid amount of target labels. It must be equal to the " + "number of patterns in the dataset. Got {}, expected " + "{}!".format(len(task_labels), totlen) + ) + das.append( + DataAttribute( + task_labels, "targets_task_labels", use_in_getitem=True + ) + ) + + if targets is not None: # User defined targets always take precedence + if len(targets) != totlen: + raise ValueError( + "Invalid amount of target dictionaries. It must be " + "equal to the number of patterns in the dataset. " + "Got {}, expected {}!".format(len(targets), totlen) + ) + das.append(DataAttribute(targets, "targets")) + if len(das) == 0: + das = None + data = DetectionDataset( + dds, transform_groups=transform_groups, data_attributes=das + ) + return data.with_transforms(initial_transform_group) + + +def _select_targets(dataset, indices): + if hasattr(dataset, "targets"): + # Standard supported dataset + found_targets = dataset.targets + else: + raise ValueError( + "Unsupported dataset: must have a valid targets field" + ) + + if indices is not None: + found_targets = SubSequence(found_targets, indices=indices) + + return found_targets + + +def _select_task_labels(dataset, indices): + found_task_labels = None + if hasattr(dataset, "targets_task_labels"): + found_task_labels = dataset.targets_task_labels + + if found_task_labels is None: + if isinstance(dataset, (Subset, ConcatDataset)): + return None # Continue traversing + + if found_task_labels is None: + if indices is None: + return ConstantSequence(0, len(dataset)) + return ConstantSequence(0, len(indices)) + + if indices is not None: + found_task_labels = SubSequence(found_task_labels, indices=indices) + + return found_task_labels + + +def _traverse_supported_dataset( + dataset, values_selector: Callable[[Dataset, List[int]], List], indices=None +) -> List: + initial_error = None + try: + result = values_selector(dataset, indices) + if result is not None: + return result + except BaseException as e: + initial_error = e + + if isinstance(dataset, Subset): + if indices is None: + indices = range(len(dataset)) + indices = [dataset.indices[x] for x in indices] + return list( + _traverse_supported_dataset( + dataset.dataset, values_selector, indices + ) + ) + + if isinstance(dataset, ConcatDataset): + result = [] + if indices is None: + for c_dataset in dataset.datasets: + result += list( + _traverse_supported_dataset( + c_dataset, values_selector, indices + ) + ) + return result + + datasets_to_indexes = defaultdict(list) + indexes_to_dataset = [] + datasets_len = [] + recursion_result = [] + + all_size = 0 + for c_dataset in dataset.datasets: + len_dataset = len(c_dataset) + datasets_len.append(len_dataset) + all_size += len_dataset + + for subset_idx in indices: + dataset_idx, pattern_idx = find_list_from_index( + subset_idx, datasets_len, all_size + ) + datasets_to_indexes[dataset_idx].append(pattern_idx) + indexes_to_dataset.append(dataset_idx) + + for dataset_idx, c_dataset in enumerate(dataset.datasets): + recursion_result.append( + deque( + _traverse_supported_dataset( + c_dataset, + values_selector, + datasets_to_indexes[dataset_idx], + ) + ) + ) + + result = [] + for idx in range(len(indices)): + dataset_idx = indexes_to_dataset[idx] + result.append(recursion_result[dataset_idx].popleft()) + + return result + + if initial_error is not None: + raise initial_error + + raise ValueError("Error: can't find the needed data in the given dataset") + + +class DetectionTaskSet(Mapping): + """A lazy mapping for task dataset>. + + Given a `DetectionDataset`, this class provides an + iterator that splits the data into task subsets, returning tuples + ``. + + Usage: + + .. code-block:: python + + tset = DetectionTaskSet(data) + for tid, tdata in tset: + print(f"task {tid} has {len(tdata)} examples.") + + """ + + def __init__(self, data: DetectionDataset): + """Constructor. + + :param data: original data + """ + super().__init__() + self.data = data + + def __iter__(self): + return iter(self.data.targets_task_labels.uniques) + + def __getitem__(self, task_label): + tl_idx = self.data.targets_task_labels.val_to_idx[task_label] + return detection_subset(self.data, tl_idx) + + def __len__(self): + return len(self.data.targets_task_labels.uniques) + + +__all__ = [ + "SupportedDetectionDataset", + "DetectionDataset", + "make_detection_dataset", + "detection_subset", + "concat_detection_datasets", + "DetectionTaskSet", +] diff --git a/avalanche/benchmarks/utils/flat_data.py b/avalanche/benchmarks/utils/flat_data.py index 2efb8030f..46c20037b 100644 --- a/avalanche/benchmarks/utils/flat_data.py +++ b/avalanche/benchmarks/utils/flat_data.py @@ -12,14 +12,17 @@ Datasets with optimized concat/subset operations. """ import bisect -from typing import List +from typing import List, TypeVar, Optional from torch.utils.data import ConcatDataset from avalanche.benchmarks.utils.dataset_definitions import IDataset +FlatDataImplT = TypeVar('FlatDataImplT', bound='FlatData') +DataT = TypeVar("DataT") -class FlatData(IDataset): + +class FlatData(IDataset[DataT]): """FlatData is a dataset optimized for efficient repeated concatenation and subset operations. @@ -42,9 +45,9 @@ class FlatData(IDataset): def __init__( self, - datasets: List[IDataset], + datasets: List[IDataset[DataT]], indices: List[int] = None, - can_flatten=True, + can_flatten: bool = True, ): """Constructor @@ -69,13 +72,13 @@ def _get_indices(self): else: return list(range(len(self))) - def subset(self, indices: List[int]) -> "FlatData": + def subset(self: FlatDataImplT, indices: Optional[List[int]]) -> FlatDataImplT: """Subsampling operation. :param indices: indices of the new samples :return: """ - if self._can_flatten: + if self._can_flatten and indices is not None: if self._indices is None: new_indices = indices else: @@ -84,7 +87,7 @@ def subset(self, indices: List[int]) -> "FlatData": return self.__class__(datasets=self._datasets, indices=new_indices) return self.__class__(datasets=[self], indices=indices) - def concat(self, other: "FlatData") -> "FlatData": + def concat(self: FlatDataImplT, other: "FlatData") -> FlatDataImplT: """Concatenation operation. :param other: other dataset. @@ -172,7 +175,7 @@ def _get_idx(self, idx): idx = idx - self._cumulative_sizes[dataset_idx - 1] return dataset_idx, int(idx) - def __getitem__(self, idx): + def __getitem__(self, idx) -> DataT: dataset_idx, idx = self._get_idx(idx) return self._datasets[dataset_idx][idx] @@ -183,10 +186,10 @@ def __len__(self): return len(self._indices) return self._cumulative_sizes[-1] - def __add__(self, other: "FlatData") -> "FlatData": + def __add__(self, other: FlatDataImplT) -> FlatDataImplT: return self.concat(other) - def __radd__(self, other: "FlatData") -> "FlatData": + def __radd__(self, other: FlatDataImplT) -> FlatDataImplT: return other.concat(self) @@ -240,7 +243,7 @@ def __str__(self): ) -def _flatten_dataset_list(datasets: List[FlatData]): +def _flatten_dataset_list(datasets: List[IDataset[DataT]]) -> List[IDataset[DataT]]: """Flatten dataset tree if possible.""" # Concat -> Concat branch # Flattens by borrowing the list of concatenated datasets @@ -259,7 +262,7 @@ def _flatten_dataset_list(datasets: List[FlatData]): flattened_list.append(dataset) # merge consecutive Subsets if compatible - new_data_list = [] + new_data_list: List[IDataset[DataT]] = [] for dataset in flattened_list: if ( isinstance(dataset, FlatData) diff --git a/avalanche/distributed/distributed_batch.py b/avalanche/distributed/distributed_batch.py index f5d0a281b..f33bf3de3 100644 --- a/avalanche/distributed/distributed_batch.py +++ b/avalanche/distributed/distributed_batch.py @@ -1,5 +1,5 @@ from abc import abstractmethod, ABC -from typing import TypeVar, List, Optional, Callable, Any +from typing import TypeVar, List, Optional, Callable, Any, Iterable import torch from torch import Tensor @@ -31,6 +31,10 @@ def _merge_objects(self, objects: List[LocalT]) -> DistributedT: pass +class OnlyTupleSynchronizationSupported(BaseException): + pass + + class DistributedBatch(DistributedObject[LocalT, LocalT], ABC): """ An intermediate abstract class in charge of synchronizing data batches. @@ -63,24 +67,33 @@ def _set_local(self, new_local_value): super()._set_local(new_local_value) def _merge_objects(self, objects: List[LocalT]) -> LocalT: - if self._value_is_tuple: - return self._merge_tuples(objects) - else: - return self._merge_single_values(objects, 0) + if not self._value_is_tuple: + try: + return self._merge_single_values(objects, 0) + except OnlyTupleSynchronizationSupported: + pass + + return self._merge_tuples(objects) def _merge_tuples(self, tuples: List[LocalT]): - merged_elements = [] - n_elements = len(self._local_value) - for element_idx in range(n_elements): - to_merge_elements = [] - for tp in tuples: - to_merge_elements.append(tp[element_idx]) + try: + merged_elements = [] + # Note: _local_value is usually a tuple (mb_x, mb_y, ...) + # which means that n_elements is usually == 2 or 3 + + n_elements = len(self._local_value) + for element_idx in range(n_elements): + to_merge_elements = [] + for tp in tuples: + to_merge_elements.append(tp[element_idx]) - merged_elements.append( - self._merge_single_values(to_merge_elements, element_idx) - ) + merged_elements.append( + self._merge_single_values(to_merge_elements, element_idx) + ) - return tuple(merged_elements) + return tuple(merged_elements) + except OnlyTupleSynchronizationSupported: + raise RuntimeError('[DistributedBatch] No proper collate function set.') @abstractmethod def _merge_single_values(self, values: List, value_index: int): @@ -91,23 +104,52 @@ class CollateDistributedBatch(DistributedBatch[LocalT]): """ An implementation of :class:`DistributedBatch` in which the `_merge_tuples` mechanism is given as a callable function. + + This assumes that local batches are locally pre-collated and + will thus unroll them before calling the given function. """ def __init__(self, name: str, initial_local_value: LocalT, tuples_collate_fn: Optional[Callable[[List], LocalT]], - single_values_collate_fn: Callable[[Any, int], Any]): + single_values_collate_fn: Optional[Callable[[Any, int], Any]]): super().__init__(name, initial_local_value) self.tuples_collate_fn = tuples_collate_fn self.single_values_collate_fn = single_values_collate_fn + def _unroll_minibatch(self, tuples: List[LocalT]) -> List[LocalT]: + unrolled_elements = [] + for local_tuple in tuples: + n_elements = len(local_tuple) + mb_size = len(local_tuple[0]) + + for mb_element_idx in range(mb_size): + mb_element = [] + for tuple_element_idx in range(n_elements): + mb_element.append(local_tuple[tuple_element_idx][mb_element_idx]) + unrolled_elements.append(tuple(mb_element)) + return unrolled_elements + + def _unroll_value(self, collated_values: List[Iterable[Any]]) -> Any: + unrolled_values = [] + for val_batch in collated_values: + unrolled_values.extend(val_batch) + + return unrolled_values + def _merge_tuples(self, tuples: List[LocalT]): - if self.tuples_collate_fn is None: - return super()._merge_tuples(tuples) + if self.tuples_collate_fn is not None: + unrolled_elements = self._unroll_minibatch(tuples) + + return self.tuples_collate_fn(unrolled_elements) - return self.tuples_collate_fn(tuples) + return super()._merge_tuples(tuples) def _merge_single_values(self, values: List, value_index: int): - return self.single_values_collate_fn(values, value_index) + if self.single_values_collate_fn is None: + raise OnlyTupleSynchronizationSupported() + + unrolled_elements = self._unroll_value(values) + return self.single_values_collate_fn(unrolled_elements, value_index) def make_classification_distributed_batch(name: str) -> \ @@ -117,7 +159,7 @@ def make_classification_distributed_batch(name: str) -> \ are Tensors. Values are obtained by concatenating these tensors. """ return CollateDistributedBatch( - name, None, None, lambda x, y: torch.cat(x) + name, None, None, lambda x, y: torch.stack(x) ) diff --git a/avalanche/distributed/distributed_consistency_verification.py b/avalanche/distributed/distributed_consistency_verification.py new file mode 100644 index 000000000..39bfbe211 --- /dev/null +++ b/avalanche/distributed/distributed_consistency_verification.py @@ -0,0 +1,74 @@ +from typing import Tuple, TYPE_CHECKING + +import torch +from torch import Tensor +from torch.nn import Module + +if TYPE_CHECKING: + from avalanche.benchmarks import GenericCLScenario + + +def hash_benchmark(benchmark: 'GenericCLScenario') -> str: + import hashlib + import io + + hash_engine = hashlib.sha256() + for stream_name, stream in benchmark.streams.items(): + hash_engine.update(stream_name.encode()) + for experience in stream: + exp_dataset = experience.dataset + dataset_content = exp_dataset[:] + for tuple_elem in dataset_content: + # https://stackoverflow.com/a/63880190 + buff = io.BytesIO() + torch.save(tuple_elem, buff) + buff.seek(0) + hash_engine.update(buff.read()) + return hash_engine.hexdigest() + + +def hash_minibatch(minibatch: Tuple[Tensor]) -> str: + import hashlib + import io + + hash_engine = hashlib.sha256() + for tuple_elem in minibatch: + buff = io.BytesIO() + torch.save(tuple_elem, buff) + buff.seek(0) + hash_engine.update(buff.read()) + return hash_engine.hexdigest() + + +def hash_tensor(tensor: Tensor) -> str: + import hashlib + import io + + hash_engine = hashlib.sha256() + buff = io.BytesIO() + torch.save(tensor, buff) + buff.seek(0) + hash_engine.update(buff.read()) + return hash_engine.hexdigest() + + +def hash_model(model: Module) -> str: + import hashlib + import io + + hash_engine = hashlib.sha256() + for name, param in model.named_parameters(): + hash_engine.update(name.encode()) + buff = io.BytesIO() + torch.save(param, buff) + buff.seek(0) + hash_engine.update(buff.read()) + return hash_engine.hexdigest() + + +__all__ = [ + 'hash_benchmark', + 'hash_minibatch', + 'hash_tensor', + 'hash_model' +] diff --git a/avalanche/distributed/distributed_helper.py b/avalanche/distributed/distributed_helper.py index 52b4578b9..46f36309a 100644 --- a/avalanche/distributed/distributed_helper.py +++ b/avalanche/distributed/distributed_helper.py @@ -1,11 +1,9 @@ import os -import random +import pickle import warnings -from collections import OrderedDict from io import BytesIO -from typing import Optional, List, Tuple +from typing import Optional, List -import numpy as np import torch from torch import Tensor from torch.distributed import init_process_group @@ -13,9 +11,7 @@ from torch.nn.parallel import DistributedDataParallel from typing_extensions import Literal -from avalanche.benchmarks import GenericCLScenario - -import pickle +from avalanche.distributed.distributed_consistency_verification import hash_tensor class _Singleton(type): @@ -28,7 +24,7 @@ def __call__(cls, *args, **kwargs): return cls._instances[cls] -class _RollingSeedContext(object): +class RollingSeedContext(object): """ Implement seed alignment by storing random number generators state. @@ -38,20 +34,19 @@ class _RollingSeedContext(object): - change the global state of random number generators """ def __init__(self): - self.generators_state = None + self.rng_manager_state = None def save_generators_state(self): - self.generators_state = dict() - for gen_name, gen_def in DistributedHelper.random_generators.items(): - self.generators_state[gen_name] = gen_def['save_state']() + from avalanche.training.determinism.rng_manager import RNGManager + self.rng_manager_state = RNGManager.__getstate__() def load_generators_state(self): - for gen_name, gen_def in DistributedHelper.random_generators.items(): - gen_def['load_state'](self.generators_state[gen_name]) + from avalanche.training.determinism.rng_manager import RNGManager + self.rng_manager_state = RNGManager.__setstate__(self.rng_manager_state) def step_random_generators(self): - for gen_name, gen_def in DistributedHelper.random_generators.items(): - gen_def['step']() + from avalanche.training.determinism.rng_manager import RNGManager + RNGManager.step_generators() def __enter__(self): self.save_generators_state() @@ -61,11 +56,11 @@ def __exit__(self, *_): self.step_random_generators() -class _BroadcastSeedContext(object): +class BroadcastSeedContext(object): """ Implement seed alignment by broadcasting a new seed from the main process. - This is usually slower than using :class:`_RollingSeedContext`. + This is usually slower than using :class:`RollingSeedContext`. """ def __init__(self): pass @@ -90,9 +85,9 @@ def __init__( seed_alignment: Literal["rolling", "broadcast"] = 'rolling', final_barrier: bool = False): if seed_alignment == 'rolling': - self._seed_aligner = _RollingSeedContext() + self._seed_aligner = RollingSeedContext() else: - self._seed_aligner = _BroadcastSeedContext() + self._seed_aligner = BroadcastSeedContext() self._final_barrier = final_barrier @@ -117,35 +112,14 @@ class _DistributedHelperCls(object): __metaclass__ = _Singleton def __init__(self): - self.use_cuda = True - - self.random_generators = OrderedDict() - - self.register_random_generator('torch', { - 'seed': torch.random.manual_seed, - 'save_state': torch.random.get_rng_state, - 'load_state': torch.random.set_rng_state, - 'step': lambda: torch.rand(1) - }) - - self.register_random_generator('numpy', { - 'seed': np.random.seed, - 'save_state': np.random.get_state, - 'load_state': np.random.set_state, - 'step': lambda: np.random.rand(1) - }) - - self.register_random_generator('random', { - 'seed': random.seed, - 'save_state': random.getstate, - 'load_state': random.setstate, - 'step': random.random - }) + self.use_cuda = False def init_distributed(self, random_seed, backend=None, use_cuda=True): if self.is_distributed: raise RuntimeError('Distributed API already initialized') + use_cuda = use_cuda and torch.cuda.is_available() + if backend is None: if use_cuda: backend = 'nccl' @@ -166,7 +140,7 @@ def init_distributed(self, random_seed, backend=None, use_cuda=True): self.set_random_seeds(random_seed) self.use_cuda = use_cuda - if use_cuda or backend == 'nccl': + if use_cuda or backend == 'nccl': # TODO: remove in final release # https://github.com/pytorch/pytorch/issues/6351 torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False @@ -180,7 +154,7 @@ def get_device_id(self): else: device_id = 0 - if self.use_cuda and torch.cuda.is_available(): + if self.use_cuda: return device_id return -1 @@ -191,7 +165,7 @@ def make_device(self): else: device_id = 0 - if self.use_cuda and device_id >= 0 and torch.cuda.is_available(): + if self.use_cuda and device_id >= 0: ref_device = torch.device(f'cuda:{device_id}') torch.cuda.set_device(ref_device) else: @@ -220,16 +194,9 @@ def unwrap_model(self, model: Module) -> Module: return model - def register_random_generator(self, name: str, rng_def: dict): - if 'save_state' not in rng_def or \ - 'load_state' not in rng_def or 'step' not in rng_def: - raise ValueError('Invalid random number generator definition') - - self.random_generators[name] = rng_def - def set_random_seeds(self, random_seed): - for gen_name, gen_dict in self.random_generators.items(): - gen_dict['seed'](random_seed) + from avalanche.training.determinism.rng_manager import RNGManager + RNGManager.set_random_seeds(random_seed) def align_seeds(self): if not self.is_distributed: @@ -462,64 +429,6 @@ def forced_cuda_comm(self) -> bool: return self.backend == 'nccl' -def hash_benchmark(benchmark: GenericCLScenario) -> str: - import hashlib - import io - - hash_engine = hashlib.sha256() - for stream_name, stream in benchmark.streams.items(): - hash_engine.update(stream_name.encode()) - for experience in stream: - exp_dataset = experience.dataset - dataset_content = exp_dataset[:] - for tuple_elem in dataset_content: - # https://stackoverflow.com/a/63880190 - buff = io.BytesIO() - torch.save(tuple_elem, buff) - buff.seek(0) - hash_engine.update(buff.read()) - return hash_engine.hexdigest() - - -def hash_minibatch(minibatch: Tuple[Tensor]) -> str: - import hashlib - import io - - hash_engine = hashlib.sha256() - for tuple_elem in minibatch: - buff = io.BytesIO() - torch.save(tuple_elem, buff) - buff.seek(0) - hash_engine.update(buff.read()) - return hash_engine.hexdigest() - - -def hash_tensor(tensor: Tensor) -> str: - import hashlib - import io - - hash_engine = hashlib.sha256() - buff = io.BytesIO() - torch.save(tensor, buff) - buff.seek(0) - hash_engine.update(buff.read()) - return hash_engine.hexdigest() - - -def hash_model(model: Module) -> str: - import hashlib - import io - - hash_engine = hashlib.sha256() - for name, param in model.named_parameters(): - hash_engine.update(name.encode()) - buff = io.BytesIO() - torch.save(param, buff) - buff.seek(0) - hash_engine.update(buff.read()) - return hash_engine.hexdigest() - - DistributedHelper = _DistributedHelperCls() @@ -549,6 +458,8 @@ def find_class(self, module, name): __all__ = [ + 'RollingSeedContext', + 'BroadcastSeedContext', 'DistributedHelper', '_DistributedHelperCls' ] diff --git a/avalanche/distributed/strategies/distributed_mbatch_strategy.py b/avalanche/distributed/strategies/distributed_mbatch_strategy.py index f374807c2..3b9d3a3bc 100644 --- a/avalanche/distributed/strategies/distributed_mbatch_strategy.py +++ b/avalanche/distributed/strategies/distributed_mbatch_strategy.py @@ -1,6 +1,6 @@ from typing import Callable, List, Any - +from avalanche.benchmarks.utils import AvalancheDataset from avalanche.benchmarks.utils.collate_functions import \ classification_collate_mbatches_fn, classification_single_values_collate_fn from avalanche.distributed import CollateDistributedBatch @@ -25,6 +25,8 @@ def __init__(self): classification_single_values_collate_fn ) + self._adapted_dataset = None + self._use_local_contexts.append(self.use_local_input_batch) self._use_local_contexts.append(self.use_local_output_batch) @@ -100,6 +102,8 @@ def reset_distributed_mb_output(self): self._mb_output.reset_distributed_value() # --- END OUTPUT MINIBATCH PROPERTY --- + # TODO: adapt collate functions + # --- START COLLATE FUNCTIONS (INPUT MB) --- @property def input_batch_collate_fn(self): @@ -147,6 +151,24 @@ def use_local_output_batch(self, *args, **kwargs): return self._mb_output.use_local_value(*args, **kwargs) # --- END LOCAL CONTEXT MANAGERS --- + # --- START - GET COLLATE FUNCTIONS FROM DATASET --- + @property + def adapted_dataset(self): + return self._adapted_dataset + + @adapted_dataset.setter + def adapted_dataset(self, dataset: AvalancheDataset): + # Every time a new dataset is set, the related collate + # function is retrieved and set for sync-ing distributed + # input/output minibatch fields. + self._adapted_dataset = dataset + if self._adapted_dataset is not None: + new_collate = self._adapted_dataset.collate_fn + self.input_batch_collate_fn = new_collate + self.input_batch_single_values_collate_fn = None + + # --- END - GET COLLATE FUNCTIONS FROM DATASET --- + __all__ = [ 'DistributedMiniBatchStrategySupport' diff --git a/avalanche/distributed/strategies/distributed_strategy_support.py b/avalanche/distributed/strategies/distributed_strategy_support.py index b67501b2c..a595aa6ca 100644 --- a/avalanche/distributed/strategies/distributed_strategy_support.py +++ b/avalanche/distributed/strategies/distributed_strategy_support.py @@ -26,7 +26,7 @@ def use_local(self, *args, **kwargs): Examples of distributed-critical fields are `model`, `mbatch`, `mb_output`, `loss`. - Beware that this is method will modify the behavior of getters of ALL + Beware that this method will modify the behavior of getters of ALL such properties. This may not be desirable. Use the field-specific `use_local_*` context managers to control the behavior of these fields in a finer way. diff --git a/avalanche/training/determinism/rng_manager.py b/avalanche/training/determinism/rng_manager.py index 5052cacdc..9b7b0208a 100644 --- a/avalanche/training/determinism/rng_manager.py +++ b/avalanche/training/determinism/rng_manager.py @@ -1,4 +1,3 @@ -import hashlib import random from collections import OrderedDict diff --git a/avalanche/training/supervised/ar1.py b/avalanche/training/supervised/ar1.py index 882aa97f2..203248a6c 100644 --- a/avalanche/training/supervised/ar1.py +++ b/avalanche/training/supervised/ar1.py @@ -272,7 +272,7 @@ def make_train_dataloader(self, num_workers=0, shuffle=True, **kwargs): def training_epoch(self, **kwargs): for mb_it, self.mbatch in enumerate(self.dataloader): - self._unpack_minibatch() + self.unpack_minibatch() self._before_training_iteration(**kwargs) self.optimizer.zero_grad() diff --git a/avalanche/training/supervised/naive_object_detection.py b/avalanche/training/supervised/naive_object_detection.py index 700b4f57c..fbdde7ff2 100644 --- a/avalanche/training/supervised/naive_object_detection.py +++ b/avalanche/training/supervised/naive_object_detection.py @@ -157,7 +157,6 @@ def make_train_dataloader( batch_size=self.train_mb_size, shuffle=shuffle, pin_memory=pin_memory, - collate_mbatches=detection_collate_mbatches_fn, collate_fn=detection_collate_fn, **other_dataloader_args ) @@ -192,8 +191,8 @@ def criterion(self): Beware that the loss can only be obtained for the training phase as no loss dictionary is returned when evaluating. """ - with self.local_mb_output(): - with self.local_mbatch(): + with self.use_local_output_batch(): + with self.use_local_input_batch(): if self.is_training: return sum( loss for loss in self.detection_loss_dict.values()) @@ -230,8 +229,7 @@ def _unpack_minibatch(self): targets = [ {k: v.to(self.device) for k, v in t.items()} for t in self.mbatch[1] ] - self.mbatch[0] = images - self.mbatch[1] = targets + self.mbatch = (images, targets, *self.mbatch[2:]) def _backward(self): if self.scaler is not None: diff --git a/avalanche/training/templates/base_sgd.py b/avalanche/training/templates/base_sgd.py index 61243c118..4e5524805 100644 --- a/avalanche/training/templates/base_sgd.py +++ b/avalanche/training/templates/base_sgd.py @@ -1,28 +1,27 @@ -from typing import Iterable, Sequence, Optional, Union, List -from pkg_resources import parse_version +from typing import Iterable, Sequence, Optional, Union, List, final import torch +from pkg_resources import parse_version from torch.nn import Module, CrossEntropyLoss from torch.optim import Optimizer -from torch.utils.data import DataLoader +from torch.utils.data import DataLoader, DistributedSampler +from avalanche.benchmarks import CLExperience, CLStream +from avalanche.benchmarks.utils.data_loader import TaskBalancedDataLoader, \ + collate_from_data_or_kwargs +from avalanche.core import BaseSGDPlugin from avalanche.distributed import DistributedHelper from avalanche.distributed.strategies import \ DistributedMiniBatchStrategySupport, DistributedLossStrategySupport -from avalanche.benchmarks import ClassificationExperience -from avalanche.benchmarks import CLExperience, CLStream -from avalanche.core import BaseSGDPlugin from avalanche.training.plugins import SupervisedPlugin, EvaluationPlugin from avalanche.training.plugins.clock import Clock from avalanche.training.plugins.evaluation import default_evaluator from avalanche.training.templates.base import BaseTemplate, ExpSequence -from avalanche.models.utils import avalanche_model_adaptation -from avalanche.benchmarks.utils.data_loader import TaskBalancedDataLoader, \ - collate_from_data_or_kwargs from avalanche.training.utils import trigger_plugins -class BaseSGDTemplate(BaseTemplate): +class BaseSGDTemplate(BaseTemplate, DistributedMiniBatchStrategySupport, + DistributedLossStrategySupport): """Base SGD class for continual learning skeletons. **Training loop** @@ -165,12 +164,6 @@ def eval(self, exp_list: Union[CLExperience, CLStream], **kwargs): super().eval(exp_list, **kwargs) return self.evaluator.get_last_metrics() - def _train_exp( - self, experience: CLExperience, eval_streams, **kwargs - ): - # Should be implemented in Observation Type - raise NotImplementedError() - def _eval_exp(self, **kwargs): self.eval_epoch(**kwargs) @@ -199,8 +192,19 @@ def training_epoch(self, **kwargs): # Should be implemented in Update Type raise NotADirectoryError() + @final def backward(self): - """Run the backward pass.""" + """ + Run the backward pass. + This method should not be overridden by child classes. + Consider overriding :meth:`_backward` instead. + """ + with self.use_local_loss(): + self._backward() + self.reset_distributed_loss() + + def _backward(self): + """ Implementation of the backward pass. """ self.loss.backward() def optimizer_step(self): @@ -210,7 +214,7 @@ def optimizer_step(self): def eval_epoch(self, **kwargs): """Evaluation loop over the current `self.dataloader`.""" for self.mbatch in self.dataloader: - self._unpack_minibatch() + self.unpack_minibatch() self._before_eval_iteration(**kwargs) self._before_eval_forward(**kwargs) @@ -222,6 +226,12 @@ def eval_epoch(self, **kwargs): # ==================================================================> NEW + def wrap_distributed_model(self, model): + """ + Prepare a model for distributed training/eval. + """ + return DistributedHelper.wrap_model(model) + def check_model_and_optimizer(self): # Should be implemented in observation type raise NotImplementedError() @@ -323,6 +333,9 @@ def make_train_dataloader( :param shuffle: True if the data should be shuffled, False otherwise. :param pin_memory: If True, the data loader will copy Tensors into CUDA pinned memory before returning them. Defaults to True. + :param persistent_workers: If True, the data loader will not shut down + the worker processes after a dataset has been consumed once. + Please refer to PyTorch `DataLoader` class for more details. """ other_dataloader_args = {} @@ -364,11 +377,17 @@ def make_eval_dataloader( collate_from_data_or_kwargs(self.adapted_dataset, other_dataloader_args) + sampler = None + if DistributedHelper.is_distributed: + sampler = DistributedSampler( + self.adapted_dataset, shuffle=False, drop_last=False) + self.dataloader = DataLoader( self.adapted_dataset, num_workers=num_workers, batch_size=self.eval_mb_size, pin_memory=pin_memory, + sampler=sampler, **other_dataloader_args ) @@ -377,6 +396,17 @@ def eval_dataset_adaptation(self, **kwargs): self.adapted_dataset = self.experience.dataset self.adapted_dataset = self.adapted_dataset.eval() + @final + def unpack_minibatch(self): + """ + Move minibatch elements to device. + This method should not be overridden by child classes. + Consider overriding :meth:`_unpack_minibatch` instead. + """ + with self.use_local_input_batch(): + self._unpack_minibatch() + self.reset_distributed_mbatch() + def _unpack_minibatch(self): """Move to device""" # First verify the mini-batch diff --git a/avalanche/training/templates/observation_type/batch_observation.py b/avalanche/training/templates/observation_type/batch_observation.py index 4ec073849..ccbabe3a6 100644 --- a/avalanche/training/templates/observation_type/batch_observation.py +++ b/avalanche/training/templates/observation_type/batch_observation.py @@ -1,25 +1,36 @@ -from typing import Iterable +from typing import final -from avalanche.benchmarks import CLExperience from avalanche.models.dynamic_optimizers import reset_optimizer from avalanche.models.utils import avalanche_model_adaptation class BatchObservation: + + @final def model_adaptation(self, model=None): """Adapts the model to the current data. + Calls the :class:`~avalanche.models.DynamicModule`s adaptation. + This method should not be overridden by child classes. + Consider overriding :meth:`_model_adaptation` instead. + """ + with self.use_local_model(): + return self._model_adaptation(model=model) + + def _model_adaptation(self, model=None): + """Adapts the model to the current data. Calls the :class:`~avalanche.models.DynamicModule`s adaptation. """ if model is None: model = self.model avalanche_model_adaptation(model, self.experience) + return model.to(self.device) def make_optimizer(self): """Optimizer initialization. - Called before each training experiene to configure the optimizer. + Called before each training experience to configure the optimizer. """ # we reset the optimizer's state after each experience. # This allows to add new parameters (new heads) and @@ -27,5 +38,7 @@ def make_optimizer(self): reset_optimizer(self.optimizer, self.model) def check_model_and_optimizer(self): - self.model = self.model_adaptation() - self.make_optimizer() + with self.use_local_model(): + self.model = self.model_adaptation() + self.model = self.wrap_distributed_model(self.model) + self.make_optimizer() diff --git a/avalanche/training/templates/observation_type/online_observation.py b/avalanche/training/templates/observation_type/online_observation.py index d3dbfaac5..aa8b4565c 100644 --- a/avalanche/training/templates/observation_type/online_observation.py +++ b/avalanche/training/templates/observation_type/online_observation.py @@ -1,4 +1,4 @@ -from typing import Iterable +from typing import Iterable, final from avalanche.benchmarks import OnlineCLExperience from avalanche.models.dynamic_optimizers import reset_optimizer @@ -7,6 +7,7 @@ class OnlineObservation: + def make_optimizer(self): """Optimizer initialization. @@ -26,8 +27,18 @@ def make_optimizer(self): self.model.parameters(), reset_state=False) + @final def model_adaptation(self, model=None): """Adapts the model to the current data. + Calls the :class:`~avalanche.models.DynamicModule`s adaptation. + This method should not be overridden by child classes. + Consider overriding :meth:`_model_adaptation` instead. + """ + with self.use_local_model(): + return self._model_adaptation(model=model) + + def _model_adaptation(self, model=None): + """Adapts the model to the current data. Calls the :class:`~avalanche.models.DynamicModule`s adaptation. """ @@ -53,14 +64,17 @@ def model_adaptation(self, model=None): return model.to(self.device) def check_model_and_optimizer(self): - # If strategy has access to the task boundaries, and the current - # sub-experience is the first sub-experience in the online (sub-)stream, - # then adapt the model with the full origin experience: - if self.experience.access_task_boundaries: - if self.experience.is_first_subexp: + with self.use_local_model(): + # If strategy has access to the task boundaries, and the current + # sub-experience is the first sub-experience in the online (sub-)stream, + # then adapt the model with the full origin experience: + if self.experience.access_task_boundaries: + if self.experience.is_first_subexp: + self.model = self.model_adaptation() + self.model = self.wrap_distributed_model(self.model) + self.make_optimizer() + # Otherwise, adapt to the current sub-experience: + else: self.model = self.model_adaptation() + self.model = self.wrap_distributed_model(self.model) self.make_optimizer() - # Otherwise, adapt to the current sub-experience: - else: - self.model = self.model_adaptation() - self.make_optimizer() diff --git a/avalanche/training/templates/problem_type/supervised_problem.py b/avalanche/training/templates/problem_type/supervised_problem.py index 9432e04ef..66dac0e69 100644 --- a/avalanche/training/templates/problem_type/supervised_problem.py +++ b/avalanche/training/templates/problem_type/supervised_problem.py @@ -1,3 +1,6 @@ +from typing import final + +from avalanche.distributed.strategies import DistributedMiniBatchStrategySupport, DistributedModelStrategySupport from avalanche.models import avalanche_forward @@ -20,10 +23,23 @@ def mb_task_id(self): def criterion(self): """Loss function for supervised problems.""" - return self._criterion(self.mb_output, self.mb_y) + with self.use_local_output_batch(): # Force self.mb_output to be from local batch + with self.use_local_input_batch(): # Force self.mb_y to be from local batch + + return self._criterion(self.mb_output, self.mb_y) + @final def forward(self): - """Compute the model's output given the current mini-batch.""" + """ + Compute the model's output given the current mini-batch. + This method should not be overridden by child classes. + Consider overriding :meth:`_forward` instead. + """ + with self.use_local_input_batch(): + return self._forward() + + def _forward(self): + """Implementation of the forward pass.""" return avalanche_forward(self.model, self.mb_x, self.mb_task_id) def _check_minibatch(self): diff --git a/avalanche/training/templates/update_type/meta_update.py b/avalanche/training/templates/update_type/meta_update.py index d387db9c0..b0bba9727 100644 --- a/avalanche/training/templates/update_type/meta_update.py +++ b/avalanche/training/templates/update_type/meta_update.py @@ -12,7 +12,7 @@ def training_epoch(self, **kwargs): if self._stop_training: break - self._unpack_minibatch() + self.unpack_minibatch() self._before_training_iteration(**kwargs) self.optimizer.zero_grad() diff --git a/avalanche/training/templates/update_type/sgd_update.py b/avalanche/training/templates/update_type/sgd_update.py index d85365f49..e81d8e124 100644 --- a/avalanche/training/templates/update_type/sgd_update.py +++ b/avalanche/training/templates/update_type/sgd_update.py @@ -10,11 +10,10 @@ def training_epoch(self, **kwargs): if self._stop_training: break - self._unpack_minibatch() + self.unpack_minibatch() self._before_training_iteration(**kwargs) self.optimizer.zero_grad() - self.loss = 0 # Forward self._before_forward(**kwargs) @@ -22,7 +21,7 @@ def training_epoch(self, **kwargs): self._after_forward(**kwargs) # Loss & Backward - self.loss += self.criterion() + self.loss = self.criterion() self._before_backward(**kwargs) self.backward() diff --git a/avalanche/training/utils.py b/avalanche/training/utils.py index 4d0800c5d..1f7e76d7c 100644 --- a/avalanche/training/utils.py +++ b/avalanche/training/utils.py @@ -421,6 +421,7 @@ def __str__(self): __all__ = [ + "trigger_plugins", "load_all_dataset", "zerolike_params_dict", "copy_params_dict", diff --git a/examples/detection.py b/examples/detection.py index 30abb29d1..cec1329c1 100644 --- a/examples/detection.py +++ b/examples/detection.py @@ -15,41 +15,29 @@ stream of experiences is obtained by splitting the dataset in equal parts. """ +import argparse import logging from pathlib import Path from typing import Union +import torch +import torchvision from torch.utils.data import random_split, Subset +from torchvision.models.detection.faster_rcnn import FastRCNNPredictor from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor +from torchvision.transforms import ToTensor -from avalanche.benchmarks import StreamUserDef -from avalanche.benchmarks.datasets import LvisDataset, PennFudanDataset -from avalanche.benchmarks.scenarios.detection_scenario import ( - DetectionCLScenario, -) -from avalanche.benchmarks.utils import ( - make_classification_dataset, - classification_subset, -) -from avalanche.training.supervised.naive_object_detection import ( - ObjectDetectionTemplate, -) - +from avalanche.benchmarks.datasets import PennFudanDataset from avalanche.evaluation.metrics import ( - make_lvis_metrics, timing_metrics, loss_metrics, - DetectionMetrics, ) +from avalanche.evaluation.metrics.detection import DetectionMetrics from avalanche.logging import InteractiveLogger from avalanche.training.plugins import LRSchedulerPlugin, EvaluationPlugin -import argparse -import torch -from torchvision.transforms import ToTensor -import torchvision -from torchvision.models.detection.faster_rcnn import FastRCNNPredictor - - +from avalanche.training.supervised.naive_object_detection import ( + ObjectDetectionTemplate, +) # This sets the root logger to write to stdout (your console). # Your script/app needs to call this somewhere at least once. from examples.detection_examples_utils import split_detection_benchmark diff --git a/examples/detection_examples_utils.py b/examples/detection_examples_utils.py index b46727066..b13b423fe 100644 --- a/examples/detection_examples_utils.py +++ b/examples/detection_examples_utils.py @@ -5,8 +5,7 @@ DetectionCLScenario, ) from avalanche.benchmarks.utils import ( - make_classification_dataset, - classification_subset, + make_detection_dataset, detection_subset, ) @@ -44,12 +43,12 @@ def split_detection_benchmark( exp_n_imgs = len(train_dataset) // n_experiences remaining = len(train_dataset) % n_experiences - train_dataset_avl = make_classification_dataset( + train_dataset_avl = make_detection_dataset( train_dataset, transform_groups=transform_groups, initial_transform_group="train", ) - test_dataset_avl = make_classification_dataset( + test_dataset_avl = make_detection_dataset( test_dataset, transform_groups=transform_groups, initial_transform_group="eval", @@ -73,9 +72,9 @@ def split_detection_benchmark( last_slice_idx = 0 for exp_id in range(n_experiences): n_imgs = exp_sz[exp_id] - idx_range = train_indices[last_slice_idx : last_slice_idx + n_imgs] + idx_range = train_indices[last_slice_idx:last_slice_idx + n_imgs] train_exps_datasets.append( - classification_subset(train_dataset_avl, indices=idx_range) + detection_subset(train_dataset_avl, indices=idx_range) ) last_slice_idx += n_imgs @@ -100,4 +99,6 @@ def split_detection_benchmark( ) -__all__ = ["split_detection_benchmark"] +__all__ = [ + "split_detection_benchmark" +] diff --git a/tests/distributed/test_distributed_batch.py b/tests/distributed/test_distributed_batch.py index 881c76b44..12d492cb0 100644 --- a/tests/distributed/test_distributed_batch.py +++ b/tests/distributed/test_distributed_batch.py @@ -1,12 +1,14 @@ import contextlib import os import unittest +from typing import Tuple import torch from torch import Tensor +from torch.utils.data import default_collate from avalanche.distributed import DistributedHelper, \ - make_classification_distributed_batch + make_classification_distributed_batch, CollateDistributedBatch @contextlib.contextmanager @@ -41,8 +43,11 @@ def test_classification_batch(self): distrib_val = dt.value self.assertEqual(2, len(distrib_val)) + self.assertIsInstance(distrib_val, tuple) self.assertSequenceEqual((8*DistributedHelper.world_size, 1, 28, 28), distrib_val[0].shape) + self.assertIsInstance(distrib_val[0], Tensor) + self.assertIsInstance(distrib_val[1], Tensor) for rank in range(DistributedHelper.world_size): expect = torch.full((8,), rank, @@ -68,6 +73,36 @@ def test_unsupervised_classification_batch(self): self.assertSequenceEqual((8*DistributedHelper.world_size, 1, 28, 28), distrib_val.shape) + @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + 'Distributed tests ignored') + def test_tuple_merge_batch_vanilla_collate(self): + dt: CollateDistributedBatch[Tuple[Tensor, Tensor]] = CollateDistributedBatch( + 'mb', + None, + default_collate, + None) + + self.assertEqual(None, dt.local_value) + self.assertEqual(None, dt.value) + + batch = (torch.ones((8, 1, 28, 28)), + torch.full( + (8,), fill_value=DistributedHelper.rank, dtype=torch.long)) + + dt.value = batch + + distrib_val = dt.value + + self.assertEqual(2, len(distrib_val)) + self.assertSequenceEqual((8 * DistributedHelper.world_size, 1, 28, 28), + distrib_val[0].shape) + for rank in range(DistributedHelper.world_size): + expect = torch.full((8,), + rank, + dtype=torch.long) + self.assertTrue(torch.equal(expect, + distrib_val[1][8 * rank:8 * (rank + 1)])) + if __name__ == "__main__": with manage_output(): diff --git a/tests/distributed/test_distributed_helper.py b/tests/distributed/test_distributed_helper.py new file mode 100644 index 000000000..2cde8476e --- /dev/null +++ b/tests/distributed/test_distributed_helper.py @@ -0,0 +1,89 @@ +import contextlib +import os +import random +import unittest + +import torch +import torch.distributed as dst + +from avalanche.distributed import DistributedHelper +from avalanche.distributed.distributed_helper import RollingSeedContext, BroadcastSeedContext + +from avalanche.training.determinism.rng_manager import RNGManager + + +@contextlib.contextmanager +def manage_output(): + if os.environ['LOCAL_RANK'] != 0: + with contextlib.redirect_stderr(None): + with contextlib.redirect_stdout(None): + yield + else: + yield + + +class DistributedHelperTests(unittest.TestCase): + + def setUp(self) -> None: + self.use_gpu_in_tests = os.environ.get('USE_GPU', 'false').lower() in ['1', 'true'] + self.use_gpu_in_tests = self.use_gpu_in_tests and torch.cuda.is_available() + DistributedHelper.init_distributed(1234, use_cuda=self.use_gpu_in_tests) + + @unittest.skipIf(os.environ.get('DISTRIBUTED_TESTS', 'false').lower() not in ['1', 'true'], + 'Distributed tests ignored') + def test_device_id(self): + if self.use_gpu_in_tests: + print('Verify GPU') + self.assertEqual(dst.get_rank(), DistributedHelper.get_device_id()) + self.assertEqual(torch.device(f'cuda:{dst.get_rank()}'), DistributedHelper.make_device()) + else: + self.assertEqual(-1, DistributedHelper.get_device_id()) + self.assertEqual(torch.device('cpu'), DistributedHelper.make_device()) + + @unittest.skipIf(os.environ.get('DISTRIBUTED_TESTS', 'false').lower() not in ['1', 'true'], + 'Distributed tests ignored') + def test_fields(self): + self.assertEqual(dst.get_rank(), DistributedHelper.rank) + self.assertEqual(dst.get_world_size(), DistributedHelper.world_size) + self.assertEqual(True, DistributedHelper.is_distributed) + self.assertEqual(dst.get_rank() == 0, DistributedHelper.is_main_process) + + if self.use_gpu_in_tests: + print('Verify GPU') + self.assertEqual('nccl', DistributedHelper.backend) + self.assertTrue(DistributedHelper.forced_cuda_comm) + else: + self.assertEqual('gloo', DistributedHelper.backend) + self.assertFalse(DistributedHelper.forced_cuda_comm) + + @unittest.skipIf(os.environ.get('DISTRIBUTED_TESTS', 'false').lower() not in ['1', 'true'], + 'Distributed tests ignored') + def test_rolling_seed_aligner(self): + RNGManager.set_random_seeds(4321) + + with RollingSeedContext(): + RNGManager.set_random_seeds(1234 + DistributedHelper.rank) + random.randint(0, 2 ** 64 - 1) + + final_value = random.randint(0, 2 ** 64 - 1) + self.assertEqual(14732185405572191734, final_value) + + @unittest.skipIf(os.environ.get('DISTRIBUTED_TESTS', 'false').lower() not in ['1', 'true'], + 'Distributed tests ignored') + def test_broadcast_seed_aligner(self): + RNGManager.set_random_seeds(4321) + + with BroadcastSeedContext(): + RNGManager.set_random_seeds(1234 + DistributedHelper.rank) + random.randint(0, 2 ** 64 - 1) + + final_value = random.randint(0, 2 ** 64 - 1) + self.assertEqual(15306775005444441373, final_value) + + +if __name__ == "__main__": + with manage_output(): + verbosity = 1 + if DistributedHelper.rank > 0: + verbosity = 0 + unittest.main(verbosity=verbosity) diff --git a/tests/run_dist_tests.py b/tests/run_dist_tests.py index c17718fa9..e746c6b05 100644 --- a/tests/run_dist_tests.py +++ b/tests/run_dist_tests.py @@ -38,6 +38,15 @@ def run_distributed_suites(): success = True exited = False + use_gpu_in_tests = os.environ.get('USE_GPU', 0).lower() in ['1', 'true'] + if use_gpu_in_tests: + print('Running tests using GPUs') + import torch + nproc_per_node = torch.cuda.device_count() + else: + print('Running tests using CPU only') + nproc_per_node = 4 + for case_name in cases_names: if exited: print('Exiting due to keyboard interrupt') @@ -46,7 +55,7 @@ def run_distributed_suites(): try: p = Popen( ['python', '-m', 'torch.distributed.run', '--nnodes=1', - '--nproc_per_node=4', '-m', 'unittest', case_name], + f'--nproc_per_node={nproc_per_node}', '-m', 'unittest', case_name], stdout=sys.stdout, stderr=sys.stderr) p.communicate() except KeyboardInterrupt: diff --git a/tests/training/test_supervised_regression.py b/tests/training/test_supervised_regression.py index 6fc521c93..88b1d6020 100644 --- a/tests/training/test_supervised_regression.py +++ b/tests/training/test_supervised_regression.py @@ -317,7 +317,7 @@ def training_epoch(self, **kwargs): if self._stop_training: break - self._unpack_minibatch() + self.unpack_minibatch() trigger_plugins(self, "before_training_iteration") self.optimizer.zero_grad() @@ -354,7 +354,7 @@ def eval_dataset_adaptation(self, **kwargs): def eval_epoch(self, **kwargs): """Evaluation loop over the current `self.dataloader`.""" for self.mbatch in self.dataloader: - self._unpack_minibatch() + self.unpack_minibatch() trigger_plugins(self, "before_eval_iteration") trigger_plugins(self, "before_eval_forward") From 1717b8d47d8d2c919362649e588e98a26dae7b3e Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Wed, 23 Nov 2022 15:53:47 +0000 Subject: [PATCH 11/16] Improved management of dataloader arguments in strategies. Improved default loggers creation. Added distributed training integration unit tests. --- avalanche/benchmarks/utils/data_loader.py | 17 +- .../distributed_consistency_verification.py | 46 +++-- .../strategies/distributed_mbatch_strategy.py | 47 +++-- avalanche/logging/base_logger.py | 2 +- avalanche/training/supervised/ar1.py | 14 +- avalanche/training/supervised/cumulative.py | 2 +- avalanche/training/supervised/deep_slda.py | 2 +- avalanche/training/supervised/icarl.py | 2 +- .../training/supervised/joint_training.py | 2 +- avalanche/training/supervised/lamaml.py | 2 +- .../supervised/naive_object_detection.py | 63 +++++-- .../training/supervised/strategy_wrappers.py | 32 ++-- .../supervised/strategy_wrappers_online.py | 4 +- avalanche/training/templates/base_sgd.py | 114 +++++++++--- tests/distributed/distributed_test_utils.py | 35 ++++ tests/distributed/test_distributed_batch.py | 24 +-- tests/distributed/test_distributed_helper.py | 29 +-- tests/distributed/test_distributed_model.py | 20 +- .../test_distributed_strategy_support.py | 175 ++++++++++++++++-- tests/distributed/test_distributed_tensor.py | 24 +-- tests/run_dist_tests.py | 21 ++- tests/training/test_online_strategies.py | 4 +- 22 files changed, 481 insertions(+), 200 deletions(-) create mode 100644 tests/distributed/distributed_test_utils.py diff --git a/avalanche/benchmarks/utils/data_loader.py b/avalanche/benchmarks/utils/data_loader.py index fa3f4740d..7c0b582fa 100644 --- a/avalanche/benchmarks/utils/data_loader.py +++ b/avalanche/benchmarks/utils/data_loader.py @@ -234,6 +234,7 @@ def __iter__(self): removed_dataloaders_idxs.append(tid) continue mb_curr.extend(batch) + yield self.collate_fn(mb_curr) # clear empty data-loaders @@ -308,6 +309,7 @@ def __iter__(self): for tid, t_loader in enumerate(iter_dataloaders): batch = next(t_loader) mb_curr.append(batch) + yield self.collate_mbatches(mb_curr) def __len__(self): @@ -614,13 +616,22 @@ def _make_data_loader( data_loader_args['persistent_workers'] = False if DistributedHelper.is_distributed and distributed_sampling: + # Note: shuffle only goes in the sampler, while + # drop_last must be passed to both the sampler + # and the DataLoader + drop_last = data_loader_args.pop("drop_last", False) sampler = DistributedSampler( dataset, - shuffle=data_loader_args.pop("shuffle", False), - drop_last=data_loader_args.pop("drop_last", False), + shuffle=data_loader_args.pop("shuffle", True), + drop_last=drop_last, ) + data_loader = DataLoader( - dataset, sampler=sampler, batch_size=batch_size, **data_loader_args + dataset, + sampler=sampler, + batch_size=batch_size, + drop_last=drop_last, + **data_loader_args ) else: sampler = None diff --git a/avalanche/distributed/distributed_consistency_verification.py b/avalanche/distributed/distributed_consistency_verification.py index 39bfbe211..502c37448 100644 --- a/avalanche/distributed/distributed_consistency_verification.py +++ b/avalanche/distributed/distributed_consistency_verification.py @@ -1,36 +1,49 @@ +import hashlib +import io + from typing import Tuple, TYPE_CHECKING import torch from torch import Tensor from torch.nn import Module +from torch.utils.data import Dataset, DataLoader if TYPE_CHECKING: from avalanche.benchmarks import GenericCLScenario def hash_benchmark(benchmark: 'GenericCLScenario') -> str: - import hashlib - import io - hash_engine = hashlib.sha256() for stream_name, stream in benchmark.streams.items(): hash_engine.update(stream_name.encode()) for experience in stream: exp_dataset = experience.dataset - dataset_content = exp_dataset[:] - for tuple_elem in dataset_content: - # https://stackoverflow.com/a/63880190 - buff = io.BytesIO() - torch.save(tuple_elem, buff) - buff.seek(0) - hash_engine.update(buff.read()) + hash_dataset(exp_dataset, hash_engine=hash_engine) return hash_engine.hexdigest() -def hash_minibatch(minibatch: Tuple[Tensor]) -> str: - import hashlib - import io +def hash_dataset(dataset: 'Dataset', *, hash_engine=None, num_workers=0) -> str: + from avalanche.distributed import DistributedHelper + if hash_engine is None: + hash_engine = hashlib.sha256() + + data_loader = DataLoader( + dataset, + collate_fn=lambda batch: tuple(zip(*batch)), + num_workers=num_workers + ) + for loaded_elem in data_loader: + example = tuple(tuple_element[0] for tuple_element in loaded_elem) + # https://stackoverflow.com/a/63880190 + buff = io.BytesIO() + torch.save(example, buff) + buff.seek(0) + hash_engine.update(buff.read()) + return hash_engine.hexdigest() + + +def hash_minibatch(minibatch: Tuple[Tensor]) -> str: hash_engine = hashlib.sha256() for tuple_elem in minibatch: buff = io.BytesIO() @@ -41,9 +54,6 @@ def hash_minibatch(minibatch: Tuple[Tensor]) -> str: def hash_tensor(tensor: Tensor) -> str: - import hashlib - import io - hash_engine = hashlib.sha256() buff = io.BytesIO() torch.save(tensor, buff) @@ -53,9 +63,6 @@ def hash_tensor(tensor: Tensor) -> str: def hash_model(model: Module) -> str: - import hashlib - import io - hash_engine = hashlib.sha256() for name, param in model.named_parameters(): hash_engine.update(name.encode()) @@ -68,6 +75,7 @@ def hash_model(model: Module) -> str: __all__ = [ 'hash_benchmark', + 'hash_dataset', 'hash_minibatch', 'hash_tensor', 'hash_model' diff --git a/avalanche/distributed/strategies/distributed_mbatch_strategy.py b/avalanche/distributed/strategies/distributed_mbatch_strategy.py index 3b9d3a3bc..22a2e4e72 100644 --- a/avalanche/distributed/strategies/distributed_mbatch_strategy.py +++ b/avalanche/distributed/strategies/distributed_mbatch_strategy.py @@ -1,8 +1,8 @@ -from typing import Callable, List, Any +from typing import Callable, List, Any, Optional, Union from avalanche.benchmarks.utils import AvalancheDataset from avalanche.benchmarks.utils.collate_functions import \ - classification_collate_mbatches_fn, classification_single_values_collate_fn + classification_collate_mbatches_fn, classification_single_values_collate_fn, Collate, ClassificationCollate from avalanche.distributed import CollateDistributedBatch from avalanche.distributed.strategies import DistributedStrategySupport @@ -11,6 +11,8 @@ class DistributedMiniBatchStrategySupport(DistributedStrategySupport): def __init__(self): super().__init__() + + default_collate_impl = ClassificationCollate() self._mbatch = CollateDistributedBatch( 'mbatch', None, @@ -25,7 +27,8 @@ def __init__(self): classification_single_values_collate_fn ) - self._adapted_dataset = None + self._adapted_dataset: Optional[AvalancheDataset] = None + self._collate_fn: Optional[Union[Collate, Callable]] = None self._use_local_contexts.append(self.use_local_input_batch) self._use_local_contexts.append(self.use_local_output_batch) @@ -102,8 +105,6 @@ def reset_distributed_mb_output(self): self._mb_output.reset_distributed_value() # --- END OUTPUT MINIBATCH PROPERTY --- - # TODO: adapt collate functions - # --- START COLLATE FUNCTIONS (INPUT MB) --- @property def input_batch_collate_fn(self): @@ -121,7 +122,6 @@ def input_batch_single_values_collate_fn(self): def input_batch_single_values_collate_fn( self, single_values_collate_fn: Callable[[List], Any]): self._mbatch.single_values_collate_fn = single_values_collate_fn - # --- END COLLATE FUNCTIONS (INPUT MB) --- # --- START COLLATE FUNCTIONS (OUTPUT MB) --- @@ -152,20 +152,45 @@ def use_local_output_batch(self, *args, **kwargs): # --- END LOCAL CONTEXT MANAGERS --- # --- START - GET COLLATE FUNCTIONS FROM DATASET --- + @property + def collate_fn(self): + """ + The collate function used to merge the values obtained from the + dataset into a minibatch. + + This value is obtained from the adapted dataset directly. + """ + return self._collate_fn + + @collate_fn.setter + def collate_fn(self, new_collate): + self._collate_fn = new_collate + + if isinstance(new_collate, Collate): + self.input_batch_collate_fn = new_collate.collate_fn + self.input_batch_single_values_collate_fn = new_collate.collate_single_value_fn + else: + self.input_batch_collate_fn = new_collate + self.input_batch_single_values_collate_fn = None + @property def adapted_dataset(self): return self._adapted_dataset @adapted_dataset.setter - def adapted_dataset(self, dataset: AvalancheDataset): + def adapted_dataset(self, dataset: Optional[AvalancheDataset]): # Every time a new dataset is set, the related collate # function is retrieved and set for sync-ing distributed # input/output minibatch fields. self._adapted_dataset = dataset - if self._adapted_dataset is not None: - new_collate = self._adapted_dataset.collate_fn - self.input_batch_collate_fn = new_collate - self.input_batch_single_values_collate_fn = None + if self._adapted_dataset is None: + return + + new_collate = self._adapted_dataset.collate_fn + if new_collate is None: + return + + self.collate_fn = new_collate # --- END - GET COLLATE FUNCTIONS FROM DATASET --- diff --git a/avalanche/logging/base_logger.py b/avalanche/logging/base_logger.py index 8c746e0aa..8598b219b 100644 --- a/avalanche/logging/base_logger.py +++ b/avalanche/logging/base_logger.py @@ -7,7 +7,6 @@ if TYPE_CHECKING: from avalanche.evaluation.metric_results import MetricValue - from avalanche.training.templates import SupervisedTemplate class BaseLogger(ABC): @@ -32,6 +31,7 @@ def __init__(self): super().__init__() if not DistributedHelper.is_main_process: + raise RuntimeError( 'You are creating a logger in a non-main process during a ' 'distributed training session. ' diff --git a/avalanche/training/supervised/ar1.py b/avalanche/training/supervised/ar1.py index 203248a6c..18cd631b3 100644 --- a/avalanche/training/supervised/ar1.py +++ b/avalanche/training/supervised/ar1.py @@ -60,7 +60,7 @@ def __init__( eval_mb_size: int = 128, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, ): """ @@ -261,13 +261,19 @@ def make_train_dataloader(self, num_workers=0, shuffle=True, **kwargs): if hasattr(self.adapted_dataset, "collate_fn") else None ) + + other_dataloader_args = self._obtain_common_dataloader_parameters( + batch_size=current_batch_mb_size, + num_workers=num_workers, + shuffle=shuffle, + **kwargs + ) + # AR1 only supports SIT scenarios (no task labels). self.dataloader = DataLoader( self.adapted_dataset, - num_workers=num_workers, - batch_size=current_batch_mb_size, - shuffle=shuffle, collate_fn=collate_fn, + **other_dataloader_args ) def training_epoch(self, **kwargs): diff --git a/avalanche/training/supervised/cumulative.py b/avalanche/training/supervised/cumulative.py index f2ae3981b..d413d6a15 100644 --- a/avalanche/training/supervised/cumulative.py +++ b/avalanche/training/supervised/cumulative.py @@ -28,7 +28,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, ): """Init. diff --git a/avalanche/training/supervised/deep_slda.py b/avalanche/training/supervised/deep_slda.py index 9cb33e94d..36384667d 100644 --- a/avalanche/training/supervised/deep_slda.py +++ b/avalanche/training/supervised/deep_slda.py @@ -37,7 +37,7 @@ def __init__( eval_mb_size: int = 1, device="cpu", plugins: Optional[Sequence["SupervisedPlugin"]] = None, - evaluator=default_evaluator(), + evaluator=default_evaluator, eval_every=-1, ): """Init function for the SLDA model. diff --git a/avalanche/training/supervised/icarl.py b/avalanche/training/supervised/icarl.py index 91125afa2..05d7d04f1 100644 --- a/avalanche/training/supervised/icarl.py +++ b/avalanche/training/supervised/icarl.py @@ -42,7 +42,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, ): """Init. diff --git a/avalanche/training/supervised/joint_training.py b/avalanche/training/supervised/joint_training.py index e581d798e..68bb49c5f 100644 --- a/avalanche/training/supervised/joint_training.py +++ b/avalanche/training/supervised/joint_training.py @@ -54,7 +54,7 @@ def __init__( eval_mb_size: int = 1, device="cpu", plugins: Optional[Sequence["SupervisedPlugin"]] = None, - evaluator=default_evaluator(), + evaluator=default_evaluator, eval_every=-1, ): """Init. diff --git a/avalanche/training/supervised/lamaml.py b/avalanche/training/supervised/lamaml.py index 7da505094..41f229192 100644 --- a/avalanche/training/supervised/lamaml.py +++ b/avalanche/training/supervised/lamaml.py @@ -39,7 +39,7 @@ def __init__( eval_mb_size: int = 1, device="cpu", plugins: Optional[Sequence["SupervisedPlugin"]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, peval_mode="epoch", ): diff --git a/avalanche/training/supervised/naive_object_detection.py b/avalanche/training/supervised/naive_object_detection.py index fbdde7ff2..cd0e18934 100644 --- a/avalanche/training/supervised/naive_object_detection.py +++ b/avalanche/training/supervised/naive_object_detection.py @@ -56,7 +56,7 @@ def __init__( eval_mb_size: int = 1, device="cpu", plugins: Optional[Sequence["SupervisedPlugin"]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, peval_mode="epoch", scaler=None, @@ -127,7 +127,7 @@ def make_train_dataloader( self, num_workers=0, shuffle=True, - pin_memory=True, + pin_memory=None, persistent_workers=False, **kwargs ): @@ -139,45 +139,70 @@ def make_train_dataloader( :param num_workers: number of thread workers for the data loading. :param shuffle: True if the data should be shuffled, False otherwise. :param pin_memory: If True, the data loader will copy Tensors into CUDA - pinned memory before returning them. Defaults to True. + pinned memory before returning them. Defaults to None, which means + that the value will be determined by looking at the strategy `device` + field. :param persistent_workers: If True, the data loader will not shutdown the worker processes after a dataset has been consumed once. Used only if `PyTorch >= 1.7.0`. """ - other_dataloader_args = {} - - if parse_version(torch.__version__) >= parse_version("1.7.0"): - other_dataloader_args["persistent_workers"] = persistent_workers + other_dataloader_args = self._obtain_common_dataloader_parameters( + batch_size=self.train_mb_size, + num_workers=num_workers, + shuffle=shuffle, + pin_memory=pin_memory, + persistent_workers=persistent_workers, + **kwargs + ) self.dataloader = TaskBalancedDataLoader( self.adapted_dataset, oversample_small_groups=True, - num_workers=num_workers, - batch_size=self.train_mb_size, - shuffle=shuffle, - pin_memory=pin_memory, collate_fn=detection_collate_fn, **other_dataloader_args ) - def make_eval_dataloader(self, num_workers=0, pin_memory=True, **kwargs): + def make_eval_dataloader( + self, + num_workers=0, + shuffle=False, + pin_memory=None, + persistent_workers=False, + drop_last=False, + **kwargs): + """ - Initializes the eval data loader. :param num_workers: How many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. (default: 0). + :param shuffle: True if the data should be shuffled, False otherwise. :param pin_memory: If True, the data loader will copy Tensors into CUDA - pinned memory before returning them. Defaults to True. - :param kwargs: - :return: + pinned memory before returning them. Defaults to None, which means + that the value will be determined by looking at the strategy `device` + field. + :param persistent_workers: If True, the data loader will not shut down + the worker processes after a dataset has been consumed once. + Please refer to PyTorch `DataLoader` class for more details. + :param drop_last: If True, the last batch will be skipped if not of size + equal to the eval minibatch size. + :param kwargs: Other dataloader parameters. """ - self.dataloader = DataLoader( - self.adapted_dataset, - num_workers=num_workers, + + other_dataloader_args = self._obtain_common_dataloader_parameters( batch_size=self.eval_mb_size, + num_workers=num_workers, + shuffle=shuffle, pin_memory=pin_memory, + persistent_workers=persistent_workers, + drop_last=drop_last, + **kwargs + ) + + self.dataloader = DataLoader( + self.adapted_dataset, collate_fn=detection_collate_fn, + **other_dataloader_args ) def criterion(self): diff --git a/avalanche/training/supervised/strategy_wrappers.py b/avalanche/training/supervised/strategy_wrappers.py index 72cd68dd0..183a4bc9c 100644 --- a/avalanche/training/supervised/strategy_wrappers.py +++ b/avalanche/training/supervised/strategy_wrappers.py @@ -61,7 +61,7 @@ def __init__( eval_mb_size: Optional[int] = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -117,7 +117,7 @@ def __init__( eval_mb_size: int = 1, device="cpu", plugins: Optional[Sequence["SupervisedPlugin"]] = None, - evaluator=default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -172,7 +172,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -236,7 +236,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -311,7 +311,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, generator_strategy: BaseTemplate = None, replay_size: int = None, @@ -435,7 +435,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = get_default_vae_logger(), + evaluator=get_default_vae_logger, eval_every=-1, **base_kwargs ): @@ -502,7 +502,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -568,7 +568,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -633,7 +633,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -701,7 +701,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -769,7 +769,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -839,7 +839,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -925,7 +925,7 @@ def __init__( eval_mb_size: int = 1, device="cpu", plugins: Optional[Sequence["SupervisedPlugin"]] = None, - evaluator=default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -1000,7 +1000,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -1073,7 +1073,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): @@ -1140,7 +1140,7 @@ def __init__( eval_mb_size: int = 1, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator=default_evaluator, eval_every=-1, **base_kwargs ): diff --git a/avalanche/training/supervised/strategy_wrappers_online.py b/avalanche/training/supervised/strategy_wrappers_online.py index d757e2401..45b3b473f 100644 --- a/avalanche/training/supervised/strategy_wrappers_online.py +++ b/avalanche/training/supervised/strategy_wrappers_online.py @@ -8,7 +8,7 @@ # E-mail: contact@continualai.org # # Website: avalanche.continualai.org # ################################################################################ -from typing import Optional, Sequence, List, Union +from typing import Optional, Sequence, List, Union, Callable from torch.nn import Module, CrossEntropyLoss from torch.optim import Optimizer @@ -42,7 +42,7 @@ def __init__( eval_mb_size: int = None, device=None, plugins: Optional[List[SupervisedPlugin]] = None, - evaluator: EvaluationPlugin = default_evaluator, + evaluator=default_evaluator, eval_every=-1, ): """ diff --git a/avalanche/training/templates/base_sgd.py b/avalanche/training/templates/base_sgd.py index 4e5524805..0c9e5f5ba 100644 --- a/avalanche/training/templates/base_sgd.py +++ b/avalanche/training/templates/base_sgd.py @@ -1,4 +1,4 @@ -from typing import Iterable, Sequence, Optional, Union, List, final +from typing import Iterable, Sequence, Optional, Union, List, final, Callable import torch from pkg_resources import parse_version @@ -50,7 +50,7 @@ def __init__( eval_mb_size: Optional[int] = 1, device="cpu", plugins: Optional[List["SupervisedPlugin"]] = None, - evaluator: EvaluationPlugin = default_evaluator(), + evaluator: Union[EvaluationPlugin, Callable[[], EvaluationPlugin]] = default_evaluator, eval_every=-1, peval_mode="epoch", ): @@ -94,8 +94,10 @@ def __init__( if evaluator is None: evaluator = EvaluationPlugin() + elif isinstance(evaluator, Callable): + evaluator = evaluator() self.plugins.append(evaluator) - self.evaluator = evaluator + self.evaluator: EvaluationPlugin = evaluator """ EvaluationPlugin used for logging and metric computations. """ # Configure periodic evaluation. @@ -125,6 +127,14 @@ def __init__( use :attr:`.BaseTemplate.experience`. """ + self.collate_fn = None + """ + The collate function used to merge the values obtained from the + dataset into a minibatch. + + This value is obtained from the adapted dataset directly. + """ + self.dataloader = None """ Dataloader. """ @@ -298,6 +308,7 @@ def train_dataset_adaptation(self, **kwargs): self.adapted_dataset = self.experience.dataset self.adapted_dataset = self.adapted_dataset.train() + def _load_train_state(self, prev_state): super()._load_train_state(prev_state) self.adapted_dataset = prev_state["adapted_dataset"] @@ -316,11 +327,41 @@ def _before_eval_exp(self, **kwargs): super()._before_eval_exp(**kwargs) + def _obtain_common_dataloader_parameters(self, **kwargs): + """ + Utility function that returns the dictionary of parameters to be passed + to the train and eval dataloaders. + + The resulting dataset does not include the collate function. + + Overriding this function can be useful if particular/runtime computed + parameters are needed. However, when overriding, it is recommended to first + call this implementation (super) to obtain a base parameters dictionary. + + :param kwargs: The dataloader arguments as passed to the `train` + or `eval` method. + :return: A dictionary of parameters to be passed to the DataLoader class + or to one of the Avalanche dataloaders. + """ + other_dataloader_args = {} + + if 'persistent_workers' in kwargs: + if parse_version(torch.__version__) >= parse_version("1.7.0"): + other_dataloader_args["persistent_workers"] = kwargs['persistent_workers'] + + for k, v in kwargs.items(): + other_dataloader_args[k] = v + + if other_dataloader_args.get('pin_memory', None) is None: + other_dataloader_args['pin_memory'] = self.device.type == 'cuda' + + return other_dataloader_args + def make_train_dataloader( self, num_workers=0, shuffle=True, - pin_memory=True, + pin_memory=None, persistent_workers=False, **kwargs ): @@ -332,61 +373,78 @@ def make_train_dataloader( :param num_workers: number of thread workers for the data loading. :param shuffle: True if the data should be shuffled, False otherwise. :param pin_memory: If True, the data loader will copy Tensors into CUDA - pinned memory before returning them. Defaults to True. + pinned memory before returning them. Defaults to None, which means + that the value will be determined by looking at the strategy `device` + field. :param persistent_workers: If True, the data loader will not shut down the worker processes after a dataset has been consumed once. Please refer to PyTorch `DataLoader` class for more details. + :param kwargs: Other dataloader parameters. """ - other_dataloader_args = {} - - if parse_version(torch.__version__) >= parse_version("1.7.0"): - other_dataloader_args["persistent_workers"] = persistent_workers - for k, v in kwargs.items(): - other_dataloader_args[k] = v + other_dataloader_args = self._obtain_common_dataloader_parameters( + batch_size=self.train_mb_size, + num_workers=num_workers, + shuffle=shuffle, + pin_memory=pin_memory, + persistent_workers=persistent_workers, + **kwargs + ) self.dataloader = TaskBalancedDataLoader( self.adapted_dataset, oversample_small_groups=True, - num_workers=num_workers, - batch_size=self.train_mb_size, - shuffle=shuffle, - pin_memory=pin_memory, **other_dataloader_args ) def make_eval_dataloader( - self, num_workers=0, pin_memory=True, persistent_workers=False, **kwargs + self, + num_workers=0, + shuffle=False, + pin_memory=None, + persistent_workers=False, + drop_last=False, + **kwargs ): """ Initializes the eval data loader. :param num_workers: How many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. (default: 0). + :param shuffle: True if the data should be shuffled, False otherwise. :param pin_memory: If True, the data loader will copy Tensors into CUDA - pinned memory before returning them. Defaults to True. - :param kwargs: - :return: + pinned memory before returning them. Defaults to None, which means + that the value will be determined by looking at the strategy `device` + field. + :param persistent_workers: If True, the data loader will not shut down + the worker processes after a dataset has been consumed once. + Please refer to PyTorch `DataLoader` class for more details. + :param drop_last: If True, the last batch will be skipped if not of size + equal to the eval minibatch size. + :param kwargs: Other dataloader parameters. """ - other_dataloader_args = {} - if parse_version(torch.__version__) >= parse_version("1.7.0"): - other_dataloader_args["persistent_workers"] = persistent_workers - for k, v in kwargs.items(): - other_dataloader_args[k] = v + other_dataloader_args = self._obtain_common_dataloader_parameters( + batch_size=self.eval_mb_size, + num_workers=num_workers, + shuffle=shuffle, + pin_memory=pin_memory, + persistent_workers=persistent_workers, + drop_last=drop_last, + **kwargs + ) collate_from_data_or_kwargs(self.adapted_dataset, other_dataloader_args) sampler = None if DistributedHelper.is_distributed: sampler = DistributedSampler( - self.adapted_dataset, shuffle=False, drop_last=False) + self.adapted_dataset, + shuffle=other_dataloader_args.pop('shuffle'), + drop_last=other_dataloader_args.get('drop_last')) self.dataloader = DataLoader( self.adapted_dataset, - num_workers=num_workers, - batch_size=self.eval_mb_size, - pin_memory=pin_memory, sampler=sampler, **other_dataloader_args ) diff --git a/tests/distributed/distributed_test_utils.py b/tests/distributed/distributed_test_utils.py new file mode 100644 index 000000000..2a833cd0e --- /dev/null +++ b/tests/distributed/distributed_test_utils.py @@ -0,0 +1,35 @@ +import contextlib +import os + +import torch + +from avalanche.distributed import DistributedHelper + + +def common_dst_tests_setup(): + use_gpu_in_tests = os.environ.get('USE_GPU', 'false').lower() in ['1', 'true'] + use_gpu_in_tests = use_gpu_in_tests and torch.cuda.is_available() + DistributedHelper.init_distributed(1234, use_cuda=use_gpu_in_tests) + return use_gpu_in_tests + + +def check_skip_distributed_test() -> bool: + return os.environ.get('DISTRIBUTED_TESTS', 'false').lower() \ + not in ['1', 'true'] + + +@contextlib.contextmanager +def suppress_dst_tests_output(): + if os.environ['LOCAL_RANK'] != 0: + with contextlib.redirect_stderr(None): + with contextlib.redirect_stdout(None): + yield + else: + yield + + +__all__ = [ + 'common_dst_tests_setup', + 'check_skip_distributed_test', + 'suppress_dst_tests_output' +] diff --git a/tests/distributed/test_distributed_batch.py b/tests/distributed/test_distributed_batch.py index 12d492cb0..3f2e7ce7e 100644 --- a/tests/distributed/test_distributed_batch.py +++ b/tests/distributed/test_distributed_batch.py @@ -1,5 +1,3 @@ -import contextlib -import os import unittest from typing import Tuple @@ -9,24 +7,16 @@ from avalanche.distributed import DistributedHelper, \ make_classification_distributed_batch, CollateDistributedBatch - - -@contextlib.contextmanager -def manage_output(): - if os.environ['LOCAL_RANK'] != 0: - with contextlib.redirect_stderr(None): - with contextlib.redirect_stdout(None): - yield - else: - yield +from tests.distributed.distributed_test_utils import check_skip_distributed_test, suppress_dst_tests_output, \ + common_dst_tests_setup class DistributedBatchesTests(unittest.TestCase): def setUp(self) -> None: - DistributedHelper.init_distributed(1234, use_cuda=False) + self.use_gpu_in_tests = common_dst_tests_setup() - @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_classification_batch(self): dt = make_classification_distributed_batch('mb') @@ -55,7 +45,7 @@ def test_classification_batch(self): self.assertTrue(torch.equal(expect, distrib_val[1][8*rank:8*(rank+1)])) - @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_unsupervised_classification_batch(self): dt = make_classification_distributed_batch('mb') @@ -73,7 +63,7 @@ def test_unsupervised_classification_batch(self): self.assertSequenceEqual((8*DistributedHelper.world_size, 1, 28, 28), distrib_val.shape) - @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_tuple_merge_batch_vanilla_collate(self): dt: CollateDistributedBatch[Tuple[Tensor, Tensor]] = CollateDistributedBatch( @@ -105,7 +95,7 @@ def test_tuple_merge_batch_vanilla_collate(self): if __name__ == "__main__": - with manage_output(): + with suppress_dst_tests_output(): verbosity = 1 if DistributedHelper.rank > 0: verbosity = 0 diff --git a/tests/distributed/test_distributed_helper.py b/tests/distributed/test_distributed_helper.py index 2cde8476e..9cc414be8 100644 --- a/tests/distributed/test_distributed_helper.py +++ b/tests/distributed/test_distributed_helper.py @@ -1,4 +1,3 @@ -import contextlib import os import random import unittest @@ -10,37 +9,26 @@ from avalanche.distributed.distributed_helper import RollingSeedContext, BroadcastSeedContext from avalanche.training.determinism.rng_manager import RNGManager - - -@contextlib.contextmanager -def manage_output(): - if os.environ['LOCAL_RANK'] != 0: - with contextlib.redirect_stderr(None): - with contextlib.redirect_stdout(None): - yield - else: - yield +from tests.distributed.distributed_test_utils import check_skip_distributed_test, suppress_dst_tests_output, \ + common_dst_tests_setup class DistributedHelperTests(unittest.TestCase): def setUp(self) -> None: - self.use_gpu_in_tests = os.environ.get('USE_GPU', 'false').lower() in ['1', 'true'] - self.use_gpu_in_tests = self.use_gpu_in_tests and torch.cuda.is_available() - DistributedHelper.init_distributed(1234, use_cuda=self.use_gpu_in_tests) + self.use_gpu_in_tests = common_dst_tests_setup() - @unittest.skipIf(os.environ.get('DISTRIBUTED_TESTS', 'false').lower() not in ['1', 'true'], + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_device_id(self): if self.use_gpu_in_tests: - print('Verify GPU') self.assertEqual(dst.get_rank(), DistributedHelper.get_device_id()) self.assertEqual(torch.device(f'cuda:{dst.get_rank()}'), DistributedHelper.make_device()) else: self.assertEqual(-1, DistributedHelper.get_device_id()) self.assertEqual(torch.device('cpu'), DistributedHelper.make_device()) - @unittest.skipIf(os.environ.get('DISTRIBUTED_TESTS', 'false').lower() not in ['1', 'true'], + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_fields(self): self.assertEqual(dst.get_rank(), DistributedHelper.rank) @@ -49,14 +37,13 @@ def test_fields(self): self.assertEqual(dst.get_rank() == 0, DistributedHelper.is_main_process) if self.use_gpu_in_tests: - print('Verify GPU') self.assertEqual('nccl', DistributedHelper.backend) self.assertTrue(DistributedHelper.forced_cuda_comm) else: self.assertEqual('gloo', DistributedHelper.backend) self.assertFalse(DistributedHelper.forced_cuda_comm) - @unittest.skipIf(os.environ.get('DISTRIBUTED_TESTS', 'false').lower() not in ['1', 'true'], + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_rolling_seed_aligner(self): RNGManager.set_random_seeds(4321) @@ -68,7 +55,7 @@ def test_rolling_seed_aligner(self): final_value = random.randint(0, 2 ** 64 - 1) self.assertEqual(14732185405572191734, final_value) - @unittest.skipIf(os.environ.get('DISTRIBUTED_TESTS', 'false').lower() not in ['1', 'true'], + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_broadcast_seed_aligner(self): RNGManager.set_random_seeds(4321) @@ -82,7 +69,7 @@ def test_broadcast_seed_aligner(self): if __name__ == "__main__": - with manage_output(): + with suppress_dst_tests_output(): verbosity = 1 if DistributedHelper.rank > 0: verbosity = 0 diff --git a/tests/distributed/test_distributed_model.py b/tests/distributed/test_distributed_model.py index 95c0ac1cf..b4b54a6b0 100644 --- a/tests/distributed/test_distributed_model.py +++ b/tests/distributed/test_distributed_model.py @@ -1,29 +1,19 @@ -import contextlib -import os import unittest from torch.nn.parallel import DistributedDataParallel from avalanche.distributed import DistributedHelper, DistributedModel from avalanche.models import SimpleMLP - - -@contextlib.contextmanager -def manage_output(): - if os.environ['LOCAL_RANK'] != 0: - with contextlib.redirect_stderr(None): - with contextlib.redirect_stdout(None): - yield - else: - yield +from tests.distributed.distributed_test_utils import check_skip_distributed_test, suppress_dst_tests_output, \ + common_dst_tests_setup class DistributedModelTests(unittest.TestCase): def setUp(self) -> None: - DistributedHelper.init_distributed(1234, use_cuda=False) + self.use_gpu_in_tests = common_dst_tests_setup() - @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_distributed_model(self): dt: DistributedModel = DistributedModel() @@ -73,7 +63,7 @@ def test_distributed_model(self): if __name__ == "__main__": - with manage_output(): + with suppress_dst_tests_output(): verbosity = 1 if DistributedHelper.rank > 0: verbosity = 0 diff --git a/tests/distributed/test_distributed_strategy_support.py b/tests/distributed/test_distributed_strategy_support.py index 686555a12..fc8fc4c9d 100644 --- a/tests/distributed/test_distributed_strategy_support.py +++ b/tests/distributed/test_distributed_strategy_support.py @@ -1,30 +1,28 @@ -import contextlib -import os -import time +import hashlib import unittest import torch +from torch import Tensor +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DistributedSampler, DataLoader from avalanche.distributed import DistributedHelper +from avalanche.distributed.distributed_consistency_verification import hash_dataset from avalanche.distributed.strategies import DistributedMiniBatchStrategySupport - - -@contextlib.contextmanager -def manage_output(): - if os.environ['LOCAL_RANK'] != 0: - with contextlib.redirect_stderr(None): - with contextlib.redirect_stdout(None): - yield - else: - yield +from avalanche.models import SimpleMLP +from avalanche.training import Naive +from tests.distributed.distributed_test_utils import check_skip_distributed_test, suppress_dst_tests_output, \ + common_dst_tests_setup +from tests.unit_tests_utils import get_fast_benchmark class DistributedStrategySupportTests(unittest.TestCase): def setUp(self) -> None: - DistributedHelper.init_distributed(1234, use_cuda=False) + self.use_gpu_in_tests = common_dst_tests_setup() - @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_use_local_works(self): uut = DistributedMiniBatchStrategySupport() @@ -75,9 +73,154 @@ def test_use_local_works(self): torch.full((10,), from_rank, dtype=torch.float32), got_mb_output[row_idx])) + def _check_loss_equal(self, uut): + local_loss = uut.local_loss + global_loss = uut.loss + + self.assertIsInstance(local_loss, Tensor) + self.assertIsInstance(global_loss, Tensor) + self.assertEqual(uut.device, local_loss.device) + self.assertEqual(uut.device, global_loss.device) + + all_losses = DistributedHelper.gather_all_objects(float(local_loss)) + # Note: the results of torch.mean are different from the ones + # of statistics.mean + self.assertAlmostEqual( + float(torch.mean(torch.as_tensor(all_losses))), + float(global_loss)) + + def _check_batches_equal(self, uut: Naive, rank: int, mb_size: int, mb_dist_size: int, input_size: int): + local_input_mb = uut.local_mbatch + global_input_mb = uut.mbatch + + self.assertEqual(3, len(local_input_mb)) + self.assertEqual(3, len(global_input_mb)) + + for mb_i, mb_elem in enumerate(local_input_mb): + self.assertIsInstance(mb_elem, Tensor) + self.assertEqual(uut.device, mb_elem.device) + + for mb_i, mb_elem in enumerate(global_input_mb): + self.assertIsInstance(mb_elem, Tensor) + self.assertEqual(uut.device, mb_elem.device) + + self.assertTrue(torch.equal(global_input_mb[0], uut.mb_x)) + self.assertTrue(torch.equal(global_input_mb[1], uut.mb_y)) + self.assertTrue(torch.equal(global_input_mb[2], uut.mb_task_id)) + + self.assertSequenceEqual(local_input_mb[0].shape, [mb_dist_size, input_size]) + self.assertSequenceEqual(local_input_mb[1].shape, [mb_dist_size]) + self.assertSequenceEqual(local_input_mb[2].shape, [mb_dist_size]) + + self.assertSequenceEqual(global_input_mb[0].shape, [mb_size, input_size]) + self.assertSequenceEqual(global_input_mb[1].shape, [mb_size]) + self.assertSequenceEqual(global_input_mb[2].shape, [mb_size]) + + global_index_start = mb_dist_size * rank + global_index_end = global_index_start + mb_dist_size + + for i in range(3): + self.assertTrue(torch.equal(local_input_mb[i], global_input_mb[i][global_index_start:global_index_end])) + + def _check_adapted_datasets_equal(self, uut: Naive): + local_adapted_dataset = uut.adapted_dataset + + DistributedHelper.check_equal_objects( + hash_dataset(local_adapted_dataset, num_workers=4, hash_engine=hashlib.sha1()) + ) + + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_naive_classification_dst(self): + self.assertTrue(DistributedHelper.is_distributed) + + input_size = 28 * 28 + # mb_size == 60, so that it can be tested using [1, 6] parallel processes + mb_size = 1*2*2*3*4*5 + model = SimpleMLP(input_size=input_size) + optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9) + criterion = CrossEntropyLoss() + device = DistributedHelper.make_device() + + # DST parameters adaptation + mb_size_dst = mb_size // DistributedHelper.world_size + + uut = Naive( + model, + optimizer, + criterion, + train_mb_size=mb_size_dst, + eval_mb_size=mb_size_dst, + train_epochs=2, + device=device + ) + + self.assertEqual(device, uut.device) + + if not DistributedHelper.is_main_process: + self.assertEqual(0, len(uut.evaluator.loggers)) + + benchmark = get_fast_benchmark( + n_samples_per_class=400, + n_features=input_size) + + for exp_idx, train_experience in enumerate(benchmark.train_stream): + # TODO: insert checks between iterations + metrics = uut.train(train_experience, drop_last=True) + self._check_batches_equal(uut, DistributedHelper.rank, mb_size, mb_size_dst, input_size) + self._check_loss_equal(uut) + if exp_idx < 2: + # Do it only for the first 2 experiences to speed up tests + self._check_adapted_datasets_equal(uut) + DistributedHelper.check_equal_objects(metrics) + + metrics = uut.eval(benchmark.test_stream, drop_last=True) + self._check_batches_equal(uut, DistributedHelper.rank, mb_size, mb_size_dst, input_size) + self._check_loss_equal(uut) + if exp_idx < 2: + # Do it only for the first 2 experiences to speed up tests + self._check_adapted_datasets_equal(uut) + DistributedHelper.check_equal_objects(metrics) + + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_pytorch_distributed_sampler(self): + """ + Only used to test the DistributedSampler class from PyTorch. + """ + self.assertTrue(DistributedHelper.is_distributed) + + input_size = 28 * 28 + mb_size = 210 # Can be tested using [1, 10] parallel processes + + # DST parameters adaptation + mb_size_dst = mb_size // DistributedHelper.world_size + + benchmark = get_fast_benchmark( + n_samples_per_class=175 * 4, + n_features=input_size) + + for train_experience in benchmark.train_stream: + dataset = train_experience.dataset + sampler = DistributedSampler( + dataset, + shuffle=True, + drop_last=True + ) + dataloader = DataLoader( + dataset, + batch_size=mb_size_dst, + sampler=sampler, + drop_last=True + ) + + for mb_x, mb_y, mb_t in dataloader: + self.assertSequenceEqual(mb_x.shape, [mb_size_dst, input_size]) + self.assertEqual(len(mb_y), mb_size_dst) + if __name__ == "__main__": - with manage_output(): + with suppress_dst_tests_output(): verbosity = 1 if DistributedHelper.rank > 0: verbosity = 0 diff --git a/tests/distributed/test_distributed_tensor.py b/tests/distributed/test_distributed_tensor.py index 3add6c554..fe366ac42 100644 --- a/tests/distributed/test_distributed_tensor.py +++ b/tests/distributed/test_distributed_tensor.py @@ -1,5 +1,3 @@ -import contextlib -import os import unittest import torch @@ -7,24 +5,16 @@ from avalanche.distributed import DistributedHelper from avalanche.distributed.distributed_tensor import \ DistributedMeanTensor - - -@contextlib.contextmanager -def manage_output(): - if os.environ['LOCAL_RANK'] != 0: - with contextlib.redirect_stderr(None): - with contextlib.redirect_stdout(None): - yield - else: - yield +from tests.distributed.distributed_test_utils import check_skip_distributed_test, suppress_dst_tests_output, \ + common_dst_tests_setup class DistributedTensorTests(unittest.TestCase): def setUp(self) -> None: - DistributedHelper.init_distributed(1234, use_cuda=False) + self.use_gpu_in_tests = common_dst_tests_setup() - @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_one_element_tensor(self): dt = DistributedMeanTensor('dt', torch.zeros((1,), dtype=torch.float32)) @@ -43,7 +33,7 @@ def test_one_element_tensor(self): self.assertEqual(i, float(dt.local_value)) self.assertEqual(expected / n, float(dt.value)) - @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_one_element_tensor_random(self): dt = DistributedMeanTensor('dt', torch.zeros((1,), dtype=torch.float32)) @@ -56,7 +46,7 @@ def test_one_element_tensor_random(self): self.assertTrue(torch.allclose(expected, torch.mean(dt.local_value))) self.assertTrue(torch.allclose(expected, dt.value)) - @unittest.skipIf(int(os.environ.get('DISTRIBUTED_TESTS', 0)) != 1, + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_unshaped_tensor(self): dt = DistributedMeanTensor('dt', @@ -81,7 +71,7 @@ def test_unshaped_tensor(self): if __name__ == "__main__": - with manage_output(): + with suppress_dst_tests_output(): verbosity = 1 if DistributedHelper.rank > 0: verbosity = 0 diff --git a/tests/run_dist_tests.py b/tests/run_dist_tests.py index e746c6b05..f1fffc415 100644 --- a/tests/run_dist_tests.py +++ b/tests/run_dist_tests.py @@ -6,6 +6,8 @@ from typing import Union, Set from unittest import TestSuite, TestCase +import click + os.environ['DISTRIBUTED_TESTS'] = '1' @@ -29,16 +31,27 @@ def get_distributed_test_cases(suite: Union[TestCase, TestSuite]) -> Set[str]: return found_cases -def run_distributed_suites(): +@click.command() +@click.argument('test_cases', nargs=-1) +def run_distributed_suites(test_cases): cases_names = get_distributed_test_cases( unittest.defaultTestLoader.discover('.')) # Don't change the path! cases_names = list(sorted(cases_names)) + print(cases_names) + if len(test_cases) > 0: + test_cases = set(test_cases) + cases_names = [x for x in cases_names if x in test_cases] + + if set(cases_names) != test_cases: + print('Some cases have not been found!', test_cases - set(cases_names)) + sys.exit(1) + print('Running', len(cases_names), 'tests') p = None success = True exited = False - use_gpu_in_tests = os.environ.get('USE_GPU', 0).lower() in ['1', 'true'] + use_gpu_in_tests = os.environ.get('USE_GPU', 'false').lower() in ['1', 'true'] if use_gpu_in_tests: print('Running tests using GPUs') import torch @@ -70,10 +83,10 @@ def run_distributed_suites(): if success: print('Tests completed successfully') - exit(0) + sys.exit(0) else: print('Tests terminated with errors') - exit(1) + sys.exit(1) if __name__ == '__main__': diff --git a/tests/training/test_online_strategies.py b/tests/training/test_online_strategies.py index fdced8935..f1820371e 100644 --- a/tests/training/test_online_strategies.py +++ b/tests/training/test_online_strategies.py @@ -53,7 +53,7 @@ def test_naive(self): train_mb_size=1, device=self.device, eval_mb_size=50, - evaluator=default_evaluator(), + evaluator=default_evaluator, ) ocl_benchmark = OnlineCLScenario(benchmark_streams, access_task_boundaries=True) @@ -68,7 +68,7 @@ def test_naive(self): train_mb_size=1, device=self.device, eval_mb_size=50, - evaluator=default_evaluator(), + evaluator=default_evaluator, ) ocl_benchmark = OnlineCLScenario(benchmark_streams, access_task_boundaries=False) From da5c58cd47ebde41a47c969925b0741a44507b69 Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Wed, 23 Nov 2022 17:20:26 +0000 Subject: [PATCH 12/16] Improved distributed strategy unit tests. Fixed PEP8 issues. --- .../scenarios/classification_scenario.py | 3 +- .../scenarios/detection_scenario.py | 6 +- .../scenarios/lazy_dataset_sequence.py | 3 +- .../benchmarks/utils/collate_functions.py | 19 ++- avalanche/benchmarks/utils/data_attribute.py | 6 +- .../benchmarks/utils/detection_dataset.py | 22 ++-- avalanche/benchmarks/utils/flat_data.py | 6 +- avalanche/distributed/distributed_batch.py | 6 +- .../distributed_consistency_verification.py | 1 - avalanche/distributed/distributed_helper.py | 44 ++++++- .../strategies/distributed_mbatch_strategy.py | 13 ++- avalanche/training/supervised/cumulative.py | 4 +- .../supervised/naive_object_detection.py | 11 +- avalanche/training/templates/base_sgd.py | 20 ++-- .../observation_type/online_observation.py | 7 +- .../problem_type/supervised_problem.py | 7 +- tests/distributed/distributed_test_utils.py | 3 +- tests/distributed/test_distributed_batch.py | 20 ++-- tests/distributed/test_distributed_helper.py | 13 ++- tests/distributed/test_distributed_model.py | 3 +- .../test_distributed_strategy_support.py | 110 +++++++++++++++--- tests/distributed/test_distributed_tensor.py | 3 +- tests/run_dist_tests.py | 9 +- 23 files changed, 246 insertions(+), 93 deletions(-) diff --git a/avalanche/benchmarks/scenarios/classification_scenario.py b/avalanche/benchmarks/scenarios/classification_scenario.py index 3e4617871..3d875b64a 100644 --- a/avalanche/benchmarks/scenarios/classification_scenario.py +++ b/avalanche/benchmarks/scenarios/classification_scenario.py @@ -31,7 +31,8 @@ from avalanche.benchmarks.scenarios.lazy_dataset_sequence import ( LazyDatasetSequence, ) -from avalanche.benchmarks.utils import make_classification_dataset, AvalancheDataset +from avalanche.benchmarks.utils import \ + make_classification_dataset, AvalancheDataset from avalanche.benchmarks.utils.dataset_utils import manage_advanced_indexing TGenericCLClassificationScenario = TypeVar( diff --git a/avalanche/benchmarks/scenarios/detection_scenario.py b/avalanche/benchmarks/scenarios/detection_scenario.py index b90f51a61..18cef4d08 100644 --- a/avalanche/benchmarks/scenarios/detection_scenario.py +++ b/avalanche/benchmarks/scenarios/detection_scenario.py @@ -106,10 +106,12 @@ def classes_in_experience( return _LazyStreamClassesInDetectionExps(self) -class _LazyStreamClassesInDetectionExps(Mapping[str, Sequence[Optional[Set[int]]]]): +class _LazyStreamClassesInDetectionExps(Mapping[str, + Sequence[Optional[Set[int]]]]): def __init__(self, benchmark: GenericCLScenario): self._benchmark = benchmark - self._default_lcie = _LazyClassesInDetectionExps(benchmark, stream="train") + self._default_lcie = _LazyClassesInDetectionExps( + benchmark, stream="train") def __len__(self): return len(self._benchmark.stream_definitions) diff --git a/avalanche/benchmarks/scenarios/lazy_dataset_sequence.py b/avalanche/benchmarks/scenarios/lazy_dataset_sequence.py index 044dfd9af..46ce1b8c5 100644 --- a/avalanche/benchmarks/scenarios/lazy_dataset_sequence.py +++ b/avalanche/benchmarks/scenarios/lazy_dataset_sequence.py @@ -12,7 +12,8 @@ from collections import defaultdict from typing import Sequence, Iterable, Dict, Optional, Iterator -from avalanche.benchmarks.utils import make_classification_dataset, AvalancheDataset +from avalanche.benchmarks.utils import \ + make_classification_dataset, AvalancheDataset class LazyDatasetSequence(Sequence[make_classification_dataset]): diff --git a/avalanche/benchmarks/utils/collate_functions.py b/avalanche/benchmarks/utils/collate_functions.py index e5ff22e52..2088423af 100644 --- a/avalanche/benchmarks/utils/collate_functions.py +++ b/avalanche/benchmarks/utils/collate_functions.py @@ -128,8 +128,9 @@ def collate_fn(self, batch: Sequence[ExampleT]) -> BatchT: the dataset. PyTorch official documentation described the default_collate_fn as: - "Function that takes in a batch of data and puts the elements within the batch - into a tensor with an additional outer dimension - batch size." + "Function that takes in a batch of data and puts the elements within + the batch into a tensor with an additional + outer dimension - batch size." :param batch: The list of examples. :return: The batch. @@ -177,10 +178,12 @@ def collate_single_value_batches_fn( This function expects a list of pre-batched features. - :param feature_batches: A list of batched features to be merged together. + :param feature_batches: A list of batched features to be merged + together. :param feature_idx: The index of the feature being batched. This may be useful to customize how features are merged. - :return: A batch of featured made by collating the input batched featured. + :return: A batch of features made by collating the input batched + features. """ pass @@ -198,7 +201,10 @@ class ClassificationCollate(Collate[Tuple[Tensor, ...], Tuple[Tensor, ...]]): def collate_fn(self, batch): return default_collate(batch) - def collate_single_value_fn(self, feature_batch: Sequence[Tensor], feature_idx): + def collate_single_value_fn( + self, + feature_batch: Sequence[Tensor], + feature_idx): return torch.stack(feature_batch) def collate_batches_fn(self, batches): @@ -217,7 +223,8 @@ def collate_single_value_batches_fn( return torch.cat(feature_batch, dim=0) -class DetectionCollate(Collate[Tuple[Tensor, Dict, int], Tuple[Tuple[Tensor], Tuple[Dict], Tuple[int]]]): +class DetectionCollate(Collate[Tuple[Tensor, Dict, int], + Tuple[Tuple[Tensor], Tuple[Dict], Tuple[int]]]): def collate_fn(self, batch): return detection_collate_fn(batch) diff --git a/avalanche/benchmarks/utils/data_attribute.py b/avalanche/benchmarks/utils/data_attribute.py index 9cc50a5a1..6780264d0 100644 --- a/avalanche/benchmarks/utils/data_attribute.py +++ b/avalanche/benchmarks/utils/data_attribute.py @@ -36,7 +36,11 @@ class labels. Data attributes can be efficiently concatenated and subsampled. """ - def __init__(self, data: IDataset[DataT], name: str = None, use_in_getitem: bool = False): + def __init__( + self, + data: IDataset[DataT], + name: str = None, + use_in_getitem: bool = False): """Data Attribute. :param data: a sequence of values, one for each sample. diff --git a/avalanche/benchmarks/utils/detection_dataset.py b/avalanche/benchmarks/utils/detection_dataset.py index b7045ed9e..5652e0a86 100644 --- a/avalanche/benchmarks/utils/detection_dataset.py +++ b/avalanche/benchmarks/utils/detection_dataset.py @@ -81,10 +81,12 @@ def __call__(self, input_value: Any) -> Any: ConcatDataset, ] -DetectionExampleT = Tuple[Tensor, TTargetType, int] # Image (tensor), target dict, task label +# Image (tensor), target dict, task label +DetectionExampleT = Tuple[Tensor, TTargetType, int] -class DetectionDataset(AvalancheDataset, IDatasetWithTargets[DetectionExampleT, TTargetType]): +class DetectionDataset(AvalancheDataset, + IDatasetWithTargets[DetectionExampleT, TTargetType]): def __init__(self, *args, **kwargs): # Here defined only to provide type hinting self.targets_task_labels: DataAttribute[int] = DataAttribute( @@ -144,9 +146,10 @@ def make_detection_dataset( This dataset applies input/target transformations, it supports slicing and advanced indexing and it also contains useful fields as - `targets`, which contains the pattern dictionaries, and `targets_task_labels`, - which contains the pattern task labels. The `task_set` field can be used to - obtain a the subset of patterns labeled with a given task label. + `targets`, which contains the pattern dictionaries, and + `targets_task_labels`, which contains the pattern task labels. + The `task_set` field can be used to obtain a the subset of patterns + labeled with a given task label. This dataset can also be used to apply several advanced operations involving transformations. For instance, it allows the user to add and replace @@ -366,7 +369,8 @@ def _detection_class_mapping_transform(class_mapping, example_target_dict): # example_target_dict["labels"] is a tensor containing one label # for each bounding box in the image. We need to remap each of them example_target_labels = example_target_dict["labels"] - example_mapped_labels = [class_mapping[int(el)] for el in example_target_labels] + example_mapped_labels = [class_mapping[int(el)] for el + in example_target_labels] if isinstance(example_target_labels, Tensor): example_mapped_labels = torch.as_tensor(example_mapped_labels) @@ -486,8 +490,10 @@ def detection_subset( if targets is None: targets = dataset.targets - tgs = [_detection_class_mapping_transform(class_mapping, example_target_dict) - for example_target_dict in targets] + tgs = [ + _detection_class_mapping_transform( + class_mapping, example_target_dict) + for example_target_dict in targets] targets = DataAttribute(tgs, "targets") diff --git a/avalanche/benchmarks/utils/flat_data.py b/avalanche/benchmarks/utils/flat_data.py index 46c20037b..eaa3214cb 100644 --- a/avalanche/benchmarks/utils/flat_data.py +++ b/avalanche/benchmarks/utils/flat_data.py @@ -72,7 +72,8 @@ def _get_indices(self): else: return list(range(len(self))) - def subset(self: FlatDataImplT, indices: Optional[List[int]]) -> FlatDataImplT: + def subset(self: FlatDataImplT, indices: Optional[List[int]]) \ + -> FlatDataImplT: """Subsampling operation. :param indices: indices of the new samples @@ -243,7 +244,8 @@ def __str__(self): ) -def _flatten_dataset_list(datasets: List[IDataset[DataT]]) -> List[IDataset[DataT]]: +def _flatten_dataset_list(datasets: List[IDataset[DataT]]) \ + -> List[IDataset[DataT]]: """Flatten dataset tree if possible.""" # Concat -> Concat branch # Flattens by borrowing the list of concatenated datasets diff --git a/avalanche/distributed/distributed_batch.py b/avalanche/distributed/distributed_batch.py index f33bf3de3..0fd3ed858 100644 --- a/avalanche/distributed/distributed_batch.py +++ b/avalanche/distributed/distributed_batch.py @@ -93,7 +93,8 @@ def _merge_tuples(self, tuples: List[LocalT]): return tuple(merged_elements) except OnlyTupleSynchronizationSupported: - raise RuntimeError('[DistributedBatch] No proper collate function set.') + raise RuntimeError( + '[DistributedBatch] No proper collate function set.') @abstractmethod def _merge_single_values(self, values: List, value_index: int): @@ -125,7 +126,8 @@ def _unroll_minibatch(self, tuples: List[LocalT]) -> List[LocalT]: for mb_element_idx in range(mb_size): mb_element = [] for tuple_element_idx in range(n_elements): - mb_element.append(local_tuple[tuple_element_idx][mb_element_idx]) + mb_element.append( + local_tuple[tuple_element_idx][mb_element_idx]) unrolled_elements.append(tuple(mb_element)) return unrolled_elements diff --git a/avalanche/distributed/distributed_consistency_verification.py b/avalanche/distributed/distributed_consistency_verification.py index 502c37448..25478b740 100644 --- a/avalanche/distributed/distributed_consistency_verification.py +++ b/avalanche/distributed/distributed_consistency_verification.py @@ -23,7 +23,6 @@ def hash_benchmark(benchmark: 'GenericCLScenario') -> str: def hash_dataset(dataset: 'Dataset', *, hash_engine=None, num_workers=0) -> str: - from avalanche.distributed import DistributedHelper if hash_engine is None: hash_engine = hashlib.sha256() diff --git a/avalanche/distributed/distributed_helper.py b/avalanche/distributed/distributed_helper.py index 46f36309a..5f8236609 100644 --- a/avalanche/distributed/distributed_helper.py +++ b/avalanche/distributed/distributed_helper.py @@ -2,7 +2,7 @@ import pickle import warnings from io import BytesIO -from typing import Optional, List +from typing import Optional, List, Any, Iterable, Dict import torch from torch import Tensor @@ -11,7 +11,8 @@ from torch.nn.parallel import DistributedDataParallel from typing_extensions import Literal -from avalanche.distributed.distributed_consistency_verification import hash_tensor +from avalanche.distributed.distributed_consistency_verification import \ + hash_tensor class _Singleton(type): @@ -370,13 +371,16 @@ def check_equal_objects(self, obj): if not DistributedHelper.is_distributed: return - output = [None for _ in range(self.world_size)] + output: List[Any] = [None for _ in range(self.world_size)] torch.distributed.all_gather_object(output, obj) + obj_bt = base_typed(obj) + for i, o in enumerate(output): - if obj != o: + o_bt = base_typed(o) + if obj_bt != o_bt: raise ValueError( - 'Different object ranks this={}, remote={}. ' + 'Different objects (ranks this={}, remote={}). ' 'Got this={}, remote={}'.format( self.rank, i, obj, o)) @@ -429,6 +433,36 @@ def forced_cuda_comm(self) -> bool: return self.backend == 'nccl' +BASE_TYPES = [str, int, float, bool, type(None)] + + +def base_typed(obj): + """ + Improved version of https://stackoverflow.com/a/62420097 + """ + T = type(obj) + from_numpy = T.__module__ == 'numpy' + from_pytorch = T.__module__ == 'torch' + + if from_numpy or from_pytorch: + print(T.__module__) + return obj.tolist() + + if T in BASE_TYPES or callable(obj) or ((from_numpy or from_pytorch) + and not isinstance(T, Iterable)): + return obj + + if isinstance(obj, Dict): + return {base_typed(k): base_typed(v) for k, v in obj.items()} + elif isinstance(obj, Iterable): + base_items = [base_typed(item) for item in obj] + return base_items if (from_numpy or from_pytorch) else T(base_items) + + d = obj if T is dict else obj.__dict__ + + return {k: base_typed(v) for k, v in d.items()} + + DistributedHelper = _DistributedHelperCls() diff --git a/avalanche/distributed/strategies/distributed_mbatch_strategy.py b/avalanche/distributed/strategies/distributed_mbatch_strategy.py index 22a2e4e72..3f02a80de 100644 --- a/avalanche/distributed/strategies/distributed_mbatch_strategy.py +++ b/avalanche/distributed/strategies/distributed_mbatch_strategy.py @@ -2,7 +2,7 @@ from avalanche.benchmarks.utils import AvalancheDataset from avalanche.benchmarks.utils.collate_functions import \ - classification_collate_mbatches_fn, classification_single_values_collate_fn, Collate, ClassificationCollate + Collate, ClassificationCollate from avalanche.distributed import CollateDistributedBatch from avalanche.distributed.strategies import DistributedStrategySupport @@ -16,15 +16,15 @@ def __init__(self): self._mbatch = CollateDistributedBatch( 'mbatch', None, - classification_collate_mbatches_fn, - classification_single_values_collate_fn + default_collate_impl.collate_fn, + default_collate_impl.collate_single_value_fn ) self._mb_output = CollateDistributedBatch( 'mb_output', None, - classification_collate_mbatches_fn, - classification_single_values_collate_fn + default_collate_impl.collate_fn, + default_collate_impl.collate_single_value_fn ) self._adapted_dataset: Optional[AvalancheDataset] = None @@ -168,7 +168,8 @@ def collate_fn(self, new_collate): if isinstance(new_collate, Collate): self.input_batch_collate_fn = new_collate.collate_fn - self.input_batch_single_values_collate_fn = new_collate.collate_single_value_fn + self.input_batch_single_values_collate_fn = \ + new_collate.collate_single_value_fn else: self.input_batch_collate_fn = new_collate self.input_batch_single_values_collate_fn = None diff --git a/avalanche/training/supervised/cumulative.py b/avalanche/training/supervised/cumulative.py index d413d6a15..612d35fee 100644 --- a/avalanche/training/supervised/cumulative.py +++ b/avalanche/training/supervised/cumulative.py @@ -2,12 +2,10 @@ from torch.nn import Module from torch.optim import Optimizer -from torch.utils.data import ConcatDataset -from avalanche.benchmarks.utils import concat_classification_datasets from avalanche.benchmarks.utils.utils import concat_datasets +from avalanche.training.plugins import SupervisedPlugin from avalanche.training.plugins.evaluation import default_evaluator -from avalanche.training.plugins import SupervisedPlugin, EvaluationPlugin from avalanche.training.templates import SupervisedTemplate diff --git a/avalanche/training/supervised/naive_object_detection.py b/avalanche/training/supervised/naive_object_detection.py index cd0e18934..c549af229 100644 --- a/avalanche/training/supervised/naive_object_detection.py +++ b/avalanche/training/supervised/naive_object_detection.py @@ -140,8 +140,8 @@ def make_train_dataloader( :param shuffle: True if the data should be shuffled, False otherwise. :param pin_memory: If True, the data loader will copy Tensors into CUDA pinned memory before returning them. Defaults to None, which means - that the value will be determined by looking at the strategy `device` - field. + that the value will be determined by looking at the strategy + `device` field. :param persistent_workers: If True, the data loader will not shutdown the worker processes after a dataset has been consumed once. Used only if `PyTorch >= 1.7.0`. @@ -170,7 +170,8 @@ def make_eval_dataloader( pin_memory=None, persistent_workers=False, drop_last=False, - **kwargs): + **kwargs + ): """ :param num_workers: How many subprocesses to use for data loading. @@ -179,8 +180,8 @@ def make_eval_dataloader( :param shuffle: True if the data should be shuffled, False otherwise. :param pin_memory: If True, the data loader will copy Tensors into CUDA pinned memory before returning them. Defaults to None, which means - that the value will be determined by looking at the strategy `device` - field. + that the value will be determined by looking at the strategy + `device` field. :param persistent_workers: If True, the data loader will not shut down the worker processes after a dataset has been consumed once. Please refer to PyTorch `DataLoader` class for more details. diff --git a/avalanche/training/templates/base_sgd.py b/avalanche/training/templates/base_sgd.py index 0c9e5f5ba..01c48974d 100644 --- a/avalanche/training/templates/base_sgd.py +++ b/avalanche/training/templates/base_sgd.py @@ -50,7 +50,8 @@ def __init__( eval_mb_size: Optional[int] = 1, device="cpu", plugins: Optional[List["SupervisedPlugin"]] = None, - evaluator: Union[EvaluationPlugin, Callable[[], EvaluationPlugin]] = default_evaluator, + evaluator: Union[EvaluationPlugin, + Callable[[], EvaluationPlugin]] = default_evaluator, eval_every=-1, peval_mode="epoch", ): @@ -308,7 +309,6 @@ def train_dataset_adaptation(self, **kwargs): self.adapted_dataset = self.experience.dataset self.adapted_dataset = self.adapted_dataset.train() - def _load_train_state(self, prev_state): super()._load_train_state(prev_state) self.adapted_dataset = prev_state["adapted_dataset"] @@ -335,8 +335,9 @@ def _obtain_common_dataloader_parameters(self, **kwargs): The resulting dataset does not include the collate function. Overriding this function can be useful if particular/runtime computed - parameters are needed. However, when overriding, it is recommended to first - call this implementation (super) to obtain a base parameters dictionary. + parameters are needed. However, when overriding, it is recommended to + first call this implementation (super) to obtain a base dictionary of + parameters . :param kwargs: The dataloader arguments as passed to the `train` or `eval` method. @@ -347,7 +348,8 @@ def _obtain_common_dataloader_parameters(self, **kwargs): if 'persistent_workers' in kwargs: if parse_version(torch.__version__) >= parse_version("1.7.0"): - other_dataloader_args["persistent_workers"] = kwargs['persistent_workers'] + other_dataloader_args["persistent_workers"] = \ + kwargs['persistent_workers'] for k, v in kwargs.items(): other_dataloader_args[k] = v @@ -374,8 +376,8 @@ def make_train_dataloader( :param shuffle: True if the data should be shuffled, False otherwise. :param pin_memory: If True, the data loader will copy Tensors into CUDA pinned memory before returning them. Defaults to None, which means - that the value will be determined by looking at the strategy `device` - field. + that the value will be determined by looking at the strategy + `device` field. :param persistent_workers: If True, the data loader will not shut down the worker processes after a dataset has been consumed once. Please refer to PyTorch `DataLoader` class for more details. @@ -414,8 +416,8 @@ def make_eval_dataloader( :param shuffle: True if the data should be shuffled, False otherwise. :param pin_memory: If True, the data loader will copy Tensors into CUDA pinned memory before returning them. Defaults to None, which means - that the value will be determined by looking at the strategy `device` - field. + that the value will be determined by looking at the strategy + `device` field. :param persistent_workers: If True, the data loader will not shut down the worker processes after a dataset has been consumed once. Please refer to PyTorch `DataLoader` class for more details. diff --git a/avalanche/training/templates/observation_type/online_observation.py b/avalanche/training/templates/observation_type/online_observation.py index aa8b4565c..10590e8c2 100644 --- a/avalanche/training/templates/observation_type/online_observation.py +++ b/avalanche/training/templates/observation_type/online_observation.py @@ -1,4 +1,4 @@ -from typing import Iterable, final +from typing import final from avalanche.benchmarks import OnlineCLExperience from avalanche.models.dynamic_optimizers import reset_optimizer @@ -66,8 +66,9 @@ def _model_adaptation(self, model=None): def check_model_and_optimizer(self): with self.use_local_model(): # If strategy has access to the task boundaries, and the current - # sub-experience is the first sub-experience in the online (sub-)stream, - # then adapt the model with the full origin experience: + # sub-experience is the first sub-experience in the online + # (sub-)stream, then adapt the model with the full origin + # experience: if self.experience.access_task_boundaries: if self.experience.is_first_subexp: self.model = self.model_adaptation() diff --git a/avalanche/training/templates/problem_type/supervised_problem.py b/avalanche/training/templates/problem_type/supervised_problem.py index 66dac0e69..0bc94c19f 100644 --- a/avalanche/training/templates/problem_type/supervised_problem.py +++ b/avalanche/training/templates/problem_type/supervised_problem.py @@ -1,6 +1,5 @@ from typing import final -from avalanche.distributed.strategies import DistributedMiniBatchStrategySupport, DistributedModelStrategySupport from avalanche.models import avalanche_forward @@ -23,9 +22,9 @@ def mb_task_id(self): def criterion(self): """Loss function for supervised problems.""" - with self.use_local_output_batch(): # Force self.mb_output to be from local batch - with self.use_local_input_batch(): # Force self.mb_y to be from local batch - + # Force self.mb_output and self.mb_y to be from local batch + with self.use_local_output_batch(): + with self.use_local_input_batch(): return self._criterion(self.mb_output, self.mb_y) @final diff --git a/tests/distributed/distributed_test_utils.py b/tests/distributed/distributed_test_utils.py index 2a833cd0e..bbdb974fb 100644 --- a/tests/distributed/distributed_test_utils.py +++ b/tests/distributed/distributed_test_utils.py @@ -7,7 +7,8 @@ def common_dst_tests_setup(): - use_gpu_in_tests = os.environ.get('USE_GPU', 'false').lower() in ['1', 'true'] + use_gpu_in_tests = os.environ.get('USE_GPU', 'false').lower() in [ + '1', 'true'] use_gpu_in_tests = use_gpu_in_tests and torch.cuda.is_available() DistributedHelper.init_distributed(1234, use_cuda=use_gpu_in_tests) return use_gpu_in_tests diff --git a/tests/distributed/test_distributed_batch.py b/tests/distributed/test_distributed_batch.py index 3f2e7ce7e..227d7de9c 100644 --- a/tests/distributed/test_distributed_batch.py +++ b/tests/distributed/test_distributed_batch.py @@ -7,7 +7,8 @@ from avalanche.distributed import DistributedHelper, \ make_classification_distributed_batch, CollateDistributedBatch -from tests.distributed.distributed_test_utils import check_skip_distributed_test, suppress_dst_tests_output, \ +from tests.distributed.distributed_test_utils import \ + check_skip_distributed_test, suppress_dst_tests_output, \ common_dst_tests_setup @@ -66,11 +67,12 @@ def test_unsupervised_classification_batch(self): @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_tuple_merge_batch_vanilla_collate(self): - dt: CollateDistributedBatch[Tuple[Tensor, Tensor]] = CollateDistributedBatch( - 'mb', - None, - default_collate, - None) + dt: CollateDistributedBatch[Tuple[Tensor, Tensor]] = \ + CollateDistributedBatch( + 'mb', + None, + default_collate, + None) self.assertEqual(None, dt.local_value) self.assertEqual(None, dt.value) @@ -90,8 +92,10 @@ def test_tuple_merge_batch_vanilla_collate(self): expect = torch.full((8,), rank, dtype=torch.long) - self.assertTrue(torch.equal(expect, - distrib_val[1][8 * rank:8 * (rank + 1)])) + self.assertTrue( + torch.equal( + expect, + distrib_val[1][8 * rank:8 * (rank + 1)])) if __name__ == "__main__": diff --git a/tests/distributed/test_distributed_helper.py b/tests/distributed/test_distributed_helper.py index 9cc414be8..add3ace28 100644 --- a/tests/distributed/test_distributed_helper.py +++ b/tests/distributed/test_distributed_helper.py @@ -1,4 +1,3 @@ -import os import random import unittest @@ -6,10 +5,12 @@ import torch.distributed as dst from avalanche.distributed import DistributedHelper -from avalanche.distributed.distributed_helper import RollingSeedContext, BroadcastSeedContext +from avalanche.distributed.distributed_helper import \ + RollingSeedContext, BroadcastSeedContext from avalanche.training.determinism.rng_manager import RNGManager -from tests.distributed.distributed_test_utils import check_skip_distributed_test, suppress_dst_tests_output, \ +from tests.distributed.distributed_test_utils import \ + check_skip_distributed_test, suppress_dst_tests_output, \ common_dst_tests_setup @@ -23,10 +24,12 @@ def setUp(self) -> None: def test_device_id(self): if self.use_gpu_in_tests: self.assertEqual(dst.get_rank(), DistributedHelper.get_device_id()) - self.assertEqual(torch.device(f'cuda:{dst.get_rank()}'), DistributedHelper.make_device()) + self.assertEqual(torch.device(f'cuda:{dst.get_rank()}'), + DistributedHelper.make_device()) else: self.assertEqual(-1, DistributedHelper.get_device_id()) - self.assertEqual(torch.device('cpu'), DistributedHelper.make_device()) + self.assertEqual(torch.device('cpu'), + DistributedHelper.make_device()) @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') diff --git a/tests/distributed/test_distributed_model.py b/tests/distributed/test_distributed_model.py index b4b54a6b0..c976c8d6c 100644 --- a/tests/distributed/test_distributed_model.py +++ b/tests/distributed/test_distributed_model.py @@ -4,7 +4,8 @@ from avalanche.distributed import DistributedHelper, DistributedModel from avalanche.models import SimpleMLP -from tests.distributed.distributed_test_utils import check_skip_distributed_test, suppress_dst_tests_output, \ +from tests.distributed.distributed_test_utils import \ + check_skip_distributed_test, suppress_dst_tests_output, \ common_dst_tests_setup diff --git a/tests/distributed/test_distributed_strategy_support.py b/tests/distributed/test_distributed_strategy_support.py index fc8fc4c9d..aee8e9836 100644 --- a/tests/distributed/test_distributed_strategy_support.py +++ b/tests/distributed/test_distributed_strategy_support.py @@ -1,4 +1,5 @@ import hashlib +import math import unittest import torch @@ -7,12 +8,19 @@ from torch.optim import SGD from torch.utils.data import DistributedSampler, DataLoader +from avalanche.core import SupervisedPlugin from avalanche.distributed import DistributedHelper -from avalanche.distributed.distributed_consistency_verification import hash_dataset +from avalanche.distributed.distributed_consistency_verification import \ + hash_dataset from avalanche.distributed.strategies import DistributedMiniBatchStrategySupport +from avalanche.evaluation.metrics import accuracy_metrics, loss_metrics, \ + confusion_matrix_metrics, topk_acc_metrics, class_accuracy_metrics, \ + amca_metrics from avalanche.models import SimpleMLP from avalanche.training import Naive -from tests.distributed.distributed_test_utils import check_skip_distributed_test, suppress_dst_tests_output, \ +from avalanche.training.plugins import EvaluationPlugin +from tests.distributed.distributed_test_utils import \ + check_skip_distributed_test, suppress_dst_tests_output, \ common_dst_tests_setup from tests.unit_tests_utils import get_fast_benchmark @@ -89,7 +97,8 @@ def _check_loss_equal(self, uut): float(torch.mean(torch.as_tensor(all_losses))), float(global_loss)) - def _check_batches_equal(self, uut: Naive, rank: int, mb_size: int, mb_dist_size: int, input_size: int): + def _check_batches_equal(self, uut: Naive, rank: int, mb_size: int, + mb_dist_size: int, input_size: int): local_input_mb = uut.local_mbatch global_input_mb = uut.mbatch @@ -108,11 +117,13 @@ def _check_batches_equal(self, uut: Naive, rank: int, mb_size: int, mb_dist_size self.assertTrue(torch.equal(global_input_mb[1], uut.mb_y)) self.assertTrue(torch.equal(global_input_mb[2], uut.mb_task_id)) - self.assertSequenceEqual(local_input_mb[0].shape, [mb_dist_size, input_size]) + self.assertSequenceEqual(local_input_mb[0].shape, + [mb_dist_size, input_size]) self.assertSequenceEqual(local_input_mb[1].shape, [mb_dist_size]) self.assertSequenceEqual(local_input_mb[2].shape, [mb_dist_size]) - self.assertSequenceEqual(global_input_mb[0].shape, [mb_size, input_size]) + self.assertSequenceEqual(global_input_mb[0].shape, + [mb_size, input_size]) self.assertSequenceEqual(global_input_mb[1].shape, [mb_size]) self.assertSequenceEqual(global_input_mb[2].shape, [mb_size]) @@ -120,13 +131,18 @@ def _check_batches_equal(self, uut: Naive, rank: int, mb_size: int, mb_dist_size global_index_end = global_index_start + mb_dist_size for i in range(3): - self.assertTrue(torch.equal(local_input_mb[i], global_input_mb[i][global_index_start:global_index_end])) + self.assertTrue( + torch.equal( + local_input_mb[i], + global_input_mb[i][global_index_start:global_index_end])) def _check_adapted_datasets_equal(self, uut: Naive): local_adapted_dataset = uut.adapted_dataset DistributedHelper.check_equal_objects( - hash_dataset(local_adapted_dataset, num_workers=4, hash_engine=hashlib.sha1()) + hash_dataset(local_adapted_dataset, + num_workers=4, + hash_engine=hashlib.sha1()) ) @unittest.skipIf(check_skip_distributed_test(), @@ -135,8 +151,8 @@ def test_naive_classification_dst(self): self.assertTrue(DistributedHelper.is_distributed) input_size = 28 * 28 - # mb_size == 60, so that it can be tested using [1, 6] parallel processes - mb_size = 1*2*2*3*4*5 + # mb_size == 60 so that it can be tested using [1, 6] parallel processes + mb_size = 1*2*2*3*5 model = SimpleMLP(input_size=input_size) optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9) criterion = CrossEntropyLoss() @@ -145,6 +161,48 @@ def test_naive_classification_dst(self): # DST parameters adaptation mb_size_dst = mb_size // DistributedHelper.world_size + class IterationCheckerPlugin(SupervisedPlugin): + + def __init__(self, test_suite): + super().__init__() + self.test_suite = test_suite + + def after_training_iteration(self, strategy, *args, **kwargs): + self._check_aligned(strategy) + + def after_eval_iteration(self, strategy, *args, **kwargs): + self._check_aligned(strategy) + + def _check_aligned(self, strategy: Naive): + + is_last_iteration = strategy.clock.train_epoch_iterations == \ + (len(strategy.dataloader) - 1) + if is_last_iteration: + return + + self.test_suite._check_batches_equal( + strategy, + DistributedHelper.rank, + mb_size, + mb_size_dst, + input_size) + self.test_suite._check_loss_equal(strategy) + + metrics = EvaluationPlugin( + accuracy_metrics(minibatch=True, epoch=True, + experience=True, stream=True), + loss_metrics(minibatch=True, epoch=True, + experience=True, stream=True), + confusion_matrix_metrics(save_image=False, + stream=True), + topk_acc_metrics(minibatch=True, epoch=True, + experience=True, stream=True), + class_accuracy_metrics(minibatch=True, epoch=True, + experience=True, stream=True), + amca_metrics(), + loggers='default' + ) + uut = Naive( model, optimizer, @@ -152,7 +210,9 @@ def test_naive_classification_dst(self): train_mb_size=mb_size_dst, eval_mb_size=mb_size_dst, train_epochs=2, - device=device + device=device, + plugins=[IterationCheckerPlugin(self)], + evaluator=metrics ) self.assertEqual(device, uut.device) @@ -161,13 +221,31 @@ def test_naive_classification_dst(self): self.assertEqual(0, len(uut.evaluator.loggers)) benchmark = get_fast_benchmark( - n_samples_per_class=400, + n_samples_per_class=250, n_features=input_size) for exp_idx, train_experience in enumerate(benchmark.train_stream): - # TODO: insert checks between iterations - metrics = uut.train(train_experience, drop_last=True) - self._check_batches_equal(uut, DistributedHelper.rank, mb_size, mb_size_dst, input_size) + metrics = uut.train(train_experience, drop_last=False) + + # Check that drop_last=False works correctly + train_dataset_sz = len(uut.adapted_dataset) + world_size = DistributedHelper.world_size + last_mb_size_without_dropping = \ + math.ceil(train_dataset_sz / world_size) * world_size % mb_size + if last_mb_size_without_dropping == 0: + # Corner case: no drop needed + last_mb_size_without_dropping = mb_size + last_mb_size_without_dropping_dst = \ + last_mb_size_without_dropping // world_size + + self._check_batches_equal( + uut, + DistributedHelper.rank, + last_mb_size_without_dropping, + last_mb_size_without_dropping_dst, + input_size) + + # Other checks self._check_loss_equal(uut) if exp_idx < 2: # Do it only for the first 2 experiences to speed up tests @@ -175,7 +253,9 @@ def test_naive_classification_dst(self): DistributedHelper.check_equal_objects(metrics) metrics = uut.eval(benchmark.test_stream, drop_last=True) - self._check_batches_equal(uut, DistributedHelper.rank, mb_size, mb_size_dst, input_size) + # Also checks that drop_last=True works correctly + self._check_batches_equal(uut, DistributedHelper.rank, mb_size, + mb_size_dst, input_size) self._check_loss_equal(uut) if exp_idx < 2: # Do it only for the first 2 experiences to speed up tests diff --git a/tests/distributed/test_distributed_tensor.py b/tests/distributed/test_distributed_tensor.py index fe366ac42..e4ca40cad 100644 --- a/tests/distributed/test_distributed_tensor.py +++ b/tests/distributed/test_distributed_tensor.py @@ -5,7 +5,8 @@ from avalanche.distributed import DistributedHelper from avalanche.distributed.distributed_tensor import \ DistributedMeanTensor -from tests.distributed.distributed_test_utils import check_skip_distributed_test, suppress_dst_tests_output, \ +from tests.distributed.distributed_test_utils import \ + check_skip_distributed_test, suppress_dst_tests_output, \ common_dst_tests_setup diff --git a/tests/run_dist_tests.py b/tests/run_dist_tests.py index f1fffc415..716f000f1 100644 --- a/tests/run_dist_tests.py +++ b/tests/run_dist_tests.py @@ -43,7 +43,8 @@ def run_distributed_suites(test_cases): cases_names = [x for x in cases_names if x in test_cases] if set(cases_names) != test_cases: - print('Some cases have not been found!', test_cases - set(cases_names)) + print('Some cases have not been found!', + test_cases - set(cases_names)) sys.exit(1) print('Running', len(cases_names), 'tests') @@ -51,7 +52,8 @@ def run_distributed_suites(test_cases): success = True exited = False - use_gpu_in_tests = os.environ.get('USE_GPU', 'false').lower() in ['1', 'true'] + use_gpu_in_tests = os.environ.get('USE_GPU', 'false').lower() in [ + '1', 'true'] if use_gpu_in_tests: print('Running tests using GPUs') import torch @@ -68,7 +70,8 @@ def run_distributed_suites(test_cases): try: p = Popen( ['python', '-m', 'torch.distributed.run', '--nnodes=1', - f'--nproc_per_node={nproc_per_node}', '-m', 'unittest', case_name], + f'--nproc_per_node={nproc_per_node}', + '-m', 'unittest', case_name], stdout=sys.stdout, stderr=sys.stderr) p.communicate() except KeyboardInterrupt: From cdcd8c4f846db7854ddbc7087fb1c6a36cf4812c Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Wed, 23 Nov 2022 17:23:37 +0000 Subject: [PATCH 13/16] Aligned environment update action content. --- .github/workflows/environment-update.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/environment-update.yml b/.github/workflows/environment-update.yml index bad34100d..0097a6eb6 100644 --- a/.github/workflows/environment-update.yml +++ b/.github/workflows/environment-update.yml @@ -57,6 +57,7 @@ jobs: shell: bash -l -c "conda run -n avalanche-env --no-capture-output bash {0}" run: | python -m unittest discover tests && + echo "Running checkpointing tests..." && bash ./tests/checkpointing/test_checkpointing.sh && python ./tests/run_dist_tests.py - name: checkout avalanche-docker repo From 2a93ad8d088f0adc365b4db6ff8f02c7401ea195 Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Sun, 11 Dec 2022 15:35:08 +0100 Subject: [PATCH 14/16] Fix multitask issues. Improve distributed training support and tests. --- .../distributed_consistency_verification.py | 43 ++- avalanche/distributed/distributed_helper.py | 46 ++- avalanche/models/dynamic_modules.py | 39 ++- avalanche/models/utils.py | 12 +- examples/distributed_training.py | 2 +- tests/distributed/check_metrics_aligned.py | 33 ++ .../distributed/distributed_training_main.py | 287 ++++++++++++++++++ tests/distributed/test_distributed.sh | 73 +++++ tests/distributed/test_distributed_helper.py | 72 +++++ tests/distributed/test_distributed_model.py | 111 ++++++- tests/run_dist_tests.py | 9 +- tests/unit_tests_utils.py | 2 +- 12 files changed, 695 insertions(+), 34 deletions(-) create mode 100644 tests/distributed/check_metrics_aligned.py create mode 100644 tests/distributed/distributed_training_main.py create mode 100755 tests/distributed/test_distributed.sh diff --git a/avalanche/distributed/distributed_consistency_verification.py b/avalanche/distributed/distributed_consistency_verification.py index 25478b740..71c0e8602 100644 --- a/avalanche/distributed/distributed_consistency_verification.py +++ b/avalanche/distributed/distributed_consistency_verification.py @@ -12,13 +12,19 @@ from avalanche.benchmarks import GenericCLScenario -def hash_benchmark(benchmark: 'GenericCLScenario') -> str: - hash_engine = hashlib.sha256() - for stream_name, stream in benchmark.streams.items(): +def hash_benchmark(benchmark: 'GenericCLScenario', *, + hash_engine=None, num_workers=0) -> str: + if hash_engine is None: + hash_engine = hashlib.sha256() + + for stream_name in sorted(benchmark.streams.keys()): + stream = benchmark.streams[stream_name] hash_engine.update(stream_name.encode()) for experience in stream: exp_dataset = experience.dataset - hash_dataset(exp_dataset, hash_engine=hash_engine) + hash_dataset(exp_dataset, + hash_engine=hash_engine, + num_workers=num_workers) return hash_engine.hexdigest() @@ -42,8 +48,10 @@ def hash_dataset(dataset: 'Dataset', *, hash_engine=None, num_workers=0) -> str: return hash_engine.hexdigest() -def hash_minibatch(minibatch: Tuple[Tensor]) -> str: - hash_engine = hashlib.sha256() +def hash_minibatch(minibatch: Tuple[Tensor], *, hash_engine=None) -> str: + if hash_engine is None: + hash_engine = hashlib.sha256() + for tuple_elem in minibatch: buff = io.BytesIO() torch.save(tuple_elem, buff) @@ -52,8 +60,10 @@ def hash_minibatch(minibatch: Tuple[Tensor]) -> str: return hash_engine.hexdigest() -def hash_tensor(tensor: Tensor) -> str: - hash_engine = hashlib.sha256() +def hash_tensor(tensor: Tensor, *, hash_engine=None) -> str: + if hash_engine is None: + hash_engine = hashlib.sha256() + buff = io.BytesIO() torch.save(tensor, buff) buff.seek(0) @@ -61,14 +71,25 @@ def hash_tensor(tensor: Tensor) -> str: return hash_engine.hexdigest() -def hash_model(model: Module) -> str: - hash_engine = hashlib.sha256() +def hash_model(model: Module, include_buffers=True, *, hash_engine=None) -> str: + if hash_engine is None: + hash_engine = hashlib.sha256() + for name, param in model.named_parameters(): hash_engine.update(name.encode()) buff = io.BytesIO() - torch.save(param, buff) + torch.save(param.detach().cpu(), buff) buff.seek(0) hash_engine.update(buff.read()) + + if include_buffers: + for name, model_buffer in model.named_buffers(): + hash_engine.update(name.encode()) + buff = io.BytesIO() + torch.save(model_buffer.detach().cpu(), buff) + buff.seek(0) + hash_engine.update(buff.read()) + return hash_engine.hexdigest() diff --git a/avalanche/distributed/distributed_helper.py b/avalanche/distributed/distributed_helper.py index 5f8236609..64dad60d3 100644 --- a/avalanche/distributed/distributed_helper.py +++ b/avalanche/distributed/distributed_helper.py @@ -114,6 +114,7 @@ class _DistributedHelperCls(object): def __init__(self): self.use_cuda = False + self._dev_map = _DistributedHelperCls._make_map('cpu') def init_distributed(self, random_seed, backend=None, use_cuda=True): if self.is_distributed: @@ -131,12 +132,14 @@ def init_distributed(self, random_seed, backend=None, use_cuda=True): warnings.warn( 'Bad configuration: using NCCL, but you set use_cuda=False!') + could_initialize_distributed = False if os.environ.get('LOCAL_RANK', None) is None: warnings.warn( 'Torch distributed could not be initialized ' '(missing environment configuration)') else: init_process_group(backend=backend) + could_initialize_distributed = True self.set_random_seeds(random_seed) self.use_cuda = use_cuda @@ -146,8 +149,13 @@ def init_distributed(self, random_seed, backend=None, use_cuda=True): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False - self.make_device() # Force-init the default CUDA device (if any) - return True + # Force-init the default CUDA device (if any) + reference_device = self.make_device(set_cuda_device=True) + + # Create map for device placement of unpickled tensors + self._dev_map = _DistributedHelperCls._make_map(reference_device) + + return could_initialize_distributed def get_device_id(self): if self.is_distributed: @@ -160,7 +168,7 @@ def get_device_id(self): return -1 - def make_device(self): + def make_device(self, set_cuda_device=False): if self.is_distributed: device_id = self.rank else: @@ -168,7 +176,8 @@ def make_device(self): if self.use_cuda and device_id >= 0: ref_device = torch.device(f'cuda:{device_id}') - torch.cuda.set_device(ref_device) + if set_cuda_device: + torch.cuda.set_device(ref_device) else: ref_device = torch.device('cpu') return ref_device @@ -183,9 +192,12 @@ def wrap_model(self, model: Module) -> Module: # (an int, a device object or a str) # If not set, output_device defaults to device_ids[0] return DistributedDataParallel( - model, device_ids=[self.make_device()]) + model, device_ids=[self.make_device()], + find_unused_parameters=True) else: - return DistributedDataParallel(model) + return DistributedDataParallel( + model, + find_unused_parameters=True) else: return model @@ -432,6 +444,25 @@ def backend(self) -> str: def forced_cuda_comm(self) -> bool: return self.backend == 'nccl' + @property + def device_map(self) -> Dict[str, str]: + return self._dev_map + + @staticmethod + def _make_map(device_or_map) -> Dict[str, str]: + # TODO: borrowed from checkpointing plugins + # it would be better to have a single function in a shared utils + if not isinstance(device_or_map, (torch.device, str)): + return device_or_map + + device = torch.device(device_or_map) + map_location = dict() + + map_location['cpu'] = 'cpu' + for cuda_idx in range(100): + map_location[f'cuda:{cuda_idx}'] = str(device) + return map_location + BASE_TYPES = [str, int, float, bool, type(None)] @@ -445,7 +476,6 @@ def base_typed(obj): from_pytorch = T.__module__ == 'torch' if from_numpy or from_pytorch: - print(T.__module__) return obj.tolist() if T in BASE_TYPES or callable(obj) or ((from_numpy or from_pytorch) @@ -468,7 +498,7 @@ def base_typed(obj): def fix(): return lambda b: torch.load(BytesIO(b), - map_location=DistributedHelper.make_device()) + map_location=DistributedHelper.device_map) class MappedUnpickler(pickle.Unpickler): diff --git a/avalanche/models/dynamic_modules.py b/avalanche/models/dynamic_modules.py index dbac376d5..7d4df00e2 100644 --- a/avalanche/models/dynamic_modules.py +++ b/avalanche/models/dynamic_modules.py @@ -74,6 +74,11 @@ def eval_adaptation(self, experience: CLExperience): """ pass + @property + def model_device(self): + """Returns the device of the model.""" + return next(self.parameters()).device + class MultiTaskModule(DynamicModule): """Base pytorch Module with support for task labels. @@ -216,7 +221,7 @@ def __init__( self.mask_value = mask_value self.classifier = torch.nn.Linear(in_features, initial_out_features) - au_init = torch.zeros(initial_out_features, dtype=torch.bool) + au_init = torch.zeros(initial_out_features, dtype=torch.int8) self.register_buffer("active_units", au_init) @torch.no_grad() @@ -226,6 +231,7 @@ def adaptation(self, experience: CLExperience): :param experience: data from the current experience. :return: """ + device = self.model_device in_features = self.classifier.in_features old_nclasses = self.classifier.out_features curr_classes = experience.classes_in_this_experience @@ -235,7 +241,11 @@ def adaptation(self, experience: CLExperience): if self.masking: if old_nclasses != new_nclasses: # expand active_units mask old_act_units = self.active_units - self.active_units = torch.zeros(new_nclasses, dtype=torch.bool) + self.active_units = torch.zeros( + new_nclasses, + dtype=torch.int8, + device=device) + self.active_units[: old_act_units.shape[0]] = old_act_units # update with new active classes if self.training: @@ -245,7 +255,7 @@ def adaptation(self, experience: CLExperience): if old_nclasses == new_nclasses: return old_w, old_b = self.classifier.weight, self.classifier.bias - self.classifier = torch.nn.Linear(in_features, new_nclasses) + self.classifier = torch.nn.Linear(in_features, new_nclasses).to(device) self.classifier.weight[:old_nclasses] = old_w self.classifier.bias[:old_nclasses] = old_b @@ -318,14 +328,14 @@ def __init__( self.classifiers["0"] = first_head self.max_class_label = max(self.max_class_label, initial_out_features) - au_init = torch.zeros(initial_out_features, dtype=torch.bool) + au_init = torch.zeros(initial_out_features, dtype=torch.int8) self.register_buffer("active_units_T0", au_init) @property def active_units(self): res = {} for tid in self.known_train_tasks_labels: - mask = getattr(self, f"active_units_T{tid}") + mask = getattr(self, f"active_units_T{tid}").to(torch.bool) au = torch.arange(0, mask.shape[0])[mask].tolist() res[tid] = au return res @@ -334,7 +344,7 @@ def active_units(self): def task_masks(self): res = {} for tid in self.known_train_tasks_labels: - res[tid] = getattr(self, f"active_units_T{tid}") + res[tid] = getattr(self, f"active_units_T{tid}").to(torch.bool) return res def adaptation(self, experience: CLExperience): @@ -344,6 +354,7 @@ def adaptation(self, experience: CLExperience): :return: """ super().adaptation(experience) + device = self.model_device curr_classes = experience.classes_in_this_experience task_labels = experience.task_labels if isinstance(task_labels, ConstantSequence): @@ -356,11 +367,13 @@ def adaptation(self, experience: CLExperience): if tid not in self.classifiers: # create new head new_head = IncrementalClassifier( self.in_features, self.starting_out_features - ) + ).to(device) self.classifiers[tid] = new_head au_init = torch.zeros( - self.starting_out_features, dtype=torch.bool + self.starting_out_features, + dtype=torch.int8, + device=device ) self.register_buffer(f"active_units_T{tid}", au_init) @@ -388,7 +401,9 @@ def adaptation(self, experience: CLExperience): if old_nunits != new_nclasses: # expand active_units mask old_act_units = self._buffers[au_name] self._buffers[au_name] = torch.zeros( - new_nclasses, dtype=torch.bool + new_nclasses, + dtype=torch.int8, + device=device ) self._buffers[au_name][ : old_act_units.shape[0] @@ -405,6 +420,7 @@ def forward_single_task(self, x, task_label): :param task_label: :return: """ + device = self.model_device task_label = str(task_label) out = self.classifiers[task_label](x) if self.masking: @@ -413,7 +429,10 @@ def forward_single_task(self, x, task_label): nunits, oldsize = out.shape[-1], curr_au.shape[0] if oldsize < nunits: # we have to update the mask old_mask = self._buffers[au_name] - self._buffers[au_name] = torch.zeros(nunits, dtype=torch.bool) + self._buffers[au_name] = torch.zeros( + nunits, + dtype=torch.int8, + device=device) self._buffers[au_name][:oldsize] = old_mask curr_au = self._buffers[au_name] out[..., torch.logical_not(curr_au)] = self.mask_value diff --git a/avalanche/models/utils.py b/avalanche/models/utils.py index 5a1ef3153..b40f88191 100644 --- a/avalanche/models/utils.py +++ b/avalanche/models/utils.py @@ -1,19 +1,29 @@ from avalanche.benchmarks.utils import make_classification_dataset from avalanche.models.dynamic_modules import MultiTaskModule, DynamicModule import torch.nn as nn +from torch.nn.parallel import DistributedDataParallel from collections import OrderedDict from avalanche.benchmarks.scenarios import CLExperience +def is_multi_task_module(model: nn.Module): + return isinstance(model, MultiTaskModule) or \ + (isinstance(model, DistributedDataParallel) and + isinstance(model.module, MultiTaskModule)) + + def avalanche_forward(model, x, task_labels): - if isinstance(model, MultiTaskModule): + if is_multi_task_module(model): return model(x, task_labels) else: # no task labels return model(x) def avalanche_model_adaptation(model: nn.Module, experience: CLExperience): + if isinstance(model, DistributedDataParallel): + raise RuntimeError('The model is wrapped in DistributedDataParallel. ' + 'Please unwrap it before calling this method.') for module in model.modules(): if isinstance(module, DynamicModule): module.adaptation(experience) diff --git a/examples/distributed_training.py b/examples/distributed_training.py index 06c15cecd..486a90bd8 100644 --- a/examples/distributed_training.py +++ b/examples/distributed_training.py @@ -106,7 +106,7 @@ def main(args): if DistributedHelper.is_main_process: # Loggers should be created in the main process only loggers.append(TensorboardLogger( - tb_log_dir=f'./logs/{args.exp_name}')) + tb_log_dir=f'./distributed_training_logs/{args.exp_name}')) # Metrics should be created as usual, with no differences between main and # non-main processes. diff --git a/tests/distributed/check_metrics_aligned.py b/tests/distributed/check_metrics_aligned.py new file mode 100644 index 000000000..80c97369d --- /dev/null +++ b/tests/distributed/check_metrics_aligned.py @@ -0,0 +1,33 @@ +import os +import pickle +import sys + + +def load_pickles(directory): + # Load the pickle files into a list of dictionaries. + files = os.listdir(directory) + files.sort() + data = [] + for f in files: + with open(os.path.join(directory, f), 'rb') as fh: + data.append(pickle.load(fh)) + + return data + + +def check_metrics_aligned(directory1, directory2): + data1 = load_pickles(directory1) + data2 = load_pickles(directory2) + assert len(data1) == len(data2) + + # Check that the metrics are aligned. + for i in range(len(data1)): + if data1[i] != data2[i]: + print('Metrics are not aligned for experience {}'.format(i)) + sys.exit(1) + + print('Metrics are aligned') + + +if __name__ == '__main__': + check_metrics_aligned(sys.argv[1], sys.argv[2]) diff --git a/tests/distributed/distributed_training_main.py b/tests/distributed/distributed_training_main.py new file mode 100644 index 000000000..85f960266 --- /dev/null +++ b/tests/distributed/distributed_training_main.py @@ -0,0 +1,287 @@ +################################################################################ +# Copyright (c) 2021 ContinualAI. # +# Copyrights licensed under the MIT License. # +# See the accompanying LICENSE file for terms. # +# # +# Date: 06-12-2022 # +# Author(s): Lorenzo Pellegrini # +# E-mail: contact@continualai.org # +# Website: avalanche.continualai.org # +################################################################################ + +""" +This is a deterministic version of the script with the same name found in the +examples folder. + +Used in unit tests. + +Adapted from the one used for unit testing the checkpointing functionality. +""" + + +import argparse +import os +import sys +import time +import pickle +from pathlib import Path +from typing import Sequence + +import torch +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.optim.lr_scheduler import ReduceLROnPlateau + +from avalanche.benchmarks import CLExperience, \ + SplitCIFAR100, SplitMNIST, SplitFMNIST, SplitCIFAR10 +from avalanche.distributed import DistributedHelper +from avalanche.distributed.distributed_consistency_verification import \ + hash_benchmark, hash_model +from avalanche.evaluation.metrics import accuracy_metrics, loss_metrics, \ + class_accuracy_metrics +from avalanche.logging import InteractiveLogger, TensorboardLogger, \ + WandBLogger, TextLogger +from avalanche.models import SimpleMLP, as_multitask +from avalanche.training import Naive +from avalanche.training.plugins import EvaluationPlugin, CWRStarPlugin, \ + ReplayPlugin, GDumbPlugin, LwFPlugin, SynapticIntelligencePlugin, \ + EWCPlugin, LRSchedulerPlugin, SupervisedPlugin +from tests.unit_tests_utils import get_fast_benchmark + +OVERALL_MB_SIZE = 192 +BENCHMARK_HASH = \ + '8ac6f78597e6f7279c601f1f75113aec6c56abd1518e3386a6729c7be9262cdd' +MODEL_HASH = \ + 'cbb45bc281908892402fda9794e82d71c3593631f76229f1f396fa7a936affaa' + + +class CheckModelAlignedPlugin(SupervisedPlugin): + def after_update(self, strategy, *args, **kwargs): + DistributedHelper.check_equal_objects( + hash_model(strategy.model, include_buffers=True)) + + +def main(args): + torch.use_deterministic_algorithms(True) + + is_dist = DistributedHelper.init_distributed( + random_seed=4321, use_cuda=args.cuda + ) + + rank = DistributedHelper.rank + world_size = DistributedHelper.world_size + device = DistributedHelper.make_device() + print(f'Current process rank: {rank}/{world_size}, ' + f'will use device: {device}') + + if not DistributedHelper.is_main_process: + # Suppress the output of non-main processes + # This prevents the output from being duplicated in the console + sys.stdout = open(os.devnull, 'w') + sys.stderr = open(os.devnull, 'w') + + # --- SCENARIO CREATION + use_tasks = 'si' not in args.plugins and 'cwr' not in args.plugins \ + and args.benchmark != 'Stream51' + input_size = 32*32*3 + + if args.benchmark == 'TestBenchmark': + input_size = 28 * 28 * 1 + scenario = get_fast_benchmark( + use_task_labels=use_tasks, + n_features=input_size, + n_samples_per_class=256, + seed=1337 + ) + + if use_tasks: + # print(hash_benchmark(scenario, num_workers=4)) + assert hash_benchmark(scenario, num_workers=4) == BENCHMARK_HASH + print('Benchmark hash is correct.') + elif args.benchmark == 'SplitMNIST': + scenario = SplitMNIST(n_experiences=5, return_task_id=True) + input_size = 28*28*1 + elif args.benchmark == 'SplitFMNIST': + scenario = SplitFMNIST(n_experiences=5, return_task_id=True) + input_size = 28*28*1 + elif args.benchmark == 'SplitCifar100': + scenario = SplitCIFAR100(n_experiences=5, return_task_id=use_tasks) + elif args.benchmark == 'SplitCifar10': + scenario = SplitCIFAR10(n_experiences=5, return_task_id=use_tasks) + else: + raise ValueError('Unrecognized benchmark name from CLI.') + train_stream: Sequence[CLExperience] = scenario.train_stream + test_stream: Sequence[CLExperience] = scenario.test_stream + + print('Testing using the', args.benchmark, 'benchmark') + for train_exp in train_stream: + print('Train experience', train_exp.current_experience, + 'has', len(train_exp.dataset), 'samples') + + for test_exp in test_stream: + print('Test experience', test_exp.current_experience, + 'has', len(test_exp.dataset), 'samples') + # --------- + + # MODEL CREATION + if use_tasks: + model = SimpleMLP(input_size=input_size, + num_classes=scenario.n_classes // 5) + model = as_multitask(model, 'classifier') + if args.benchmark == 'TestBenchmark' and use_tasks: + # print(hash_model(model)) + assert hash_model(model) == MODEL_HASH + print('Model hash is correct.') + else: + model = SimpleMLP(input_size=input_size, num_classes=scenario.n_classes) + + DistributedHelper.check_equal_objects( + hash_model(model, include_buffers=True)) + DistributedHelper.check_equal_objects( + hash_benchmark(scenario, num_workers=4)) + + optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) + criterion = CrossEntropyLoss() + + # CREATE THE STRATEGY INSTANCE (NAIVE) + + # Adapt the minibatch size + mb_size = OVERALL_MB_SIZE // DistributedHelper.world_size + + plugins = [ + CheckModelAlignedPlugin() + ] + + cli_plugins = [] + cli_plugin_names = '_'.join(args.plugins) + for cli_plugin in args.plugins: + if cli_plugin == 'cwr': + plugin_instance = CWRStarPlugin( + model, freeze_remaining_model=True) + elif cli_plugin == 'replay': + plugin_instance = ReplayPlugin(mem_size=500) + elif cli_plugin == 'gdumb': + plugin_instance = GDumbPlugin(mem_size=500) + elif cli_plugin == 'lwf': + plugin_instance = LwFPlugin() + elif cli_plugin == 'si': + plugin_instance = SynapticIntelligencePlugin(0.001) + elif cli_plugin == 'ewc': + plugin_instance = EWCPlugin(0.001) + elif cli_plugin == 'reduccre_on_plateau': + plugin_instance = LRSchedulerPlugin( + ReduceLROnPlateau(optimizer), step_granularity='iteration', + metric='train_loss' + ) + else: + raise ValueError('Unrecognized plugin name from CLI.') + print('Adding plugin', plugin_instance) + cli_plugins.append(plugin_instance) + plugins += cli_plugins + + loggers = [] + if DistributedHelper.is_main_process: + use_cuda_str = 'cuda' if args.cuda else 'cpu' + is_dist_str = 'distributed' if is_dist else 'single' + log_location: Path = Path('logs') / \ + (f'distributed_{args.benchmark}_' + + f'{use_cuda_str}_{is_dist_str}_{cli_plugin_names}') + + # Loggers should be created in the main process only + os.makedirs(log_location, exist_ok=True) + loggers = [ + TextLogger(open(log_location / 'log.txt', 'w')), + InteractiveLogger(), + TensorboardLogger(log_location) + ] + + if args.wandb: + loggers.append(WandBLogger( + project_name='AvalancheDistributedTraining', + run_name=f'distributed_{args.benchmark}_' + f'{use_cuda_str}_{is_dist_str}_' + f'{cli_plugin_names}' + )) + Path(args.log_metrics_to).mkdir(parents=True, exist_ok=True) + + # Metrics should be created as usual, with no differences between main and + # non-main processes. + evaluation_plugin = EvaluationPlugin( + accuracy_metrics(minibatch=False, epoch=True, + experience=True, stream=True), + loss_metrics(minibatch=False, epoch=True, + experience=True, stream=True), + class_accuracy_metrics( + stream=True + ), + loggers=loggers + ) + + cl_strategy = Naive( + model=model, + optimizer=optimizer, + criterion=criterion, + train_mb_size=mb_size, + train_epochs=2, + eval_mb_size=mb_size, + device=device, + plugins=plugins, + evaluator=evaluation_plugin + ) + + start_time = time.time() + + # TRAINING LOOP + + for experience in train_stream: + cl_strategy.train( + experience, + num_workers=8, + drop_last=True, + shuffle=False) + + metrics = cl_strategy.eval( + test_stream, + num_workers=8, + drop_last=True, + shuffle=False) + + if DistributedHelper.is_main_process: + with open(Path(args.log_metrics_to) / + f'metrics_exp' + f'{experience.current_experience}.pkl', 'wb') as f: + pickle.dump(metrics, f) + + print('Training+eval took', time.time() - start_time) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--cuda', + default=False, + action='store_true', + help="If set, use GPUs." + ) + parser.add_argument( + "--benchmark", + type=str, + default='SplitCifar100', + help="The benchmark to use." + ) + parser.add_argument( + "--log_metrics_to", + type=str, + default='./metrics' + ) + parser.add_argument( + "--wandb", + action='store_true' + ) + parser.add_argument( + "--plugins", + nargs='*', + required=False, + default=[] + ) + main(parser.parse_args()) diff --git a/tests/distributed/test_distributed.sh b/tests/distributed/test_distributed.sh new file mode 100755 index 000000000..9fbb606b4 --- /dev/null +++ b/tests/distributed/test_distributed.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# Script used to automatically test various combinations of plugins when used with +# the distributed training functionality. +set -euo pipefail +cd tests/distributed +rm -rf logs +rm -rf metrics_no_distributed +rm -rf metrics_distributed + +export PYTHONUNBUFFERED=1 +export PYTHONPATH=../.. +export CUBLAS_WORKSPACE_CONFIG=:4096:8 + +BENCHMARK="TestBenchmark" + +# Config from env +# https://blog.stigok.com/2022/02/08/parsing-boolean-string-statements-in-bash.html +function str_bool { + local str="${1:-false}" + local pat='^(true|1|yes)$' + if [[ ${str,,} =~ $pat ]] + then + echo 'true' + else + echo 'false' + fi +} + +RUN_FAST_TESTS=$(str_bool "${FAST_TEST:-False}") +RUN_GPU_TESTS=$(str_bool "${USE_GPU:-False}") + +TESTS_PARALLELISM=4 + +GPU_PARAM="" + +if [ "$RUN_GPU_TESTS" = "true" ] +then + GPU_PARAM="--cuda" + TESTS_PARALLELISM=$(nvidia-smi -L | wc -l) + echo "Auto-detected $TESTS_PARALLELISM GPUs." +fi + +EXP_RUN_LINE="torchrun --standalone --nnodes=1 --nproc_per_node=$TESTS_PARALLELISM" + +run_and_check() { + set -x + # Without distributed training + python distributed_training_main.py $GPU_PARAM \ + --plugins "$@" --benchmark $BENCHMARK --log_metrics_to './metrics_no_distributed' + + # Run distributed training + $EXP_RUN_LINE distributed_training_main.py $GPU_PARAM \ + --plugins "$@" --benchmark $BENCHMARK --log_metrics_to './metrics_distributed' + + #python -u check_metrics_aligned.py \ + # "./metrics_no_distributed" "./metrics_distributed" + + rm -r metrics_no_distributed + rm -r metrics_distributed + rm -r logs + set +x +} + +run_and_check "replay" + +if [ "$RUN_FAST_TESTS" = "false" ] +then + echo "Running slow tests..." + run_and_check "lwf" + run_and_check "ewc" + run_and_check "gdumb" + run_and_check "cwr" "replay" +fi diff --git a/tests/distributed/test_distributed_helper.py b/tests/distributed/test_distributed_helper.py index add3ace28..0c49bd45d 100644 --- a/tests/distributed/test_distributed_helper.py +++ b/tests/distributed/test_distributed_helper.py @@ -3,10 +3,13 @@ import torch import torch.distributed as dst +from torch.nn import Module +from torch.nn.parallel import DistributedDataParallel from avalanche.distributed import DistributedHelper from avalanche.distributed.distributed_helper import \ RollingSeedContext, BroadcastSeedContext +from avalanche.models import SimpleMLP, as_multitask from avalanche.training.determinism.rng_manager import RNGManager from tests.distributed.distributed_test_utils import \ @@ -31,6 +34,75 @@ def test_device_id(self): self.assertEqual(torch.device('cpu'), DistributedHelper.make_device()) + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_wrap_model(self): + mb_size = 1*2*2*3*5 + num_classes = 11 + torch.manual_seed(1234 + DistributedHelper.rank) + mb_x = torch.randn((mb_size, 32)) + model = SimpleMLP(num_classes=num_classes, input_size=32) + self.assertIsInstance(model, Module) + + device = DistributedHelper.make_device() + + if device.type == 'cuda': + # Additional test: must raise an error if the model + # is not already in the correct device + with self.assertRaises(Exception): + model_wrapped = DistributedHelper.wrap_model(model) + + model = model.to(device) + + model_wrapped = DistributedHelper.wrap_model(model) + self.assertIsInstance(model_wrapped, DistributedDataParallel) + self.assertNotIsInstance(model, DistributedDataParallel) + + device = DistributedHelper.make_device() + mb_x = mb_x.to(device) + model = model.to(device) + + model.eval() + model_wrapped.eval() + + with torch.no_grad(): + mb_out1 = model(mb_x).detach() + self.assertEqual(mb_out1.device, device) + self.assertSequenceEqual([mb_size, num_classes], mb_out1.shape) + + mb_out2 = model_wrapped(mb_x).detach() + self.assertEqual(mb_out2.device, device) + self.assertSequenceEqual([mb_size, num_classes], mb_out2.shape) + + self.assertTrue(torch.equal(mb_out1, mb_out2)) + + mb_out_all = DistributedHelper.cat_all(mb_out2) + + start_idx = mb_size * DistributedHelper.rank + end_idx = start_idx + mb_size + + self.assertTrue(torch.equal(mb_out1, + mb_out_all[start_idx: end_idx])) + + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_broadcast(self): + ts = torch.full((10,), DistributedHelper.rank, dtype=torch.long) + DistributedHelper.broadcast(ts) + self.assertTrue(torch.equal(ts, torch.zeros((10,), dtype=torch.long))) + + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_check_equal_tensors(self): + torch.manual_seed(1234) + ts = torch.randn((100,)) + DistributedHelper.check_equal_tensors(ts) + + torch.manual_seed(1234 + DistributedHelper.rank) + ts = torch.randn((100,)) + with self.assertRaises(Exception): + DistributedHelper.check_equal_tensors(ts) + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_fields(self): diff --git a/tests/distributed/test_distributed_model.py b/tests/distributed/test_distributed_model.py index c976c8d6c..afd50f3fc 100644 --- a/tests/distributed/test_distributed_model.py +++ b/tests/distributed/test_distributed_model.py @@ -1,12 +1,17 @@ import unittest +import torch from torch.nn.parallel import DistributedDataParallel +from torch.utils.data import DataLoader from avalanche.distributed import DistributedHelper, DistributedModel from avalanche.models import SimpleMLP +from avalanche.models.helper_method import as_multitask +from avalanche.models.utils import avalanche_forward, avalanche_model_adaptation from tests.distributed.distributed_test_utils import \ check_skip_distributed_test, suppress_dst_tests_output, \ common_dst_tests_setup +from tests.unit_tests_utils import get_fast_benchmark class DistributedModelTests(unittest.TestCase): @@ -23,13 +28,88 @@ def test_distributed_model(self): self.assertIsNone(dt.value) self.assertIsNone(dt.distributed_value) + device = DistributedHelper.make_device() + + dt.model = model + + self.assertEqual(model, dt.local_value) + self.assertEqual(model, dt.value) + self.assertEqual(model, dt.distributed_value) + + if device.type == 'cuda': + # Additional test: must raise an error if the model + # is not already in the correct device + with self.assertRaises(Exception): + wrapped = DistributedDataParallel( + model, + device_ids=[device]) + + model = model.to(device) + wrapped = DistributedDataParallel( + model, + device_ids=[device]) + + dt.model = wrapped + + self.assertEqual(model, dt.local_value) + self.assertNotIsInstance(dt.local_value, DistributedDataParallel) + + self.assertIsInstance(dt.value, DistributedDataParallel) + self.assertEqual(wrapped, dt.value) + self.assertEqual(wrapped, dt.distributed_value) + + dt.reset_distributed_value() + + self.assertEqual(model, dt.local_value) + self.assertEqual(model, dt.value) + self.assertEqual(model, dt.distributed_value) + + self.assertNotIsInstance(dt.value, DistributedDataParallel) + + dt.reset_distributed_value() + self.assertIsNotNone(dt.local_value) + + dt.value = wrapped + dt.distributed_model = None + + self.assertIsNotNone(dt.local_value) + + dt.value = None + + self.assertIsNone(dt.local_value) + self.assertIsNone(dt.distributed_value) + self.assertIsNone(dt.value) + + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_distributed_model_multitask(self): + dt: DistributedModel = DistributedModel() + model = SimpleMLP() + model = as_multitask(model, 'classifier') + self.assertIsNone(dt.local_value) + self.assertIsNone(dt.value) + self.assertIsNone(dt.distributed_value) + + device = DistributedHelper.make_device() + dt.model = model self.assertEqual(model, dt.local_value) self.assertEqual(model, dt.value) self.assertEqual(model, dt.distributed_value) - wrapped = DistributedDataParallel(model) + if device.type == 'cuda': + # Additional test: must raise an error if the model + # is not already in the correct device + with self.assertRaises(Exception): + wrapped = DistributedDataParallel( + model, + device_ids=[device]) + + model = model.to(device) + wrapped = DistributedDataParallel( + model, + device_ids=[device]) dt.model = wrapped @@ -62,6 +142,35 @@ def test_distributed_model(self): self.assertIsNone(dt.distributed_value) self.assertIsNone(dt.value) + # test model adaptation + input_size = 28 * 28 * 1 + scenario = get_fast_benchmark( + use_task_labels=True, + n_features=input_size, + n_samples_per_class=256, + seed=1337 + ) + avalanche_model_adaptation(model, scenario.train_stream[1]) + model.eval() + dt.value = model + + wrapped = DistributedDataParallel(model, device_ids=[device]) + dt.model = wrapped + + self.assertEqual(model, dt.local_value) + loader = DataLoader(scenario.train_stream[1].dataset, batch_size=32) + with torch.no_grad(): + for x, y, t in loader: + x = x.to(device) + y = y.to(device) + t = t.to(device) + self.assertEqual([1] * len(t), t.tolist()) + out_mb = avalanche_forward(dt.model, x, t) + DistributedHelper.check_equal_tensors(out_mb) + out_mb_local = avalanche_forward(dt.local_value, x, t) + DistributedHelper.check_equal_tensors(out_mb_local) + self.assertTrue(torch.equal(out_mb, out_mb_local)) + if __name__ == "__main__": with suppress_dst_tests_output(): diff --git a/tests/run_dist_tests.py b/tests/run_dist_tests.py index 716f000f1..bd7a769b3 100644 --- a/tests/run_dist_tests.py +++ b/tests/run_dist_tests.py @@ -51,6 +51,7 @@ def run_distributed_suites(test_cases): p = None success = True exited = False + failed_test_cases = set() use_gpu_in_tests = os.environ.get('USE_GPU', 'false').lower() in [ '1', 'true'] @@ -84,11 +85,17 @@ def run_distributed_suites(test_cases): success = success and exit_code == 0 p = None + if exit_code != 0: + failed_test_cases.add(case_name) + if success: print('Tests completed successfully') sys.exit(0) else: - print('Tests terminated with errors') + print('The following tests terminated with errors:') + for failed_case in sorted(failed_test_cases): + print(failed_case) + sys.exit(1) diff --git a/tests/unit_tests_utils.py b/tests/unit_tests_utils.py index bd6885d79..11032420a 100644 --- a/tests/unit_tests_utils.py +++ b/tests/unit_tests_utils.py @@ -29,7 +29,7 @@ if "UPDATE_METRICS" in os.environ: UPDATE_METRICS = os.environ["UPDATE_METRICS"].lower() == "true" -print(f"UPDATE_METRICS: {UPDATE_METRICS}") +# print(f"UPDATE_METRICS: {UPDATE_METRICS}") def is_github_action(): From 1174f33f95cf45314a59a1fd6247d7525444aa01 Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Tue, 10 Jan 2023 16:20:42 +0100 Subject: [PATCH 15/16] Added additional unit tests. Issue with all_gather to be fixed. --- avalanche/distributed/distributed_helper.py | 48 ++- tests/distributed/distributed_test_utils.py | 6 + tests/distributed/test_distributed_helper.py | 291 ++++++++++++++++++- 3 files changed, 334 insertions(+), 11 deletions(-) diff --git a/avalanche/distributed/distributed_helper.py b/avalanche/distributed/distributed_helper.py index 64dad60d3..c120cbf9f 100644 --- a/avalanche/distributed/distributed_helper.py +++ b/avalanche/distributed/distributed_helper.py @@ -2,14 +2,21 @@ import pickle import warnings from io import BytesIO -from typing import Optional, List, Any, Iterable, Dict +from typing import Optional, List, Any, Iterable, Dict, TypeVar import torch from torch import Tensor -from torch.distributed import init_process_group from torch.nn.modules import Module from torch.nn.parallel import DistributedDataParallel from typing_extensions import Literal +from torch.distributed import ( + init_process_group, + broadcast_object_list +) + + +BroadcastT = TypeVar('BroadcastT') + from avalanche.distributed.distributed_consistency_verification import \ hash_tensor @@ -240,8 +247,20 @@ def broadcast(self, tensor: Tensor, src=0): tensor = self._revert_to_original_device(tensor_distrib, orig_data) return tensor + + def broadcast_object(self, obj: BroadcastT, src=0) -> BroadcastT: + if not self.is_distributed: + return obj + + io_list = [obj] + + broadcast_object_list(io_list, src=src) + return io_list[0] def cat_all(self, tensor: Tensor): + # TODO: use all_gather_into_tensor (if available and + # if NCCL and tensor.device == 'default device') + if not self.is_distributed: return tensor @@ -258,8 +277,16 @@ def gather_all( self, tensor: Tensor, out_tensors: Optional[List[Tensor]] = None, - different_shape0: bool = None, - different_shape1_n: bool = None): + different_shape0: Optional[bool] = None, + different_shape1_n: Optional[bool] = None): + """ + Gather all for tensors only. + + Note: differently from the original Pytorch function, which requires that input tensor is to be moved + to the default device (forced to CUDA if using NCCL), this function also manages input tensors residing on + arbitrary devices. The resulting list of tensors will be moved to the same device + of the input tensor. + """ if not self.is_distributed: return [tensor] @@ -297,7 +324,10 @@ def gather_all( else: # TODO: needs unit test (especially for 0-shaped tensors) # Same size for all tensors - tensor_size = torch.tensor(tensor.shape, dtype=torch.int64) + if len(tensor.shape) > 0: + tensor_size = torch.tensor(tensor.shape, dtype=torch.int64) + else: + tensor_size = torch.tensor([0], dtype=torch.int64) all_tensors_shape = \ [tensor_size for _ in range(self.world_size)] @@ -353,12 +383,16 @@ def gather_all( orig_device = tensor.device tensor, _ = self._prepare_for_distributed_comm(tensor) out_tensors = [self._prepare_for_distributed_comm(t)[0] - for t in out_tensors] + for t in out_tensors] torch.distributed.all_gather(out_tensors, tensor) out_tensors = [t.to(orig_device) for t in out_tensors] return out_tensors - def gather_all_objects(self, obj): + def gather_all_objects(self, obj: BroadcastT) -> List[BroadcastT]: + """ + Gather all for objects. This will also take care of moving cuda tensors + (even the ones nested inside objects) to the correct default device. + """ out_list = [None for _ in range(self.world_size)] torch.distributed.all_gather_object(out_list, obj) return out_list diff --git a/tests/distributed/distributed_test_utils.py b/tests/distributed/distributed_test_utils.py index bbdb974fb..4e17e8f4b 100644 --- a/tests/distributed/distributed_test_utils.py +++ b/tests/distributed/distributed_test_utils.py @@ -19,6 +19,11 @@ def check_skip_distributed_test() -> bool: not in ['1', 'true'] +def check_skip_distributed_slow_test() -> bool: + return check_skip_distributed_test() or \ + os.environ.get('FAST_TEST', 'false').lower() in ['1', 'true'] + + @contextlib.contextmanager def suppress_dst_tests_output(): if os.environ['LOCAL_RANK'] != 0: @@ -32,5 +37,6 @@ def suppress_dst_tests_output(): __all__ = [ 'common_dst_tests_setup', 'check_skip_distributed_test', + 'check_skip_distributed_slow_test', 'suppress_dst_tests_output' ] diff --git a/tests/distributed/test_distributed_helper.py b/tests/distributed/test_distributed_helper.py index 0c49bd45d..8091f6029 100644 --- a/tests/distributed/test_distributed_helper.py +++ b/tests/distributed/test_distributed_helper.py @@ -1,19 +1,29 @@ +import itertools +import os import random +import shutil +import tempfile +import time +import timeit import unittest +import numpy as np import torch import torch.distributed as dst from torch.nn import Module from torch.nn.parallel import DistributedDataParallel +from avalanche.benchmarks.generators.benchmark_generators import dataset_benchmark +from avalanche.benchmarks.utils.classification_dataset import make_tensor_classification_dataset from avalanche.distributed import DistributedHelper from avalanche.distributed.distributed_helper import \ RollingSeedContext, BroadcastSeedContext from avalanche.models import SimpleMLP, as_multitask +from avalanche.models.utils import avalanche_model_adaptation from avalanche.training.determinism.rng_manager import RNGManager from tests.distributed.distributed_test_utils import \ - check_skip_distributed_test, suppress_dst_tests_output, \ + check_skip_distributed_slow_test, check_skip_distributed_test, suppress_dst_tests_output, \ common_dst_tests_setup @@ -41,7 +51,10 @@ def test_wrap_model(self): num_classes = 11 torch.manual_seed(1234 + DistributedHelper.rank) mb_x = torch.randn((mb_size, 32)) + mb_y = torch.randint(0, num_classes, (mb_size,)) + mb_t = torch.full((mb_size,), 1) model = SimpleMLP(num_classes=num_classes, input_size=32) + model = as_multitask(model, 'classifier') self.assertIsInstance(model, Module) device = DistributedHelper.make_device() @@ -60,17 +73,30 @@ def test_wrap_model(self): device = DistributedHelper.make_device() mb_x = mb_x.to(device) + mb_y = mb_y.to(device) + mb_t = mb_t.to(device) model = model.to(device) model.eval() model_wrapped.eval() + benchmark = dataset_benchmark( + [make_tensor_classification_dataset( + mb_x, mb_y, mb_t, task_labels=mb_t.tolist() + )], + [make_tensor_classification_dataset( + mb_x, mb_y, mb_t, task_labels=mb_t.tolist() + )] + ) + + avalanche_model_adaptation(model, benchmark.train_stream[0]) + with torch.no_grad(): - mb_out1 = model(mb_x).detach() + mb_out1 = model(mb_x, mb_t).detach() self.assertEqual(mb_out1.device, device) self.assertSequenceEqual([mb_size, num_classes], mb_out1.shape) - mb_out2 = model_wrapped(mb_x).detach() + mb_out2 = model_wrapped(mb_x, mb_t).detach() self.assertEqual(mb_out2.device, device) self.assertSequenceEqual([mb_size, num_classes], mb_out2.shape) @@ -83,14 +109,197 @@ def test_wrap_model(self): self.assertTrue(torch.equal(mb_out1, mb_out_all[start_idx: end_idx])) + + self.assertTrue(model is DistributedHelper.unwrap_model(model_wrapped)) @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') - def test_broadcast(self): + def test_broadcast_tensor_or_objects(self): ts = torch.full((10,), DistributedHelper.rank, dtype=torch.long) DistributedHelper.broadcast(ts) self.assertTrue(torch.equal(ts, torch.zeros((10,), dtype=torch.long))) + device = DistributedHelper.make_device() + ts = ts.to(device) + + my_object = {'a': DistributedHelper.rank, 'b': ts} + my_object_from_main = DistributedHelper.broadcast_object(my_object) + + expect = { + 'a': 0, + 'b': torch.full((10,), 0, dtype=torch.long).tolist()} + + self.assertEqual(device, my_object_from_main['b'].device) + my_object_from_main['b'] = my_object_from_main['b'].tolist() + self.assertEqual(expect, my_object_from_main) + + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_gather_all_objects(self): + ts = torch.full((10,), DistributedHelper.rank, dtype=torch.long) + + device = DistributedHelper.make_device() + ts = ts.to(device) + + my_object = {'a': DistributedHelper.rank, 'b': ts} + all_objects = DistributedHelper.gather_all_objects(my_object) + self.assertIsInstance(all_objects, list) + self.assertEqual(DistributedHelper.world_size, len(all_objects)) + + for rank in range(DistributedHelper.world_size): + expect = { + 'a': 0, + 'b': torch.full((10,), rank, dtype=torch.long).tolist()} + + self.assertEqual(device, all_objects[rank]['b'].device) + all_objects[rank]['b'] = all_objects[rank]['b'].tolist() + self.assertEqual(expect, all_objects[rank]) + + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_cat_all(self): + ts = torch.full((10, 5), DistributedHelper.rank, dtype=torch.long) + device = DistributedHelper.make_device() + + if device.type == 'cuda': + # Additional test: the tensor do not need to be on the default device + DistributedHelper.cat_all(ts) + + ts = ts.to(device) + + concatenated_tensor = DistributedHelper.cat_all(ts) + + self.assertEqual(device, concatenated_tensor.device) + + expect = torch.empty((DistributedHelper.world_size * 10, 5), dtype=torch.long).to(device) + for rank in range(DistributedHelper.world_size): + expect[rank * 10: (rank + 1) * 10] = rank + + self.assertTrue(torch.equal(concatenated_tensor, expect)) + + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_gather_all_same_size(self): + # TODO: implement test + ts = torch.full((10, 5), DistributedHelper.rank, dtype=torch.long) + device = DistributedHelper.make_device() + + if device.type == 'cuda': + # Additional test: the tensor do not need to be on the default device + DistributedHelper.gather_all(ts) + + # On the other hand, PyTorch all_gather requires tensors to be on the default device + with self.assertRaises(Exception): + + out_t = [torch.empty_like(ts) for _ in range(DistributedHelper.world_size)] + torch.distributed.all_gather(out_t, ts) + + # ... while this should work + out_t = [torch.empty_like(ts).to(device) for _ in range(DistributedHelper.world_size)] + torch.distributed.all_gather(out_t, ts.to(device)) + + ts = ts.to(device) + + for different_shape0, different_shape1_n in itertools.product([None, False], [None, False]): + with self.subTest(different_shape0=different_shape0, different_shape1_n=different_shape1_n): + tensor_list = DistributedHelper.gather_all(ts, different_shape0=different_shape0, different_shape1_n=different_shape1_n) + + self.assertEqual(DistributedHelper.world_size, len(tensor_list)) + + for t in tensor_list: + self.assertEqual(device, t.device) + + for rank in range(DistributedHelper.world_size): + expect = torch.full((10, 5), rank, dtype=torch.long).to(device) + self.assertTrue(torch.equal(tensor_list[rank], expect)) + + @unittest.skipIf(check_skip_distributed_slow_test(), + 'Distributed tests ignored') + def test_gather_all_performance_known_same_shape(self): + ts = torch.full((128, 224, 224, 3), DistributedHelper.rank, dtype=torch.float32) + device = DistributedHelper.make_device() + ts = ts.to(device) + + resulting_tensors = [torch.empty_like(ts).to(device) for _ in range(DistributedHelper.world_size)] + + from tqdm import tqdm + n_times = 30 + torch.distributed.all_gather(resulting_tensors, ts) + start_time = time.time() + for _ in tqdm(range(n_times)): + torch.distributed.all_gather(resulting_tensors, ts) + end_time = time.time() + print('Time taken by PyTorch all_gather', end_time-start_time, 'avg', (end_time-start_time) / n_times) + + start_time = time.time() + out_list = [None for _ in range(DistributedHelper.world_size)] + torch.distributed.all_gather_object(out_list, ts) + + for _ in tqdm(range(n_times)): + torch.distributed.all_gather_object(out_list, ts) + end_time = time.time() + print('Time taken by PyTorch all_gather_object', end_time-start_time, 'avg', (end_time-start_time) / n_times) + + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_gather_all_same_dim0(self): + ts = torch.full((10, DistributedHelper.rank+1), DistributedHelper.rank, dtype=torch.long) + device = DistributedHelper.make_device() + + ts = ts.to(device) + + for different_shape0, different_shape1_n in itertools.product([None, False], [None, True]): + with self.subTest(different_shape0=different_shape0, different_shape1_n=different_shape1_n): + tensor_list = DistributedHelper.gather_all(ts, different_shape0=different_shape0, different_shape1_n=different_shape1_n) + self.assertEqual(DistributedHelper.world_size, len(tensor_list)) + + for t in tensor_list: + self.assertEqual(device, t.device) + + for rank in range(DistributedHelper.world_size): + expect = torch.full((10, rank+1), rank, dtype=torch.long).to(device) + self.assertTrue(torch.equal(tensor_list[rank], expect)) + + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_gather_all_same_dim1_n(self): + ts = torch.full((10+DistributedHelper.rank, 5), DistributedHelper.rank, dtype=torch.long) + device = DistributedHelper.make_device() + + ts = ts.to(device) + + for different_shape0, different_shape1_n in itertools.product([None, True], [None, False]): + with self.subTest(different_shape0=different_shape0, different_shape1_n=different_shape1_n): + tensor_list = DistributedHelper.gather_all(ts, different_shape0=different_shape0, different_shape1_n=different_shape1_n) + self.assertEqual(DistributedHelper.world_size, len(tensor_list)) + + for t in tensor_list: + self.assertEqual(device, t.device) + + for rank in range(DistributedHelper.world_size): + expect = torch.full((10+DistributedHelper.rank, 5), rank, dtype=torch.long).to(device) + self.assertTrue(torch.equal(tensor_list[rank], expect)) + + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_gather_all_zero_shaped(self): + ts = torch.full(tuple(), DistributedHelper.rank, dtype=torch.long) + device = DistributedHelper.make_device() + + ts = ts.to(device) + + for different_shape0, different_shape1_n in itertools.product([None, False, True], [None, False, True]): + with self.subTest(different_shape0=different_shape0, different_shape1_n=different_shape1_n): + tensor_list = DistributedHelper.gather_all(ts, different_shape0=different_shape0, different_shape1_n=different_shape1_n) + self.assertEqual(DistributedHelper.world_size, len(tensor_list)) + + for t in tensor_list: + self.assertEqual(device, t.device) + + for rank in range(DistributedHelper.world_size): + expect = torch.full(tuple(), rank, dtype=torch.long).to(device) + self.assertTrue(torch.equal(tensor_list[rank], expect)) + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_check_equal_tensors(self): @@ -118,6 +327,30 @@ def test_fields(self): self.assertEqual('gloo', DistributedHelper.backend) self.assertFalse(DistributedHelper.forced_cuda_comm) + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_set_random_seeds_and_align(self): + DistributedHelper.set_random_seeds(5678) + + self.assertEqual(297076, np.random.randint(0, 1000000)) + self.assertEqual(643380, torch.randint(0, 1000000, (1,)).item()) + self.assertEqual(683410, random.randint(0, 1000000)) + + if DistributedHelper.is_main_process: + np.random.randint(0, 1000000) + torch.randint(0, 1000000, (1,)) + random.randint(0, 1000000) + + DistributedHelper.align_seeds() + + ref_values = ( + int(np.random.randint(0, 1000000)), + int(torch.randint(0, 1000000, (1,))), + int(random.randint(0, 1000000)) + ) + + DistributedHelper.check_equal_objects(ref_values) + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_rolling_seed_aligner(self): @@ -141,6 +374,56 @@ def test_broadcast_seed_aligner(self): final_value = random.randint(0, 2 ** 64 - 1) self.assertEqual(15306775005444441373, final_value) + + @unittest.skipIf(check_skip_distributed_test(), + 'Distributed tests ignored') + def test_main_process_first(self): + tmpdirname = '' + try: + my_rank = DistributedHelper.rank + if DistributedHelper.is_main_process: + tmpdirname = tempfile.mkdtemp() + + tmpdirname = DistributedHelper.broadcast_object(tmpdirname) + + #print('Entering exclusive section', my_rank) + with DistributedHelper.main_process_first(): + #print('Entered exclusive section', my_rank) + + for _ in range(2): + #print('Checking files before', my_rank) + time.sleep(0.1 + my_rank * 0.05) + files = list(os.listdir(tmpdirname)) + if DistributedHelper.is_main_process: + self.assertEqual(0, len(files)) + else: + self.assertIn(f'rank0', files) + self.assertNotIn(f'rank{my_rank}', files) + + #print('Writing my file', my_rank) + with open(os.path.join(tmpdirname, f'rank{my_rank}'), 'w') as f: + f.write('ok') + + for _ in range(2): + #print('Checking files after', my_rank) + time.sleep(0.1 + my_rank * 0.05) + files = list(os.listdir(tmpdirname)) + if DistributedHelper.is_main_process: + self.assertEqual(1, len(files)) + self.assertIn(f'rank0', files) + else: + self.assertIn(f'rank0', files) + self.assertIn(f'rank{my_rank}', files) + #print('Exiting exclusive section', my_rank) + + DistributedHelper.barrier() + files = set(os.listdir(tmpdirname)) + expect = set([f'rank{rnk}' for rnk in range(DistributedHelper.world_size)]) + self.assertSetEqual(expect, files) + DistributedHelper.barrier() + finally: + if tmpdirname is not None and DistributedHelper.is_main_process: + shutil.rmtree(tmpdirname) if __name__ == "__main__": From 6a3dd1f97a5920090f42c069b3280c549030a229 Mon Sep 17 00:00:00 2001 From: Lorenzo Pellegrini Date: Mon, 16 Jan 2023 17:22:19 +0100 Subject: [PATCH 16/16] Tests for DistributedHelper. Distributed support field in plugins. --- .github/workflows/environment-update.yml | 3 +- avalanche/core.py | 7 + avalanche/distributed/distributed_helper.py | 203 +++++++++-------- avalanche/models/dynamic_modules.py | 3 +- avalanche/training/plugins/clock.py | 2 + avalanche/training/plugins/cwr_star.py | 35 +-- avalanche/training/plugins/evaluation.py | 2 + avalanche/training/plugins/ewc.py | 13 ++ avalanche/training/plugins/gdumb.py | 4 +- avalanche/training/plugins/lwf.py | 13 +- avalanche/training/plugins/replay.py | 2 + avalanche/training/templates/base.py | 43 ++++ avalanche/training/templates/base_sgd.py | 2 + .../observation_type/batch_observation.py | 7 +- .../distributed/distributed_training_main.py | 32 ++- tests/distributed/test_distributed.sh | 9 +- tests/distributed/test_distributed_helper.py | 204 ++++++++++++------ .../test_distributed_strategy_support.py | 2 + tests/run_dist_tests.py | 2 +- .../test_avalanche_classification_dataset.py | 2 +- 20 files changed, 388 insertions(+), 202 deletions(-) diff --git a/.github/workflows/environment-update.yml b/.github/workflows/environment-update.yml index 0097a6eb6..247533337 100644 --- a/.github/workflows/environment-update.yml +++ b/.github/workflows/environment-update.yml @@ -59,7 +59,8 @@ jobs: python -m unittest discover tests && echo "Running checkpointing tests..." && bash ./tests/checkpointing/test_checkpointing.sh && - python ./tests/run_dist_tests.py + echo "Running distributed training tests..." && + python ./tests/run_dist_tests.py && - name: checkout avalanche-docker repo if: always() uses: actions/checkout@v3 diff --git a/avalanche/core.py b/avalanche/core.py index ac13aac9f..1441c0754 100644 --- a/avalanche/core.py +++ b/avalanche/core.py @@ -27,6 +27,13 @@ class BasePlugin(Generic[Template], ABC): and loggers. """ + supports_distributed = False + """ + A class-level attribute that indicates whether the plugin is supported + in distributed training. If False, Avalanche will warn when the plugin + is used in distributed training. + """ + def __init__(self): pass diff --git a/avalanche/distributed/distributed_helper.py b/avalanche/distributed/distributed_helper.py index c120cbf9f..ef04e19bd 100644 --- a/avalanche/distributed/distributed_helper.py +++ b/avalanche/distributed/distributed_helper.py @@ -190,6 +190,7 @@ def make_device(self, set_cuda_device=False): return ref_device def wrap_model(self, model: Module) -> Module: + # Note: find_unused_parameters is needed for multi task models. if self.is_distributed: if self.forced_cuda_comm or self.use_cuda: # forced_cuda_comm is True if using NCCL; use_cuda may be true @@ -264,8 +265,7 @@ def cat_all(self, tensor: Tensor): if not self.is_distributed: return tensor - gathered_tensors = self.gather_all( - tensor, different_shape0=True, different_shape1_n=False) + gathered_tensors = self.gather_all(tensor) for i, t in enumerate(gathered_tensors): if len(t.shape) == 0: # Tensor with 0-length shape @@ -273,124 +273,140 @@ def cat_all(self, tensor: Tensor): return torch.cat(gathered_tensors) + def gather_tensor_shapes(self, tensor: Tensor, max_shape_len=10) \ + -> List[List[int]]: + """ + Gathers the shapes of all the tensors. + """ + # Tensor differ by whole shape + tensor_size = torch.zeros(max_shape_len, dtype=torch.int64) + for i in range(len(tensor.shape)): + tensor_size[i] = tensor.shape[i] + all_tensors_shape = [ + self._prepare_for_distributed_comm( + torch.zeros_like(tensor_size))[0] + for _ in range(self.world_size)] + tensor_size, _ = self._prepare_for_distributed_comm(tensor_size) + + torch.distributed.all_gather(all_tensors_shape, tensor_size) + + all_tensors_shape = [t.cpu() for t in all_tensors_shape] + + # Trim shape + for i, t in enumerate(all_tensors_shape): + for x in range(len(t)): + if t[x] == 0: + if x == 0: + # Tensor with 0-length shape + all_tensors_shape[i] = t[:x+1] + else: + all_tensors_shape[i] = t[:x] + + break + + return [t_shape.tolist() for t_shape in all_tensors_shape] + def gather_all( self, tensor: Tensor, - out_tensors: Optional[List[Tensor]] = None, - different_shape0: Optional[bool] = None, - different_shape1_n: Optional[bool] = None): + same_shape: bool = False, + shapes: Optional[List[List[int]]] = None): """ Gather all for tensors only. - Note: differently from the original Pytorch function, which requires that input tensor is to be moved - to the default device (forced to CUDA if using NCCL), this function also manages input tensors residing on - arbitrary devices. The resulting list of tensors will be moved to the same device - of the input tensor. + Note: differently from the original Pytorch function, which requires + that input tensor is to be moved to the default device (forced to + CUDA if using NCCL), this function also manages input tensors + residing on a different devics. The resulting list of tensors will + be moved to the same device of the input tensor. + + This will also manage tensors of different shapes. If you + are sure that the tensors will be of the same shape, consider + passing same_shape to speed up the communication. + + Beware that, if you are in need of concatenating multiple tensors, + method `cat_all` may be more suitable. """ if not self.is_distributed: return [tensor] - if different_shape0 is None or different_shape1_n is None: - warnings.warn('different_shape0 and different_shape1_n not set. ' - 'This may lead to inefficiencies.') - - if different_shape0 is None: - different_shape0 = True - - if different_shape1_n is None: - different_shape1_n = True - # Based on: # https://discuss.pytorch.org/t/how-to-concatenate-different-size-tensors-from-distributed-processes/44819/4 - if out_tensors is None: - all_tensors_shape = None - if different_shape1_n: - # TODO: needs unit test (especially for 0-shaped tensors) - # Tensor differ by whole shape (not very common case) - tensor_size = torch.zeros(10, dtype=torch.int64) - for i in range(len(tensor.shape)): - tensor_size[i] = tensor.shape[i] - - elif different_shape0: - # Tensors differ by shape[0] (most common case) - if len(tensor.shape) > 0: - # Usual case - tensor_size = torch.tensor([tensor.shape[0]], - dtype=torch.int64) - else: - # Some tensors, especially loss tensors, have 0-length shape - tensor_size = torch.tensor([0], dtype=torch.int64) + if same_shape: + # Same size for all tensors + if len(tensor.shape) > 0: + tensor_size = list(tensor.shape) else: - # TODO: needs unit test (especially for 0-shaped tensors) - # Same size for all tensors - if len(tensor.shape) > 0: - tensor_size = torch.tensor(tensor.shape, dtype=torch.int64) - else: - tensor_size = torch.tensor([0], dtype=torch.int64) - all_tensors_shape = \ - [tensor_size for _ in range(self.world_size)] - - if all_tensors_shape is None: - all_tensors_shape = [ - self._prepare_for_distributed_comm( - torch.zeros_like(tensor_size))[0] - for _ in range(self.world_size)] - tensor_size, _ = self._prepare_for_distributed_comm(tensor_size) - - torch.distributed.all_gather(all_tensors_shape, tensor_size) - - all_tensors_shape = [t.cpu() for t in all_tensors_shape] - - if different_shape1_n: - # TODO: needs unit test (especially for 0-shaped tensors) - # Trim shape - for i, t in enumerate(all_tensors_shape): - for x in range(len(t)): - if t[x] == 0: - if x == 0: - # Tensor with 0-length shape - all_tensors_shape[i] = t[:x+1] - else: - all_tensors_shape[i] = t[:x] - - break - - elif different_shape0: - if len(tensor.shape[1:]) == 0: - # To manage tensors with 0-length shape - pass - else: - all_tensors_shape = \ - [torch.cat( - [t, - torch.as_tensor(tensor.shape[1:], - dtype=torch.int64)]) - for t in all_tensors_shape] - + tensor_size = [0] all_tensors_shape = \ - [t_shape.tolist() for t_shape in all_tensors_shape] - dtype = tensor.dtype + [tensor_size for _ in range(self.world_size)] + elif shapes is not None: + # Shapes given by the user + # make sure it is a list of lists + all_tensors_shape = [list(s) for s in shapes] + else: + # Tensor differ by whole shape + all_tensors_shape = self.gather_tensor_shapes(tensor) + + same_shape = all(all_tensors_shape[0] == x for x in all_tensors_shape) + orig_device = tensor.device - out_tensors = [] + if same_shape: + # Same shape: create identical tensors and proceed with all_gather + out_tensors = [torch.empty_like(tensor) for _ in all_tensors_shape] + else: + # Different shapes: create a tensors of the size of the bigger one + all_tensors_numel = [] + dtype = tensor.dtype for t_shape in all_tensors_shape: if t_shape[0] == 0 and len(t_shape) == 1: # Tensor with 0-length shape - out_tensors.append(torch.zeros(tuple(), dtype=dtype)) + curr_size = 1 else: - out_tensors.append(torch.zeros(*t_shape, dtype=dtype)) + curr_size = 1 + for t_s in t_shape: + curr_size *= t_s + all_tensors_numel.append(curr_size) + + max_numel = max(all_tensors_numel) + out_tensors = [torch.empty((max_numel,), dtype=dtype) + for _ in all_tensors_shape] + + tensor = tensor.flatten() + n_padding = max_numel - tensor.numel() + if n_padding > 0: + padding = torch.zeros((n_padding,), + dtype=tensor.dtype, + device=orig_device) + tensor = torch.cat((tensor, padding), dim=0) - orig_device = tensor.device tensor, _ = self._prepare_for_distributed_comm(tensor) out_tensors = [self._prepare_for_distributed_comm(t)[0] - for t in out_tensors] + for t in out_tensors] + torch.distributed.all_gather(out_tensors, tensor) + + if not same_shape: + # The tensors are flat and of the wrong dimension: re-shape them + for tensor_idx, (tensor_sz, tensor_numel, out_t) in \ + enumerate(zip(all_tensors_shape, + all_tensors_numel, + out_tensors)): + if tensor_sz[0] == 0: + # Tensor with 0-length shape + out_tensors[tensor_idx] = \ + out_t[:tensor_numel].reshape(tuple()) + else: + out_tensors[tensor_idx] = \ + out_t[:tensor_numel].reshape(tensor_sz) + out_tensors = [t.to(orig_device) for t in out_tensors] return out_tensors def gather_all_objects(self, obj: BroadcastT) -> List[BroadcastT]: """ - Gather all for objects. This will also take care of moving cuda tensors + Gather all for objects. This will also take care of moving cuda tensors (even the ones nested inside objects) to the correct default device. """ out_list = [None for _ in range(self.world_size)] @@ -401,10 +417,7 @@ def check_equal_tensors(self, tensor: Tensor): if not DistributedHelper.is_distributed: return - all_tensors = self.gather_all( - tensor, - different_shape0=True, - different_shape1_n=True) + all_tensors = self.gather_all(tensor) tensors_hashes = [hash_tensor(t) for t in all_tensors] diff --git a/avalanche/models/dynamic_modules.py b/avalanche/models/dynamic_modules.py index 7d4df00e2..f00ed393d 100644 --- a/avalanche/models/dynamic_modules.py +++ b/avalanche/models/dynamic_modules.py @@ -14,7 +14,6 @@ """ import torch from torch.nn import Module -import numpy as np from avalanche.benchmarks.utils.flat_data import ConstantSequence from avalanche.benchmarks.scenarios import CLExperience @@ -366,7 +365,7 @@ def adaptation(self, experience: CLExperience): # head adaptation if tid not in self.classifiers: # create new head new_head = IncrementalClassifier( - self.in_features, self.starting_out_features + self.in_features, self.starting_out_features, masking=False ).to(device) self.classifiers[tid] = new_head diff --git a/avalanche/training/plugins/clock.py b/avalanche/training/plugins/clock.py index 535ef3f72..1718beaf3 100644 --- a/avalanche/training/plugins/clock.py +++ b/avalanche/training/plugins/clock.py @@ -18,6 +18,8 @@ class Clock(SupervisedPlugin): wrong for plugins called after it. """ + supports_distributed = True + def __init__(self): """Init.""" super().__init__() diff --git a/avalanche/training/plugins/cwr_star.py b/avalanche/training/plugins/cwr_star.py index 6bd88c681..2495eae85 100644 --- a/avalanche/training/plugins/cwr_star.py +++ b/avalanche/training/plugins/cwr_star.py @@ -22,6 +22,8 @@ class CWRStarPlugin(SupervisedPlugin): This plugin does not use task identities. """ + supports_distributed = True + def __init__(self, model, cwr_layer_name=None, freeze_remaining_model=True): """ :param model: the model. @@ -47,23 +49,26 @@ def __init__(self, model, cwr_layer_name=None, freeze_remaining_model=True): self.cur_class = None def after_training_exp(self, strategy, **kwargs): - self.consolidate_weights() - self.set_consolidate_weights() + with strategy.use_local_model(): + self.consolidate_weights() + self.set_consolidate_weights() def before_training_exp(self, strategy, **kwargs): - if self.freeze_remaining_model and strategy.clock.train_exp_counter > 0: - self.freeze_other_layers() - - # Count current classes and number of samples for each of them. - data = strategy.experience.dataset - self.model.cur_j = examples_per_class(data.targets) - self.cur_class = [ - cls - for cls in set(self.model.cur_j.keys()) - if self.model.cur_j[cls] > 0 - ] - - self.reset_weights(self.cur_class) + with strategy.use_local_model(): + if self.freeze_remaining_model and \ + strategy.clock.train_exp_counter > 0: + self.freeze_other_layers() + + # Count current classes and number of samples for each of them. + data = strategy.experience.dataset + self.model.cur_j = examples_per_class(data.targets) + self.cur_class = [ + cls + for cls in set(self.model.cur_j.keys()) + if self.model.cur_j[cls] > 0 + ] + + self.reset_weights(self.cur_class) def consolidate_weights(self): """Mean-shift for the target layer weights""" diff --git a/avalanche/training/plugins/evaluation.py b/avalanche/training/plugins/evaluation.py index e5f12dd21..1606613a8 100644 --- a/avalanche/training/plugins/evaluation.py +++ b/avalanche/training/plugins/evaluation.py @@ -30,6 +30,8 @@ class EvaluationPlugin: This plugin also logs metrics using the provided loggers. """ + supports_distributed = True + def __init__( self, *metrics: Union["PluginMetric", Sequence["PluginMetric"]], diff --git a/avalanche/training/plugins/ewc.py b/avalanche/training/plugins/ewc.py index 255e686ec..fcbf0f6f1 100644 --- a/avalanche/training/plugins/ewc.py +++ b/avalanche/training/plugins/ewc.py @@ -23,6 +23,19 @@ class EWCPlugin(SupervisedPlugin): training set. This plugin does not use task identities. """ + supports_distributed = False + """ + EwC does not support distributed training. + + This is because the plugin needs to compute an additional component of the + loss function that involves model parameters. It is not possible, in + distributed training, to use model parameters to compute grad elements + outside the forward function. + This is a limitation of PyTorch DistributedDataParallel. + + Setting parameters like `find_unused_parameters` do not solve this problem. + """ + def __init__( self, ewc_lambda, diff --git a/avalanche/training/plugins/gdumb.py b/avalanche/training/plugins/gdumb.py index be44c8cdc..0c95224c7 100644 --- a/avalanche/training/plugins/gdumb.py +++ b/avalanche/training/plugins/gdumb.py @@ -21,6 +21,8 @@ class GDumbPlugin(SupervisedPlugin): https://www.robots.ox.ac.uk/~tvg/publications/2020/gdumb.pdf """ + supports_distributed = True + def __init__(self, mem_size: int = 200): super().__init__() self.mem_size = mem_size @@ -39,7 +41,7 @@ def before_train_dataset_adaptation( if self.init_model is None: self.init_model = copy.deepcopy(strategy.model) else: - strategy.model = copy.deepcopy(self.init_model) + strategy.model = copy.deepcopy(self.init_model) strategy.model_adaptation(self.init_model) def before_eval_dataset_adaptation( diff --git a/avalanche/training/plugins/lwf.py b/avalanche/training/plugins/lwf.py index ed5c5b8be..d63afbb9e 100644 --- a/avalanche/training/plugins/lwf.py +++ b/avalanche/training/plugins/lwf.py @@ -10,6 +10,8 @@ class LwFPlugin(SupervisedPlugin): When used with multi-headed models, all heads are distilled. """ + supports_distributed = True + def __init__(self, alpha=1, temperature=2): """ :param alpha: distillation hyperparameter. It can be either a float @@ -24,13 +26,16 @@ def before_backward(self, strategy, **kwargs): Add distillation loss """ - strategy.loss += self.lwf( - strategy.mb_x, strategy.mb_output, strategy.model - ) + with strategy.use_local_loss(): + with strategy.use_local_input_batch(): + with strategy.use_local_output_batch(): + strategy.loss += self.lwf( + strategy.mb_x, strategy.mb_output, strategy.model + ) def after_training_exp(self, strategy, **kwargs): """ Save a copy of the model after each experience and update self.prev_classes to include the newly learned classes. """ - self.lwf.update(strategy.experience, strategy.model) + self.lwf.update(strategy.experience, strategy.local_model) diff --git a/avalanche/training/plugins/replay.py b/avalanche/training/plugins/replay.py index f653a1834..22bca224b 100644 --- a/avalanche/training/plugins/replay.py +++ b/avalanche/training/plugins/replay.py @@ -44,6 +44,8 @@ class ReplayPlugin(SupervisedPlugin): in memory """ + supports_distributed = True + def __init__( self, mem_size: int = 200, diff --git a/avalanche/training/templates/base.py b/avalanche/training/templates/base.py index 4d80adb7b..b33cca462 100644 --- a/avalanche/training/templates/base.py +++ b/avalanche/training/templates/base.py @@ -1,3 +1,4 @@ +import sys import warnings from typing import Iterable, Sequence, Optional, Union, List @@ -6,6 +7,7 @@ from avalanche.benchmarks import CLExperience, CLStream from avalanche.core import BasePlugin +from avalanche.distributed.distributed_helper import DistributedHelper from avalanche.distributed.strategies import DistributedModelStrategySupport from avalanche.training.utils import trigger_plugins @@ -69,6 +71,12 @@ def __init__( self.current_eval_stream: Optional[ExpSequence] = None """ Current evaluation stream. """ + self._distributed_check: bool = False + """ + Internal flag used to verify the support for distributed + training only once. + """ + @property def is_eval(self): """True if the strategy is in evaluation mode.""" @@ -94,6 +102,12 @@ def train( If None: use training experiences for evaluation. Use [] if you do not want to evaluate during training. """ + if not self._distributed_check: + # Checks if the strategy elements are compatible with + # distributed training + self._check_distributed_training_compatibility() + self._distributed_check = True + self.is_training = True self._stop_training = False @@ -134,6 +148,12 @@ def eval( :return: dictionary containing last recorded value for each metric name """ + if not self._distributed_check: + # Checks if the strategy elements are compatible with + # distributed training + self._check_distributed_training_compatibility() + self._distributed_check = True + # eval can be called inside the train method. # Save the shared state here to restore before returning. prev_train_state = self._save_train_state() @@ -219,6 +239,29 @@ def is_callback(x): f"callbacks: {cb_p - cb_supported}", ) return + + def _check_distributed_training_compatibility(self): + """ + Check if strategy elements (plugins, ...) are compatible with + distributed training. + + This check does nothing if not training in distributed mode. + """ + if not DistributedHelper.is_distributed: + return True + + unsupported_plugins = [] + for plugin in self.plugins: + if not getattr(plugin, "supports_distributed", False): + unsupported_plugins.append(plugin) + + if len(unsupported_plugins) > 0: + warnings.warn('You are using plugins that are not compatible' + 'with distributed training:') + for plugin in unsupported_plugins: + print(type(plugin), file=sys.stderr) + + return len(unsupported_plugins) == 0 ######################################################### # Plugin Triggers # diff --git a/avalanche/training/templates/base_sgd.py b/avalanche/training/templates/base_sgd.py index 01c48974d..ddbfed5ff 100644 --- a/avalanche/training/templates/base_sgd.py +++ b/avalanche/training/templates/base_sgd.py @@ -542,6 +542,8 @@ class PeriodicEval(SupervisedPlugin): This plugin is automatically configured and added by the BaseTemplate. """ + supports_distributed = True + def __init__(self, eval_every=-1, peval_mode="epoch", do_initial=True): """Init. diff --git a/avalanche/training/templates/observation_type/batch_observation.py b/avalanche/training/templates/observation_type/batch_observation.py index ccbabe3a6..02cdb1889 100644 --- a/avalanche/training/templates/observation_type/batch_observation.py +++ b/avalanche/training/templates/observation_type/batch_observation.py @@ -41,4 +41,9 @@ def check_model_and_optimizer(self): with self.use_local_model(): self.model = self.model_adaptation() self.model = self.wrap_distributed_model(self.model) - self.make_optimizer() + self.make_optimizer() + + +__all__ = [ + 'BatchObservation' +] diff --git a/tests/distributed/distributed_training_main.py b/tests/distributed/distributed_training_main.py index 85f960266..d05e1e71b 100644 --- a/tests/distributed/distributed_training_main.py +++ b/tests/distributed/distributed_training_main.py @@ -56,6 +56,9 @@ class CheckModelAlignedPlugin(SupervisedPlugin): + + supports_distributed = True + def after_update(self, strategy, *args, **kwargs): DistributedHelper.check_equal_objects( hash_model(strategy.model, include_buffers=True)) @@ -114,13 +117,6 @@ def main(args): test_stream: Sequence[CLExperience] = scenario.test_stream print('Testing using the', args.benchmark, 'benchmark') - for train_exp in train_stream: - print('Train experience', train_exp.current_experience, - 'has', len(train_exp.dataset), 'samples') - - for test_exp in test_stream: - print('Test experience', test_exp.current_experience, - 'has', len(test_exp.dataset), 'samples') # --------- # MODEL CREATION @@ -168,7 +164,7 @@ def main(args): plugin_instance = SynapticIntelligencePlugin(0.001) elif cli_plugin == 'ewc': plugin_instance = EWCPlugin(0.001) - elif cli_plugin == 'reduccre_on_plateau': + elif cli_plugin == 'reduce_on_plateau': plugin_instance = LRSchedulerPlugin( ReduceLROnPlateau(optimizer), step_granularity='iteration', metric='train_loss' @@ -183,9 +179,11 @@ def main(args): if DistributedHelper.is_main_process: use_cuda_str = 'cuda' if args.cuda else 'cpu' is_dist_str = 'distributed' if is_dist else 'single' + eval_every = f'peval{args.eval_every}' + log_location: Path = Path('logs') / \ (f'distributed_{args.benchmark}_' + - f'{use_cuda_str}_{is_dist_str}_{cli_plugin_names}') + f'{use_cuda_str}_{is_dist_str}_{eval_every}_{cli_plugin_names}') # Loggers should be created in the main process only os.makedirs(log_location, exist_ok=True) @@ -200,7 +198,7 @@ def main(args): project_name='AvalancheDistributedTraining', run_name=f'distributed_{args.benchmark}_' f'{use_cuda_str}_{is_dist_str}_' - f'{cli_plugin_names}' + f'{eval_every}_{cli_plugin_names}' )) Path(args.log_metrics_to).mkdir(parents=True, exist_ok=True) @@ -224,6 +222,8 @@ def main(args): train_mb_size=mb_size, train_epochs=2, eval_mb_size=mb_size, + eval_every=args.eval_every, + peval_mode=args.eval_every_mode, device=device, plugins=plugins, evaluator=evaluation_plugin @@ -269,6 +269,18 @@ def main(args): default='SplitCifar100', help="The benchmark to use." ) + parser.add_argument( + "--eval_every", + type=int, + default=-1, + help="Evaluation frequency." + ) + parser.add_argument( + "--eval_every_mode", + type=str, + default="epoch", + help="Periodic evaluation mode (epoch, experience, iteration)." + ) parser.add_argument( "--log_metrics_to", type=str, diff --git a/tests/distributed/test_distributed.sh b/tests/distributed/test_distributed.sh index 9fbb606b4..2f61bcf4f 100755 --- a/tests/distributed/test_distributed.sh +++ b/tests/distributed/test_distributed.sh @@ -44,14 +44,14 @@ EXP_RUN_LINE="torchrun --standalone --nnodes=1 --nproc_per_node=$TESTS_PARALLELI run_and_check() { set -x - # Without distributed training - python distributed_training_main.py $GPU_PARAM \ - --plugins "$@" --benchmark $BENCHMARK --log_metrics_to './metrics_no_distributed' - # Run distributed training $EXP_RUN_LINE distributed_training_main.py $GPU_PARAM \ --plugins "$@" --benchmark $BENCHMARK --log_metrics_to './metrics_distributed' + # Without distributed training + python distributed_training_main.py $GPU_PARAM \ + --plugins "$@" --benchmark $BENCHMARK --log_metrics_to './metrics_no_distributed' + #python -u check_metrics_aligned.py \ # "./metrics_no_distributed" "./metrics_distributed" @@ -67,7 +67,6 @@ if [ "$RUN_FAST_TESTS" = "false" ] then echo "Running slow tests..." run_and_check "lwf" - run_and_check "ewc" run_and_check "gdumb" run_and_check "cwr" "replay" fi diff --git a/tests/distributed/test_distributed_helper.py b/tests/distributed/test_distributed_helper.py index 8091f6029..6bafc9931 100644 --- a/tests/distributed/test_distributed_helper.py +++ b/tests/distributed/test_distributed_helper.py @@ -1,10 +1,8 @@ -import itertools import os import random import shutil import tempfile import time -import timeit import unittest import numpy as np @@ -12,8 +10,10 @@ import torch.distributed as dst from torch.nn import Module from torch.nn.parallel import DistributedDataParallel -from avalanche.benchmarks.generators.benchmark_generators import dataset_benchmark -from avalanche.benchmarks.utils.classification_dataset import make_tensor_classification_dataset +from avalanche.benchmarks.generators.benchmark_generators import \ + dataset_benchmark +from avalanche.benchmarks.utils.classification_dataset import \ + make_tensor_classification_dataset from avalanche.distributed import DistributedHelper from avalanche.distributed.distributed_helper import \ @@ -23,8 +23,8 @@ from avalanche.training.determinism.rng_manager import RNGManager from tests.distributed.distributed_test_utils import \ - check_skip_distributed_slow_test, check_skip_distributed_test, suppress_dst_tests_output, \ - common_dst_tests_setup + check_skip_distributed_slow_test, check_skip_distributed_test, \ + suppress_dst_tests_output, common_dst_tests_setup class DistributedHelperTests(unittest.TestCase): @@ -148,7 +148,7 @@ def test_gather_all_objects(self): for rank in range(DistributedHelper.world_size): expect = { - 'a': 0, + 'a': rank, 'b': torch.full((10,), rank, dtype=torch.long).tolist()} self.assertEqual(device, all_objects[rank]['b'].device) @@ -158,11 +158,14 @@ def test_gather_all_objects(self): @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_cat_all(self): - ts = torch.full((10, 5), DistributedHelper.rank, dtype=torch.long) + if DistributedHelper.rank == 0: + ts = torch.full((10+1, 5), DistributedHelper.rank, dtype=torch.long) + else: + ts = torch.full((10, 5), DistributedHelper.rank, dtype=torch.long) device = DistributedHelper.make_device() if device.type == 'cuda': - # Additional test: the tensor do not need to be on the default device + # Additional test: tensors do not need to be on the default device DistributedHelper.cat_all(ts) ts = ts.to(device) @@ -171,56 +174,67 @@ def test_cat_all(self): self.assertEqual(device, concatenated_tensor.device) - expect = torch.empty((DistributedHelper.world_size * 10, 5), dtype=torch.long).to(device) + expect = torch.empty((DistributedHelper.world_size * 10 + 1, 5), + dtype=torch.long).to(device) for rank in range(DistributedHelper.world_size): - expect[rank * 10: (rank + 1) * 10] = rank + if rank == 0: + expect[rank * 10: (rank + 1) * 10 + 1] = rank + else: + expect[1 + rank * 10: 1 + (rank + 1) * 10] = rank self.assertTrue(torch.equal(concatenated_tensor, expect)) @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_gather_all_same_size(self): - # TODO: implement test ts = torch.full((10, 5), DistributedHelper.rank, dtype=torch.long) device = DistributedHelper.make_device() if device.type == 'cuda': - # Additional test: the tensor do not need to be on the default device + # Additional test: tensors do not need to be on the default device DistributedHelper.gather_all(ts) - # On the other hand, PyTorch all_gather requires tensors to be on the default device + # On the other hand, PyTorch all_gather requires tensors to be on + # the default device with self.assertRaises(Exception): - out_t = [torch.empty_like(ts) for _ in range(DistributedHelper.world_size)] + out_t = [torch.empty_like(ts) + for _ in range(DistributedHelper.world_size)] torch.distributed.all_gather(out_t, ts) # ... while this should work - out_t = [torch.empty_like(ts).to(device) for _ in range(DistributedHelper.world_size)] + out_t = [torch.empty_like(ts).to(device) + for _ in range(DistributedHelper.world_size)] torch.distributed.all_gather(out_t, ts.to(device)) ts = ts.to(device) - for different_shape0, different_shape1_n in itertools.product([None, False], [None, False]): - with self.subTest(different_shape0=different_shape0, different_shape1_n=different_shape1_n): - tensor_list = DistributedHelper.gather_all(ts, different_shape0=different_shape0, different_shape1_n=different_shape1_n) + for same_shape in [False, True]: + print(f'same_shape={same_shape}') + # with self.subTest(same_shape=same_shape): + tensor_list = DistributedHelper.gather_all( + ts, same_shape=same_shape) - self.assertEqual(DistributedHelper.world_size, len(tensor_list)) + self.assertEqual(DistributedHelper.world_size, len(tensor_list)) - for t in tensor_list: - self.assertEqual(device, t.device) + for t in tensor_list: + self.assertEqual(device, t.device) - for rank in range(DistributedHelper.world_size): - expect = torch.full((10, 5), rank, dtype=torch.long).to(device) - self.assertTrue(torch.equal(tensor_list[rank], expect)) + for rank in range(DistributedHelper.world_size): + expect = torch.full((10, 5), rank, dtype=torch.long).to(device) + self.assertTrue(torch.equal(tensor_list[rank], expect)) @unittest.skipIf(check_skip_distributed_slow_test(), 'Distributed tests ignored') def test_gather_all_performance_known_same_shape(self): - ts = torch.full((128, 224, 224, 3), DistributedHelper.rank, dtype=torch.float32) + ts = torch.full((128, 224, 224, 3), + DistributedHelper.rank, + dtype=torch.float32) device = DistributedHelper.make_device() ts = ts.to(device) - resulting_tensors = [torch.empty_like(ts).to(device) for _ in range(DistributedHelper.world_size)] + resulting_tensors = [torch.empty_like(ts).to(device) + for _ in range(DistributedHelper.world_size)] from tqdm import tqdm n_times = 30 @@ -229,7 +243,8 @@ def test_gather_all_performance_known_same_shape(self): for _ in tqdm(range(n_times)): torch.distributed.all_gather(resulting_tensors, ts) end_time = time.time() - print('Time taken by PyTorch all_gather', end_time-start_time, 'avg', (end_time-start_time) / n_times) + print('Time taken by PyTorch all_gather', end_time-start_time, + 'avg', (end_time-start_time) / n_times) start_time = time.time() out_list = [None for _ in range(DistributedHelper.world_size)] @@ -238,47 +253,105 @@ def test_gather_all_performance_known_same_shape(self): for _ in tqdm(range(n_times)): torch.distributed.all_gather_object(out_list, ts) end_time = time.time() - print('Time taken by PyTorch all_gather_object', end_time-start_time, 'avg', (end_time-start_time) / n_times) + print('Time taken by PyTorch all_gather_object', end_time-start_time, + 'avg', (end_time-start_time) / n_times) + + @unittest.skipIf(check_skip_distributed_slow_test(), + 'Distributed tests ignored') + def test_gather_all_performance_sync_shape(self): + max_shape_size = 10 + shape = [128, 6, DistributedHelper.rank+1] + \ + ([3] * DistributedHelper.rank) + device = DistributedHelper.make_device() + + def shape_all_gather(): + ts = torch.zeros((max_shape_size,), dtype=torch.int64) + for i in range(len(shape)): + ts[i] = shape[i] + + ts = ts.to(device) + all_tensors_shape = [torch.empty_like(ts) + for _ in range(DistributedHelper.world_size)] + torch.distributed.all_gather(all_tensors_shape, ts) + all_tensors_shape = [t.cpu() for t in all_tensors_shape] + + for i, t in enumerate(all_tensors_shape): + for x in range(len(t)): + if t[x] == 0: + if x == 0: + # Tensor with 0-length shape + all_tensors_shape[i] = t[:x+1] + else: + all_tensors_shape[i] = t[:x] + break + + def shape_all_gather_objects(): + out_list = [None for _ in range(DistributedHelper.world_size)] + torch.distributed.all_gather_object(out_list, shape) + + from tqdm import tqdm + n_times = 1000 + shape_all_gather() + start_time = time.time() + for _ in tqdm(range(n_times)): + shape_all_gather() + end_time = time.time() + print('Time taken by PyTorch all_gather', end_time-start_time, + 'avg', (end_time-start_time) / n_times) + + start_time = time.time() + shape_all_gather_objects() + + for _ in tqdm(range(n_times)): + shape_all_gather_objects() + end_time = time.time() + print('Time taken by PyTorch all_gather_object', end_time-start_time, + 'avg', (end_time-start_time) / n_times) + @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_gather_all_same_dim0(self): - ts = torch.full((10, DistributedHelper.rank+1), DistributedHelper.rank, dtype=torch.long) + ts = torch.full((10, DistributedHelper.rank+1), + DistributedHelper.rank, + dtype=torch.long) device = DistributedHelper.make_device() ts = ts.to(device) - for different_shape0, different_shape1_n in itertools.product([None, False], [None, True]): - with self.subTest(different_shape0=different_shape0, different_shape1_n=different_shape1_n): - tensor_list = DistributedHelper.gather_all(ts, different_shape0=different_shape0, different_shape1_n=different_shape1_n) - self.assertEqual(DistributedHelper.world_size, len(tensor_list)) + tensor_list = DistributedHelper.gather_all(ts) + self.assertEqual(DistributedHelper.world_size, len(tensor_list)) - for t in tensor_list: - self.assertEqual(device, t.device) + for t in tensor_list: + self.assertEqual(device, t.device) - for rank in range(DistributedHelper.world_size): - expect = torch.full((10, rank+1), rank, dtype=torch.long).to(device) - self.assertTrue(torch.equal(tensor_list[rank], expect)) + for rank in range(DistributedHelper.world_size): + expect = torch.full((10, rank+1), + rank, + dtype=torch.long).to(device) + self.assertTrue(torch.equal(tensor_list[rank], expect)) @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') def test_gather_all_same_dim1_n(self): - ts = torch.full((10+DistributedHelper.rank, 5), DistributedHelper.rank, dtype=torch.long) + ts = torch.full((10+DistributedHelper.rank, 5), + DistributedHelper.rank, + dtype=torch.long) device = DistributedHelper.make_device() ts = ts.to(device) - for different_shape0, different_shape1_n in itertools.product([None, True], [None, False]): - with self.subTest(different_shape0=different_shape0, different_shape1_n=different_shape1_n): - tensor_list = DistributedHelper.gather_all(ts, different_shape0=different_shape0, different_shape1_n=different_shape1_n) - self.assertEqual(DistributedHelper.world_size, len(tensor_list)) + tensor_list = DistributedHelper.gather_all(ts) + self.assertEqual(DistributedHelper.world_size, len(tensor_list)) - for t in tensor_list: - self.assertEqual(device, t.device) + for t in tensor_list: + self.assertEqual(device, t.device) - for rank in range(DistributedHelper.world_size): - expect = torch.full((10+DistributedHelper.rank, 5), rank, dtype=torch.long).to(device) - self.assertTrue(torch.equal(tensor_list[rank], expect)) + for rank in range(DistributedHelper.world_size): + expect = torch.full((10+rank, 5), + rank, + dtype=torch.long).to(device) + self.assertTrue(torch.equal(tensor_list[rank], expect)) @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') @@ -288,17 +361,20 @@ def test_gather_all_zero_shaped(self): ts = ts.to(device) - for different_shape0, different_shape1_n in itertools.product([None, False, True], [None, False, True]): - with self.subTest(different_shape0=different_shape0, different_shape1_n=different_shape1_n): - tensor_list = DistributedHelper.gather_all(ts, different_shape0=different_shape0, different_shape1_n=different_shape1_n) - self.assertEqual(DistributedHelper.world_size, len(tensor_list)) + for same_shape in [False, True]: + print(f'same_shape={same_shape}') + # with self.subTest(same_shape=same_shape): + tensor_list = DistributedHelper.gather_all( + ts, + same_shape=same_shape) + self.assertEqual(DistributedHelper.world_size, len(tensor_list)) - for t in tensor_list: - self.assertEqual(device, t.device) + for t in tensor_list: + self.assertEqual(device, t.device) - for rank in range(DistributedHelper.world_size): - expect = torch.full(tuple(), rank, dtype=torch.long).to(device) - self.assertTrue(torch.equal(tensor_list[rank], expect)) + for rank in range(DistributedHelper.world_size): + expect = torch.full(tuple(), rank, dtype=torch.long).to(device) + self.assertTrue(torch.equal(tensor_list[rank], expect)) @unittest.skipIf(check_skip_distributed_test(), 'Distributed tests ignored') @@ -386,12 +462,9 @@ def test_main_process_first(self): tmpdirname = DistributedHelper.broadcast_object(tmpdirname) - #print('Entering exclusive section', my_rank) with DistributedHelper.main_process_first(): - #print('Entered exclusive section', my_rank) for _ in range(2): - #print('Checking files before', my_rank) time.sleep(0.1 + my_rank * 0.05) files = list(os.listdir(tmpdirname)) if DistributedHelper.is_main_process: @@ -400,12 +473,11 @@ def test_main_process_first(self): self.assertIn(f'rank0', files) self.assertNotIn(f'rank{my_rank}', files) - #print('Writing my file', my_rank) - with open(os.path.join(tmpdirname, f'rank{my_rank}'), 'w') as f: + with open(os.path.join(tmpdirname, f'rank{my_rank}'), 'w') \ + as f: f.write('ok') for _ in range(2): - #print('Checking files after', my_rank) time.sleep(0.1 + my_rank * 0.05) files = list(os.listdir(tmpdirname)) if DistributedHelper.is_main_process: @@ -414,11 +486,11 @@ def test_main_process_first(self): else: self.assertIn(f'rank0', files) self.assertIn(f'rank{my_rank}', files) - #print('Exiting exclusive section', my_rank) DistributedHelper.barrier() files = set(os.listdir(tmpdirname)) - expect = set([f'rank{rnk}' for rnk in range(DistributedHelper.world_size)]) + expect = set([f'rank{rnk}' + for rnk in range(DistributedHelper.world_size)]) self.assertSetEqual(expect, files) DistributedHelper.barrier() finally: diff --git a/tests/distributed/test_distributed_strategy_support.py b/tests/distributed/test_distributed_strategy_support.py index aee8e9836..45d7e67f1 100644 --- a/tests/distributed/test_distributed_strategy_support.py +++ b/tests/distributed/test_distributed_strategy_support.py @@ -163,6 +163,8 @@ def test_naive_classification_dst(self): class IterationCheckerPlugin(SupervisedPlugin): + supports_distributed = True + def __init__(self, test_suite): super().__init__() self.test_suite = test_suite diff --git a/tests/run_dist_tests.py b/tests/run_dist_tests.py index bd7a769b3..207c0c371 100644 --- a/tests/run_dist_tests.py +++ b/tests/run_dist_tests.py @@ -61,7 +61,7 @@ def run_distributed_suites(test_cases): nproc_per_node = torch.cuda.device_count() else: print('Running tests using CPU only') - nproc_per_node = 4 + nproc_per_node = 2 for case_name in cases_names: if exited: diff --git a/tests/test_avalanche_classification_dataset.py b/tests/test_avalanche_classification_dataset.py index 9dd5c972f..13bd81ec6 100644 --- a/tests/test_avalanche_classification_dataset.py +++ b/tests/test_avalanche_classification_dataset.py @@ -1713,7 +1713,7 @@ def test_replace_transforms(self): dataset_other = make_classification_dataset(dataset_reset) dataset_other = dataset_other.replace_current_transform_group( - (None, lambda l: l + 1) + (None, lambda val: val + 1) ) _, y6, _ = dataset_other[0]