diff --git a/avalanche/benchmarks/classic/cmnist.py b/avalanche/benchmarks/classic/cmnist.py
index db6ef1297..107ad84b9 100644
--- a/avalanche/benchmarks/classic/cmnist.py
+++ b/avalanche/benchmarks/classic/cmnist.py
@@ -33,9 +33,8 @@
 )
 from avalanche.benchmarks.utils.data import make_avalanche_dataset
 
-_default_mnist_train_transform = Compose([Normalize((0.1307,), (0.3081,))])
-
-_default_mnist_eval_transform = Compose([Normalize((0.1307,), (0.3081,))])
+default_mnist_train_transform = Compose([Normalize((0.1307,), (0.3081,))])
+default_mnist_eval_transform = Compose([Normalize((0.1307,), (0.3081,))])
 
 
 class PixelsPermutation(object):
@@ -83,8 +82,8 @@ def SplitMNIST(
     shuffle: bool = True,
     class_ids_from_zero_in_each_exp: bool = False,
     class_ids_from_zero_from_first_exp: bool = False,
-    train_transform: Optional[Any] = _default_mnist_train_transform,
-    eval_transform: Optional[Any] = _default_mnist_eval_transform,
+    train_transform: Optional[Any] = default_mnist_train_transform,
+    eval_transform: Optional[Any] = default_mnist_eval_transform,
     dataset_root: Optional[Union[str, Path]] = None
 ):
     """
@@ -170,8 +169,8 @@ def PermutedMNIST(
     *,
     return_task_id=False,
     seed: Optional[int] = None,
-    train_transform: Optional[Any] = _default_mnist_train_transform,
-    eval_transform: Optional[Any] = _default_mnist_eval_transform,
+    train_transform: Optional[Any] = default_mnist_train_transform,
+    eval_transform: Optional[Any] = default_mnist_eval_transform,
     dataset_root: Optional[Union[str, Path]] = None
 ) -> NCScenario:
     """
@@ -268,8 +267,8 @@ def RotatedMNIST(
     return_task_id: bool = False,
     seed: Optional[int] = None,
     rotations_list: Optional[Sequence[int]] = None,
-    train_transform: Optional[Any] = _default_mnist_train_transform,
-    eval_transform: Optional[Any] = _default_mnist_eval_transform,
+    train_transform: Optional[Any] = default_mnist_train_transform,
+    eval_transform: Optional[Any] = default_mnist_eval_transform,
     dataset_root: Optional[Union[str, Path]] = None
 ) -> NCScenario:
     """Creates a Rotated MNIST benchmark.
@@ -379,7 +378,13 @@ def RotatedMNIST(
     )
 
 
-__all__ = ["SplitMNIST", "PermutedMNIST", "RotatedMNIST"]
+__all__ = [
+    "SplitMNIST",
+    "PermutedMNIST",
+    "RotatedMNIST",
+    "default_mnist_train_transform",
+    "default_mnist_eval_transform",
+]
 
 
 if __name__ == "__main__":
diff --git a/avalanche/benchmarks/datasets/dataset_utils.py b/avalanche/benchmarks/datasets/dataset_utils.py
index 575093176..e26c0af7e 100644
--- a/avalanche/benchmarks/datasets/dataset_utils.py
+++ b/avalanche/benchmarks/datasets/dataset_utils.py
@@ -48,7 +48,7 @@ def load_config_file():
 
 
 def maybe_init_config_file():
-    """Initialize Avalanche user's config file, if it does not exists yet.
+    """Initialize Avalanche user's config file, if it does not exist yet.
 
     The file is located in `~/.avalanche/config.json`
     """
diff --git a/avalanche/benchmarks/scenarios/__init__.py b/avalanche/benchmarks/scenarios/__init__.py
index a3ecad792..1e72800bd 100644
--- a/avalanche/benchmarks/scenarios/__init__.py
+++ b/avalanche/benchmarks/scenarios/__init__.py
@@ -1,6 +1,7 @@
 from .generic_scenario import *
 from .deprecated.dataset_scenario import *
 from .deprecated.classification_scenario import *
+from .deprecated.generic_benchmark_creation import *
 from .deprecated.new_classes import *
 from .deprecated.new_instances import *
 
diff --git a/avalanche/benchmarks/scenarios/dataset_scenario.py b/avalanche/benchmarks/scenarios/dataset_scenario.py
index d00f2ac04..e7ad6a897 100644
--- a/avalanche/benchmarks/scenarios/dataset_scenario.py
+++ b/avalanche/benchmarks/scenarios/dataset_scenario.py
@@ -73,13 +73,12 @@ def __init__(
         self, *, dataset: TCLDataset, current_experience: Optional[int] = None
     ):
         super().__init__(current_experience=current_experience, origin_stream=None)
-        self._dataset: AvalancheDataset = dataset
+        self._dataset = dataset
 
     @property
-    def dataset(self) -> AvalancheDataset:
+    def dataset(self) -> TCLDataset:
         # dataset is a read-only property
-        data = self._dataset
-        return data
+        return self._dataset
 
 
 def _split_dataset_by_attribute(
@@ -101,9 +100,9 @@ def _split_dataset_by_attribute(
 def split_validation_random(
     validation_size: Union[int, float],
     shuffle: bool,
+    dataset: TCLDataset,
     seed: Optional[int] = None,
-    dataset: Optional[AvalancheDataset] = None,
-) -> Tuple[AvalancheDataset, AvalancheDataset]:
+) -> Tuple[TCLDataset, TCLDataset]:
     """Splits an `AvalancheDataset` in two splits.
 
     The default splitting strategy used by
@@ -119,7 +118,7 @@ def split_validation_random(
     a single parameter: the experience. Consider wrapping your custom
     splitting strategy with `partial` if more parameters are needed.
 
-    You can use this split strategy with methdos that require a custom
+    You can use this split strategy with methods that require a custom
     split strategy such as :func:`benchmark_with_validation_stream`to split
     a benchmark with::
 
@@ -133,11 +132,10 @@ def split_validation_random(
         Otherwise, the first instances will be allocated to the training
         dataset by leaving the last ones to the validation dataset.
     :param dataset: The dataset to split.
+    :param seed: The random seed for shuffling the dataset.
     :return: A tuple containing 2 elements: the new training and validation
         datasets.
     """
-    if dataset is None:
-        raise ValueError("dataset must be provided")
     exp_indices = list(range(len(dataset)))
 
     if seed is None:
diff --git a/avalanche/benchmarks/scenarios/deprecated/generators.py b/avalanche/benchmarks/scenarios/deprecated/generators.py
index cb2fdf630..e411cb3e7 100644
--- a/avalanche/benchmarks/scenarios/deprecated/generators.py
+++ b/avalanche/benchmarks/scenarios/deprecated/generators.py
@@ -72,6 +72,7 @@
 TCLDataset = TypeVar("TCLDataset", bound="AvalancheDataset")
 
 
+# TODO: Nomenclature: experience vs task
 def nc_benchmark(
     train_dataset: Union[Sequence[SupportedDataset], SupportedDataset],
     test_dataset: Union[Sequence[SupportedDataset], SupportedDataset],
diff --git a/avalanche/benchmarks/scenarios/deprecated/generic_benchmark_creation.py b/avalanche/benchmarks/scenarios/deprecated/generic_benchmark_creation.py
index 351d7b6f3..1aed1bae3 100644
--- a/avalanche/benchmarks/scenarios/deprecated/generic_benchmark_creation.py
+++ b/avalanche/benchmarks/scenarios/deprecated/generic_benchmark_creation.py
@@ -31,7 +31,7 @@
 
 from avalanche.benchmarks.utils.classification_dataset import (
     _make_taskaware_tensor_classification_dataset,
-    _make_taskaware_classification_dataset,
+    _make_taskaware_classification_dataset, ClassificationDataset,
 )
 
 from avalanche.benchmarks.utils import (
@@ -197,7 +197,7 @@ class LazyStreamDefinition(NamedTuple):
     This class is a named tuple containing the fields required for defining
     a lazily-created benchmark.
 
-    - exps_generator: The experiences generator. Can be a "yield"-based
+    - exps_generator: The experience's generator. Can be a "yield"-based
       generator, a custom sequence, a standard list or any kind of
       iterable returning :class:`AvalancheDataset`.
     - stream_length: The number of experiences in the stream. Must match the
@@ -207,7 +207,7 @@ class LazyStreamDefinition(NamedTuple):
       can be used.
     """
 
-    exps_generator: Iterable[TaskAwareClassificationDataset]
+    exps_generator: Iterable[ClassificationDataset]
     """
     The experiences generator. Can be a "yield"-based generator, a custom
     sequence, a standard list or any kind of iterable returning
diff --git a/avalanche/benchmarks/scenarios/deprecated/new_classes/nc_scenario.py b/avalanche/benchmarks/scenarios/deprecated/new_classes/nc_scenario.py
index a5509b18d..4430f0ea4 100644
--- a/avalanche/benchmarks/scenarios/deprecated/new_classes/nc_scenario.py
+++ b/avalanche/benchmarks/scenarios/deprecated/new_classes/nc_scenario.py
@@ -556,7 +556,7 @@ class NCExperience(ClassificationExperience[TaskAwareSupervisedClassificationDat
     def __init__(self, origin_stream: NCStream, current_experience: int):
         """
         Creates a ``NCExperience`` instance given the stream from this
-        experience was taken and and the current experience ID.
+        experience was taken and the current experience ID.
 
         :param origin_stream: The stream from which this experience was
             obtained.
diff --git a/avalanche/benchmarks/scenarios/generic_scenario.py b/avalanche/benchmarks/scenarios/generic_scenario.py
index 34da0d249..c6ba25225 100644
--- a/avalanche/benchmarks/scenarios/generic_scenario.py
+++ b/avalanche/benchmarks/scenarios/generic_scenario.py
@@ -36,7 +36,6 @@
     slice_alike_object_to_indices,
 )
 
-
 # Typing
 T = TypeVar("T")
 TCov = TypeVar("TCov", covariant=True)
@@ -275,6 +274,7 @@ def _check_unset_attribute(attribute_name: str, attribute_value: Any):
         )
 
 
+# TODO: itertools.cycle?
 class GeneratorMemo(Generic[T]):
     def __init__(self, generator: Generator[T, None, None]):
         self._generator: Optional[Generator[T, None, None]] = generator
@@ -386,7 +386,7 @@ def __len__(self) -> int:
 
         :return: The number of experiences in this stream.
         """
-        pass
+        ...
 
 
 class SequenceCLStream(SizedCLStream[TCLExperience], Sequence[TCLExperience], ABC):
@@ -590,6 +590,13 @@ class CLScenario(Generic[TCLStream]):
     provide access to past, current, and future data.
     """
 
+    # Define usual empty streams for typing
+    # TODO: If regarded unnecessary, the constructor magic should be removed
+    #  and `scenario.streams['train']` yields the correct type
+    train_stream = CLStream('train', [])
+    test_stream = CLStream('test', [])
+    valid_stream = CLStream('valid', [])
+
     def __init__(self, streams: Iterable[TCLStream]):
         """Creates an instance of a Continual Learning benchmark.
 
@@ -603,7 +610,7 @@ def __init__(self, streams: Iterable[TCLStream]):
 
     @property
     def streams(self):
-        # we don't want in-place modifications so we return a copy
+        # we don't want in-place modifications, so we return a copy
         return copy(self._streams)
 
 
@@ -612,7 +619,7 @@ def make_stream(name: str, exps: Iterable[CLExperience]) -> CLStream:
 
     Uses the correct class for generators, sized generators, and lists.
 
-    :param new_name: The name of the new stream.
+    :param name: The name of the new stream.
     :param exps: sequence of experiences.
     """
     s_wrapped: CLStream
diff --git a/avalanche/benchmarks/scenarios/online.py b/avalanche/benchmarks/scenarios/online.py
index 1e51dfadc..4faafac7a 100644
--- a/avalanche/benchmarks/scenarios/online.py
+++ b/avalanche/benchmarks/scenarios/online.py
@@ -46,6 +46,7 @@ class CyclicSampler(Sampler):
     """Samples elements from [0,..,len(dataset)-1] in a cyclic manner."""
 
     def __init__(self, n_samples, shuffle=True, rng=None):
+        super().__init__()
         self.n_samples = n_samples
         self.rng = rng
         self.shuffle = shuffle
diff --git a/avalanche/benchmarks/scenarios/supervised.py b/avalanche/benchmarks/scenarios/supervised.py
index c9a5f36ce..b01dc6a1f 100644
--- a/avalanche/benchmarks/scenarios/supervised.py
+++ b/avalanche/benchmarks/scenarios/supervised.py
@@ -11,6 +11,7 @@
 
 """High-level benchmark generators for supervised scenarios such as class-incremental."""
 import warnings
+from collections.abc import Collection
 from copy import copy
 from typing import (
     Iterable,
@@ -22,6 +23,7 @@
 )
 
 import torch
+from typing_extensions import Self
 
 from avalanche.benchmarks.utils.classification_dataset import (
     ClassificationDataset,
@@ -30,7 +32,7 @@
 from avalanche.benchmarks.utils.data import AvalancheDataset
 from avalanche.benchmarks.utils.data_attribute import DataAttribute
 from .dataset_scenario import _split_dataset_by_attribute, DatasetExperience
-from .generic_scenario import CLScenario, CLStream, EagerCLStream
+from .generic_scenario import CLScenario, CLStream, EagerCLStream, CLExperience
 
 
 def class_incremental_benchmark(
@@ -40,7 +42,7 @@ def class_incremental_benchmark(
     num_experiences: Optional[int] = None,
     num_classes_per_exp: Optional[Sequence[int]] = None,
     seed: Optional[int] = None,
-) -> CLScenario:
+) -> CLScenario[EagerCLStream[DatasetExperience[ClassificationDataset]]]:
     """Splits datasets according to a class-incremental scenario.
 
     Each dataset will create a stream with the same class order.
@@ -103,14 +105,14 @@ def class_incremental_benchmark(
     classes_exp_assignment = []
     if num_experiences is not None:
         assert num_classes_per_exp is None, "BUG: num_classes_per_exp must be None"
-        curr_classess_per_exp: int = num_classes // num_experiences
+        curr_classes_per_exp: int = num_classes // num_experiences
         for eid in range(num_experiences):
             if eid == 0:
-                classes_exp_assignment.append(class_order[:curr_classess_per_exp])
+                classes_exp_assignment.append(class_order[:curr_classes_per_exp])
             else:
                 # final exp will take reminder of classes if they don't divide equally
-                start_idx = curr_classess_per_exp * eid
-                end_idx = start_idx + curr_classess_per_exp
+                start_idx = curr_classes_per_exp * eid
+                end_idx = start_idx + curr_classes_per_exp
                 classes_exp_assignment.append(class_order[start_idx:end_idx])
     elif num_classes_per_exp is not None:
         num_curr = 0
@@ -120,7 +122,7 @@ def class_incremental_benchmark(
             num_curr += num_classes
 
     # create the streams using class_order to split the data
-    streams = []
+    streams: List[EagerCLStream[DatasetExperience[ClassificationDataset]]] = []
     for name, dd in datasets_dict.items():
         curr_stream = []
         data_by_class = _split_dataset_by_attribute(dd, "targets")
@@ -339,12 +341,6 @@ def new_instances_benchmark(
     return CLScenario(streams=[train_stream, test_stream])
 
 
-__all__ = [
-    "class_incremental_benchmark",
-    "new_instances_benchmark",
-]
-
-
 class ClassesTimeline(Protocol):
     """Experience decorator that provides info about classes occurrence over time."""
 
@@ -381,7 +377,7 @@ def _decorate_benchmark(obj: CLScenario):
             new_streams.append(_decorate_stream(s))
         return CLScenario(new_streams)
 
-    def _decorate_stream(obj: CLStream):
+    def _decorate_stream(obj: CLStream[DatasetExperience[ClassificationDataset]]):
         # TODO: support stream generators. Should return a new generators which applies
         #  foo_decorate_exp every time a new experience is generated.
         new_stream = []
diff --git a/avalanche/benchmarks/scenarios/task_aware.py b/avalanche/benchmarks/scenarios/task_aware.py
index cd3d07508..824e4fdd1 100644
--- a/avalanche/benchmarks/scenarios/task_aware.py
+++ b/avalanche/benchmarks/scenarios/task_aware.py
@@ -124,7 +124,7 @@ def task_incremental_benchmark(bm: CLScenario, reset_task_labels=False) -> CLSce
 
         with_task_labels(benchmark_from_datasets(**dataset_streams)
 
-    :param **dataset_streams: keys are stream names, values are list of datasets.
+    :param dataset_streams: keys are stream names, values are list of datasets.
     :param reset_task_labels: whether existing task labels should be ignored.
         If False (default) if any dataset has task labels the function will raise
         a ValueError. If `True`, it will reset task labels.
diff --git a/avalanche/benchmarks/scenarios/validation_scenario.py b/avalanche/benchmarks/scenarios/validation_scenario.py
index d3ad9e4ce..a80f5f146 100644
--- a/avalanche/benchmarks/scenarios/validation_scenario.py
+++ b/avalanche/benchmarks/scenarios/validation_scenario.py
@@ -70,7 +70,7 @@ def benchmark_with_validation_stream(
         # functools.partial is a more compact option
         # However, MyPy does not understand what a partial is -_-
         def random_validation_split_strategy_wrapper(data):
-            return split_validation_random(validation_size, shuffle, seed, data)
+            return split_validation_random(validation_size, shuffle, data, seed)
 
         split_strategy = random_validation_split_strategy_wrapper
     else:
diff --git a/avalanche/benchmarks/utils/classification_dataset.py b/avalanche/benchmarks/utils/classification_dataset.py
index ad6db47f6..c781da8ca 100644
--- a/avalanche/benchmarks/utils/classification_dataset.py
+++ b/avalanche/benchmarks/utils/classification_dataset.py
@@ -16,40 +16,7 @@
 labels automatically. Concatenation and subsampling operations are optimized
 to be used frequently, as is common in replay strategies.
 """
-
 from functools import partial
-import torch
-from torch.utils.data.dataset import Subset, ConcatDataset, TensorDataset
-
-from avalanche.benchmarks.utils.utils import (
-    TaskSet,
-    _count_unique,
-    find_common_transforms_group,
-    _init_task_labels,
-    _init_transform_groups,
-    _split_user_def_targets,
-    _split_user_def_task_label,
-    _traverse_supported_dataset,
-)
-
-from avalanche.benchmarks.utils.data import AvalancheDataset
-from avalanche.benchmarks.utils.transform_groups import (
-    TransformGroupDef,
-    DefaultTransformGroups,
-    XTransform,
-    YTransform,
-)
-from avalanche.benchmarks.utils.data_attribute import DataAttribute
-from avalanche.benchmarks.utils.dataset_utils import (
-    SubSequence,
-)
-from avalanche.benchmarks.utils.flat_data import ConstantSequence
-from avalanche.benchmarks.utils.dataset_definitions import (
-    ISupportedClassificationDataset,
-    ITensorDataset,
-    IDatasetWithTargets,
-)
-
 from typing import (
     List,
     Any,
@@ -61,16 +28,46 @@
     Dict,
     Tuple,
     Mapping,
-    overload,
+    overload, Self,
 )
 
+import torch
+from torch.utils.data.dataset import Subset, ConcatDataset, TensorDataset
+
+from avalanche.benchmarks.utils.data import AvalancheDataset
+from avalanche.benchmarks.utils.data_attribute import DataAttribute
+from avalanche.benchmarks.utils.dataset_definitions import (
+    ISupportedClassificationDataset,
+    ITensorDataset,
+    IDatasetWithTargets,
+)
+from avalanche.benchmarks.utils.dataset_utils import (
+    SubSequence,
+)
+from avalanche.benchmarks.utils.flat_data import ConstantSequence
+from avalanche.benchmarks.utils.transform_groups import (
+    TransformGroupDef,
+    DefaultTransformGroups,
+    XTransform,
+    YTransform,
+)
+from avalanche.benchmarks.utils.utils import (
+    TaskSet,
+    _count_unique,
+    find_common_transforms_group,
+    _init_task_labels,
+    _init_transform_groups,
+    _split_user_def_targets,
+    _split_user_def_task_label,
+    _traverse_supported_dataset,
+)
 
 T_co = TypeVar("T_co", covariant=True)
-TAvalancheDataset = TypeVar("TAvalancheDataset", bound="AvalancheDataset")
+TAvalancheDataset = TypeVar("TAvalancheDataset", bound=AvalancheDataset)
 TTargetType = int
 
 TClassificationDataset = TypeVar(
-    "TClassificationDataset", bound="ClassificationDataset"
+    "TClassificationDataset", bound=IDatasetWithTargets
 )
 
 
@@ -116,8 +113,8 @@ def task_pattern_indices(self) -> Dict[int, Sequence[int]]:
         return self.targets_task_labels.val_to_idx  # type: ignore
 
     @property
-    def task_set(self: TClassificationDataset) -> TaskSet[TClassificationDataset]:
-        """Returns the datasets's ``TaskSet``, which is a mapping <task-id,
+    def task_set(self) -> TaskSet[Self]:
+        """Returns the dataset's ``TaskSet``, which is a mapping <task-id,
         task-dataset>."""
         return TaskSet(self)
 
@@ -226,7 +223,7 @@ def _make_taskaware_classification_dataset(
     slicing and advanced indexing and it also contains useful fields as
     `targets`, which contains the pattern labels, and `targets_task_labels`,
     which contains the pattern task labels. The `task_set` field can be used to
-    obtain a the subset of patterns labeled with a given task label.
+    obtain a subset of patterns labeled with a given task label.
 
     This dataset can also be used to apply several advanced operations involving
     transformations. For instance, it allows the user to add and replace
@@ -297,11 +294,11 @@ def _make_taskaware_classification_dataset(
     is_supervised = isinstance(dataset, TaskAwareSupervisedClassificationDataset)
 
     transform_gs = _init_transform_groups(
-        transform_groups,
-        transform,
-        target_transform,
-        initial_transform_group,
-        dataset,
+        transform_groups=transform_groups,
+        transform=transform,
+        target_transform=target_transform,
+        initial_transform_group=initial_transform_group,
+        dataset=dataset,
     )
     targets_data: Optional[DataAttribute[TTargetType]] = _init_targets(dataset, targets)
     task_labels_data: Optional[DataAttribute[int]] = _init_task_labels(
@@ -522,11 +519,11 @@ def _taskaware_classification_subset(
     )
 
     transform_gs = _init_transform_groups(
-        transform_groups,
-        transform,
-        target_transform,
-        initial_transform_group,
-        dataset,
+        transform_groups=transform_groups,
+        transform=transform,
+        target_transform=target_transform,
+        initial_transform_group=initial_transform_group,
+        dataset=dataset,
     )
 
     if initial_transform_group is not None and isinstance(dataset, AvalancheDataset):
@@ -696,11 +693,11 @@ def _make_taskaware_tensor_classification_dataset(
     dataset = _TensorClassificationDataset(*tts)
 
     transform_gs = _init_transform_groups(
-        transform_groups,
-        transform,
-        target_transform,
-        initial_transform_group,
-        dataset,
+        transform_groups=transform_groups,
+        transform=transform,
+        target_transform=target_transform,
+        initial_transform_group=initial_transform_group,
+        dataset=dataset,
     )
     targets_data = _init_targets(dataset, targets)
     task_labels_data = _init_task_labels(dataset, task_labels)
@@ -896,12 +893,13 @@ def _concat_taskaware_classification_datasets(
         dds.append(dd)
 
     if len(dds) > 0:
+        dataset = dds[0]
         transform_groups_obj = _init_transform_groups(
-            transform_groups,
-            transform,
-            target_transform,
-            initial_transform_group,
-            dds[0],
+            transform_groups=transform_groups,
+            transform=transform,
+            target_transform=target_transform,
+            initial_transform_group=initial_transform_group,
+            dataset=dataset,
         )
     else:
         transform_groups_obj = None
@@ -1116,6 +1114,7 @@ def _as_taskaware_supervised_classification_dataset(
 
 
 __all__ = [
+    "ClassificationDataset",
     "SupportedDataset",
     "TaskAwareClassificationDataset",
     "TaskAwareSupervisedClassificationDataset",
diff --git a/avalanche/benchmarks/utils/data.py b/avalanche/benchmarks/utils/data.py
index 9985e19fa..e2861e264 100644
--- a/avalanche/benchmarks/utils/data.py
+++ b/avalanche/benchmarks/utils/data.py
@@ -17,13 +17,7 @@
 """
 import copy
 import warnings
-import numpy as np
-
-from torch.utils.data.dataloader import default_collate
-
-from avalanche.benchmarks.utils.dataset_definitions import IDataset
-from .data_attribute import DataAttribute
-
+from collections import OrderedDict
 from typing import (
     Dict,
     List,
@@ -36,21 +30,24 @@
     overload,
 )
 
-from .flat_data import FlatData
-from .transform_groups import TransformGroups, EmptyTransformGroups
+import numpy as np
 from torch.utils.data import Dataset as TorchDataset
-from collections import OrderedDict
+from torch.utils.data.dataloader import default_collate
+from typing_extensions import Self
 
+from avalanche.benchmarks.utils.dataset_definitions import IDataset
+from .data_attribute import DataAttribute
+from .flat_data import FlatData
+from .transform_groups import TransformGroups, EmptyTransformGroups
 
 T_co = TypeVar("T_co", covariant=True)
-TAvalancheDataset = TypeVar("TAvalancheDataset", bound="AvalancheDataset")
 TDataWTransform = TypeVar("TDataWTransform", bound="_FlatDataWithTransform")
 
 
 class AvalancheDataset(IDataset[T_co]):
     """Avalanche Dataset.
 
-    Avlanche dataset are pytorch-compatible Datasets with some additional
+    Avalanche dataset are pytorch-compatible Datasets with some additional
     functionality such as:
     - management of transformation groups via :class:`AvalancheTransform`
     - support for sample attributes such as class targets and task labels
@@ -79,16 +76,20 @@ class AvalancheDataset(IDataset[T_co]):
 
     switching to a different transformation group by calling the ``train()``,
     ``eval()`` or ``with_transforms` methods always returns a new dataset,
-    levaing the original one unchanged.
+    leaving the original one unchanged.
 
-    Ttransformation groups can be manipulated by removing, freezing, or
+    Transformation groups can be manipulated by removing, freezing, or
     replacing transformations. Each operation returns a new dataset, leaving
     the original one unchanged.
     """
 
     def __init__(
         self,
-        datasets: Sequence[IDataset[T_co]],
+        datasets: Union[
+            Sequence[IDataset[T_co]],
+            TorchDataset[T_co],
+            "AvalancheDataset[T_co]",
+        ],
         *,
         indices: Optional[List[int]] = None,
         data_attributes: Optional[List[DataAttribute]] = None,
@@ -98,14 +99,17 @@ def __init__(
     ):
         """Creates a ``AvalancheDataset`` instance.
 
-        :param dataset: Original dataset. Beware that
+        :param datasets: Original dataset. Beware that
             AvalancheDataset will not overwrite transformations already
             applied by this dataset.
         :param transform_groups: Avalanche transform groups.
         """
-        if issubclass(type(datasets), TorchDataset) or issubclass(
-            type(datasets), AvalancheDataset
-        ):
+        # TODO: Deprecate in favor of `make_avalanche_dataset`?
+        if isinstance(datasets, (TorchDataset, AvalancheDataset)):
+            warnings.warn(
+                f"'datasets' argument should be a list of datasets, "
+                f"not {type(datasets).__name__}"
+            )
             datasets = [datasets]  # type: ignore
 
         # NOTES on implementation:
@@ -251,12 +255,10 @@ def __init__(
     def __len__(self) -> int:
         return len(self._flat_data)
 
-    def __add__(self: TAvalancheDataset, other: TAvalancheDataset) -> TAvalancheDataset:
+    def __add__(self, other: Self) -> Self:
         return self.concat(other)
 
-    def __radd__(
-        self: TAvalancheDataset, other: TAvalancheDataset
-    ) -> TAvalancheDataset:
+    def __radd__(self, other: Self) -> Self:
         return other.concat(self)
 
     @property
@@ -264,7 +266,7 @@ def _datasets(self):
         """Only for backward compatibility of old unit tests. Do not use."""
         return self._flat_data._datasets
 
-    def concat(self: TAvalancheDataset, other: TAvalancheDataset) -> TAvalancheDataset:
+    def concat(self, other: Self) -> Self:
         """Concatenate this dataset with other.
 
         :param other: Other dataset to concatenate.
@@ -272,7 +274,7 @@ def concat(self: TAvalancheDataset, other: TAvalancheDataset) -> TAvalancheDatas
         """
         return self.__class__([self, other])
 
-    def subset(self: TAvalancheDataset, indices: Sequence[int]) -> TAvalancheDataset:
+    def subset(self, indices: Sequence[int]) -> Self:
         """Subset this dataset.
 
         :param indices: The indices to keep.
@@ -288,14 +290,12 @@ def transform(self):
             "See the documentation for more info."
         )
 
-    def update_data_attribute(
-        self: TAvalancheDataset, name: str, new_value
-    ) -> TAvalancheDataset:
+    def update_data_attribute(self, name: str, new_value) -> Self:
         """
         Return a new dataset with the added or replaced data attribute.
 
-        If a object of type :class:`DataAttribute` is passed, then the data
-        attribute is setted as is.
+        If an object of type :class:`DataAttribute` is passed, then the data
+        attribute is set as is.
 
         Otherwise, if a raw value is passed, a new DataAttribute is created.
         If a DataAttribute with the same already exists, the use_in_getitem
@@ -347,11 +347,9 @@ def __eq__(self, other: object):
     def __getitem__(self, exp_id: int) -> T_co: ...
 
     @overload
-    def __getitem__(self: TAvalancheDataset, exp_id: slice) -> TAvalancheDataset: ...
+    def __getitem__(self, exp_id: slice) -> Self: ...  # type: ignore
 
-    def __getitem__(
-        self: TAvalancheDataset, idx: Union[int, slice]
-    ) -> Union[T_co, TAvalancheDataset]:
+    def __getitem__(self, idx: Union[int, slice]) -> Union[T_co, Self]:
         elem = self._flat_data[idx]
         for da in self._data_attributes.values():
             if da.use_in_getitem:
@@ -389,7 +387,7 @@ def eval(self):
         """
         return self.with_transforms("eval")
 
-    def with_transforms(self: TAvalancheDataset, group_name: str) -> TAvalancheDataset:
+    def with_transforms(self, group_name: str) -> Self:
         """
         Returns a new dataset with the transformations of a different group
         loaded.
@@ -403,7 +401,7 @@ def with_transforms(self: TAvalancheDataset, group_name: str) -> TAvalancheDatas
         datacopy._flat_data = datacopy._flat_data.with_transforms(group_name)
         return datacopy
 
-    def freeze_transforms(self: TAvalancheDataset) -> TAvalancheDataset:
+    def freeze_transforms(self) -> Self:
         """Returns a new dataset with the transformation groups frozen."""
         datacopy = self._shallow_clone_dataset()
         datacopy._flat_data = datacopy._flat_data.freeze_transforms()
@@ -424,7 +422,7 @@ def replace_current_transform_group(self, transform):
         datacopy._flat_data = fdata.replace_current_transform_group(transform)
         return datacopy
 
-    def _shallow_clone_dataset(self: TAvalancheDataset) -> TAvalancheDataset:
+    def _shallow_clone_dataset(self) -> Self:
         """Clone dataset.
         This is a shallow copy, i.e. the data attributes are not copied.
         """
diff --git a/avalanche/benchmarks/utils/dataset_definitions.py b/avalanche/benchmarks/utils/dataset_definitions.py
index de9fa05da..11187ca92 100644
--- a/avalanche/benchmarks/utils/dataset_definitions.py
+++ b/avalanche/benchmarks/utils/dataset_definitions.py
@@ -23,7 +23,7 @@
 #
 # That is, accept ISupportedClassificationDataset as parameter to
 # functions/constructors (when possible), but always expose/return instances of
-# ClassificationDataset to the, user (no matter what). The main difference is
+# ClassificationDataset to the user (no matter what). The main difference is
 # that ClassificationDataset is a subclass of the PyTorch Dataset while
 # ISupportedClassificationDataset is just a Protocol. This will allow the user
 # to pass any custom dataset while receiving Dataset subclasses as outputs at
@@ -111,11 +111,13 @@ class IClassificationDataset(IDatasetWithTargets[T_co, int], Protocol):
     protocol see :class:`ISupportedClassificationDataset`.
     """
 
-    targets: Sequence[int]
-    """
-    A sequence of ints describing the label of each pattern contained in the
-    dataset.
-    """
+    @property
+    def targets(self) -> Sequence[int]:
+        """
+        A sequence of ints describing the label of each pattern contained in the
+        dataset.
+        """
+        ...
 
 
 class ClassificationDataset(IClassificationDataset[T_co], Dataset):
@@ -126,12 +128,9 @@ class ClassificationDataset(IClassificationDataset[T_co], Dataset):
     The actual value of the targets field should be set by the child class.
     """
 
-    def __init__(self):
-        self.targets = []
-        """
-        A sequence of ints describing the label of each pattern contained in the
-        dataset.
-        """
+    @property
+    def targets(self) -> Sequence[int]:
+        return []
 
 
 __all__ = [
diff --git a/avalanche/benchmarks/utils/detection_dataset.py b/avalanche/benchmarks/utils/detection_dataset.py
index 6c5efb43f..e6d63468b 100644
--- a/avalanche/benchmarks/utils/detection_dataset.py
+++ b/avalanche/benchmarks/utils/detection_dataset.py
@@ -199,10 +199,10 @@ def make_detection_dataset(
     this dataset, but it can also be used in a completely standalone manner.
 
     This dataset applies input/target transformations, it supports
-    slicing and advanced indexing and it also contains useful fields as
+    slicing, advanced indexing, and it also contains useful fields as
     `targets`, which contains the pattern dictionaries, and
     `targets_task_labels`, which contains the pattern task labels.
-    The `task_set` field can be used to obtain a the subset of patterns
+    The `task_set` field can be used to obtain a subset of patterns
     labeled with a given task label.
 
     This dataset can also be used to apply several advanced operations involving
@@ -273,11 +273,11 @@ def make_detection_dataset(
     is_supervised = isinstance(dataset, SupervisedDetectionDataset)
 
     transform_gs = _init_transform_groups(
-        transform_groups,
-        transform,
-        target_transform,
-        initial_transform_group,
-        dataset,
+        transform_groups=transform_groups,
+        transform=transform,
+        target_transform=target_transform,
+        initial_transform_group=initial_transform_group,
+        dataset=dataset,
     )
     targets_data: Optional[DataAttribute[TTargetType]] = _init_targets(dataset, targets)
     task_labels_data: Optional[DataAttribute[int]] = _init_task_labels(
@@ -509,11 +509,11 @@ def detection_subset(
     del targets
 
     transform_gs = _init_transform_groups(
-        transform_groups,
-        transform,
-        target_transform,
-        initial_transform_group,
-        dataset,
+        transform_groups=transform_groups,
+        transform=transform,
+        target_transform=target_transform,
+        initial_transform_group=initial_transform_group,
+        dataset=dataset,
     )
 
     if initial_transform_group is not None and isinstance(dataset, AvalancheDataset):
@@ -741,12 +741,13 @@ def concat_detection_datasets(
         #######################################
         # TRANSFORMATION GROUPS
         #######################################
+        dataset = dds[0]
         transform_groups_obj = _init_transform_groups(
-            transform_groups,
-            transform,
-            target_transform,
-            initial_transform_group,
-            dds[0],
+            transform_groups=transform_groups,
+            transform=transform,
+            target_transform=target_transform,
+            initial_transform_group=initial_transform_group,
+            dataset=dataset,
         )
 
         # Find common "current_group" or use "train"
diff --git a/avalanche/benchmarks/utils/flat_data.py b/avalanche/benchmarks/utils/flat_data.py
index 02d3681cd..0d6542f90 100644
--- a/avalanche/benchmarks/utils/flat_data.py
+++ b/avalanche/benchmarks/utils/flat_data.py
@@ -186,10 +186,10 @@ class FlatData(IDataset[T_co], Sequence[T_co]):
     """FlatData is a dataset optimized for efficient repeated concatenation
     and subset operations.
 
-    The class combines concatentation and subsampling operations in a single
+    The class combines concatenation and subsampling operations in a single
     class.
 
-    Class for internal use only. Users shuold use `AvalancheDataset` for data
+    Class for internal use only. Users should use `AvalancheDataset` for data
     or `DataAttribute` for attributes such as class and task labels.
 
     *Notes for subclassing*
diff --git a/avalanche/benchmarks/utils/transform_groups.py b/avalanche/benchmarks/utils/transform_groups.py
index 0f9004df3..5293d4bb3 100644
--- a/avalanche/benchmarks/utils/transform_groups.py
+++ b/avalanche/benchmarks/utils/transform_groups.py
@@ -71,7 +71,7 @@ class TransformGroups:
     """Transformation groups for Avalanche datasets.
 
     TransformGroups supports preprocessing and augmentation pipelines for
-    Avalanche datasets. Transfomations are separated into groups (e.g. `train`
+    Avalanche datasets. Transformations are separated into groups (e.g. `train`
     transforms and `test` transforms), that can be easily switched using the
     `with_transform` method.
     """
diff --git a/avalanche/benchmarks/utils/utils.py b/avalanche/benchmarks/utils/utils.py
index 17df10fdc..26e3797ef 100644
--- a/avalanche/benchmarks/utils/utils.py
+++ b/avalanche/benchmarks/utils/utils.py
@@ -11,6 +11,7 @@
 
 """ Common benchmarks/environments utils. """
 
+import warnings
 from collections import OrderedDict, defaultdict, deque
 from typing import (
     TYPE_CHECKING,
@@ -28,7 +29,6 @@
     Dict,
     SupportsInt,
 )
-import warnings
 
 import torch
 from torch import Tensor
@@ -45,15 +45,15 @@
 )
 from .flat_data import ConstantSequence
 from .transform_groups import (
-    TransformGroupDef,
-    TransformGroups,
-    XTransform,
-    YTransform,
+    TransformGroups, XTransform, YTransform, TransformGroupDef,
 )
 
 if TYPE_CHECKING:
-    from .classification_dataset import TaskAwareClassificationDataset
+    # Avoid cyclic imports
+    from .classification_dataset import ClassificationDataset, TaskAwareClassificationDataset
 
+Y = TypeVar("Y")
+T = TypeVar("T")
 T_co = TypeVar("T_co", covariant=True)
 TAvalancheDataset = TypeVar("TAvalancheDataset", bound="AvalancheDataset")
 
@@ -73,56 +73,6 @@ def tensor_as_list(sequence) -> List:
     return list(sequence)
 
 
-def _indexes_grouped_by_classes(
-    targets: Sequence[int],
-    patterns_indexes: Union[None, Sequence[int]],
-    sort_indexes: bool = True,
-    sort_classes: bool = True,
-) -> Union[List[int], None]:
-    result_per_class: Dict[int, List[int]] = OrderedDict()
-    result: List[int] = []
-
-    indexes_was_none = patterns_indexes is None
-
-    if patterns_indexes is not None:
-        patterns_indexes = tensor_as_list(patterns_indexes)
-    else:
-        patterns_indexes = list(range(len(targets)))
-
-    targets = tensor_as_list(targets)
-
-    # Consider that result_per_class is an OrderedDict
-    # This means that, if sort_classes is True, the next for statement
-    # will initialize "result_per_class" in sorted order which in turn means
-    # that patterns will be ordered by ascending class ID.
-    classes = torch.unique(torch.as_tensor(targets), sorted=sort_classes).tolist()
-
-    for class_id in classes:
-        result_per_class[class_id] = []
-
-    # Stores each pattern index in the appropriate class list
-    for idx in patterns_indexes:
-        result_per_class[targets[idx]].append(idx)
-
-    # Concatenate all the pattern indexes
-    for class_id in classes:
-        if sort_indexes:
-            result_per_class[class_id].sort()
-        result.extend(result_per_class[class_id])
-
-    if result == patterns_indexes and indexes_was_none:
-        # Result is [0, 1, 2, ..., N] and patterns_indexes was originally None
-        # This means that the user tried to obtain a full Dataset
-        # (indexes_was_none) only ordered according to the sort_indexes and
-        # sort_classes parameters. However, sort_indexes+sort_classes returned
-        # the plain pattern sequence as it already is. So the original Dataset
-        # already satisfies the sort_indexes+sort_classes constraints.
-        # By returning None, we communicate that the Dataset can be taken as-is.
-        return None
-
-    return result
-
-
 def grouped_and_ordered_indexes(
     targets: Sequence[int],
     patterns_indexes: Union[None, Sequence[int]],
@@ -176,21 +126,25 @@ def grouped_and_ordered_indexes(
 
 def as_avalanche_dataset(
     dataset: ISupportedClassificationDataset[T_co],
+    **kwargs,
 ) -> AvalancheDataset:
     if isinstance(dataset, AvalancheDataset):
         return dataset
-    return AvalancheDataset([dataset])
+    transform_groups = _init_transform_groups(**kwargs)
+    return AvalancheDataset([dataset], transform_groups=transform_groups)
 
 
 def as_classification_dataset(
     dataset: ISupportedClassificationDataset[T_co],
-    transform_groups: Optional[TransformGroups] = None,
-) -> "TaskAwareClassificationDataset":
-    """Converts a dataset with a `targets` field into an Avalanche ClassificationDataset."""
+    **kwargs,
+) -> "ClassificationDataset":
+    """Converts a dataset with a `targets` field into a ClassificationDataset."""
+    # Avoid cyclic imports
     from avalanche.benchmarks.utils.classification_dataset import ClassificationDataset
 
     if isinstance(dataset, ClassificationDataset):
         return dataset
+    transform_groups = _init_transform_groups(**kwargs)
     da = DataAttribute(dataset.targets, "targets")
     return ClassificationDataset(
         [dataset], transform_groups=transform_groups, data_attributes=[da]
@@ -199,14 +153,67 @@ def as_classification_dataset(
 
 def as_taskaware_classification_dataset(
     dataset: ISupportedClassificationDataset[T_co],
+    **kwargs,
 ) -> "TaskAwareClassificationDataset":
+    # Avoid cyclic imports
     from avalanche.benchmarks.utils.classification_dataset import (
         TaskAwareClassificationDataset,
     )
 
     if isinstance(dataset, TaskAwareClassificationDataset):
         return dataset
-    return TaskAwareClassificationDataset([dataset])
+    transform_groups = _init_transform_groups(**kwargs)
+    return TaskAwareClassificationDataset([dataset], transform_groups=transform_groups)
+
+
+def _indexes_grouped_by_classes(
+    targets: Sequence[int],
+    patterns_indexes: Union[None, Sequence[int]],
+    sort_indexes: bool = True,
+    sort_classes: bool = True,
+) -> Union[List[int], None]:
+    result_per_class: Dict[int, List[int]] = OrderedDict()
+    result: List[int] = []
+
+    indexes_was_none = patterns_indexes is None
+
+    if patterns_indexes is not None:
+        patterns_indexes = tensor_as_list(patterns_indexes)
+    else:
+        patterns_indexes = list(range(len(targets)))
+
+    targets = tensor_as_list(targets)
+
+    # Consider that result_per_class is an OrderedDict
+    # This means that, if sort_classes is True, the next for statement
+    # will initialize "result_per_class" in sorted order which in turn means
+    # that patterns will be ordered by ascending class ID.
+    classes = torch.unique(torch.as_tensor(targets), sorted=sort_classes).tolist()
+
+    for class_id in classes:
+        result_per_class[class_id] = []
+
+    # Stores each pattern index in the appropriate class list
+    for idx in patterns_indexes:
+        result_per_class[targets[idx]].append(idx)
+
+    # Concatenate all the pattern indexes
+    for class_id in classes:
+        if sort_indexes:
+            result_per_class[class_id].sort()
+        result.extend(result_per_class[class_id])
+
+    if result == patterns_indexes and indexes_was_none:
+        # Result is [0, 1, 2, ..., N] and patterns_indexes was originally None
+        # This means that the user tried to obtain a full Dataset
+        # (indexes_was_none) only ordered according to the sort_indexes and
+        # sort_classes parameters. However, sort_indexes+sort_classes returned
+        # the plain pattern sequence as it already is. So the original Dataset
+        # already satisfies the sort_indexes+sort_classes constraints.
+        # By returning None, we communicate that the Dataset can be taken as-is.
+        return None
+
+    return result
 
 
 def _count_unique(*sequences: Sequence[SupportsInt]):
@@ -268,10 +275,6 @@ def find_common_transforms_group(
     return initial_transform_group
 
 
-Y = TypeVar("Y")
-T = TypeVar("T")
-
-
 def _traverse_supported_dataset(
     dataset: Y,
     values_selector: Callable[[Y, Optional[List[int]]], Optional[Sequence[T]]],
@@ -330,12 +333,7 @@ def _traverse_supported_dataset(
         datasets_len = []
         recursion_result = []
 
-        all_size = 0
-        for c_dataset in dataset.datasets:
-            len_dataset = len(c_dataset)
-            datasets_len.append(len_dataset)
-            all_size += len_dataset
-
+        all_size = len(dataset)
         for subset_idx in indices:
             dataset_idx, pattern_idx = find_list_from_index(
                 subset_idx, datasets_len, all_size
@@ -374,7 +372,7 @@ def _init_task_labels(
     Initializes the task label list (one for each pattern in the dataset).
 
     Precedence is given to the values contained in `task_labels` if passed.
-    Otherwisem the elements will be retrieved from the dataset itself by
+    Otherwise, the elements will be retrieved from the dataset itself by
     traversing it and looking at the `targets_task_labels` field.
 
     :param dataset: The dataset for which the task labels list must be
@@ -456,17 +454,17 @@ def _select_task_labels(
 
 
 def _init_transform_groups(
-    transform_groups: Optional[Mapping[str, TransformGroupDef]],
-    transform: Optional[XTransform],
-    target_transform: Optional[YTransform],
-    initial_transform_group: Optional[str],
-    dataset,
+    transform_groups: Optional[Mapping[str, TransformGroupDef]] = None,
+    transform: Optional[XTransform] = None,
+    target_transform: Optional[YTransform] = None,
+    initial_transform_group: Optional[str] = None,
+    dataset: Optional[Any] = None,
 ) -> Optional[TransformGroups]:
     """
     Initializes the transform groups for the given dataset.
 
     This internal utility is commonly used to manage the transformation
-    defintions coming from the user-facing API. The user may want to
+    definitions coming from the user-facing API. The user may want to
     define transformations in a more classic (and simple) way by
     passing a single `transform`, or in a more elaborate way by
     passing a dictionary of groups (`transform_groups`).
@@ -655,13 +653,15 @@ class TaskSet(Mapping[int, TAvalancheDataset], Generic[TAvalancheDataset]):
 
     """
 
+    data: TAvalancheDataset
+
     def __init__(self, data: TAvalancheDataset):
         """Constructor.
 
         :param data: original data
         """
         super().__init__()
-        self.data: TAvalancheDataset = data
+        self.data = data
 
     def __iter__(self) -> Iterator[int]:
         t_labels = self._get_task_labels_field()
diff --git a/avalanche/checkpointing/checkpoint.py b/avalanche/checkpointing/checkpoint.py
index fbf6edf39..14de6d029 100644
--- a/avalanche/checkpointing/checkpoint.py
+++ b/avalanche/checkpointing/checkpoint.py
@@ -1,6 +1,7 @@
 import os.path
-from pathlib import Path
 from copy import copy
+from functools import partial
+from pathlib import Path
 from typing import (
     Any,
     Callable,
@@ -14,12 +15,12 @@
     Union,
     Collection,
 )
-from typing_extensions import TypeAlias
 
 import dill
 import torch
-from functools import partial
 from packaging.version import parse
+from typing_extensions import TypeAlias
+
 from .checkpoint_internals import (
     CHECKPOINT_MECHANISM_VERSION,
     _CheckpointLoadingContext,
@@ -119,7 +120,7 @@ def maybe_load_checkpoint(
     The method returns the strategy with the state deserialized from the file
     and the index of the training experience to resume training.
 
-    If the file does not exists, the method returns the strategy unmodified
+    If the file does not exist, the method returns the strategy unmodified
     and the index 0. As a result, the method can be safely called even if no
     checkpoint has been previously created (e.g. during the first run).
 
diff --git a/avalanche/core.py b/avalanche/core.py
index e1ed5368b..c94cebb3b 100644
--- a/avalanche/core.py
+++ b/avalanche/core.py
@@ -166,7 +166,7 @@ class BasePlugin(Generic[Template], ABC):
     `StrategyCallbacks`
     provide two functions `before_{method}` and `after_{method}`, called
     before and after the method, respectively.
-    Therefore plugins can "inject" additional code by implementing callbacks.
+    Therefore, plugins can "inject" additional code by implementing callbacks.
     Each callback has a `strategy` argument that gives access to the state.
 
     In Avalanche, callbacks are used to implement continual strategies, metrics
@@ -180,7 +180,7 @@ class BasePlugin(Generic[Template], ABC):
 
     def __init__(self):
         """
-        Inizializes an instance of a supervised plugin.
+        Initializes an instance of a supervised plugin.
         """
         super().__init__()
 
@@ -229,7 +229,7 @@ class BaseSGDPlugin(BasePlugin[Template], ABC):
 
     def __init__(self):
         """
-        Inizializes an instance of a base SGD plugin.
+        Initializes an instance of a base SGD plugin.
         """
         super().__init__()
 
@@ -296,13 +296,13 @@ def after_eval_iteration(self, strategy: Template, *args, **kwargs) -> Any:
     def before_train_dataset_adaptation(
         self, strategy: Template, *args, **kwargs
     ) -> Any:
-        """Called before `train_dataset_adapatation` by the `BaseTemplate`."""
+        """Called before `train_dataset_adaptation` by the `BaseTemplate`."""
         pass
 
     def after_train_dataset_adaptation(
         self, strategy: Template, *args, **kwargs
     ) -> Any:
-        """Called after `train_dataset_adapatation` by the `BaseTemplate`."""
+        """Called after `train_dataset_adaptation` by the `BaseTemplate`."""
         pass
 
     def before_eval_dataset_adaptation(
@@ -324,7 +324,7 @@ class SupervisedPlugin(BaseSGDPlugin[Template], ABC):
 
     def __init__(self):
         """
-        Inizializes an instance of a supervised plugin.
+        Initializes an instance of a supervised plugin.
         """
         super().__init__()
 
diff --git a/avalanche/evaluation/metrics/accuracy.py b/avalanche/evaluation/metrics/accuracy.py
index e745bcf36..6c1ce0c18 100644
--- a/avalanche/evaluation/metrics/accuracy.py
+++ b/avalanche/evaluation/metrics/accuracy.py
@@ -38,7 +38,7 @@ class Accuracy(Metric[float]):
     def __init__(self):
         """Creates an instance of the standalone Accuracy metric.
 
-        By default this metric in its initial state will return an accuracy
+        By default, this metric in its initial state will return an accuracy
         value of 0. The metric can be updated by using the `update` method
         while the running accuracy can be retrieved using the `result` method.
         """
@@ -60,6 +60,9 @@ def update(
 
         :return: None.
         """
+        # print('[Accuracy] update')
+        # print('  true_y', true_y.shape)  # [256]
+        # print('  predicted_y', predicted_y.shape)  # [256, 2, 128]
         true_y = torch.as_tensor(true_y)
         predicted_y = torch.as_tensor(predicted_y)
 
@@ -75,6 +78,9 @@ def update(
             # Logits -> transform to labels
             true_y = torch.max(true_y, 1)[1]
 
+        # print('  true_y 2', true_y.shape)  # [256]
+        # print('  predicted_y 2', predicted_y.shape)  # [256, 128]
+
         true_positives = float(torch.sum(torch.eq(predicted_y, true_y)))
         total_patterns = len(true_y)
         self._mean_accuracy.update(true_positives / total_patterns, total_patterns)
@@ -218,6 +224,10 @@ def result(self) -> float:
         return self._metric.result()
 
     def update(self, strategy):
+        # print('[AccuracyPluginMetric] update]')
+        # print('  strategy:', strategy)
+        # print('  mb_output', strategy.mb_output.shape)
+        # print('  mb_y', strategy.mb_y.shape)
         self._metric.update(strategy.mb_output, strategy.mb_y)
 
 
diff --git a/avalanche/models/dynamic_modules.py b/avalanche/models/dynamic_modules.py
index 9a43cda54..f9007eebf 100644
--- a/avalanche/models/dynamic_modules.py
+++ b/avalanche/models/dynamic_modules.py
@@ -68,15 +68,15 @@ def __init__(self, auto_adapt=True):
         """
         :param auto_adapt: If True, will be adapted in the recursive adaptation loop
                            else, will be adapted by a module in charge
-                           (i.e IncrementalClassifier inside MultiHeadClassifier)
+                           (i.e. IncrementalClassifier inside MultiHeadClassifier)
         """
         super().__init__()
         self._auto_adapt = auto_adapt
 
     def pre_adapt(self, agent, experience):
         """
-        Calls self.adaptation recursively accross
-        the hierarchy of pytorch module childrens
+        Calls self.adaptation recursively across
+        the hierarchy of pytorch module children
         """
         avalanche_model_adaptation(self, experience)
 
@@ -120,7 +120,7 @@ class MultiTaskModule(DynamicModule):
     scenarios. The ``forward`` method accepts task labels, one for
     each sample in the mini-batch.
 
-    By default the ``forward`` method splits the mini-batch by task
+    By default, the ``forward`` method splits the mini-batch by task
     and calls ``forward_single_task``. Subclasses must implement
     ``forward_single_task`` or override `forward. If ``task_labels == None``,
     the output is computed in parallel for each task.
@@ -142,7 +142,7 @@ def adaptation(self, experience: CLExperience):
 
         .. warning::
             As a general rule, you should NOT use this method to train the
-            model. The dataset should be used only to check conditions which
+            model. The experience should be used only to check conditions which
             require the model's adaptation, such as the discovery of new
             classes or tasks.
 
@@ -342,7 +342,7 @@ def __init__(
 
         # needs to create the first head because pytorch optimizers
         # fail when model.parameters() is empty.
-        # masking in IncrementalClassifier is unaware of task labels
+        # masking in IncrementalClassifier is unaware of task labels,
         # so we do masking here instead.
         first_head = IncrementalClassifier(
             self.in_features,
diff --git a/avalanche/models/dynamic_optimizers.py b/avalanche/models/dynamic_optimizers.py
index 10e487dab..98ed157f3 100644
--- a/avalanche/models/dynamic_optimizers.py
+++ b/avalanche/models/dynamic_optimizers.py
@@ -335,7 +335,7 @@ def update_optimizer(
     verbose=False,
 ):
     """Update the optimizer by adding new parameters,
-    removing removed parameters, and adding new parameters
+    removing obsolete parameters, and adding new parameters
     to the optimizer, for instance after model has been adapted
     to a new task. The state of the optimizer can also be reset,
     it will be reset for the modified parameters.
@@ -351,7 +351,7 @@ def update_optimizer(
         currently optimized parameters. In most use cases, it will be `None in
         the first call and the return value of the last `update_optimizer` call
         for the subsequent calls.
-    :param reset_state: Whether to reset the optimizer's state (i.e momentum).
+    :param reset_state: Whether to reset the optimizer's state (i.e. momentum).
                         Defaults to False.
     :param remove_params: Whether to remove parameters that were in the optimizer
                           but are not found in new parameters. For safety reasons,
@@ -369,7 +369,7 @@ def update_optimizer(
     ) = _map_optimized_params(optimizer, new_params, old_params=optimized_params)
 
     # Change reference to already existing parameters
-    # i.e growing IncrementalClassifier
+    # i.e. growing IncrementalClassifier
     for name, group_idx, param_idx in changed_parameters:
         group = optimizer.param_groups[group_idx]
         old_p = optimized_params[name]
@@ -381,7 +381,7 @@ def update_optimizer(
             optimizer.state[new_p] = {}
 
     # Remove parameters that are not here anymore
-    # This should not happend in most use case
+    # This should not happen in most use cases
     if remove_params:
         for group_idx, idx_list in enumerate(not_found_in_parameters):
             for j in sorted(idx_list, key=lambda x: x, reverse=True):
diff --git a/avalanche/training/losses.py b/avalanche/training/losses.py
index 460b01aa2..4f6b61855 100644
--- a/avalanche/training/losses.py
+++ b/avalanche/training/losses.py
@@ -3,7 +3,6 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
-from torch import nn
 from torch.nn import BCELoss
 
 from avalanche.training.plugins import SupervisedPlugin
@@ -115,7 +114,10 @@ def forward(self, features, labels=None, mask=None):
         elif labels is not None:
             labels = labels.contiguous().view(-1, 1)
             if labels.shape[0] != batch_size:
-                raise ValueError("Num of labels does not match num of features")
+                raise ValueError(
+                    f"Num of labels {labels.shape[0]} does not match "
+                    f"num of features {batch_size}"
+                )
             mask = torch.eq(labels, labels.T).float().to(device)
         else:
             mask = mask.float().to(device)
@@ -170,7 +172,7 @@ class MaskedCrossEntropy(SupervisedPlugin):
     Masked Cross Entropy
 
     This criterion can be used for instance in Class Incremental
-    Learning Problems when no examplars are used
+    Learning Problems when no exemplars are used
     (i.e LwF in Class Incremental Learning would need to use mask="new").
     """
 
diff --git a/avalanche/training/regularization.py b/avalanche/training/regularization.py
index dc4ab310f..0c660bbfb 100644
--- a/avalanche/training/regularization.py
+++ b/avalanche/training/regularization.py
@@ -353,4 +353,5 @@ def __call__(
     "LearningWithoutForgetting",
     "ACECriterion",
     "AMLCriterion",
+    "cross_entropy_with_oh_targets",
 ]
diff --git a/avalanche/training/supervised/joint_training.py b/avalanche/training/supervised/joint_training.py
index 08294cfc0..e50e6b63a 100644
--- a/avalanche/training/supervised/joint_training.py
+++ b/avalanche/training/supervised/joint_training.py
@@ -121,7 +121,7 @@ def train(
         trains on all of them at the same time (a.k.a. offline training).
 
         :param experiences: single Experience or sequence.
-        :param eval_streams: list of streams for evaluation.
+        :param eval_streams: sequence of streams for evaluation.
             If None: use training experiences for evaluation.
             Use [] if you do not want to evaluate during training.
 
diff --git a/avalanche/training/supervised/strategy_wrappers.py b/avalanche/training/supervised/strategy_wrappers.py
index b13091b64..2552688a1 100644
--- a/avalanche/training/supervised/strategy_wrappers.py
+++ b/avalanche/training/supervised/strategy_wrappers.py
@@ -56,7 +56,7 @@ class Naive(SupervisedTemplate):
     """Naive finetuning.
 
     The simplest (and least effective) Continual Learning strategy. Naive just
-    incrementally fine tunes a single model without employing any method
+    incrementally fine-tunes a single model without employing any method
     to contrast the catastrophic forgetting of previous knowledge.
     This strategy does not use task identities.
 
diff --git a/avalanche/training/supervised/supervised_contrastive_replay.py b/avalanche/training/supervised/supervised_contrastive_replay.py
index f4dd44a0a..44474eea4 100644
--- a/avalanche/training/supervised/supervised_contrastive_replay.py
+++ b/avalanche/training/supervised/supervised_contrastive_replay.py
@@ -1,7 +1,6 @@
 from typing import Optional, Sequence
 
 import torch
-from torch.nn import Module
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 from torchvision.transforms import Compose, Lambda
@@ -24,7 +23,7 @@ class SCR(SupervisedTemplate):
     embeddings produced by the encoder.
 
     Accuracy cannot be monitored during training (no NCM classifier).
-    During training, NCRLoss is monitored, while during eval
+    During training, SCRLoss is monitored, while during eval
     CrossEntropyLoss is monitored.
 
     The original paper uses an additional fine-tuning phase on the buffer
@@ -37,9 +36,9 @@ def __init__(
         *,
         model: SCRModel,
         optimizer: Optimizer,
-        augmentations=Compose([Lambda(lambda el: el)]),
+        augmentations=Lambda(lambda el: el),
         mem_size: int = 100,
-        temperature: int = 0.1,
+        temperature: float = 0.1,
         train_mb_size: int = 1,
         batch_size_mem: int = 100,
         train_epochs: int = 1,
@@ -112,10 +111,11 @@ def __init__(
             plugins = [self.replay_plugin] + plugins
         else:
             raise ValueError("`plugins` parameter needs to be a list.")
+
         super().__init__(
             model=model,
             optimizer=optimizer,
-            criterion=SCRLoss(temperature=self.temperature),
+            # criterion=SCRLoss(temperature=self.temperature),
             train_mb_size=train_mb_size,
             train_epochs=train_epochs,
             eval_mb_size=eval_mb_size,
@@ -128,6 +128,9 @@ def __init__(
 
     def criterion(self):
         if self.is_training:
+            # print(self.train_loss)
+            # print('mb_output', self.mb_output.shape)  # [384, 2, 128]
+            # print('mb_y', self.mb_y.shape)  # [256]
             return self.train_loss(self.mb_output, self.mb_y)
         else:
             return self.eval_loss(self.mb_output, self.mb_y)
@@ -138,9 +141,17 @@ def _before_forward(self, **kwargs):
         """
         assert self.is_training
         super()._before_forward(**kwargs)
-        mb_x_augmented = self.augmentations(self.mbatch[0])
+        # print('mbatch', len(self.mbatch), self.mbatch)
+        mb_x_augmented = self.augmentations(self.mb_x)
+        # print()
+        # print('before forward')
+        # print('x', self.mb_x.shape)  # [256, 1, 28, 28]
+        # print('y', self.mb_y.shape)  # [256]
+        # print('mb_x_augmented', mb_x_augmented.shape)  # [512, 1, 28, 28]
+
         # (batch_size*2, input_size)
-        self.mbatch[0] = torch.cat([self.mbatch[0], mb_x_augmented], dim=0)
+        self.mbatch[0] = torch.cat([self.mb_x, mb_x_augmented], dim=0)
+        # print('~x', self.mb_x.shape)  # [768, 1, 28, 28]
 
     def _after_forward(self, **kwargs):
         """
@@ -151,10 +162,12 @@ def _after_forward(self, **kwargs):
         super()._after_forward(**kwargs)
         assert self.mb_output.size(0) % 2 == 0
         original_batch_size = int(self.mb_output.size(0) / 2)
+        # print('[after forward] mb_output 1:', self.mb_output.shape)
         original_examples = self.mb_output[:original_batch_size]
         augmented_examples = self.mb_output[original_batch_size:]
         # (original_batch_size, 2, output_size)
         self.mb_output = torch.stack([original_examples, augmented_examples], dim=1)
+        # print('[after forward] mb_output 2:', self.mb_output.shape)
 
     def _after_training_exp(self, **kwargs):
         """Update NCM means"""
diff --git a/avalanche/training/templates/base_sgd.py b/avalanche/training/templates/base_sgd.py
index b51046edf..40871ff14 100644
--- a/avalanche/training/templates/base_sgd.py
+++ b/avalanche/training/templates/base_sgd.py
@@ -64,6 +64,10 @@ def __init__(
         *,
         model: Module,
         optimizer: Optimizer,
+        # TODO: Make optional in base classes as subclasses may choose to implement
+        #  `def criterion()` that doesn't depend on `self._criterion`
+        #  (which is set in __init__). Subclasses using `_criterion` in `criterion()`
+        #  should then make the criterion kwarg mandatory
         criterion: CriterionType = CrossEntropyLoss(),
         train_mb_size: int = 1,
         train_epochs: int = 1,
@@ -71,7 +75,7 @@ def __init__(
         device: Union[str, torch.device] = "cpu",
         plugins: Optional[Sequence[BasePlugin]] = None,
         evaluator: Union[
-            EvaluationPlugin, Callable[[], EvaluationPlugin]
+            EvaluationPlugin, Callable[[], EvaluationPlugin], None,
         ] = default_evaluator,
         eval_every=-1,
         peval_mode="epoch",
@@ -389,7 +393,7 @@ def _obtain_common_dataloader_parameters(self, **kwargs):
         implementation (super) to obtain a base dictionary of parameters.
 
         However, if a more deep change is needed in the data loading procedure,
-        it is better to overrride :meth:`make_train_dataloader` and/or
+        it is better to override :meth:`make_train_dataloader` and/or
         :meth:`make_eval_dataloader` directly.
 
         Note: the resulting dictionary does not include the collate function
diff --git a/avalanche/training/templates/common_templates.py b/avalanche/training/templates/common_templates.py
index 20ba0465a..744822a1b 100644
--- a/avalanche/training/templates/common_templates.py
+++ b/avalanche/training/templates/common_templates.py
@@ -46,7 +46,7 @@ class SupervisedTemplate(
     This strategy supports several continual learning scenarios:
 
     * class-incremental scenarios (no task labels)
-    * multi-task scenarios, where task labels are provided)
+    * multi-task scenarios, where task labels are provided
     * multi-incremental scenarios, where the same task may be revisited
 
     The exact scenario depends on the data stream and whether it provides
@@ -92,7 +92,7 @@ def __init__(
         device: Union[str, torch.device] = "cpu",
         plugins: Optional[Sequence[BasePlugin]] = None,
         evaluator: Union[
-            EvaluationPlugin, Callable[[], EvaluationPlugin]
+            EvaluationPlugin, Callable[[], EvaluationPlugin], None,
         ] = default_evaluator,
         eval_every=-1,
         peval_mode="epoch",
diff --git a/avalanche/training/templates/observation_type/batch_observation.py b/avalanche/training/templates/observation_type/batch_observation.py
index a663e2d42..1b6942eaa 100644
--- a/avalanche/training/templates/observation_type/batch_observation.py
+++ b/avalanche/training/templates/observation_type/batch_observation.py
@@ -55,11 +55,10 @@ def make_optimizer(
                                   along with their parameter group
 
         Warnings:
-            - The first time this function is called
-              for a given strategy it will reset the
-              optimizer to gather the (name, param)
-              correspondance of the optimized parameters
-              all of the model parameters will be put in the
+            - The first time this function is called for a given strategy,
+              it will reset the optimizer to gather the (name, param)
+              correspondence of the optimized parameters.
+              All of the model parameters will be put in the
               optimizer, regardless of what parameters are
               initially put in the optimizer.
 
diff --git a/avalanche/training/templates/strategy_mixin_protocol.py b/avalanche/training/templates/strategy_mixin_protocol.py
index 4596c9d39..a2ec0e4e5 100644
--- a/avalanche/training/templates/strategy_mixin_protocol.py
+++ b/avalanche/training/templates/strategy_mixin_protocol.py
@@ -21,7 +21,7 @@
 CriterionType: TypeAlias = Union[Module, Callable[[Tensor, Tensor], Tensor]]
 
 
-class BaseStrategyProtocol(Generic[TExperienceType], Protocol[TExperienceType]):
+class BaseStrategyProtocol(Protocol[TExperienceType]):
     model: Module
 
     device: torch.device
@@ -36,7 +36,6 @@ class BaseStrategyProtocol(Generic[TExperienceType], Protocol[TExperienceType]):
 
 
 class SGDStrategyProtocol(
-    Generic[TSGDExperienceType, TMBInput, TMBOutput],
     BaseStrategyProtocol[TSGDExperienceType],
     Protocol[TSGDExperienceType, TMBInput, TMBOutput],
 ):
@@ -47,6 +46,7 @@ class SGDStrategyProtocol(
     mbatch: Optional[TMBInput]
 
     mb_output: Optional[TMBOutput]
+    """Mini-batch output (typically the result of `self.forward()`)"""
 
     dataloader: Iterable[TMBInput]
 
@@ -95,6 +95,7 @@ class SupervisedStrategyProtocol(
     SGDStrategyProtocol[TSGDExperienceType, TMBInput, TMBOutput],
     Protocol[TSGDExperienceType, TMBInput, TMBOutput],
 ):
+    # TODO: How does this differ from mbatch[0]? Converted to tensor?
     mb_x: Tensor
 
     mb_y: Tensor
@@ -120,6 +121,7 @@ def _after_outer_update(self, **kwargs): ...
 
 
 __all__ = [
+    "BaseStrategyProtocol",
     "SGDStrategyProtocol",
     "SupervisedStrategyProtocol",
     "MetaLearningStrategyProtocol",
diff --git a/avalanche/training/utils.py b/avalanche/training/utils.py
index 1427f4760..a3d6e9ca3 100644
--- a/avalanche/training/utils.py
+++ b/avalanche/training/utils.py
@@ -15,7 +15,8 @@
 
 """
 from collections import defaultdict
-from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union, \
+    TYPE_CHECKING
 
 import torch
 from torch import Tensor
@@ -25,6 +26,9 @@
 from avalanche.benchmarks import OnlineCLExperience
 from avalanche.models.batch_renorm import BatchRenorm2D
 
+if TYPE_CHECKING:
+    from avalanche.training.templates.strategy_mixin_protocol import BaseStrategyProtocol
+
 
 def _at_task_boundary(training_experience, before=True) -> bool:
     """
@@ -53,6 +57,8 @@ def _at_task_boundary(training_experience, before=True) -> bool:
                 return True
             elif (not before) and training_experience.is_last_subexp:
                 return True
+            else:
+                return False
         else:
             return True
     else:
@@ -65,13 +71,11 @@ def cycle(loader):
             yield batch
 
 
-def trigger_plugins(strategy, event, **kwargs):
-    """Call plugins on a specific callback
-
-    :return:
-    """
+def trigger_plugins(strategy: "BaseStrategyProtocol", event: str, **kwargs):
+    """Call plugins on a specific callback"""
     for p in strategy.plugins:
         if hasattr(p, event):
+            # print('triggering plugin', p, event)
             getattr(p, event)(strategy, **kwargs)
 
 
diff --git a/docs/gitbook/from-zero-to-hero-tutorial/03_benchmarks.md b/docs/gitbook/from-zero-to-hero-tutorial/03_benchmarks.md
index fd0e674d0..f9c5f41bd 100644
--- a/docs/gitbook/from-zero-to-hero-tutorial/03_benchmarks.md
+++ b/docs/gitbook/from-zero-to-hero-tutorial/03_benchmarks.md
@@ -43,10 +43,10 @@ from avalanche.benchmarks.utils import as_classification_dataset, AvalancheDatas
 
 # Most datasets in Avalanche are automatically downloaded the first time you use them
 # and stored in a default location. You can change this folder by calling
-# avalanche.benchmarks.utils.set_dataset_root(new_location)
+# avalanche.benchmarks.datasets.dataset_utils.set_dataset_root(new_location)
 datadir = default_dataset_location('mnist')
 
-# As we would simply do with any Pytorch dataset we can create the train and 
+# As we would simply do with any Pytorch dataset we can create the train and
 # test sets from it. We could use any of the above imported Datasets, but let's
 # just try to use the standard MNIST.
 train_MNIST = MNIST(datadir, train=True, download=True)
@@ -65,14 +65,14 @@ eval_transforms = torchvision.transforms.Compose([
 train_MNIST = as_classification_dataset(
     train_MNIST,
     transform_groups={
-        'train': train_transforms, 
+        'train': train_transforms,
         'eval': eval_transforms
     }
 )
 test_MNIST = as_classification_dataset(
     test_MNIST,
     transform_groups={
-        'train': train_transforms, 
+        'train': train_transforms,
         'eval': eval_transforms
     }
 )
@@ -117,7 +117,7 @@ print(list(dsub.targets))
 ## 🏛️ Classic Benchmarks
 
 Most benchmarks will provide two streams: the `train_stream` and `test_stream`.
-Often, these are two parallel streams of the same length, where each experience is sampled from the same distribution (e.g. same set of classes). 
+Often, these are two parallel streams of the same length, where each experience is sampled from the same distribution (e.g. same set of classes).
 Some benchmarks may have a single test experience with the whole test dataset.
 
 Experiences provide all the information needed to update the model, such as the new batch of data, and they may be decorated with attributes that are helpful for training or logging purposes.
@@ -248,7 +248,7 @@ print(f"Experience {exp.logging().current_experience}")
 
 #### Classification
 
-classification benchmarks follow the `ClassesTimeline` protocol and provide attributes about the classes in the stream. 
+classification benchmarks follow the `ClassesTimeline` protocol and provide attributes about the classes in the stream.
 
 
 ```python
@@ -322,7 +322,7 @@ for exp in online_train_stream:
     print(f"\tsize: {len(exp.dataset)}")
 
     # in a training loop, here you would train on the online_train_stream
-    # here you would test on bm.valid_stream or bm.test_stream 
+    # here you would test on bm.valid_stream or bm.test_stream
 ```
 
 This completes the "_Benchmark_" tutorial for the "_From Zero to Hero_" series. We hope you enjoyed it!
diff --git a/docs/gitbook/from-zero-to-hero-tutorial/05_evaluation.md b/docs/gitbook/from-zero-to-hero-tutorial/05_evaluation.md
index 305b29547..2326f23b1 100644
--- a/docs/gitbook/from-zero-to-hero-tutorial/05_evaluation.md
+++ b/docs/gitbook/from-zero-to-hero-tutorial/05_evaluation.md
@@ -23,7 +23,7 @@ Each metric comes with a standalone class and a set of plugin classes aimed at e
 
 #### Standalone metric
 
-As an example, the standalone `Accuracy` class can be used to monitor the average accuracy over a stream of `<input,target>` pairs. The class provides an `update` method to update the current average accuracy, a `result` method to print the current average accuracy and a `reset` method to set the current average accuracy to zero. The call to `result`does not change the metric state.  
+As an example, the standalone `Accuracy` class can be used to monitor the average accuracy over a stream of `<input,target>` pairs. The class provides an `update` method to update the current average accuracy, a `result` method to print the current average accuracy and a `reset` method to set the current average accuracy to zero. The call to `result`does not change the metric state.
 
 The `TaskAwareAccuracy` metric keeps separate accuracy counters for different task labels. As such, it requires the `task_labels` parameter, which specifies which task is associated with the current patterns. The metric returns a dictionary mapping task labels to accuracy values.
 
@@ -71,7 +71,7 @@ print("Average Accuracy: ", acc) # output 0.5 for task 0
 task_label = 1
 predicted_y = torch.tensor([1,2]).float()
 acc_metric.update(real_y, predicted_y, task_label)
-acc = acc_metric.result() 
+acc = acc_metric.result()
 print("Average Accuracy: ", acc) # output 0.75 for task 0 and 1.0 for task 1
 
 task_label = 0
@@ -111,7 +111,7 @@ The **Evaluation Plugin** is the object in charge of configuring and controlling
 
 The Evaluation Plugin accepts as inputs the plugin metrics you want to track. In addition, you can add one or more loggers to print the metrics in different ways \(on file, on standard output, on Tensorboard...\).
 
-It is also recommended to pass to the Evaluation Plugin the benchmark instance used in the experiment. This allows the plugin to check for consistency during metrics computation. For example, the Evaluation Plugin checks that the `strategy.eval` calls are performed on the same stream or sub-stream. Otherwise, same metric could refer to different portions of the stream.  
+It is also recommended to pass to the Evaluation Plugin the benchmark instance used in the experiment. This allows the plugin to check for consistency during metrics computation. For example, the Evaluation Plugin checks that the `strategy.eval` calls are performed on the same stream or sub-stream. Otherwise, same metric could refer to different portions of the stream.
 These checks can be configured to raise errors (stopping computation) or only warnings.
 
 
@@ -134,7 +134,7 @@ model = SimpleMLP(num_classes=benchmark.n_classes)
 
 # DEFINE THE EVALUATION PLUGIN
 # The evaluation plugin manages the metrics computation.
-# It takes as argument a list of metrics, collectes their results and returns
+# It takes as argument a list of metrics, collects their results and returns
 # them to the strategy it is attached to.
 
 eval_plugin = EvaluationPlugin(
@@ -256,8 +256,8 @@ class MyPluginMetric(PluginMetric[float]):
             task_labels = strategy.mb_task_id
         else:
             task_labels = task_labels[0]
-            
-        self._accuracy_metric.update(strategy.mb_output, strategy.mb_y, 
+
+        self._accuracy_metric.update(strategy.mb_output, strategy.mb_y,
                                      task_labels)
 
     def before_training_epoch(self, strategy: 'PluggableStrategy') -> None:
@@ -271,8 +271,8 @@ class MyPluginMetric(PluginMetric[float]):
         Emit the result
         """
         return self._package_result(strategy)
-        
-        
+
+
     def _package_result(self, strategy):
         """Taken from `GenericPluginMetric`, check that class out!"""
         metric_value = self.accuracy_metric.result()
@@ -303,9 +303,9 @@ class MyPluginMetric(PluginMetric[float]):
 
 ## Accessing metric values
 
-If you want to access all the metrics computed during training and evaluation, you have to make sure that `collect_all=True` is set when creating the `EvaluationPlugin` (default option is `True`). This option maintains an updated version of all metric results in the plugin, which can be retrieved by calling `evaluation_plugin.get_all_metrics()`. You can call this methods whenever you need the metrics. 
+If you want to access all the metrics computed during training and evaluation, you have to make sure that `collect_all=True` is set when creating the `EvaluationPlugin` (default option is `True`). This option maintains an updated version of all metric results in the plugin, which can be retrieved by calling `evaluation_plugin.get_all_metrics()`. You can call this methods whenever you need the metrics.
 
-The result is a dictionary with full metric names as keys and a tuple of two lists as values. The first list stores all the `x` values recorded for that metric. Each `x` value represents the time step at which the corresponding metric value has been computed. The second list stores metric values associated to the corresponding `x` value. 
+The result is a dictionary with full metric names as keys and a tuple of two lists as values. The first list stores all the `x` values recorded for that metric. Each `x` value represents the time step at which the corresponding metric value has been computed. The second list stores metric values associated to the corresponding `x` value.
 
 
 ```python
@@ -332,7 +332,7 @@ d = eval_plugin.get_all_metrics()
 d['Top1_Acc_Epoch/train_phase/train_stream/Task000']
 ```
 
-Alternatively, the `train` and `eval` method of every `strategy` returns a dictionary storing, for each metric, the last value recorded for that metric. You can use these dictionaries to incrementally accumulate metrics. 
+Alternatively, the `train` and `eval` method of every `strategy` returns a dictionary storing, for each metric, the last value recorded for that metric. You can use these dictionaries to incrementally accumulate metrics.
 
 
 ```python
diff --git a/docs/gitbook/from-zero-to-hero-tutorial/06_loggers.md b/docs/gitbook/from-zero-to-hero-tutorial/06_loggers.md
index e1ca65f60..8b83a307a 100644
--- a/docs/gitbook/from-zero-to-hero-tutorial/06_loggers.md
+++ b/docs/gitbook/from-zero-to-hero-tutorial/06_loggers.md
@@ -28,7 +28,7 @@ _Avalanche_ at the moment supports four main Loggers:
 * **TensorboardLogger**: It logs all the metrics on [Tensorboard](https://www.tensorflow.org/tensorboard) in real-time. Perfect for real-time plotting.
 * **WandBLogger**: It leverages [Weights and Biases](https://wandb.ai/site) tools to log metrics and results on a dashboard. It requires a W&B account.
 
-In order to keep track of when each metric value has been logged, we leverage two `global counters`, one for the training phase, one for the evaluation phase. 
+In order to keep track of when each metric value has been logged, we leverage two `global counters`, one for the training phase, one for the evaluation phase.
 You can see the `global counter` value reported in the x axis of the logged plots.
 
 Each `global counter` is an ever-increasing value which starts from 0 and it is increased by one each time a training/evaluation iteration is performed (i.e. after each training/evaluation minibatch).
@@ -56,7 +56,7 @@ model = SimpleMLP(num_classes=benchmark.n_classes)
 
 # DEFINE THE EVALUATION PLUGIN and LOGGERS
 # The evaluation plugin manages the metrics computation.
-# It takes as argument a list of metrics, collectes their results and returns
+# It takes as argument a list of metrics, collects their results and returns
 # them to the strategy it is attached to.
 
 
diff --git a/docs/gitbook/from-zero-to-hero-tutorial/07_putting-all-together.md b/docs/gitbook/from-zero-to-hero-tutorial/07_putting-all-together.md
index f9b02ada2..7825a2807 100644
--- a/docs/gitbook/from-zero-to-hero-tutorial/07_putting-all-together.md
+++ b/docs/gitbook/from-zero-to-hero-tutorial/07_putting-all-together.md
@@ -33,7 +33,7 @@ model = SimpleMLP(num_classes=scenario.n_classes)
 
 # DEFINE THE EVALUATION PLUGIN and LOGGERS
 # The evaluation plugin manages the metrics computation.
-# It takes as argument a list of metrics, collectes their results and returns
+# It takes as argument a list of metrics, collects their results and returns
 # them to the strategy it is attached to.
 
 # log to Tensorboard
diff --git a/docs/gitbook/from-zero-to-hero-tutorial/09_contribute-to-avalanche.md b/docs/gitbook/from-zero-to-hero-tutorial/09_contribute-to-avalanche.md
index 9885b49f5..7edbaaecf 100644
--- a/docs/gitbook/from-zero-to-hero-tutorial/09_contribute-to-avalanche.md
+++ b/docs/gitbook/from-zero-to-hero-tutorial/09_contribute-to-avalanche.md
@@ -1,4 +1,4 @@
-]---
+---
 description: How to Contribute Back to the Avalanche Community
 ---
 
@@ -62,14 +62,14 @@ USE_GPU=False FAST_TEST=True python -m unittest discover tests -v
 
 **Contribute to the Avalanche documentation**
 
-Apart from the code, you can also contribute to the Avalanche documentation 📚!  We use [Jupyter notebooks](https://jupyter.org/) to write the documentation, so both code and text can be smoothly inserted, and, as you may have noticed, all our documentation can be run on [Google Colab](https://colab.research.google.com/notebooks/intro.ipynb)! 
+Apart from the code, you can also contribute to the Avalanche documentation 📚!  We use [Jupyter notebooks](https://jupyter.org/) to write the documentation, so both code and text can be smoothly inserted, and, as you may have noticed, all our documentation can be run on [Google Colab](https://colab.research.google.com/notebooks/intro.ipynb)!
 
 To contribute to the documentation you need to follow the steps below:
 
-1. The notebooks are contained in the folder `notebooks`. The folder structure is specular to the documentation, so do not create or delete any folder. 
-2. Detect the notebook that you want to edit and do all the modifications 📝 
-3. Commit the changes and open a pull request (PR). 
-4. If your pull request will be accepted, your edited notebooks will be automatically converted and uploaded to the official Avalanche website 🎊! 
+1. The notebooks are contained in the folder `notebooks`. The folder structure is specular to the documentation, so do not create or delete any folder.
+2. Detect the notebook that you want to edit and do all the modifications 📝
+3. Commit the changes and open a pull request (PR).
+4. If your pull request will be accepted, your edited notebooks will be automatically converted and uploaded to the official Avalanche website 🎊!
 
 
 
diff --git a/docs/gitbook/getting-started/learn-avalanche-in-5-minutes.md b/docs/gitbook/getting-started/learn-avalanche-in-5-minutes.md
index c4d222855..ac2589778 100644
--- a/docs/gitbook/getting-started/learn-avalanche-in-5-minutes.md
+++ b/docs/gitbook/getting-started/learn-avalanche-in-5-minutes.md
@@ -116,7 +116,7 @@ for experience in train_stream:
     print("Start of task ", experience.task_label)
     print('Classes in this task:', experience.classes_in_this_experience)
 
-    # The current Pytorch training set can be easily recovered through the 
+    # The current Pytorch training set can be easily recovered through the
     # experience
     current_training_set = experience.dataset
     # ...as well as the task_label
@@ -211,7 +211,7 @@ class MyStrategy():
         self.criterion = criterion
 
     def train(self, experience):
-        # here you can implement your own training loop for each experience (i.e. 
+        # here you can implement your own training loop for each experience (i.e.
         # batch or task).
 
         train_dataset = experience.dataset
@@ -226,7 +226,7 @@ class MyStrategy():
                 pass
 
     def eval(self, experience):
-        # here you can implement your own eval loop for each experience (i.e. 
+        # here you can implement your own eval loop for each experience (i.e.
         # batch or task).
 
         eval_dataset = experience.dataset
@@ -277,8 +277,8 @@ Check out more details about what Avalanche can offer in this module following t
 
 The `evaluation` module is quite straightforward: it offers all the basic functionalities to evaluate and keep track of a continual learning experiment.
 
-This is mostly done through the **Metrics** and the **Loggers**. The **Metrics** provide a set of classes which implements the main continual learning metrics like Accuracy, Forgetting, Memory Usage, Running Times, etc.  
-Metrics should be created via the utility functions (e.g. `accuracy_metrics`, `timing_metrics` and others) specifying in the arguments when those metrics should be computed (after each minibatch, epoch, experience etc...).  
+This is mostly done through the **Metrics** and the **Loggers**. The **Metrics** provide a set of classes which implements the main continual learning metrics like Accuracy, Forgetting, Memory Usage, Running Times, etc.
+Metrics should be created via the utility functions (e.g. `accuracy_metrics`, `timing_metrics` and others) specifying in the arguments when those metrics should be computed (after each minibatch, epoch, experience etc...).
 The **Loggers** specify a way to report the metrics (e.g. with Tensorboard, on console or others). Loggers are created by instantiating the respective class.
 
 Metrics and loggers interact via the **Evaluation Plugin**: this is the main object responsible of tracking the experiment progress. Metrics and loggers are directly passed to the `EvaluationPlugin` instance. You will see the output of the loggers automatically during training and evaluation! Let's see how to put this together in few lines of code:
@@ -299,7 +299,7 @@ eval_plugin = EvaluationPlugin(
     loss_metrics(minibatch=True, stream=True),
     # catastrophic forgetting after each evaluation
     # experience
-    forgetting_metrics(experience=True, stream=True), 
+    forgetting_metrics(experience=True, stream=True),
     # add as many metrics as you like
     loggers=[InteractiveLogger(), TensorboardLogger()])
 
@@ -338,7 +338,7 @@ model = SimpleMLP(num_classes=benchmark.n_classes)
 
 # DEFINE THE EVALUATION PLUGIN and LOGGERS
 # The evaluation plugin manages the metrics computation.
-# It takes as argument a list of metrics, collectes their results and returns 
+# It takes as argument a list of metrics, collects their results and returns
 # them to the strategy it is attached to.
 
 # log to Tensorboard
diff --git a/examples/multihead.py b/examples/multihead.py
index c0177418e..3cd76039a 100644
--- a/examples/multihead.py
+++ b/examples/multihead.py
@@ -10,7 +10,7 @@
 ################################################################################
 
 """
-This example trains a Multi-head model on Split MNIST with Elastich Weight
+This example trains a Multi-head model on Split MNIST with Elastic Weight
 Consolidation. Each experience has a different task label, which is used at test
 time to select the appropriate head.
 """
diff --git a/notebooks/from-zero-to-hero-tutorial/02_models.ipynb b/notebooks/from-zero-to-hero-tutorial/02_models.ipynb
index c7667f334..d92b84a4f 100644
--- a/notebooks/from-zero-to-hero-tutorial/02_models.ipynb
+++ b/notebooks/from-zero-to-hero-tutorial/02_models.ipynb
@@ -180,8 +180,8 @@
     "    def __init__(self, in_features, initial_out_features=2):\n",
     "        super().__init__()\n",
     "\n",
-    "    def adaptation(self, dataset):\n",
-    "        super().adaptation(dataset)\n",
+    "    def adaptation(self, experience):\n",
+    "        super().adaptation(experience)\n",
     "        # your adaptation goes here\n",
     "\n",
     "    def forward_single_task(self, x, task_label):\n",
diff --git a/notebooks/from-zero-to-hero-tutorial/03_benchmarks.ipynb b/notebooks/from-zero-to-hero-tutorial/03_benchmarks.ipynb
index 863554d9d..ce91d36f4 100644
--- a/notebooks/from-zero-to-hero-tutorial/03_benchmarks.ipynb
+++ b/notebooks/from-zero-to-hero-tutorial/03_benchmarks.ipynb
@@ -85,7 +85,7 @@
     "# avalanche.benchmarks.utils.set_dataset_root(new_location)\n",
     "datadir = default_dataset_location('mnist')\n",
     "\n",
-    "# As we would simply do with any Pytorch dataset we can create the train and \n",
+    "# As we would simply do with any Pytorch dataset we can create the train and\n",
     "# test sets from it. We could use any of the above imported Datasets, but let's\n",
     "# just try to use the standard MNIST.\n",
     "train_MNIST = MNIST(datadir, train=True, download=True)\n",
@@ -104,14 +104,14 @@
     "train_MNIST = as_classification_dataset(\n",
     "    train_MNIST,\n",
     "    transform_groups={\n",
-    "        'train': train_transforms, \n",
+    "        'train': train_transforms,\n",
     "        'eval': eval_transforms\n",
     "    }\n",
     ")\n",
     "test_MNIST = as_classification_dataset(\n",
     "    test_MNIST,\n",
     "    transform_groups={\n",
-    "        'train': train_transforms, \n",
+    "        'train': train_transforms,\n",
     "        'eval': eval_transforms\n",
     "    }\n",
     ")\n",
@@ -213,10 +213,12 @@
       "data: 12080 samples\n",
       "EID=4, classes=[4, 7], tasks=[4]\n",
       "data: 12107 samples\n",
-      "EID=0, classes=[5, 6], task=[4]\n",
-      "EID=1, classes=[1, 2], task=[4]\n",
-      "EID=2, classes=[0, 8], task=[4]\n",
-      "EID=3, classes=[9, 3], task=[4]\n",
+      "\n",
+      "--- Stream: test\n",
+      "EID=0, classes=[5, 6], task=[0]\n",
+      "EID=1, classes=[1, 2], task=[1]\n",
+      "EID=2, classes=[0, 8], task=[2]\n",
+      "EID=3, classes=[9, 3], task=[3]\n",
       "EID=4, classes=[4, 7], task=[4]\n"
      ]
     }
@@ -246,8 +248,10 @@
     "    # the experience provides a dataset\n",
     "    print(f\"data: {len(exp.dataset)} samples\")\n",
     "\n",
+    "print()\n",
+    "print(f'--- Stream: {bm.test_stream.name}')\n",
     "for exp in bm.test_stream:\n",
-    "    print(f\"EID={exp.current_experience}, classes={exp.classes_in_this_experience}, task={tls}\")\n"
+    "    print(f\"EID={exp.current_experience}, classes={exp.classes_in_this_experience}, task={exp.task_labels}\")\n"
    ]
   },
   {
@@ -12558,7 +12562,7 @@
     "    print(f\"\\tsize: {len(exp.dataset)}\")\n",
     "\n",
     "    # in a training loop, here you would train on the online_train_stream\n",
-    "    # here you would test on bm.valid_stream or bm.test_stream "
+    "    # here you would test on bm.valid_stream or bm.test_stream"
    ]
   },
   {
diff --git a/notebooks/from-zero-to-hero-tutorial/05_evaluation.ipynb b/notebooks/from-zero-to-hero-tutorial/05_evaluation.ipynb
index 35ed01a1e..d6cdfdb0a 100644
--- a/notebooks/from-zero-to-hero-tutorial/05_evaluation.ipynb
+++ b/notebooks/from-zero-to-hero-tutorial/05_evaluation.ipynb
@@ -566,7 +566,7 @@
     "\n",
     "# DEFINE THE EVALUATION PLUGIN\n",
     "# The evaluation plugin manages the metrics computation.\n",
-    "# It takes as argument a list of metrics, collectes their results and returns\n",
+    "# It takes as argument a list of metrics, collects their results and returns\n",
     "# them to the strategy it is attached to.\n",
     "\n",
     "eval_plugin = EvaluationPlugin(\n",
diff --git a/notebooks/from-zero-to-hero-tutorial/06_loggers.ipynb b/notebooks/from-zero-to-hero-tutorial/06_loggers.ipynb
index 61e43fefa..7ac80ec76 100644
--- a/notebooks/from-zero-to-hero-tutorial/06_loggers.ipynb
+++ b/notebooks/from-zero-to-hero-tutorial/06_loggers.ipynb
@@ -86,7 +86,7 @@
     "\n",
     "# DEFINE THE EVALUATION PLUGIN and LOGGERS\n",
     "# The evaluation plugin manages the metrics computation.\n",
-    "# It takes as argument a list of metrics, collectes their results and returns\n",
+    "# It takes as argument a list of metrics, collects their results and returns\n",
     "# them to the strategy it is attached to.\n",
     "\n",
     "\n",
diff --git a/notebooks/from-zero-to-hero-tutorial/07_putting-all-together.ipynb b/notebooks/from-zero-to-hero-tutorial/07_putting-all-together.ipynb
index 59a9492f1..97a42faec 100644
--- a/notebooks/from-zero-to-hero-tutorial/07_putting-all-together.ipynb
+++ b/notebooks/from-zero-to-hero-tutorial/07_putting-all-together.ipynb
@@ -429,7 +429,7 @@
     "\n",
     "# DEFINE THE EVALUATION PLUGIN and LOGGERS\n",
     "# The evaluation plugin manages the metrics computation.\n",
-    "# It takes as argument a list of metrics, collectes their results and returns\n",
+    "# It takes as argument a list of metrics, collects their results and returns\n",
     "# them to the strategy it is attached to.\n",
     "\n",
     "# log to Tensorboard\n",
diff --git a/notebooks/getting-started/learn-avalanche-in-5-minutes.ipynb b/notebooks/getting-started/learn-avalanche-in-5-minutes.ipynb
index 41238bb3a..aa4d655a7 100644
--- a/notebooks/getting-started/learn-avalanche-in-5-minutes.ipynb
+++ b/notebooks/getting-started/learn-avalanche-in-5-minutes.ipynb
@@ -836,7 +836,7 @@
         "\n",
         "# DEFINE THE EVALUATION PLUGIN and LOGGERS\n",
         "# The evaluation plugin manages the metrics computation.\n",
-        "# It takes as argument a list of metrics, collectes their results and returns \n",
+        "# It takes as argument a list of metrics, collects their results and returns \n",
         "# them to the strategy it is attached to.\n",
         "\n",
         "# log to Tensorboard\n",
diff --git a/notebooks/how-tos/avalanchedataset/avalanche-datasets.ipynb b/notebooks/how-tos/avalanchedataset/avalanche-datasets.ipynb
index d6aa4a99c..7d585c167 100644
--- a/notebooks/how-tos/avalanchedataset/avalanche-datasets.ipynb
+++ b/notebooks/how-tos/avalanchedataset/avalanche-datasets.ipynb
@@ -75,7 +75,7 @@
     "# Create the Dataset\n",
     "torch_data = TensorDataset(x_data, y_data)\n",
     "\n",
-    "avl_data = AvalancheDataset(torch_data)"
+    "avl_data = AvalancheDataset([torch_data])"
    ]
   },
   {