From 188a45f9b8d35faef4952848e0cad85b05bb1c78 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 4 Oct 2024 21:15:04 -0500 Subject: [PATCH 01/27] Update fabric.py --- src/lightning/fabric/fabric.py | 42 +++++++++++++++------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py index 0ff5b04b30b0a..9555082ecc5e1 100644 --- a/src/lightning/fabric/fabric.py +++ b/src/lightning/fabric/fabric.py @@ -38,6 +38,7 @@ from lightning_utilities.core.overrides import is_overridden from torch import Tensor from torch.optim import Optimizer +from torch.optim.lr_scheduler import LRScheduler from torch.utils.data import BatchSampler, DataLoader, DistributedSampler, RandomSampler, SequentialSampler import lightning.fabric @@ -208,71 +209,66 @@ def run(self, *args: Any, **kwargs: Any) -> Any: """ - def setup( - self, - module: nn.Module, - *optimizers: Optimizer, - move_to_device: bool = True, - _reapply_compile: bool = True, - ) -> Any: # no specific return because the way we want our API to look does not play well with mypy + def setup(self, module: nn.Module, *optimizers: Optimizer, scheduler: Optional[LRScheduler] = None, move_to_device: bool = True, _reapply_compile: bool = True,) -> Any: # no specific return because the way we want our API to look does not play well with mypy r"""Set up a model and its optimizers for accelerated training. - + Args: module: A :class:`torch.nn.Module` to set up *optimizers: The optimizer(s) to set up (no optimizers is also possible) + scheduler: The learning rate scheduler to set up (no learning rate scheduler is also possible) move_to_device: If set ``True`` (default), moves the model to the correct device. Set this to ``False`` and alternatively use :meth:`to_device` manually. _reapply_compile: If ``True`` (default), and the model was ``torch.compile``d before, the corresponding :class:`~torch._dynamo.OptimizedModule` wrapper will be removed and reapplied with the same settings after the model was set up by the strategy (e.g., after the model was wrapped by DDP, FSDP etc.). Set it to ``False`` if compiling DDP/FSDP is causing issues. - + Returns: The tuple containing wrapped module and the optimizers, in the same order they were passed in. - + """ self._validate_setup(module, optimizers) module, compile_kwargs = _unwrap_compiled(module) if _reapply_compile else (module, None) original_module = module - + module = self._precision.convert_module(module) - + if move_to_device: module = self._move_model_to_device(model=module, optimizers=list(optimizers)) - + # Let accelerator/plugin wrap and connect the models and optimizers if optimizers: - module, optimizers = self._strategy.setup_module_and_optimizers( # type: ignore[assignment] - module, list(optimizers) + module, optimizers, scheduler = self._strategy.setup_module_and_optimizers( # type: ignore[assignment] + module, list(optimizers), scheduler ) else: module = self._strategy.setup_module(module) - + if compile_kwargs is not None: module = _to_compiled(module, compile_kwargs) module = _FabricModule(module, self._strategy, original_module=original_module) - + # Update the _DeviceDtypeModuleMixin's device parameter # NOTE: for sharded strategies or manual device placement, there's no single root device _update_properties( module, device=self.device if move_to_device else next(module.parameters(), torch.tensor(0)).device ) - + optimizers = [_FabricOptimizer(optimizer, self._strategy, self._callbacks) for optimizer in optimizers] - + self._models_setup += 1 - + if hasattr(original_module, "_fabric"): # this is probably a LightningModule original_module._fabric = self original_module._fabric_optimizers = optimizers if original_module not in self._callbacks: self._callbacks.append(original_module) - + self.call("on_after_setup", fabric=self, module=module) - + if optimizers: # join both types in a tuple for API convenience - return (module, *optimizers) + return (module, *optimizers, scheduler) return module def setup_module( From baf5988cd6e8e31e57e2c1ecb652c0217153b96f Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 4 Oct 2024 21:17:58 -0500 Subject: [PATCH 02/27] Update deepspeed.py --- src/lightning/fabric/strategies/deepspeed.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index 93a17f10c8998..666bee6bd9d27 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -25,6 +25,7 @@ from lightning_utilities.core.imports import RequirementCache from torch.nn import Module from torch.optim import Optimizer +from torch.optim.lr_scheduler import LRScheduler from typing_extensions import override from lightning.fabric.accelerators import Accelerator, CUDAAccelerator @@ -310,27 +311,25 @@ def model(self) -> "DeepSpeedEngine": return self._deepspeed_engine @override - def setup_module_and_optimizers( - self, module: Module, optimizers: List[Optimizer] - ) -> Tuple["DeepSpeedEngine", List[Optimizer]]: - """Set up a model and multiple optimizers together. - + def setup_module_and_optimizers(self, module: Module, optimizers: List[Optimizer], scheduler: Optional[LRScheduler] = None) -> Tuple["DeepSpeedEngine", List[Optimizer], Optional[LRScheduler]]: + """Set up a model and multiple optimizers together along with an optional learning rate scheduler. + Currently, only a single optimizer is supported. - + Return: The model wrapped into a :class:`deepspeed.DeepSpeedEngine` and a list with a single deepspeed optimizer. - + """ if len(optimizers) != 1: raise ValueError( f"Currently only one optimizer is supported with DeepSpeed." f" Got {len(optimizers)} optimizers instead." ) - - self._deepspeed_engine, optimizer = self._initialize_engine(module, optimizers[0]) + + self._deepspeed_engine, optimizer, scheduler = self._initialize_engine(module, optimizers[0], scheduler) self._set_deepspeed_activation_checkpointing() - return self._deepspeed_engine, [optimizer] + return self._deepspeed_engine, [optimizer], scheduler @override def setup_module(self, module: Module) -> "DeepSpeedEngine": From 1f4c18e03a9fb970b6922eca9f9245cacd17b9a6 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 4 Oct 2024 21:22:03 -0500 Subject: [PATCH 03/27] Update deepspeed.py --- src/lightning/fabric/strategies/deepspeed.py | 21 +++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index 666bee6bd9d27..1ee846e079fda 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -25,7 +25,7 @@ from lightning_utilities.core.imports import RequirementCache from torch.nn import Module from torch.optim import Optimizer -from torch.optim.lr_scheduler import LRScheduler +from torch.optim.lr_scheduler import _LRScheduler from typing_extensions import override from lightning.fabric.accelerators import Accelerator, CUDAAccelerator @@ -311,7 +311,7 @@ def model(self) -> "DeepSpeedEngine": return self._deepspeed_engine @override - def setup_module_and_optimizers(self, module: Module, optimizers: List[Optimizer], scheduler: Optional[LRScheduler] = None) -> Tuple["DeepSpeedEngine", List[Optimizer], Optional[LRScheduler]]: + def setup_module_and_optimizers(self, module: Module, optimizers: List[Optimizer], scheduler: Optional[_LRScheduler] = None) -> Tuple["DeepSpeedEngine", List[Optimizer], Optional[_LRScheduler]]: """Set up a model and multiple optimizers together along with an optional learning rate scheduler. Currently, only a single optimizer is supported. @@ -590,28 +590,25 @@ def register_strategies(cls, strategy_registry: _StrategyRegistry) -> None: offload_optimizer_device="nvme", ) - def _initialize_engine( - self, - model: Module, - optimizer: Optional[Optimizer] = None, - ) -> Tuple["DeepSpeedEngine", Optimizer]: + def _initialize_engine(self, model: Module, optimizer: Optional[Optimizer] = None, scheduler: Optional[_LRScheduler] = None) -> Tuple["DeepSpeedEngine", Optimizer, Optional[_LRScheduler]]: """Initialize one model and one optimizer with an optional learning rate scheduler. - + This calls :func:`deepspeed.initialize` internally. - + """ import deepspeed - + model_parameters = filter(lambda p: p.requires_grad, model.parameters()) - deepspeed_engine, deepspeed_optimizer, _, _ = deepspeed.initialize( + deepspeed_engine, deepspeed_optimizer, _, deepspeed_scheduler = deepspeed.initialize( args=argparse.Namespace(device_rank=self.root_device.index), config=self.config, model=model, model_parameters=model_parameters, optimizer=optimizer, + lr_scheduler=scheduler, dist_init_required=False, ) - return deepspeed_engine, deepspeed_optimizer + return deepspeed_engine, deepspeed_optimizer, deepspeed_scheduler @override def setup_environment(self) -> None: From 585e30274dafb5c76d2fe6053b466ae2850c67f4 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 4 Oct 2024 21:22:58 -0500 Subject: [PATCH 04/27] Update fabric.py --- src/lightning/fabric/fabric.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py index 9555082ecc5e1..96df6ec1a97fa 100644 --- a/src/lightning/fabric/fabric.py +++ b/src/lightning/fabric/fabric.py @@ -38,7 +38,7 @@ from lightning_utilities.core.overrides import is_overridden from torch import Tensor from torch.optim import Optimizer -from torch.optim.lr_scheduler import LRScheduler +from torch.optim.lr_scheduler import _LRScheduler from torch.utils.data import BatchSampler, DataLoader, DistributedSampler, RandomSampler, SequentialSampler import lightning.fabric @@ -209,7 +209,7 @@ def run(self, *args: Any, **kwargs: Any) -> Any: """ - def setup(self, module: nn.Module, *optimizers: Optimizer, scheduler: Optional[LRScheduler] = None, move_to_device: bool = True, _reapply_compile: bool = True,) -> Any: # no specific return because the way we want our API to look does not play well with mypy + def setup(self, module: nn.Module, *optimizers: Optimizer, scheduler: Optional[_LRScheduler] = None, move_to_device: bool = True, _reapply_compile: bool = True,) -> Any: # no specific return because the way we want our API to look does not play well with mypy r"""Set up a model and its optimizers for accelerated training. Args: From 045176120192e8f860d34be5d0b92b382abf2d39 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 4 Oct 2024 21:25:44 -0500 Subject: [PATCH 05/27] Update fsdp.py --- src/lightning/fabric/strategies/fsdp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index e7fdd29f6287f..74bfe56395020 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -39,6 +39,7 @@ from torch import Tensor from torch.nn import Module from torch.optim import Optimizer +from torch.optim.lr_scheduler import _LRScheduler from typing_extensions import TypeGuard, override from lightning.fabric.accelerators import Accelerator @@ -267,7 +268,7 @@ def setup_environment(self) -> None: @override def setup_module_and_optimizers( - self, module: Module, optimizers: List[Optimizer] + self, module: Module, optimizers: List[Optimizer], scheduler: Optional[_LRScheduler] = None ) -> Tuple[Module, List[Optimizer]]: """Wraps the model into a :class:`~torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel` module and sets `use_orig_params=True` to keep the reference to the original parameters in the optimizer.""" From a912aab54abcd28742616f924036b030bbc8bd43 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 4 Oct 2024 21:27:43 -0500 Subject: [PATCH 06/27] Update strategy.py --- src/lightning/fabric/strategies/strategy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/strategies/strategy.py b/src/lightning/fabric/strategies/strategy.py index 6bfed6a270b68..742fccdebc443 100644 --- a/src/lightning/fabric/strategies/strategy.py +++ b/src/lightning/fabric/strategies/strategy.py @@ -144,7 +144,7 @@ def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManag return stack def setup_module_and_optimizers( - self, module: Module, optimizers: List[Optimizer] + self, module: Module, optimizers: List[Optimizer], scheduler: Optional[_LRScheduler] = None ) -> Tuple[Module, List[Optimizer]]: """Set up a model and multiple optimizers together. @@ -154,7 +154,7 @@ def setup_module_and_optimizers( """ module = self.setup_module(module) optimizers = [self.setup_optimizer(optimizer) for optimizer in optimizers] - return module, optimizers + return module, optimizers, scheduler def setup_module(self, module: Module) -> Module: """Performs setup for the model, e.g., by wrapping it by another class.""" From d27d4a3c88e0a2866ac2062e7315877762e9ed69 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 4 Oct 2024 21:28:15 -0500 Subject: [PATCH 07/27] Update strategy.py --- src/lightning/fabric/strategies/strategy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lightning/fabric/strategies/strategy.py b/src/lightning/fabric/strategies/strategy.py index 742fccdebc443..440236be54901 100644 --- a/src/lightning/fabric/strategies/strategy.py +++ b/src/lightning/fabric/strategies/strategy.py @@ -20,6 +20,7 @@ from torch import Tensor from torch.nn import Module from torch.optim import Optimizer +from torch.optim.lr_scheduler import _LRScheduler from torch.utils.data import DataLoader from lightning.fabric.accelerators import Accelerator From 67089a10f90cb72c2b04fdf941db25f9db213033 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 4 Oct 2024 21:29:19 -0500 Subject: [PATCH 08/27] Update xla_fsdp.py --- src/lightning/fabric/strategies/xla_fsdp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lightning/fabric/strategies/xla_fsdp.py b/src/lightning/fabric/strategies/xla_fsdp.py index 6da693bafb1c8..7965f21549f09 100644 --- a/src/lightning/fabric/strategies/xla_fsdp.py +++ b/src/lightning/fabric/strategies/xla_fsdp.py @@ -21,6 +21,7 @@ from torch import Tensor from torch.nn import Module from torch.optim import Optimizer +from torch.optim.lr_scheduler import _LRScheduler from torch.utils.data import DataLoader from typing_extensions import override @@ -196,7 +197,7 @@ def setup_environment(self) -> None: @override def setup_module_and_optimizers( - self, module: Module, optimizers: List[Optimizer] + self, module: Module, optimizers: List[Optimizer], scheduler: Optional[_LRScheduler] = None ) -> Tuple[Module, List[Optimizer]]: """Returns NotImplementedError since for XLAFSDP optimizer setup must happen after module setup.""" raise NotImplementedError( From 10258752b02c73af8aaa6fa36b6524a5847d5737 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 5 Oct 2024 02:35:46 +0000 Subject: [PATCH 09/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/fabric.py | 35 ++++++++++++-------- src/lightning/fabric/strategies/deepspeed.py | 22 +++++++----- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py index 96df6ec1a97fa..6d0dc2dd4073f 100644 --- a/src/lightning/fabric/fabric.py +++ b/src/lightning/fabric/fabric.py @@ -209,9 +209,16 @@ def run(self, *args: Any, **kwargs: Any) -> Any: """ - def setup(self, module: nn.Module, *optimizers: Optimizer, scheduler: Optional[_LRScheduler] = None, move_to_device: bool = True, _reapply_compile: bool = True,) -> Any: # no specific return because the way we want our API to look does not play well with mypy + def setup( + self, + module: nn.Module, + *optimizers: Optimizer, + scheduler: Optional[_LRScheduler] = None, + move_to_device: bool = True, + _reapply_compile: bool = True, + ) -> Any: # no specific return because the way we want our API to look does not play well with mypy r"""Set up a model and its optimizers for accelerated training. - + Args: module: A :class:`torch.nn.Module` to set up *optimizers: The optimizer(s) to set up (no optimizers is also possible) @@ -222,20 +229,20 @@ def setup(self, module: nn.Module, *optimizers: Optimizer, scheduler: Optional[_ corresponding :class:`~torch._dynamo.OptimizedModule` wrapper will be removed and reapplied with the same settings after the model was set up by the strategy (e.g., after the model was wrapped by DDP, FSDP etc.). Set it to ``False`` if compiling DDP/FSDP is causing issues. - + Returns: The tuple containing wrapped module and the optimizers, in the same order they were passed in. - + """ self._validate_setup(module, optimizers) module, compile_kwargs = _unwrap_compiled(module) if _reapply_compile else (module, None) original_module = module - + module = self._precision.convert_module(module) - + if move_to_device: module = self._move_model_to_device(model=module, optimizers=list(optimizers)) - + # Let accelerator/plugin wrap and connect the models and optimizers if optimizers: module, optimizers, scheduler = self._strategy.setup_module_and_optimizers( # type: ignore[assignment] @@ -243,29 +250,29 @@ def setup(self, module: nn.Module, *optimizers: Optimizer, scheduler: Optional[_ ) else: module = self._strategy.setup_module(module) - + if compile_kwargs is not None: module = _to_compiled(module, compile_kwargs) module = _FabricModule(module, self._strategy, original_module=original_module) - + # Update the _DeviceDtypeModuleMixin's device parameter # NOTE: for sharded strategies or manual device placement, there's no single root device _update_properties( module, device=self.device if move_to_device else next(module.parameters(), torch.tensor(0)).device ) - + optimizers = [_FabricOptimizer(optimizer, self._strategy, self._callbacks) for optimizer in optimizers] - + self._models_setup += 1 - + if hasattr(original_module, "_fabric"): # this is probably a LightningModule original_module._fabric = self original_module._fabric_optimizers = optimizers if original_module not in self._callbacks: self._callbacks.append(original_module) - + self.call("on_after_setup", fabric=self, module=module) - + if optimizers: # join both types in a tuple for API convenience return (module, *optimizers, scheduler) diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index 1ee846e079fda..eca16e9b9a4ef 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -311,22 +311,24 @@ def model(self) -> "DeepSpeedEngine": return self._deepspeed_engine @override - def setup_module_and_optimizers(self, module: Module, optimizers: List[Optimizer], scheduler: Optional[_LRScheduler] = None) -> Tuple["DeepSpeedEngine", List[Optimizer], Optional[_LRScheduler]]: + def setup_module_and_optimizers( + self, module: Module, optimizers: List[Optimizer], scheduler: Optional[_LRScheduler] = None + ) -> Tuple["DeepSpeedEngine", List[Optimizer], Optional[_LRScheduler]]: """Set up a model and multiple optimizers together along with an optional learning rate scheduler. - + Currently, only a single optimizer is supported. - + Return: The model wrapped into a :class:`deepspeed.DeepSpeedEngine` and a list with a single deepspeed optimizer. - + """ if len(optimizers) != 1: raise ValueError( f"Currently only one optimizer is supported with DeepSpeed." f" Got {len(optimizers)} optimizers instead." ) - + self._deepspeed_engine, optimizer, scheduler = self._initialize_engine(module, optimizers[0], scheduler) self._set_deepspeed_activation_checkpointing() return self._deepspeed_engine, [optimizer], scheduler @@ -590,14 +592,16 @@ def register_strategies(cls, strategy_registry: _StrategyRegistry) -> None: offload_optimizer_device="nvme", ) - def _initialize_engine(self, model: Module, optimizer: Optional[Optimizer] = None, scheduler: Optional[_LRScheduler] = None) -> Tuple["DeepSpeedEngine", Optimizer, Optional[_LRScheduler]]: + def _initialize_engine( + self, model: Module, optimizer: Optional[Optimizer] = None, scheduler: Optional[_LRScheduler] = None + ) -> Tuple["DeepSpeedEngine", Optimizer, Optional[_LRScheduler]]: """Initialize one model and one optimizer with an optional learning rate scheduler. - + This calls :func:`deepspeed.initialize` internally. - + """ import deepspeed - + model_parameters = filter(lambda p: p.requires_grad, model.parameters()) deepspeed_engine, deepspeed_optimizer, _, deepspeed_scheduler = deepspeed.initialize( args=argparse.Namespace(device_rank=self.root_device.index), From 9b45b9920a9dd8bdd056e9a153342743980df044 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 4 Oct 2024 22:21:00 -0500 Subject: [PATCH 10/27] Update fsdp.py --- src/lightning/fabric/strategies/fsdp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index 74bfe56395020..6efc372db627b 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -269,7 +269,7 @@ def setup_environment(self) -> None: @override def setup_module_and_optimizers( self, module: Module, optimizers: List[Optimizer], scheduler: Optional[_LRScheduler] = None - ) -> Tuple[Module, List[Optimizer]]: + ) -> Tuple[Module, List[Optimizer], Optional[_LRScheduler]]: """Wraps the model into a :class:`~torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel` module and sets `use_orig_params=True` to keep the reference to the original parameters in the optimizer.""" use_orig_params = self._fsdp_kwargs.get("use_orig_params") @@ -281,7 +281,7 @@ def setup_module_and_optimizers( " call `setup_optimizer`." ) module = self.setup_module(module) - return module, optimizers + return module, optimizers, scheduler @override def setup_module(self, module: Module) -> Module: From a7a58351c8b4bd3b2fac05cacd889caa65aea24b Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 4 Oct 2024 22:21:55 -0500 Subject: [PATCH 11/27] Update strategy.py --- src/lightning/fabric/strategies/strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/strategies/strategy.py b/src/lightning/fabric/strategies/strategy.py index 440236be54901..96e856e68e5af 100644 --- a/src/lightning/fabric/strategies/strategy.py +++ b/src/lightning/fabric/strategies/strategy.py @@ -146,7 +146,7 @@ def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManag def setup_module_and_optimizers( self, module: Module, optimizers: List[Optimizer], scheduler: Optional[_LRScheduler] = None - ) -> Tuple[Module, List[Optimizer]]: + ) -> Tuple[Module, List[Optimizer], Optional[_LRScheduler]]: """Set up a model and multiple optimizers together. The returned objects are expected to be in the same order they were passed in. The default implementation will From 3ece31c9d0de815fcd68db83e444849337371d7a Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 4 Oct 2024 22:22:19 -0500 Subject: [PATCH 12/27] Update xla_fsdp.py --- src/lightning/fabric/strategies/xla_fsdp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/strategies/xla_fsdp.py b/src/lightning/fabric/strategies/xla_fsdp.py index 7965f21549f09..b2236aedab43f 100644 --- a/src/lightning/fabric/strategies/xla_fsdp.py +++ b/src/lightning/fabric/strategies/xla_fsdp.py @@ -198,7 +198,7 @@ def setup_environment(self) -> None: @override def setup_module_and_optimizers( self, module: Module, optimizers: List[Optimizer], scheduler: Optional[_LRScheduler] = None - ) -> Tuple[Module, List[Optimizer]]: + ) -> Tuple[Module, List[Optimizer], Optional[_LRScheduler]]: """Returns NotImplementedError since for XLAFSDP optimizer setup must happen after module setup.""" raise NotImplementedError( f"The `{type(self).__name__}` does not support the joint setup of module and optimizer(s)." From e48acd297c3d1e42cbbdcf2daf9825327dead0cd Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 4 Oct 2024 22:28:42 -0500 Subject: [PATCH 13/27] Update deepspeed.py --- src/lightning/fabric/strategies/deepspeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index eca16e9b9a4ef..e0d9d0091657f 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -340,7 +340,7 @@ def setup_module(self, module: Module) -> "DeepSpeedEngine": For training, see :meth:`setup_module_and_optimizers`. """ - self._deepspeed_engine, _ = self._initialize_engine(module) + self._deepspeed_engine, _, _ = self._initialize_engine(module) return self._deepspeed_engine @override From f13516d302419b00206239fd31e8b01b879cb554 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Mon, 28 Oct 2024 11:25:34 -0500 Subject: [PATCH 14/27] Update seed.py --- src/lightning/fabric/utilities/seed.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py index a2d627828a77e..28e988705e179 100644 --- a/src/lightning/fabric/utilities/seed.py +++ b/src/lightning/fabric/utilities/seed.py @@ -91,8 +91,9 @@ def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: """ # implementation notes: https://github.com/pytorch/pytorch/issues/5059#issuecomment-817392562 + assert "PL_GLOBAL_SEED" in os.environ, "`seed_everything(seed, workers=True)` must be called before training to use this function." global_rank = rank if rank is not None else rank_zero_only.rank - process_seed = torch.initial_seed() + process_seed = int(os.environ["PL_GLOBAL_SEED"]) # back out the base seed so we can use all the bits base_seed = process_seed - worker_id log.debug( From 80b4a6df2a413800d67df0c37fc32567a7aae25c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 16:25:55 +0000 Subject: [PATCH 15/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/utilities/seed.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py index 28e988705e179..9bacd7aecf554 100644 --- a/src/lightning/fabric/utilities/seed.py +++ b/src/lightning/fabric/utilities/seed.py @@ -91,7 +91,9 @@ def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: """ # implementation notes: https://github.com/pytorch/pytorch/issues/5059#issuecomment-817392562 - assert "PL_GLOBAL_SEED" in os.environ, "`seed_everything(seed, workers=True)` must be called before training to use this function." + assert ( + "PL_GLOBAL_SEED" in os.environ + ), "`seed_everything(seed, workers=True)` must be called before training to use this function." global_rank = rank if rank is not None else rank_zero_only.rank process_seed = int(os.environ["PL_GLOBAL_SEED"]) # back out the base seed so we can use all the bits From 2cab7e21235afc92cbd8cc014ba831871faee25d Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Mon, 28 Oct 2024 11:51:15 -0500 Subject: [PATCH 16/27] Update seed.py --- src/lightning/fabric/utilities/seed.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py index 9bacd7aecf554..77ee65ca555ac 100644 --- a/src/lightning/fabric/utilities/seed.py +++ b/src/lightning/fabric/utilities/seed.py @@ -91,11 +91,12 @@ def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: """ # implementation notes: https://github.com/pytorch/pytorch/issues/5059#issuecomment-817392562 - assert ( - "PL_GLOBAL_SEED" in os.environ - ), "`seed_everything(seed, workers=True)` must be called before training to use this function." global_rank = rank if rank is not None else rank_zero_only.rank - process_seed = int(os.environ["PL_GLOBAL_SEED"]) + env_seed = os.environ.get("PL_GLOBAL_SEED", None) + if env_seed is None: + env_seed = "0" + rank_zero_warn(f"No seed found, worker seed set to {env_seed}") + process_seed = int(env_seed) # back out the base seed so we can use all the bits base_seed = process_seed - worker_id log.debug( From e9127f4aac6b05753aab047133825a19a4d2b3f9 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Mon, 28 Oct 2024 16:27:51 -0500 Subject: [PATCH 17/27] Update seed.py --- src/lightning/fabric/utilities/seed.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py index 77ee65ca555ac..d62f1b060f828 100644 --- a/src/lightning/fabric/utilities/seed.py +++ b/src/lightning/fabric/utilities/seed.py @@ -108,7 +108,10 @@ def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: if _NUMPY_AVAILABLE: import numpy as np - np.random.seed(seed_sequence[3] & 0xFFFFFFFF) # numpy takes 32-bit seed only + ss = np.random.SeedSequence([base_seed, worker_id, global_rank]) + np_rng_seed = ss.generate_state(4) + + np.random.seed(np_rng_seed) def _generate_seed_sequence(base_seed: int, worker_id: int, global_rank: int, count: int) -> List[int]: From c127458ee9b74f4c03c3bc612151ebb67a4e4a9d Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Mon, 28 Oct 2024 16:30:45 -0500 Subject: [PATCH 18/27] Update seed.py --- src/lightning/fabric/utilities/seed.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py index d62f1b060f828..202f3e931e43a 100644 --- a/src/lightning/fabric/utilities/seed.py +++ b/src/lightning/fabric/utilities/seed.py @@ -92,11 +92,7 @@ def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: """ # implementation notes: https://github.com/pytorch/pytorch/issues/5059#issuecomment-817392562 global_rank = rank if rank is not None else rank_zero_only.rank - env_seed = os.environ.get("PL_GLOBAL_SEED", None) - if env_seed is None: - env_seed = "0" - rank_zero_warn(f"No seed found, worker seed set to {env_seed}") - process_seed = int(env_seed) + process_seed = torch.initial_seed() # back out the base seed so we can use all the bits base_seed = process_seed - worker_id log.debug( From 25e8d48f046cea17249bafefa4f18c1336d9efab Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 Nov 2024 10:33:47 +0000 Subject: [PATCH 19/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/strategies/deepspeed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index 6c5492572616c..0742cdf500c8f 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -315,8 +315,8 @@ def model(self) -> "DeepSpeedEngine": def setup_module_and_optimizers( self, module: Module, optimizers: list[Optimizer], scheduler: Optional[_LRScheduler] = None ) -> Tuple["DeepSpeedEngine", list[Optimizer], Optional[_LRScheduler]]: - """Set up a model and multiple optimizers together, along with an optional learning rate scheduler. - Currently, only a single optimizer is supported. + """Set up a model and multiple optimizers together, along with an optional learning rate scheduler. Currently, + only a single optimizer is supported. Return: The model wrapped into a :class:`deepspeed.DeepSpeedEngine` and a list with a single From 737162d6f3bc7369a6ca95309f6ed22be557d34c Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Mon, 25 Nov 2024 11:35:00 +0100 Subject: [PATCH 20/27] Update src/lightning/fabric/strategies/deepspeed.py --- src/lightning/fabric/strategies/deepspeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index 0742cdf500c8f..d695d492d7e86 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -314,7 +314,7 @@ def model(self) -> "DeepSpeedEngine": @override def setup_module_and_optimizers( self, module: Module, optimizers: list[Optimizer], scheduler: Optional[_LRScheduler] = None - ) -> Tuple["DeepSpeedEngine", list[Optimizer], Optional[_LRScheduler]]: + ) -> tuple["DeepSpeedEngine", list[Optimizer], Optional[_LRScheduler]]: """Set up a model and multiple optimizers together, along with an optional learning rate scheduler. Currently, only a single optimizer is supported. From 2d347d0bbbccc507ad0ea5fd8fbe3cfdeaa16920 Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Mon, 25 Nov 2024 11:35:06 +0100 Subject: [PATCH 21/27] Update src/lightning/fabric/strategies/fsdp.py --- src/lightning/fabric/strategies/fsdp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index 731a309b92222..b2f548c49056d 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -263,7 +263,7 @@ def setup_environment(self) -> None: @override def setup_module_and_optimizers( self, module: Module, optimizers: list[Optimizer], scheduler: Optional[_LRScheduler] = None - ) -> Tuple[Module, list[Optimizer], Optional[_LRScheduler]]: + ) -> tuple[Module, list[Optimizer], Optional[_LRScheduler]]: """Wraps the model into a :class:`~torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel` module and sets `use_orig_params=True` to keep the reference to the original parameters in the optimizer.""" use_orig_params = self._fsdp_kwargs.get("use_orig_params") From 5d227ff4809ac6e97a1a959fcb8c1e7d853c217b Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Mon, 25 Nov 2024 11:35:10 +0100 Subject: [PATCH 22/27] Update src/lightning/fabric/strategies/strategy.py --- src/lightning/fabric/strategies/strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/strategies/strategy.py b/src/lightning/fabric/strategies/strategy.py index 3b3801a3e86fa..1788ffc757809 100644 --- a/src/lightning/fabric/strategies/strategy.py +++ b/src/lightning/fabric/strategies/strategy.py @@ -147,7 +147,7 @@ def module_init_context(self, empty_init: Optional[bool] = None) -> AbstractCont def setup_module_and_optimizers( self, module: Module, optimizers: list[Optimizer], scheduler: Optional[_LRScheduler] = None - ) -> Tuple[Module, list[Optimizer], Optional[_LRScheduler]]: + ) -> tuple[Module, list[Optimizer], Optional[_LRScheduler]]: """Set up a model and multiple optimizers together. The returned objects are expected to be in the same order they were passed in. The default implementation will From f94efa710862d769ecaa2cd411a3905183e7f7b0 Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Mon, 25 Nov 2024 11:35:16 +0100 Subject: [PATCH 23/27] Update src/lightning/fabric/strategies/xla_fsdp.py --- src/lightning/fabric/strategies/xla_fsdp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/strategies/xla_fsdp.py b/src/lightning/fabric/strategies/xla_fsdp.py index aec490a89e1b0..41c63dd01f620 100644 --- a/src/lightning/fabric/strategies/xla_fsdp.py +++ b/src/lightning/fabric/strategies/xla_fsdp.py @@ -198,7 +198,7 @@ def setup_environment(self) -> None: @override def setup_module_and_optimizers( self, module: Module, optimizers: list[Optimizer], scheduler: Optional[_LRScheduler] = None - ) -> Tuple[Module, list[Optimizer], Optional[_LRScheduler]]: + ) -> tuple[Module, list[Optimizer], Optional[_LRScheduler]]: """Returns NotImplementedError since for XLAFSDP optimizer setup must happen after module setup.""" raise NotImplementedError( f"The `{type(self).__name__}` does not support the joint setup of module and optimizer(s)." From 56464ed9da9480085baa86122c56987a6e34d5bc Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Thu, 9 Jan 2025 13:21:22 -0600 Subject: [PATCH 24/27] Update deepspeed.py --- src/lightning/fabric/strategies/deepspeed.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index 37fa57e98201b..4b31c7a42fd64 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -318,13 +318,13 @@ def model(self) -> "DeepSpeedEngine": @override def setup_module_and_optimizers( self, module: Module, optimizers: list[Optimizer], scheduler: Optional[_LRScheduler] = None - ) -> tuple["DeepSpeedEngine", list[Optimizer], Optional[_LRScheduler]]: + ) -> tuple["DeepSpeedEngine", list[Optimizer], Any]: """Set up a model and multiple optimizers together, along with an optional learning rate scheduler. Currently, only a single optimizer is supported. Return: - The model wrapped into a :class:`deepspeed.DeepSpeedEngine` and a list with a single - deepspeed optimizer. + The model wrapped into a :class:`deepspeed.DeepSpeedEngine`, a list with a single + deepspeed optimizer, and an optional learning rate scheduler. """ if len(optimizers) != 1: @@ -597,7 +597,7 @@ def register_strategies(cls, strategy_registry: _StrategyRegistry) -> None: def _initialize_engine( self, model: Module, optimizer: Optional[Optimizer] = None, scheduler: Optional[_LRScheduler] = None - ) -> tuple["DeepSpeedEngine", Optimizer, Optional[_LRScheduler]]: + ) -> tuple["DeepSpeedEngine", Optimizer, Any]: """Initialize one model and one optimizer with an optional learning rate scheduler. This calls ``deepspeed.initialize`` internally. From e09941c55f5fdbb2786df0652baa1706653be8f5 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Thu, 9 Jan 2025 13:27:13 -0600 Subject: [PATCH 25/27] Update fabric.py --- src/lightning/fabric/fabric.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py index 3aee8a4691ec6..96e73ea6ffb8e 100644 --- a/src/lightning/fabric/fabric.py +++ b/src/lightning/fabric/fabric.py @@ -225,7 +225,8 @@ def setup( FSDP etc.). Set it to ``False`` if compiling DDP/FSDP is causing issues. Returns: - The tuple containing wrapped module and the optimizers, in the same order they were passed in. + The tuple containing wrapped module, optimizers, and an optional learning rate scheduler, + in the same order they were passed in. """ self._validate_setup(module, optimizers) @@ -269,7 +270,7 @@ def setup( if optimizers: # join both types in a tuple for API convenience - return (module, *optimizers, scheduler) + return (module, *optimizers, scheduler) if scheduler is not None else (module, *optimizers) return module def setup_module( From 3709f1d5eb12ba435d86f69d357955e97ecef7e1 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 10 Jan 2025 11:09:34 -0600 Subject: [PATCH 26/27] Update fabric_methods.rst --- docs/source-fabric/api/fabric_methods.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source-fabric/api/fabric_methods.rst b/docs/source-fabric/api/fabric_methods.rst index 87b22578c1202..fb03e0f61f6bd 100644 --- a/docs/source-fabric/api/fabric_methods.rst +++ b/docs/source-fabric/api/fabric_methods.rst @@ -40,6 +40,7 @@ Moves the model and optimizer to the correct device automatically. model = nn.Linear(32, 64) optimizer = torch.optim.SGD(model.parameters(), lr=0.001) + scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.3, total_iters=10) # Set up model and optimizer for accelerated training model, optimizer = fabric.setup(model, optimizer) @@ -47,6 +48,9 @@ Moves the model and optimizer to the correct device automatically. # If you don't want Fabric to set the device model, optimizer = fabric.setup(model, optimizer, move_to_device=False) + # If you want to additionally register a learning rate scheduler with compatible strategies such as DeepSpeed + model, optimizer, scheduler = fabric.setup(model, optimizer, scheduler) + The setup method also prepares the model for the selected precision choice so that operations during ``forward()`` get cast automatically. Advanced users should read :doc:`the notes on models wrapped by Fabric <../api/wrappers>`. From 13195a259ddd01ee83f0d5841f0fb4ed9c996c76 Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Fri, 10 Jan 2025 11:12:14 -0600 Subject: [PATCH 27/27] Update wrappers.rst --- docs/source-fabric/api/wrappers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-fabric/api/wrappers.rst b/docs/source-fabric/api/wrappers.rst index e87874eb08666..8b20e1906072e 100644 --- a/docs/source-fabric/api/wrappers.rst +++ b/docs/source-fabric/api/wrappers.rst @@ -124,7 +124,7 @@ If you were to run this model in Fabric with multiple devices (DDP or FSDP), you # OK: Calling the model directly output = model(torch.randn(10)) - # OK: Calling the model's forward (equivalent to the abvoe) + # OK: Calling the model's forward (equivalent to the above) output = model.forward(torch.randn(10)) # ERROR: Calling another method that calls forward indirectly