From a12b4f28b5504c47f592a03fd01faabd64020659 Mon Sep 17 00:00:00 2001 From: Cathal OBrien Date: Thu, 21 Nov 2024 12:41:03 +0000 Subject: [PATCH 1/8] reduce decoder mem usage declare an empty accum tensor outside the for loop. the old way of having out and out1 results in two copies of the array which results in more memory use. at 9km this added 6gb to peak mem usage --- src/anemoi/models/layers/block.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/anemoi/models/layers/block.py b/src/anemoi/models/layers/block.py index 60446d6c..0d7df22f 100644 --- a/src/anemoi/models/layers/block.py +++ b/src/anemoi/models/layers/block.py @@ -512,8 +512,9 @@ def forward( edge_attr_list, edge_index_list = sort_edges_1hop_chunks( num_nodes=size, edge_attr=edges, edge_index=edge_index, num_chunks=num_chunks ) + out=torch.zeros((x[1].shape[0], self.num_heads, self.out_channels_conv), device=x[1].device) for i in range(num_chunks): - out1 = self.conv( + out = self.conv( query=query, key=key, value=value, @@ -521,9 +522,6 @@ def forward( edge_index=edge_index_list[i], size=size, ) - if i == 0: - out = torch.zeros_like(out1, device=out1.device) - out = out + out1 else: out = self.conv(query=query, key=key, value=value, edge_attr=edges, edge_index=edge_index, size=size) From f7ce0937171a581dda473b3cdfc7c6e37e9f0b35 Mon Sep 17 00:00:00 2001 From: Cathal OBrien Date: Thu, 21 Nov 2024 12:42:54 +0000 Subject: [PATCH 2/8] typo --- src/anemoi/models/layers/block.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anemoi/models/layers/block.py b/src/anemoi/models/layers/block.py index 0d7df22f..c21ff19c 100644 --- a/src/anemoi/models/layers/block.py +++ b/src/anemoi/models/layers/block.py @@ -514,7 +514,7 @@ def forward( ) out=torch.zeros((x[1].shape[0], self.num_heads, self.out_channels_conv), device=x[1].device) for i in range(num_chunks): - out = self.conv( + out += self.conv( query=query, key=key, value=value, From a81ab4e74249e97b3a180767591a4e9779fe457e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 21 Nov 2024 12:59:41 +0000 Subject: [PATCH 3/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/anemoi/models/layers/block.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anemoi/models/layers/block.py b/src/anemoi/models/layers/block.py index c21ff19c..72e487d2 100644 --- a/src/anemoi/models/layers/block.py +++ b/src/anemoi/models/layers/block.py @@ -512,7 +512,7 @@ def forward( edge_attr_list, edge_index_list = sort_edges_1hop_chunks( num_nodes=size, edge_attr=edges, edge_index=edge_index, num_chunks=num_chunks ) - out=torch.zeros((x[1].shape[0], self.num_heads, self.out_channels_conv), device=x[1].device) + out = torch.zeros((x[1].shape[0], self.num_heads, self.out_channels_conv), device=x[1].device) for i in range(num_chunks): out += self.conv( query=query, From 7a86cf35fe3cb6e5ceb9072947f9c51d4a4cf3b0 Mon Sep 17 00:00:00 2001 From: Cathal OBrien Date: Thu, 28 Nov 2024 16:02:35 +0000 Subject: [PATCH 4/8] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 07ee5709..3715757e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ Keep it human-readable, your future self will thank you! - Update copyright notice - Fix `__version__` import in init - Fix missing copyrights [#71](https://github.com/ecmwf/anemoi-models/pull/71) +- Reduced memory usage when using chunking in the mapper [#84](https://github.com/ecmwf/anemoi-models/pull/84) ### Removed From 0bd386b8dbbc98d840cc643d1544cb6808ff0065 Mon Sep 17 00:00:00 2001 From: Baudouin Raoult Date: Fri, 13 Dec 2024 10:07:32 +0000 Subject: [PATCH 5/8] add support for storing numpy arrays in the checkpoints --- src/anemoi/models/interface/__init__.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/anemoi/models/interface/__init__.py b/src/anemoi/models/interface/__init__.py index 25b7852a..cd8fffb2 100644 --- a/src/anemoi/models/interface/__init__.py +++ b/src/anemoi/models/interface/__init__.py @@ -37,6 +37,8 @@ class AnemoiModelInterface(torch.nn.Module): Statistics for the data. metadata : dict Metadata for the model. + supporting_arrays : dict + Numpy arraysto store in the checkpoint. data_indices : dict Indices for the data. pre_processors : Processors @@ -48,7 +50,14 @@ class AnemoiModelInterface(torch.nn.Module): """ def __init__( - self, *, config: DotDict, graph_data: HeteroData, statistics: dict, data_indices: dict, metadata: dict + self, + *, + config: DotDict, + graph_data: HeteroData, + statistics: dict, + data_indices: dict, + metadata: dict, + supporting_arrays: dict = {}, ) -> None: super().__init__() self.config = config @@ -57,6 +66,7 @@ def __init__( self.graph_data = graph_data self.statistics = statistics self.metadata = metadata + self.supporting_arrays = supporting_arrays self.data_indices = data_indices self._build_model() From f3b2ec1b833dad72dcbbc561981aa456f1ee6b38 Mon Sep 17 00:00:00 2001 From: Baudouin Raoult Date: Fri, 13 Dec 2024 10:24:40 +0000 Subject: [PATCH 6/8] add support for storing numpy arrays in the checkpoints --- src/anemoi/models/interface/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anemoi/models/interface/__init__.py b/src/anemoi/models/interface/__init__.py index cd8fffb2..261dec29 100644 --- a/src/anemoi/models/interface/__init__.py +++ b/src/anemoi/models/interface/__init__.py @@ -57,7 +57,7 @@ def __init__( statistics: dict, data_indices: dict, metadata: dict, - supporting_arrays: dict = {}, + supporting_arrays: dict = None, ) -> None: super().__init__() self.config = config @@ -66,7 +66,7 @@ def __init__( self.graph_data = graph_data self.statistics = statistics self.metadata = metadata - self.supporting_arrays = supporting_arrays + self.supporting_arrays = supporting_arrays if supporting_arrays is not None else {} self.data_indices = data_indices self._build_model() From 9d73c27131ab5c72f72046a613c79a3aeb4006ad Mon Sep 17 00:00:00 2001 From: Mario Santa Cruz <48736305+JPXKQX@users.noreply.github.com> Date: Fri, 13 Dec 2024 12:09:15 +0000 Subject: [PATCH 7/8] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6134e233..088c9e94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ Keep it human-readable, your future self will thank you! - Added dynamic NaN masking for the imputer class with two new classes: DynamicInputImputer, DynamicConstantImputer [#89](https://github.com/ecmwf/anemoi-models/pull/89) - New `NamedNodesAttributes` class to handle node attributes in a more flexible way [#64](https://github.com/ecmwf/anemoi-models/pull/64) - Contributors file [#69](https://github.com/ecmwf/anemoi-models/pull/69) +- Added `supporting_arrays` argument, which contains arrays to store in checkpoints. [#97](https://github.com/ecmwf/anemoi-models/pull/97) ### Changed - Bugfixes for CI From fcdd6a774f7c95d53f978b7a8aed68e7dc2373fb Mon Sep 17 00:00:00 2001 From: Mario Santa Cruz <48736305+JPXKQX@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:58:28 +0100 Subject: [PATCH 8/8] fix: update 0.4.0 changelog release (#93) * fix: update 0.4.0 changelog release * fix: update PR84 --- CHANGELOG.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6134e233..c5e61284 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,13 +8,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Please add your functional changes to the appropriate section in the PR. Keep it human-readable, your future self will thank you! -## [Unreleased](https://github.com/ecmwf/anemoi-models/compare/0.3.0...HEAD) - -- Add synchronisation workflow +## [Unreleased](https://github.com/ecmwf/anemoi-models/compare/0.4.0...HEAD) ### Added - New AnemoiModelEncProcDecHierarchical class available in models [#37](https://github.com/ecmwf/anemoi-models/pull/37) +- Mask NaN values in training loss function [#56](https://github.com/ecmwf/anemoi-models/pull/56) +- Added dynamic NaN masking for the imputer class with two new classes: DynamicInputImputer, DynamicConstantImputer [#89](https://github.com/ecmwf/anemoi-models/pull/89) +- Reduced memory usage when using chunking in the mapper [#84](https://github.com/ecmwf/anemoi-models/pull/84) + +## [0.4.0](https://github.com/ecmwf/anemoi-models/compare/0.3.0...0.4.0) - Improvements to Model Design + +### Added + +- Add synchronisation workflow [#60](https://github.com/ecmwf/anemoi-models/pull/60) - Add anemoi-transform link to documentation - Codeowners file - Pygrep precommit hooks @@ -23,8 +30,6 @@ Keep it human-readable, your future self will thank you! - configurabilty of the dropout probability in the the MultiHeadSelfAttention module - Variable Bounding as configurable model layers [#13](https://github.com/ecmwf/anemoi-models/issues/13) - GraphTransformerMapperBlock chunking to reduce memory usage during inference [#46](https://github.com/ecmwf/anemoi-models/pull/46) -- Mask NaN values in training loss function [#271](https://github.com/ecmwf-lab/aifs-mono/issues/271) -- Added dynamic NaN masking for the imputer class with two new classes: DynamicInputImputer, DynamicConstantImputer [#89](https://github.com/ecmwf/anemoi-models/pull/89) - New `NamedNodesAttributes` class to handle node attributes in a more flexible way [#64](https://github.com/ecmwf/anemoi-models/pull/64) - Contributors file [#69](https://github.com/ecmwf/anemoi-models/pull/69) @@ -38,7 +43,6 @@ Keep it human-readable, your future self will thank you! - Update copyright notice - Fix `__version__` import in init - Fix missing copyrights [#71](https://github.com/ecmwf/anemoi-models/pull/71) -- Reduced memory usage when using chunking in the mapper [#84](https://github.com/ecmwf/anemoi-models/pull/84) ### Removed