From 26a23aaef4ba31f0a975ea19312c06ad923f5798 Mon Sep 17 00:00:00 2001 From: Jusong Yu Date: Fri, 22 Dec 2023 20:00:06 +0100 Subject: [PATCH] `PseudoPotentialData`: Base hash only on file content and element (#168) By default, the hash for data nodes is computed from a set of attributes that includes the version of `aiida-core` as well as the plugin package that provides the data plugin, if applicable. This means that the hash of a data node changes merely if the version of `aiida-core` or the plugin package changes, even if its content is identical. This choice was made in `aiida-core` to err on the safe side. However, for pseudopotentials, this is unnecessary and actually reduces the effectiveness of caching. If a calculation is run with pseudos that has already run before, but the version of `aiida-pseudo` is different with which the pseudo was installed, the calculation will not be cached but run again. The `PseudoPotentialData` now customizes the caching class, ensuring the hash is calculated just based on the element and the file content. This should uniquely define a pseudopotential and so should be safe enough to base the hash upon. Co-authored-by: Sebastiaan Huber --- src/aiida_pseudo/data/pseudo/pseudo.py | 13 +++++++++++++ tests/data/pseudo/test_pseudo.py | 25 +++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/src/aiida_pseudo/data/pseudo/pseudo.py b/src/aiida_pseudo/data/pseudo/pseudo.py index 7719f1c..4c37f0d 100644 --- a/src/aiida_pseudo/data/pseudo/pseudo.py +++ b/src/aiida_pseudo/data/pseudo/pseudo.py @@ -1,4 +1,6 @@ """Base class for data types representing pseudo potentials.""" +from __future__ import annotations + import io import pathlib import typing @@ -7,16 +9,27 @@ from aiida.common.constants import elements from aiida.common.exceptions import StoringNotAllowed from aiida.common.files import md5_from_filelike +from aiida.orm.nodes.caching import NodeCaching __all__ = ('PseudoPotentialData',) +class PseudoPotentialDataCaching(NodeCaching): + """Class to define caching behavior of ``PseudoPotentialData`` nodes.""" + + def _get_objects_to_hash(self) -> list: + """Return a list of objects which should be included in the node hash.""" + return [self._node.element, self._node.md5] + + class PseudoPotentialData(plugins.DataFactory('core.singlefile')): """Base class for data types representing pseudo potentials.""" _key_element = 'element' _key_md5 = 'md5' + _CLS_NODE_CACHING = PseudoPotentialDataCaching + @classmethod def get_or_create( cls, source: typing.Union[str, pathlib.Path, typing.BinaryIO], filename: typing.Optional[str] = None diff --git a/tests/data/pseudo/test_pseudo.py b/tests/data/pseudo/test_pseudo.py index db0e992..e003769 100644 --- a/tests/data/pseudo/test_pseudo.py +++ b/tests/data/pseudo/test_pseudo.py @@ -209,3 +209,28 @@ def test_get_or_create(get_pseudo_potential_data): assert isinstance(different_class, PseudoPotentialData) assert not different_class.is_stored assert different_class.uuid != original.uuid + + +@pytest.mark.parametrize( + 'stream, filename, element, are_equal', + ( + (b'content', 'filename.pseudo', 'Ar', True), + (b'content', 'filename.different', 'Ar', True), # The filename should not affect the hash + (b'contenta', 'filename.pseudo', 'Ar', False), # Different content should mean a different hash + (b'content', 'filename.pseudo', 'Kr', False), # A different element should mean a different hash + ), +) +def test_hash(stream, filename, element, are_equal): + """Test the behavior of the hash of ``PseudoPotentialData`` nodes. + + The hash should only be based on the contents of the file and the element. + """ + left = PseudoPotentialData(io.BytesIO(b'content'), filename='filename.pseudo') + left.element = 'Ar' + left.store() + + right = PseudoPotentialData(io.BytesIO(stream), filename=filename) + right.element = element + right.store() + + assert (left.base.caching.get_hash() == right.base.caching.get_hash()) is are_equal