From 4bd9b79a5621ebce1cf6401cdc349a991c6eaaf5 Mon Sep 17 00:00:00 2001 From: Oskar Weser Date: Tue, 28 Jan 2025 15:52:36 -0500 Subject: [PATCH 01/11] Update src/quemb/molbe/chemfrag.py typo found by Minsik Co-authored-by: Minsik --- src/quemb/molbe/chemfrag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py index 3496434e..580f937a 100644 --- a/src/quemb/molbe/chemfrag.py +++ b/src/quemb/molbe/chemfrag.py @@ -249,7 +249,7 @@ def cleanup_if_subset( contain_others[i_center] |= contain_others[j_center] del contain_others[j_center] - # We know that the first element of motifs is the cetner, which should + # We know that the first element of motifs is the center, which should # stay at the first position. The rest of the motifs should be sorted. # We also remove the swallowed centers, i.e. only origins are left. cleaned_fragments = { From 68c80d47f2407677bdb6cf3380588f48726fc912 Mon Sep 17 00:00:00 2001 From: Oskar Weser Date: Tue, 28 Jan 2025 16:13:58 -0500 Subject: [PATCH 02/11] consistently renamed site -> motif and adressed Minsik's comment --- src/quemb/molbe/chemfrag.py | 20 ++++++------- tests/test_chemfrag.py | 56 ++++++++++++++++++------------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py index 3496434e..3b060bb6 100644 --- a/src/quemb/molbe/chemfrag.py +++ b/src/quemb/molbe/chemfrag.py @@ -77,13 +77,13 @@ class ConnectivityData: """Data structure to store the connectivity data of a molecule.""" #: The connectivity graph of the molecule. - bonds: Final[dict[AtomIdx, OrderedSet[AtomIdx]]] + bonds_atoms: Final[dict[AtomIdx, OrderedSet[AtomIdx]]] #: The heavy atoms/motifs in the molecule. If hydrogens are not treated differently #: then every hydrogen is also a motif on its own. motifs: Final[OrderedSet[MotifIdx]] #: The connectivity graph solely of the motifs, # i.e. of the heavy atoms when ignoring the hydrogen atoms. - motif_bonds: Final[dict[MotifIdx, OrderedSet[MotifIdx]]] + bonds_motifs: Final[dict[MotifIdx, OrderedSet[MotifIdx]]] #: The hydrogen atoms in the molecule. If hydrogens are not treated differently, #: then this is an empty set. H_atoms: Final[OrderedSet[AtomIdx]] @@ -113,23 +113,23 @@ def from_cartesian(cls, m: Cartesian, treat_H_different: bool = True) -> Self: with cc.constants.RestoreElementData(): # temporarily increase van der Waals radius by 15 % cc.constants.elements.loc[:, "atomic_radius_cc"] *= 1.15 - bonds = {k: OrderedSet(sorted(v)) for k, v in m.get_bonds().items()} + bonds_atoms = {k: OrderedSet(sorted(v)) for k, v in m.get_bonds().items()} if treat_H_different: motifs = OrderedSet(m.loc[m.atom != "H", :].index) else: motifs = OrderedSet(m.index) - site_bonds = {site: bonds[site] & motifs for site in motifs} + bonds_motif = {motif: bonds_atoms[motif] & motifs for motif in motifs} H_atoms = OrderedSet(m.index).difference(motifs) - H_per_motif = {i_site: bonds[i_site] & H_atoms for i_site in motifs} + H_per_motif = {motif: bonds_atoms[motif] & H_atoms for motif in motifs} atoms_per_motif = { - i_site: merge_seqs([i_site], H_atoms) - for i_site, H_atoms in H_per_motif.items() + motif: merge_seqs([motif], H_atoms) + for motif, H_atoms in H_per_motif.items() } return cls( - bonds, + bonds_atoms, motifs, - site_bonds, + bonds_motif, H_atoms, H_per_motif, atoms_per_motif, @@ -156,7 +156,7 @@ def get_BE_fragment(self, i_center: MotifIdx, n_BE: int) -> OrderedSet[MotifIdx] result = OrderedSet({i_center}) new = result.copy() for _ in range(n_BE - 1): - new = merge_seqs(*(self.motif_bonds[i] for i in new)).difference(result) + new = merge_seqs(*(self.bonds_motifs[i] for i in new)).difference(result) if not new: break result = result.union(new) diff --git a/tests/test_chemfrag.py b/tests/test_chemfrag.py index fbbb39e2..40c06cf7 100644 --- a/tests/test_chemfrag.py +++ b/tests/test_chemfrag.py @@ -16,7 +16,7 @@ def test_connectivity_data(): conn_data = ConnectivityData.from_cartesian(m) expected = ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1, 3, 5, 7]), 1: OrderedSet([0, 2, 4, 6]), 2: OrderedSet([1]), @@ -45,7 +45,7 @@ def test_connectivity_data(): 25: OrderedSet([18]), }, motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1, 7]), 1: OrderedSet([0, 6]), 6: OrderedSet([1, 12]), @@ -89,7 +89,7 @@ def test_connectivity_data(): ) resorted_expected = ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1, 8, 9, 10]), 1: OrderedSet([0, 2, 11, 12]), 2: OrderedSet([1, 3, 13, 14]), @@ -118,7 +118,7 @@ def test_connectivity_data(): 25: OrderedSet([7]), }, motifs=OrderedSet([0, 1, 2, 3, 4, 5, 6, 7]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1]), 1: OrderedSet([0, 2]), 2: OrderedSet([1, 3]), @@ -389,7 +389,7 @@ def test_fragmented_molecule(): OrderedSet([19]), ], conn_data=ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1, 3, 5, 7]), 1: OrderedSet([0, 2, 4, 6]), 2: OrderedSet([1]), @@ -418,7 +418,7 @@ def test_fragmented_molecule(): 25: OrderedSet([18]), }, motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1, 7]), 1: OrderedSet([0, 6]), 6: OrderedSet([1, 12]), @@ -497,7 +497,7 @@ def test_fragmented_molecule(): OrderedSet([13]), ], conn_data=ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1, 3, 5, 7]), 1: OrderedSet([0, 2, 4, 6]), 2: OrderedSet([1]), @@ -526,7 +526,7 @@ def test_fragmented_molecule(): 25: OrderedSet([18]), }, motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1, 7]), 1: OrderedSet([0, 6]), 6: OrderedSet([1, 12]), @@ -595,7 +595,7 @@ def test_fragmented_molecule(): OrderedSet([7]), ], conn_data=ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1, 3, 5, 7]), 1: OrderedSet([0, 2, 4, 6]), 2: OrderedSet([1]), @@ -624,7 +624,7 @@ def test_fragmented_molecule(): 25: OrderedSet([18]), }, motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1, 7]), 1: OrderedSet([0, 6]), 6: OrderedSet([1, 12]), @@ -724,7 +724,7 @@ def test_fragmented_molecule(): edge_per_frag=[OrderedSet([1, 6, 12]), OrderedSet([0, 7, 13])], origin_per_frag=[OrderedSet([0]), OrderedSet([1])], conn_data=ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1, 3, 5, 7]), 1: OrderedSet([0, 2, 4, 6]), 2: OrderedSet([1]), @@ -753,7 +753,7 @@ def test_fragmented_molecule(): 25: OrderedSet([18]), }, motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1, 7]), 1: OrderedSet([0, 6]), 6: OrderedSet([1, 12]), @@ -828,7 +828,7 @@ def test_fragmented_molecule(): edge_per_frag=[OrderedSet()], origin_per_frag=[OrderedSet([0])], conn_data=ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1, 3, 5, 7]), 1: OrderedSet([0, 2, 4, 6]), 2: OrderedSet([1]), @@ -857,7 +857,7 @@ def test_fragmented_molecule(): 25: OrderedSet([18]), }, motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1, 7]), 1: OrderedSet([0, 6]), 6: OrderedSet([1, 12]), @@ -932,7 +932,7 @@ def test_fragmented_molecule(): edge_per_frag=[OrderedSet()], origin_per_frag=[OrderedSet([0])], conn_data=ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1, 3, 5, 7]), 1: OrderedSet([0, 2, 4, 6]), 2: OrderedSet([1]), @@ -961,7 +961,7 @@ def test_fragmented_molecule(): 25: OrderedSet([18]), }, motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1, 7]), 1: OrderedSet([0, 6]), 6: OrderedSet([1, 12]), @@ -1036,7 +1036,7 @@ def test_fragmented_molecule(): edge_per_frag=[OrderedSet()], origin_per_frag=[OrderedSet([0])], conn_data=ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1, 3, 5, 7]), 1: OrderedSet([0, 2, 4, 6]), 2: OrderedSet([1]), @@ -1065,7 +1065,7 @@ def test_fragmented_molecule(): 25: OrderedSet([18]), }, motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1, 7]), 1: OrderedSet([0, 6]), 6: OrderedSet([1, 12]), @@ -1177,7 +1177,7 @@ def test_hydrogen_chain(): OrderedSet([7]), ], conn_data=ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1]), 1: OrderedSet([0, 2]), 2: OrderedSet([1, 3]), @@ -1188,7 +1188,7 @@ def test_hydrogen_chain(): 7: OrderedSet([6]), }, motifs=OrderedSet([0, 1, 2, 3, 4, 5, 6, 7]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1]), 1: OrderedSet([0, 2]), 2: OrderedSet([1, 3]), @@ -1265,7 +1265,7 @@ def test_hydrogen_chain(): OrderedSet([6]), ], conn_data=ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1]), 1: OrderedSet([0, 2]), 2: OrderedSet([1, 3]), @@ -1276,7 +1276,7 @@ def test_hydrogen_chain(): 7: OrderedSet([6]), }, motifs=OrderedSet([0, 1, 2, 3, 4, 5, 6, 7]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1]), 1: OrderedSet([0, 2]), 2: OrderedSet([1, 3]), @@ -1343,7 +1343,7 @@ def test_hydrogen_chain(): OrderedSet([5]), ], conn_data=ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1]), 1: OrderedSet([0, 2]), 2: OrderedSet([1, 3]), @@ -1354,7 +1354,7 @@ def test_hydrogen_chain(): 7: OrderedSet([6]), }, motifs=OrderedSet([0, 1, 2, 3, 4, 5, 6, 7]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1]), 1: OrderedSet([0, 2]), 2: OrderedSet([1, 3]), @@ -1402,7 +1402,7 @@ def test_hydrogen_chain(): edge_per_frag=[OrderedSet([4, 5, 6]), OrderedSet([1, 2, 3])], origin_per_frag=[OrderedSet([3]), OrderedSet([4])], conn_data=ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1]), 1: OrderedSet([0, 2]), 2: OrderedSet([1, 3]), @@ -1413,7 +1413,7 @@ def test_hydrogen_chain(): 7: OrderedSet([6]), }, motifs=OrderedSet([0, 1, 2, 3, 4, 5, 6, 7]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1]), 1: OrderedSet([0, 2]), 2: OrderedSet([1, 3]), @@ -1455,7 +1455,7 @@ def test_hydrogen_chain(): edge_per_frag=[OrderedSet()], origin_per_frag=[OrderedSet([3])], conn_data=ConnectivityData( - bonds={ + bonds_atoms={ 0: OrderedSet([1]), 1: OrderedSet([0, 2]), 2: OrderedSet([1, 3]), @@ -1466,7 +1466,7 @@ def test_hydrogen_chain(): 7: OrderedSet([6]), }, motifs=OrderedSet([0, 1, 2, 3, 4, 5, 6, 7]), - motif_bonds={ + bonds_motifs={ 0: OrderedSet([1]), 1: OrderedSet([0, 2]), 2: OrderedSet([1, 3]), From b3039d7534b750eecec803ce86fd3f15a8614105 Mon Sep 17 00:00:00 2001 From: Oskar Weser Date: Tue, 28 Jan 2025 16:16:02 -0500 Subject: [PATCH 03/11] put missing comment why ruamel is ignored --- mypy.ini | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mypy.ini b/mypy.ini index ffe1df1e..704204de 100644 --- a/mypy.ini +++ b/mypy.ini @@ -44,5 +44,10 @@ [mypy-chemcoord.*] ignore_missing_imports = True +; We have to als ignore ruamel. +; It's a bug in either mypy or pyyaml. +; https://stackoverflow.com/questions/52189217/use-mypy-with-ruamel-yaml +; https://github.com/python/mypy/issues/7276 +; https://sourceforge.net/p/ruamel-yaml/tickets/328/ [mypy-ruamel.*] ignore_missing_imports = True \ No newline at end of file From fbba6bcd9d521caa0b018ae402003fd77f51078e Mon Sep 17 00:00:00 2001 From: Oskar Weser Date: Tue, 28 Jan 2025 18:04:05 -0500 Subject: [PATCH 04/11] allow more fine-grained control over van der waals radii --- src/quemb/molbe/chemfrag.py | 76 +++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 12 deletions(-) diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py index 3b060bb6..e2fc22e1 100644 --- a/src/quemb/molbe/chemfrag.py +++ b/src/quemb/molbe/chemfrag.py @@ -1,8 +1,9 @@ from collections import defaultdict -from collections.abc import Sequence -from typing import Final, NewType, cast +from collections.abc import Mapping, Sequence +from typing import Callable, Final, NewType, cast import chemcoord as cc +import numpy as np from attr import define from chemcoord import Cartesian from ordered_set import OrderedSet @@ -77,32 +78,61 @@ class ConnectivityData: """Data structure to store the connectivity data of a molecule.""" #: The connectivity graph of the molecule. - bonds_atoms: Final[dict[AtomIdx, OrderedSet[AtomIdx]]] + bonds_atoms: Final[Mapping[AtomIdx, OrderedSet[AtomIdx]]] #: The heavy atoms/motifs in the molecule. If hydrogens are not treated differently #: then every hydrogen is also a motif on its own. motifs: Final[OrderedSet[MotifIdx]] #: The connectivity graph solely of the motifs, # i.e. of the heavy atoms when ignoring the hydrogen atoms. - bonds_motifs: Final[dict[MotifIdx, OrderedSet[MotifIdx]]] + bonds_motifs: Final[Mapping[MotifIdx, OrderedSet[MotifIdx]]] #: The hydrogen atoms in the molecule. If hydrogens are not treated differently, #: then this is an empty set. H_atoms: Final[OrderedSet[AtomIdx]] #: The hydrogen atoms per motif. If hydrogens are not treated differently, #: then the values of the dictionary are empty sets. - H_per_motif: Final[dict[MotifIdx, OrderedSet[AtomIdx]]] + H_per_motif: Final[Mapping[MotifIdx, OrderedSet[AtomIdx]]] #: All atoms per motif. Lists the motif/heavy atom first. - atoms_per_motif: Final[dict[MotifIdx, OrderedSet[AtomIdx]]] + atoms_per_motif: Final[Mapping[MotifIdx, OrderedSet[AtomIdx]]] #: Do we treat hydrogens differently? treat_H_different: Final[bool] = True @classmethod - def from_cartesian(cls, m: Cartesian, treat_H_different: bool = True) -> Self: + def from_cartesian( + cls, + m: Cartesian, + in_bonds_atoms: Mapping[int, OrderedSet[int]] | None = None, + in_vdW_radius: float + | Callable[[float], float] + | Mapping[str, float] + | None = None, + treat_H_different: bool = True, + ) -> Self: """Create a :class:`ConnectivityData` from a :class:`chemcoord.Cartesian`. Parameters ---------- m : The Cartesian object to extract the connectivity data from. + bonds_atoms : + Can be used to specify the connectivity graph of the molecule. + Has exactly the same format as the output of + :meth:`chemcoord.Cartesian.get_bonds`, + which is called internally if this argument is not specified. + Allows it to manually change the connectivity by modifying the output of + :meth:`chemcoord.Cartesian.get_bonds`. + The keyword is mutually exclusive with :python:`vdW_radius`. + vdW_radius : + If :python:`bonds_atoms` is :class:`None`, then the connectivity graph is + determined by the van der Waals radius of the atoms. + It is possible to pass: + + * a single float which is used for all atoms, + * a callable which is applied to all radii + and can be used to e.g. scale via :python:`lambda r: r * 1.1`, + * a dictionary which maps the element symbol to the van der Waals radius, + to change the radius of individual elements, e.g. :python:`{"C": 1.5}`. + + The keyword is mutually exclusive with :python:`bonds_atoms`. treat_H_different : If True, we treat hydrogen atoms differently from heavy atoms. """ @@ -110,16 +140,38 @@ def from_cartesian(cls, m: Cartesian, treat_H_different: bool = True) -> Self: raise ValueError("We assume 0-indexed data for the rest of the code.") m = m.sort_index() - with cc.constants.RestoreElementData(): - # temporarily increase van der Waals radius by 15 % - cc.constants.elements.loc[:, "atomic_radius_cc"] *= 1.15 - bonds_atoms = {k: OrderedSet(sorted(v)) for k, v in m.get_bonds().items()} + if in_bonds_atoms is not None and in_vdW_radius is not None: + raise ValueError("Cannot specify both in_bonds_atoms and in_vdW_radius.") + + if in_bonds_atoms is not None: + bonds_atoms = cast(Mapping[AtomIdx, OrderedSet[AtomIdx]], in_bonds_atoms) + else: + with cc.constants.RestoreElementData(): + # `used_vdW_r` is a view, not a copy !!! + used_vdW_r = cc.constants.elements.loc[:, "atomic_radius_cc"] + if isinstance(in_vdW_radius, float): + used_vdW_r[:] = used_vdW_r.map(lambda _: in_vdW_radius) + if callable(in_vdW_radius): + used_vdW_r[:] = used_vdW_r.map(in_vdW_radius) + elif isinstance(in_vdW_radius, Mapping): + used_vdW_r.update(in_vdW_radius) # type: ignore[arg-type] + else: + # To avoid false-negatives we set all vdW radii to + # at least 0.55 Å + # or 20 % larger than the tabulated value. + used_vdW_r[:] = np.maximum(0.55, used_vdW_r * 1.20) + bonds_atoms = { + k: OrderedSet(sorted(v)) for k, v in m.get_bonds().items() + } if treat_H_different: motifs = OrderedSet(m.loc[m.atom != "H", :].index) else: motifs = OrderedSet(m.index) - bonds_motif = {motif: bonds_atoms[motif] & motifs for motif in motifs} + + bonds_motif: Mapping[MotifIdx, OrderedSet[MotifIdx]] = { + motif: motifs & bonds_atoms[motif] for motif in motifs + } H_atoms = OrderedSet(m.index).difference(motifs) H_per_motif = {motif: bonds_atoms[motif] & H_atoms for motif in motifs} atoms_per_motif = { From cc27f59d8d526246aa89d29a30293106c9c123c1 Mon Sep 17 00:00:00 2001 From: Oskar Weser Date: Tue, 28 Jan 2025 18:28:51 -0500 Subject: [PATCH 05/11] add easy manipulation of van der waals radii --- src/quemb/molbe/chemfrag.py | 77 ++++++++++++++++++++++++++++--------- tests/test_chemfrag.py | 19 +++++++++ 2 files changed, 77 insertions(+), 19 deletions(-) diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py index e2fc22e1..b5f5e477 100644 --- a/src/quemb/molbe/chemfrag.py +++ b/src/quemb/molbe/chemfrag.py @@ -1,11 +1,13 @@ from collections import defaultdict from collections.abc import Mapping, Sequence -from typing import Callable, Final, NewType, cast +from numbers import Number +from typing import Callable, Final, NewType, TypeAlias, cast, overload import chemcoord as cc import numpy as np from attr import define from chemcoord import Cartesian +from chemcoord.constants import elements from ordered_set import OrderedSet from pyscf.gto import Mole from typing_extensions import Self @@ -96,15 +98,45 @@ class ConnectivityData: #: Do we treat hydrogens differently? treat_H_different: Final[bool] = True + # make it explicit via overloads that some arguments are mutually exclusive + InVdWRadius: TypeAlias = Number | Callable[[Number], Number] | Mapping[str, Number] + + @overload @classmethod def from_cartesian( cls, m: Cartesian, - in_bonds_atoms: Mapping[int, OrderedSet[int]] | None = None, - in_vdW_radius: float - | Callable[[float], float] - | Mapping[str, float] - | None = None, + *, + treat_H_different: bool = True, + ) -> Self: ... + + @overload + @classmethod + def from_cartesian( + cls, + m: Cartesian, + *, + in_vdW_radius: InVdWRadius, + treat_H_different: bool = True, + ) -> Self: ... + + @overload + @classmethod + def from_cartesian( + cls, + m: Cartesian, + *, + in_bonds_atoms: Mapping[int, set[int]], + treat_H_different: bool = True, + ) -> Self: ... + + @classmethod + def from_cartesian( + cls, + m: Cartesian, + *, + in_bonds_atoms: Mapping[int, set[int]] | None = None, + in_vdW_radius: InVdWRadius | None = None, treat_H_different: bool = True, ) -> Self: """Create a :class:`ConnectivityData` from a :class:`chemcoord.Cartesian`. @@ -113,7 +145,7 @@ def from_cartesian( ---------- m : The Cartesian object to extract the connectivity data from. - bonds_atoms : + in_bonds_atoms : Mapping[int, OrderedSet[int]] Can be used to specify the connectivity graph of the molecule. Has exactly the same format as the output of :meth:`chemcoord.Cartesian.get_bonds`, @@ -121,12 +153,12 @@ def from_cartesian( Allows it to manually change the connectivity by modifying the output of :meth:`chemcoord.Cartesian.get_bonds`. The keyword is mutually exclusive with :python:`vdW_radius`. - vdW_radius : + in_vdW_radius : Number | Callable[[Number], Number] | Mapping[str, Number] If :python:`bonds_atoms` is :class:`None`, then the connectivity graph is determined by the van der Waals radius of the atoms. It is possible to pass: - * a single float which is used for all atoms, + * a single Number which is used for all atoms, * a callable which is applied to all radii and can be used to e.g. scale via :python:`lambda r: r * 1.1`, * a dictionary which maps the element symbol to the van der Waals radius, @@ -144,22 +176,28 @@ def from_cartesian( raise ValueError("Cannot specify both in_bonds_atoms and in_vdW_radius.") if in_bonds_atoms is not None: - bonds_atoms = cast(Mapping[AtomIdx, OrderedSet[AtomIdx]], in_bonds_atoms) + bonds_atoms = { + AtomIdx(k): OrderedSet([AtomIdx(j) for j in sorted(v)]) + for k, v in m.get_bonds().items() + } else: with cc.constants.RestoreElementData(): - # `used_vdW_r` is a view, not a copy !!! - used_vdW_r = cc.constants.elements.loc[:, "atomic_radius_cc"] - if isinstance(in_vdW_radius, float): - used_vdW_r[:] = used_vdW_r.map(lambda _: in_vdW_radius) - if callable(in_vdW_radius): - used_vdW_r[:] = used_vdW_r.map(in_vdW_radius) + used_vdW_r = elements.loc[:, "atomic_radius_cc"] + if isinstance(in_vdW_radius, Number): + elements.loc[:, "atomic_radius_cc"] = used_vdW_r.map( + lambda _: in_vdW_radius + ) + elif callable(in_vdW_radius): + elements.loc[:, "atomic_radius_cc"] = used_vdW_r.map(in_vdW_radius) elif isinstance(in_vdW_radius, Mapping): - used_vdW_r.update(in_vdW_radius) # type: ignore[arg-type] + elements.loc[:, "atomic_radius_cc"].update(in_vdW_radius) # type: ignore[arg-type] else: # To avoid false-negatives we set all vdW radii to # at least 0.55 Å # or 20 % larger than the tabulated value. - used_vdW_r[:] = np.maximum(0.55, used_vdW_r * 1.20) + elements.loc[:, "atomic_radius_cc"] = np.maximum( + 0.55, used_vdW_r * 1.20 + ) bonds_atoms = { k: OrderedSet(sorted(v)) for k, v in m.get_bonds().items() } @@ -393,7 +431,8 @@ def from_cartesian( If True, we treat hydrogen atoms differently from heavy atoms. """ return cls.from_motifs( - ConnectivityData.from_cartesian(mol, treat_H_different), n_BE + ConnectivityData.from_cartesian(mol, treat_H_different=treat_H_different), + n_BE, ) @classmethod diff --git a/tests/test_chemfrag.py b/tests/test_chemfrag.py index 40c06cf7..ce5fa57c 100644 --- a/tests/test_chemfrag.py +++ b/tests/test_chemfrag.py @@ -1523,3 +1523,22 @@ def test_agreement_with_autogen(): # For the rest of the atoms the order can be different, # so we assert set equality assert set(chem_fragment) == set(auto_fragment) + + +def test_manipulation_of_vdW(): + m = Cartesian.read_xyz("data/octane.xyz") + + conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=100) + for atom, connected in conn_data.bonds_atoms.items(): + # check if everything is connected to everything + assert {atom} | connected == set(m.index) + + conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=lambda r: r * 100) + for atom, connected in conn_data.bonds_atoms.items(): + # check if everything is connected to everything + assert {atom} | connected == set(m.index) + + conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius={"C": 100}) + for i_carbon in m.loc[m.atom == "C"].index: + # check if carbons are connected to everything + assert {i_carbon} | conn_data.bonds_atoms[i_carbon] == set(m.index) From fa035cdabd95d6b7c0c302f19a9b0b954d54734e Mon Sep 17 00:00:00 2001 From: Oskar Weser Date: Tue, 28 Jan 2025 19:00:10 -0500 Subject: [PATCH 06/11] finished van der Waals modification --- src/quemb/molbe/chemfrag.py | 74 ++++++++++++++++++------------------- tests/test_chemfrag.py | 2 +- 2 files changed, 37 insertions(+), 39 deletions(-) diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py index 43c0763b..a5508102 100644 --- a/src/quemb/molbe/chemfrag.py +++ b/src/quemb/molbe/chemfrag.py @@ -1,7 +1,7 @@ from collections import defaultdict from collections.abc import Mapping, Sequence from numbers import Number -from typing import Callable, Final, NewType, TypeAlias, cast, overload +from typing import Callable, Final, NewType, TypeAlias, cast import chemcoord as cc import numpy as np @@ -75,6 +75,10 @@ def merge_seqs(*seqs: Sequence[T]) -> OrderedSet[T]: return OrderedSet().union(*seqs) # type: ignore[arg-type] +# The following can be passed van der Waals radius alternative. +InVdWRadius: TypeAlias = Number | Callable[[Number], Number] | Mapping[str, Number] + + @define(frozen=True) class ConnectivityData: """Data structure to store the connectivity data of a molecule.""" @@ -98,38 +102,6 @@ class ConnectivityData: #: Do we treat hydrogens differently? treat_H_different: Final[bool] = True - # make it explicit via overloads that some arguments are mutually exclusive - InVdWRadius: TypeAlias = Number | Callable[[Number], Number] | Mapping[str, Number] - - @overload - @classmethod - def from_cartesian( - cls, - m: Cartesian, - *, - treat_H_different: bool = True, - ) -> Self: ... - - @overload - @classmethod - def from_cartesian( - cls, - m: Cartesian, - *, - in_vdW_radius: InVdWRadius, - treat_H_different: bool = True, - ) -> Self: ... - - @overload - @classmethod - def from_cartesian( - cls, - m: Cartesian, - *, - in_bonds_atoms: Mapping[int, set[int]], - treat_H_different: bool = True, - ) -> Self: ... - @classmethod def from_cartesian( cls, @@ -189,8 +161,9 @@ def from_cartesian( ) elif callable(in_vdW_radius): elements.loc[:, "atomic_radius_cc"] = used_vdW_r.map(in_vdW_radius) + elif isinstance(in_vdW_radius, Mapping): - elements.loc[:, "atomic_radius_cc"].update(in_vdW_radius) # type: ignore[arg-type] + elements.loc[:, "atomic_radius_cc"].update(in_vdW_radius) else: # To avoid false-negatives we set all vdW radii to # at least 0.55 Å @@ -417,7 +390,13 @@ def get_edges(i_origin: OriginIdx) -> OrderedSet[EdgeIdx]: @classmethod def from_cartesian( - cls, mol: Cartesian, n_BE: int, treat_H_different: bool = True + cls, + mol: Cartesian, + n_BE: int, + *, + treat_H_different: bool = True, + in_bonds_atoms: Mapping[int, set[int]] | None = None, + in_vdW_radius: InVdWRadius | None = None, ) -> Self: """Construct a :class:`FragmentedStructure` from a :class:`chemcoord.Cartesian`. @@ -431,12 +410,25 @@ def from_cartesian( If True, we treat hydrogen atoms differently from heavy atoms. """ return cls.from_motifs( - ConnectivityData.from_cartesian(mol, treat_H_different=treat_H_different), + ConnectivityData.from_cartesian( + mol, + treat_H_different=treat_H_different, + in_bonds_atoms=in_bonds_atoms, + in_vdW_radius=in_vdW_radius, + ), n_BE, ) @classmethod - def from_Mol(cls, mol: Mole, n_BE: int, treat_H_different: bool = True) -> Self: + def from_Mol( + cls, + mol: Mole, + n_BE: int, + *, + treat_H_different: bool = True, + in_bonds_atoms: Mapping[int, set[int]] | None = None, + in_vdW_radius: InVdWRadius | None = None, + ) -> Self: """Construct a :class:`FragmentedStructure` from a :class:`pyscf.gto.mole.Mole`. Parameters @@ -448,4 +440,10 @@ def from_Mol(cls, mol: Mole, n_BE: int, treat_H_different: bool = True) -> Self: treat_H_different : If True, we treat hydrogen atoms differently from heavy atoms. """ - return cls.from_cartesian(Cartesian.from_pyscf(mol), n_BE, treat_H_different) + return cls.from_cartesian( + Cartesian.from_pyscf(mol), + n_BE, + treat_H_different=treat_H_different, + in_bonds_atoms=in_bonds_atoms, + in_vdW_radius=in_vdW_radius, + ) diff --git a/tests/test_chemfrag.py b/tests/test_chemfrag.py index ce5fa57c..8b5e6377 100644 --- a/tests/test_chemfrag.py +++ b/tests/test_chemfrag.py @@ -1525,7 +1525,7 @@ def test_agreement_with_autogen(): assert set(chem_fragment) == set(auto_fragment) -def test_manipulation_of_vdW(): +def test_conn_data_manipulation_of_vdW(): m = Cartesian.read_xyz("data/octane.xyz") conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=100) From e79c8caf9a4b67aa26a9e06a9328d36497614182 Mon Sep 17 00:00:00 2001 From: Oskar Weser Date: Tue, 28 Jan 2025 19:10:13 -0500 Subject: [PATCH 07/11] ensure that H are not shared among motifs --- src/quemb/molbe/chemfrag.py | 15 +++++++++++++++ tests/test_chemfrag.py | 19 ++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py index a5508102..3cb8ff60 100644 --- a/src/quemb/molbe/chemfrag.py +++ b/src/quemb/molbe/chemfrag.py @@ -189,6 +189,21 @@ def from_cartesian( motif: merge_seqs([motif], H_atoms) for motif, H_atoms in H_per_motif.items() } + + def motifs_share_H() -> bool: + for i_motif, i_H_atoms in H_per_motif.items(): + for j_motif, j_H_atoms in H_per_motif.items(): + if i_motif == j_motif: + continue + if i_H_atoms & j_H_atoms: + return True + return False + + if treat_H_different and motifs_share_H(): + raise ValueError( + "Cannot treat hydrogens differently if motifs share hydrogens." + ) + return cls( bonds_atoms, motifs, diff --git a/tests/test_chemfrag.py b/tests/test_chemfrag.py index 8b5e6377..2af778b3 100644 --- a/tests/test_chemfrag.py +++ b/tests/test_chemfrag.py @@ -1,3 +1,4 @@ +import pytest from chemcoord import Cartesian from ordered_set import OrderedSet from pyscf.gto import M @@ -1528,17 +1529,29 @@ def test_agreement_with_autogen(): def test_conn_data_manipulation_of_vdW(): m = Cartesian.read_xyz("data/octane.xyz") - conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=100) + # if hydrogens are shared among motifs we cannot treat H differently + with pytest.raises(ValueError): + conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=100) + conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=lambda r: r * 100) + conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius={"C": 100}) + + conn_data = ConnectivityData.from_cartesian( + m, in_vdW_radius=100, treat_H_different=False + ) for atom, connected in conn_data.bonds_atoms.items(): # check if everything is connected to everything assert {atom} | connected == set(m.index) - conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=lambda r: r * 100) + conn_data = ConnectivityData.from_cartesian( + m, in_vdW_radius=lambda r: r * 100, treat_H_different=False + ) for atom, connected in conn_data.bonds_atoms.items(): # check if everything is connected to everything assert {atom} | connected == set(m.index) - conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius={"C": 100}) + conn_data = ConnectivityData.from_cartesian( + m, in_vdW_radius={"C": 100}, treat_H_different=False + ) for i_carbon in m.loc[m.atom == "C"].index: # check if carbons are connected to everything assert {i_carbon} | conn_data.bonds_atoms[i_carbon] == set(m.index) From 5bff0b999d27e08af43953de9a9f0bdb55cddb45 Mon Sep 17 00:00:00 2001 From: Oskar Weser Date: Tue, 28 Jan 2025 19:11:42 -0500 Subject: [PATCH 08/11] ignore missing link to Number --- docs/source/nitpick-exceptions | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/nitpick-exceptions b/docs/source/nitpick-exceptions index 31aa8f13..777822d9 100644 --- a/docs/source/nitpick-exceptions +++ b/docs/source/nitpick-exceptions @@ -1,4 +1,5 @@ py:class optional py:class numpy.float64 py:class OrderedSet -py:class ordered_set.OrderedSet \ No newline at end of file +py:class ordered_set.OrderedSet +py:class Number \ No newline at end of file From 33a729b2bbdd3a6def69cd5164103e1d1dc08341 Mon Sep 17 00:00:00 2001 From: Oskar Weser Date: Tue, 28 Jan 2025 19:19:00 -0500 Subject: [PATCH 09/11] fix doc --- src/quemb/molbe/chemfrag.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py index 3cb8ff60..73494dc3 100644 --- a/src/quemb/molbe/chemfrag.py +++ b/src/quemb/molbe/chemfrag.py @@ -124,9 +124,9 @@ def from_cartesian( which is called internally if this argument is not specified. Allows it to manually change the connectivity by modifying the output of :meth:`chemcoord.Cartesian.get_bonds`. - The keyword is mutually exclusive with :python:`vdW_radius`. + The keyword is mutually exclusive with :python:`in_vdW_radius`. in_vdW_radius : Number | Callable[[Number], Number] | Mapping[str, Number] - If :python:`bonds_atoms` is :class:`None`, then the connectivity graph is + If :python:`in_bonds_atoms` is :class:`None`, then the connectivity graph is determined by the van der Waals radius of the atoms. It is possible to pass: @@ -136,7 +136,7 @@ def from_cartesian( * a dictionary which maps the element symbol to the van der Waals radius, to change the radius of individual elements, e.g. :python:`{"C": 1.5}`. - The keyword is mutually exclusive with :python:`bonds_atoms`. + The keyword is mutually exclusive with :python:`in_bonds_atoms`. treat_H_different : If True, we treat hydrogen atoms differently from heavy atoms. """ From 6128157dbac530ad283bd0d7b304b681064300c9 Mon Sep 17 00:00:00 2001 From: Oskar Weser Date: Tue, 28 Jan 2025 19:29:26 -0500 Subject: [PATCH 10/11] fixed bug --- src/quemb/molbe/chemfrag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py index 73494dc3..3d432e2d 100644 --- a/src/quemb/molbe/chemfrag.py +++ b/src/quemb/molbe/chemfrag.py @@ -150,7 +150,7 @@ def from_cartesian( if in_bonds_atoms is not None: bonds_atoms = { AtomIdx(k): OrderedSet([AtomIdx(j) for j in sorted(v)]) - for k, v in m.get_bonds().items() + for k, v in in_bonds_atoms.items() } else: with cc.constants.RestoreElementData(): From f823e134f2597bce43abb6bb6623c81699a9f0b2 Mon Sep 17 00:00:00 2001 From: Oskar Weser Date: Tue, 28 Jan 2025 19:30:57 -0500 Subject: [PATCH 11/11] finished --- src/quemb/molbe/chemfrag.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py index 3d432e2d..a4dc2bac 100644 --- a/src/quemb/molbe/chemfrag.py +++ b/src/quemb/molbe/chemfrag.py @@ -164,13 +164,17 @@ def from_cartesian( elif isinstance(in_vdW_radius, Mapping): elements.loc[:, "atomic_radius_cc"].update(in_vdW_radius) - else: + elif in_vdW_radius is None: # To avoid false-negatives we set all vdW radii to # at least 0.55 Å # or 20 % larger than the tabulated value. elements.loc[:, "atomic_radius_cc"] = np.maximum( 0.55, used_vdW_r * 1.20 ) + else: + raise TypeError( + f"Invalid type {type(in_vdW_radius)} for in_vdW_radius." + ) bonds_atoms = { k: OrderedSet(sorted(v)) for k, v in m.get_bonds().items() }