From 4bd9b79a5621ebce1cf6401cdc349a991c6eaaf5 Mon Sep 17 00:00:00 2001
From: Oskar Weser <oskar.weser@gmail.com>
Date: Tue, 28 Jan 2025 15:52:36 -0500
Subject: [PATCH 01/11] Update src/quemb/molbe/chemfrag.py

typo found by Minsik

Co-authored-by: Minsik <mscho527@gmail.com>
---
 src/quemb/molbe/chemfrag.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py
index 3496434e..580f937a 100644
--- a/src/quemb/molbe/chemfrag.py
+++ b/src/quemb/molbe/chemfrag.py
@@ -249,7 +249,7 @@ def cleanup_if_subset(
                     contain_others[i_center] |= contain_others[j_center]
                     del contain_others[j_center]
 
-    # We know that the first element of motifs is the cetner, which should
+    # We know that the first element of motifs is the center, which should
     # stay at the first position. The rest of the motifs should be sorted.
     # We also remove the swallowed centers, i.e. only origins are left.
     cleaned_fragments = {

From 68c80d47f2407677bdb6cf3380588f48726fc912 Mon Sep 17 00:00:00 2001
From: Oskar Weser <oskar.weser@gmail.com>
Date: Tue, 28 Jan 2025 16:13:58 -0500
Subject: [PATCH 02/11] consistently renamed site -> motif and adressed
 Minsik's comment

---
 src/quemb/molbe/chemfrag.py | 20 ++++++-------
 tests/test_chemfrag.py      | 56 ++++++++++++++++++-------------------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py
index 3496434e..3b060bb6 100644
--- a/src/quemb/molbe/chemfrag.py
+++ b/src/quemb/molbe/chemfrag.py
@@ -77,13 +77,13 @@ class ConnectivityData:
     """Data structure to store the connectivity data of a molecule."""
 
     #: The connectivity graph of the molecule.
-    bonds: Final[dict[AtomIdx, OrderedSet[AtomIdx]]]
+    bonds_atoms: Final[dict[AtomIdx, OrderedSet[AtomIdx]]]
     #: The heavy atoms/motifs in the molecule. If hydrogens are not treated differently
     #: then every hydrogen is also a motif on its own.
     motifs: Final[OrderedSet[MotifIdx]]
     #: The connectivity graph solely of the motifs,
     # i.e. of the heavy atoms when ignoring the hydrogen atoms.
-    motif_bonds: Final[dict[MotifIdx, OrderedSet[MotifIdx]]]
+    bonds_motifs: Final[dict[MotifIdx, OrderedSet[MotifIdx]]]
     #: The hydrogen atoms in the molecule. If hydrogens are not treated differently,
     #: then this is an empty set.
     H_atoms: Final[OrderedSet[AtomIdx]]
@@ -113,23 +113,23 @@ def from_cartesian(cls, m: Cartesian, treat_H_different: bool = True) -> Self:
         with cc.constants.RestoreElementData():
             # temporarily increase van der Waals radius by 15 %
             cc.constants.elements.loc[:, "atomic_radius_cc"] *= 1.15
-            bonds = {k: OrderedSet(sorted(v)) for k, v in m.get_bonds().items()}
+            bonds_atoms = {k: OrderedSet(sorted(v)) for k, v in m.get_bonds().items()}
 
         if treat_H_different:
             motifs = OrderedSet(m.loc[m.atom != "H", :].index)
         else:
             motifs = OrderedSet(m.index)
-        site_bonds = {site: bonds[site] & motifs for site in motifs}
+        bonds_motif = {motif: bonds_atoms[motif] & motifs for motif in motifs}
         H_atoms = OrderedSet(m.index).difference(motifs)
-        H_per_motif = {i_site: bonds[i_site] & H_atoms for i_site in motifs}
+        H_per_motif = {motif: bonds_atoms[motif] & H_atoms for motif in motifs}
         atoms_per_motif = {
-            i_site: merge_seqs([i_site], H_atoms)
-            for i_site, H_atoms in H_per_motif.items()
+            motif: merge_seqs([motif], H_atoms)
+            for motif, H_atoms in H_per_motif.items()
         }
         return cls(
-            bonds,
+            bonds_atoms,
             motifs,
-            site_bonds,
+            bonds_motif,
             H_atoms,
             H_per_motif,
             atoms_per_motif,
@@ -156,7 +156,7 @@ def get_BE_fragment(self, i_center: MotifIdx, n_BE: int) -> OrderedSet[MotifIdx]
         result = OrderedSet({i_center})
         new = result.copy()
         for _ in range(n_BE - 1):
-            new = merge_seqs(*(self.motif_bonds[i] for i in new)).difference(result)
+            new = merge_seqs(*(self.bonds_motifs[i] for i in new)).difference(result)
             if not new:
                 break
             result = result.union(new)
diff --git a/tests/test_chemfrag.py b/tests/test_chemfrag.py
index fbbb39e2..40c06cf7 100644
--- a/tests/test_chemfrag.py
+++ b/tests/test_chemfrag.py
@@ -16,7 +16,7 @@ def test_connectivity_data():
 
     conn_data = ConnectivityData.from_cartesian(m)
     expected = ConnectivityData(
-        bonds={
+        bonds_atoms={
             0: OrderedSet([1, 3, 5, 7]),
             1: OrderedSet([0, 2, 4, 6]),
             2: OrderedSet([1]),
@@ -45,7 +45,7 @@ def test_connectivity_data():
             25: OrderedSet([18]),
         },
         motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]),
-        motif_bonds={
+        bonds_motifs={
             0: OrderedSet([1, 7]),
             1: OrderedSet([0, 6]),
             6: OrderedSet([1, 12]),
@@ -89,7 +89,7 @@ def test_connectivity_data():
     )
 
     resorted_expected = ConnectivityData(
-        bonds={
+        bonds_atoms={
             0: OrderedSet([1, 8, 9, 10]),
             1: OrderedSet([0, 2, 11, 12]),
             2: OrderedSet([1, 3, 13, 14]),
@@ -118,7 +118,7 @@ def test_connectivity_data():
             25: OrderedSet([7]),
         },
         motifs=OrderedSet([0, 1, 2, 3, 4, 5, 6, 7]),
-        motif_bonds={
+        bonds_motifs={
             0: OrderedSet([1]),
             1: OrderedSet([0, 2]),
             2: OrderedSet([1, 3]),
@@ -389,7 +389,7 @@ def test_fragmented_molecule():
                 OrderedSet([19]),
             ],
             conn_data=ConnectivityData(
-                bonds={
+                bonds_atoms={
                     0: OrderedSet([1, 3, 5, 7]),
                     1: OrderedSet([0, 2, 4, 6]),
                     2: OrderedSet([1]),
@@ -418,7 +418,7 @@ def test_fragmented_molecule():
                     25: OrderedSet([18]),
                 },
                 motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]),
-                motif_bonds={
+                bonds_motifs={
                     0: OrderedSet([1, 7]),
                     1: OrderedSet([0, 6]),
                     6: OrderedSet([1, 12]),
@@ -497,7 +497,7 @@ def test_fragmented_molecule():
                 OrderedSet([13]),
             ],
             conn_data=ConnectivityData(
-                bonds={
+                bonds_atoms={
                     0: OrderedSet([1, 3, 5, 7]),
                     1: OrderedSet([0, 2, 4, 6]),
                     2: OrderedSet([1]),
@@ -526,7 +526,7 @@ def test_fragmented_molecule():
                     25: OrderedSet([18]),
                 },
                 motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]),
-                motif_bonds={
+                bonds_motifs={
                     0: OrderedSet([1, 7]),
                     1: OrderedSet([0, 6]),
                     6: OrderedSet([1, 12]),
@@ -595,7 +595,7 @@ def test_fragmented_molecule():
                 OrderedSet([7]),
             ],
             conn_data=ConnectivityData(
-                bonds={
+                bonds_atoms={
                     0: OrderedSet([1, 3, 5, 7]),
                     1: OrderedSet([0, 2, 4, 6]),
                     2: OrderedSet([1]),
@@ -624,7 +624,7 @@ def test_fragmented_molecule():
                     25: OrderedSet([18]),
                 },
                 motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]),
-                motif_bonds={
+                bonds_motifs={
                     0: OrderedSet([1, 7]),
                     1: OrderedSet([0, 6]),
                     6: OrderedSet([1, 12]),
@@ -724,7 +724,7 @@ def test_fragmented_molecule():
             edge_per_frag=[OrderedSet([1, 6, 12]), OrderedSet([0, 7, 13])],
             origin_per_frag=[OrderedSet([0]), OrderedSet([1])],
             conn_data=ConnectivityData(
-                bonds={
+                bonds_atoms={
                     0: OrderedSet([1, 3, 5, 7]),
                     1: OrderedSet([0, 2, 4, 6]),
                     2: OrderedSet([1]),
@@ -753,7 +753,7 @@ def test_fragmented_molecule():
                     25: OrderedSet([18]),
                 },
                 motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]),
-                motif_bonds={
+                bonds_motifs={
                     0: OrderedSet([1, 7]),
                     1: OrderedSet([0, 6]),
                     6: OrderedSet([1, 12]),
@@ -828,7 +828,7 @@ def test_fragmented_molecule():
             edge_per_frag=[OrderedSet()],
             origin_per_frag=[OrderedSet([0])],
             conn_data=ConnectivityData(
-                bonds={
+                bonds_atoms={
                     0: OrderedSet([1, 3, 5, 7]),
                     1: OrderedSet([0, 2, 4, 6]),
                     2: OrderedSet([1]),
@@ -857,7 +857,7 @@ def test_fragmented_molecule():
                     25: OrderedSet([18]),
                 },
                 motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]),
-                motif_bonds={
+                bonds_motifs={
                     0: OrderedSet([1, 7]),
                     1: OrderedSet([0, 6]),
                     6: OrderedSet([1, 12]),
@@ -932,7 +932,7 @@ def test_fragmented_molecule():
             edge_per_frag=[OrderedSet()],
             origin_per_frag=[OrderedSet([0])],
             conn_data=ConnectivityData(
-                bonds={
+                bonds_atoms={
                     0: OrderedSet([1, 3, 5, 7]),
                     1: OrderedSet([0, 2, 4, 6]),
                     2: OrderedSet([1]),
@@ -961,7 +961,7 @@ def test_fragmented_molecule():
                     25: OrderedSet([18]),
                 },
                 motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]),
-                motif_bonds={
+                bonds_motifs={
                     0: OrderedSet([1, 7]),
                     1: OrderedSet([0, 6]),
                     6: OrderedSet([1, 12]),
@@ -1036,7 +1036,7 @@ def test_fragmented_molecule():
             edge_per_frag=[OrderedSet()],
             origin_per_frag=[OrderedSet([0])],
             conn_data=ConnectivityData(
-                bonds={
+                bonds_atoms={
                     0: OrderedSet([1, 3, 5, 7]),
                     1: OrderedSet([0, 2, 4, 6]),
                     2: OrderedSet([1]),
@@ -1065,7 +1065,7 @@ def test_fragmented_molecule():
                     25: OrderedSet([18]),
                 },
                 motifs=OrderedSet([0, 1, 6, 7, 12, 13, 18, 19]),
-                motif_bonds={
+                bonds_motifs={
                     0: OrderedSet([1, 7]),
                     1: OrderedSet([0, 6]),
                     6: OrderedSet([1, 12]),
@@ -1177,7 +1177,7 @@ def test_hydrogen_chain():
                 OrderedSet([7]),
             ],
             conn_data=ConnectivityData(
-                bonds={
+                bonds_atoms={
                     0: OrderedSet([1]),
                     1: OrderedSet([0, 2]),
                     2: OrderedSet([1, 3]),
@@ -1188,7 +1188,7 @@ def test_hydrogen_chain():
                     7: OrderedSet([6]),
                 },
                 motifs=OrderedSet([0, 1, 2, 3, 4, 5, 6, 7]),
-                motif_bonds={
+                bonds_motifs={
                     0: OrderedSet([1]),
                     1: OrderedSet([0, 2]),
                     2: OrderedSet([1, 3]),
@@ -1265,7 +1265,7 @@ def test_hydrogen_chain():
                 OrderedSet([6]),
             ],
             conn_data=ConnectivityData(
-                bonds={
+                bonds_atoms={
                     0: OrderedSet([1]),
                     1: OrderedSet([0, 2]),
                     2: OrderedSet([1, 3]),
@@ -1276,7 +1276,7 @@ def test_hydrogen_chain():
                     7: OrderedSet([6]),
                 },
                 motifs=OrderedSet([0, 1, 2, 3, 4, 5, 6, 7]),
-                motif_bonds={
+                bonds_motifs={
                     0: OrderedSet([1]),
                     1: OrderedSet([0, 2]),
                     2: OrderedSet([1, 3]),
@@ -1343,7 +1343,7 @@ def test_hydrogen_chain():
                 OrderedSet([5]),
             ],
             conn_data=ConnectivityData(
-                bonds={
+                bonds_atoms={
                     0: OrderedSet([1]),
                     1: OrderedSet([0, 2]),
                     2: OrderedSet([1, 3]),
@@ -1354,7 +1354,7 @@ def test_hydrogen_chain():
                     7: OrderedSet([6]),
                 },
                 motifs=OrderedSet([0, 1, 2, 3, 4, 5, 6, 7]),
-                motif_bonds={
+                bonds_motifs={
                     0: OrderedSet([1]),
                     1: OrderedSet([0, 2]),
                     2: OrderedSet([1, 3]),
@@ -1402,7 +1402,7 @@ def test_hydrogen_chain():
             edge_per_frag=[OrderedSet([4, 5, 6]), OrderedSet([1, 2, 3])],
             origin_per_frag=[OrderedSet([3]), OrderedSet([4])],
             conn_data=ConnectivityData(
-                bonds={
+                bonds_atoms={
                     0: OrderedSet([1]),
                     1: OrderedSet([0, 2]),
                     2: OrderedSet([1, 3]),
@@ -1413,7 +1413,7 @@ def test_hydrogen_chain():
                     7: OrderedSet([6]),
                 },
                 motifs=OrderedSet([0, 1, 2, 3, 4, 5, 6, 7]),
-                motif_bonds={
+                bonds_motifs={
                     0: OrderedSet([1]),
                     1: OrderedSet([0, 2]),
                     2: OrderedSet([1, 3]),
@@ -1455,7 +1455,7 @@ def test_hydrogen_chain():
             edge_per_frag=[OrderedSet()],
             origin_per_frag=[OrderedSet([3])],
             conn_data=ConnectivityData(
-                bonds={
+                bonds_atoms={
                     0: OrderedSet([1]),
                     1: OrderedSet([0, 2]),
                     2: OrderedSet([1, 3]),
@@ -1466,7 +1466,7 @@ def test_hydrogen_chain():
                     7: OrderedSet([6]),
                 },
                 motifs=OrderedSet([0, 1, 2, 3, 4, 5, 6, 7]),
-                motif_bonds={
+                bonds_motifs={
                     0: OrderedSet([1]),
                     1: OrderedSet([0, 2]),
                     2: OrderedSet([1, 3]),

From b3039d7534b750eecec803ce86fd3f15a8614105 Mon Sep 17 00:00:00 2001
From: Oskar Weser <oskar.weser@gmail.com>
Date: Tue, 28 Jan 2025 16:16:02 -0500
Subject: [PATCH 03/11] put missing comment why ruamel is ignored

---
 mypy.ini | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mypy.ini b/mypy.ini
index ffe1df1e..704204de 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -44,5 +44,10 @@
 [mypy-chemcoord.*]
     ignore_missing_imports = True
 
+; We have to als ignore ruamel.
+; It's a bug in either mypy or pyyaml.
+; https://stackoverflow.com/questions/52189217/use-mypy-with-ruamel-yaml
+; https://github.com/python/mypy/issues/7276
+; https://sourceforge.net/p/ruamel-yaml/tickets/328/
 [mypy-ruamel.*]
     ignore_missing_imports = True
\ No newline at end of file

From fbba6bcd9d521caa0b018ae402003fd77f51078e Mon Sep 17 00:00:00 2001
From: Oskar Weser <oskar.weser@gmail.com>
Date: Tue, 28 Jan 2025 18:04:05 -0500
Subject: [PATCH 04/11] allow more fine-grained control over van der waals
 radii

---
 src/quemb/molbe/chemfrag.py | 76 +++++++++++++++++++++++++++++++------
 1 file changed, 64 insertions(+), 12 deletions(-)

diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py
index 3b060bb6..e2fc22e1 100644
--- a/src/quemb/molbe/chemfrag.py
+++ b/src/quemb/molbe/chemfrag.py
@@ -1,8 +1,9 @@
 from collections import defaultdict
-from collections.abc import Sequence
-from typing import Final, NewType, cast
+from collections.abc import Mapping, Sequence
+from typing import Callable, Final, NewType, cast
 
 import chemcoord as cc
+import numpy as np
 from attr import define
 from chemcoord import Cartesian
 from ordered_set import OrderedSet
@@ -77,32 +78,61 @@ class ConnectivityData:
     """Data structure to store the connectivity data of a molecule."""
 
     #: The connectivity graph of the molecule.
-    bonds_atoms: Final[dict[AtomIdx, OrderedSet[AtomIdx]]]
+    bonds_atoms: Final[Mapping[AtomIdx, OrderedSet[AtomIdx]]]
     #: The heavy atoms/motifs in the molecule. If hydrogens are not treated differently
     #: then every hydrogen is also a motif on its own.
     motifs: Final[OrderedSet[MotifIdx]]
     #: The connectivity graph solely of the motifs,
     # i.e. of the heavy atoms when ignoring the hydrogen atoms.
-    bonds_motifs: Final[dict[MotifIdx, OrderedSet[MotifIdx]]]
+    bonds_motifs: Final[Mapping[MotifIdx, OrderedSet[MotifIdx]]]
     #: The hydrogen atoms in the molecule. If hydrogens are not treated differently,
     #: then this is an empty set.
     H_atoms: Final[OrderedSet[AtomIdx]]
     #: The hydrogen atoms per motif. If hydrogens are not treated differently,
     #: then the values of the dictionary are empty sets.
-    H_per_motif: Final[dict[MotifIdx, OrderedSet[AtomIdx]]]
+    H_per_motif: Final[Mapping[MotifIdx, OrderedSet[AtomIdx]]]
     #: All atoms per motif. Lists the motif/heavy atom first.
-    atoms_per_motif: Final[dict[MotifIdx, OrderedSet[AtomIdx]]]
+    atoms_per_motif: Final[Mapping[MotifIdx, OrderedSet[AtomIdx]]]
     #: Do we treat hydrogens differently?
     treat_H_different: Final[bool] = True
 
     @classmethod
-    def from_cartesian(cls, m: Cartesian, treat_H_different: bool = True) -> Self:
+    def from_cartesian(
+        cls,
+        m: Cartesian,
+        in_bonds_atoms: Mapping[int, OrderedSet[int]] | None = None,
+        in_vdW_radius: float
+        | Callable[[float], float]
+        | Mapping[str, float]
+        | None = None,
+        treat_H_different: bool = True,
+    ) -> Self:
         """Create a :class:`ConnectivityData` from a :class:`chemcoord.Cartesian`.
 
         Parameters
         ----------
         m :
             The Cartesian object to extract the connectivity data from.
+        bonds_atoms :
+            Can be used to specify the connectivity graph of the molecule.
+            Has exactly the same format as the output of
+            :meth:`chemcoord.Cartesian.get_bonds`,
+            which is called internally if this argument is not specified.
+            Allows it to manually change the connectivity by modifying the output of
+            :meth:`chemcoord.Cartesian.get_bonds`.
+            The keyword is mutually exclusive with :python:`vdW_radius`.
+        vdW_radius :
+            If :python:`bonds_atoms` is :class:`None`, then the connectivity graph is
+            determined by the van der Waals radius of the atoms.
+            It is possible to pass:
+
+            * a single float which is used for all atoms,
+            * a callable which is applied to all radii
+              and can be used to e.g. scale via :python:`lambda r: r * 1.1`,
+            * a dictionary which maps the element symbol to the van der Waals radius,
+              to change the radius of individual elements, e.g. :python:`{"C": 1.5}`.
+
+            The keyword is mutually exclusive with :python:`bonds_atoms`.
         treat_H_different :
             If True, we treat hydrogen atoms differently from heavy atoms.
         """
@@ -110,16 +140,38 @@ def from_cartesian(cls, m: Cartesian, treat_H_different: bool = True) -> Self:
             raise ValueError("We assume 0-indexed data for the rest of the code.")
         m = m.sort_index()
 
-        with cc.constants.RestoreElementData():
-            # temporarily increase van der Waals radius by 15 %
-            cc.constants.elements.loc[:, "atomic_radius_cc"] *= 1.15
-            bonds_atoms = {k: OrderedSet(sorted(v)) for k, v in m.get_bonds().items()}
+        if in_bonds_atoms is not None and in_vdW_radius is not None:
+            raise ValueError("Cannot specify both in_bonds_atoms and in_vdW_radius.")
+
+        if in_bonds_atoms is not None:
+            bonds_atoms = cast(Mapping[AtomIdx, OrderedSet[AtomIdx]], in_bonds_atoms)
+        else:
+            with cc.constants.RestoreElementData():
+                # `used_vdW_r` is a view, not a copy !!!
+                used_vdW_r = cc.constants.elements.loc[:, "atomic_radius_cc"]
+                if isinstance(in_vdW_radius, float):
+                    used_vdW_r[:] = used_vdW_r.map(lambda _: in_vdW_radius)
+                if callable(in_vdW_radius):
+                    used_vdW_r[:] = used_vdW_r.map(in_vdW_radius)
+                elif isinstance(in_vdW_radius, Mapping):
+                    used_vdW_r.update(in_vdW_radius)  # type: ignore[arg-type]
+                else:
+                    # To avoid false-negatives we set all vdW radii to
+                    # at least 0.55 Å
+                    # or 20 % larger than the tabulated value.
+                    used_vdW_r[:] = np.maximum(0.55, used_vdW_r * 1.20)
+                bonds_atoms = {
+                    k: OrderedSet(sorted(v)) for k, v in m.get_bonds().items()
+                }
 
         if treat_H_different:
             motifs = OrderedSet(m.loc[m.atom != "H", :].index)
         else:
             motifs = OrderedSet(m.index)
-        bonds_motif = {motif: bonds_atoms[motif] & motifs for motif in motifs}
+
+        bonds_motif: Mapping[MotifIdx, OrderedSet[MotifIdx]] = {
+            motif: motifs & bonds_atoms[motif] for motif in motifs
+        }
         H_atoms = OrderedSet(m.index).difference(motifs)
         H_per_motif = {motif: bonds_atoms[motif] & H_atoms for motif in motifs}
         atoms_per_motif = {

From cc27f59d8d526246aa89d29a30293106c9c123c1 Mon Sep 17 00:00:00 2001
From: Oskar Weser <oskar.weser@gmail.com>
Date: Tue, 28 Jan 2025 18:28:51 -0500
Subject: [PATCH 05/11] add easy manipulation of van der waals radii

---
 src/quemb/molbe/chemfrag.py | 77 ++++++++++++++++++++++++++++---------
 tests/test_chemfrag.py      | 19 +++++++++
 2 files changed, 77 insertions(+), 19 deletions(-)

diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py
index e2fc22e1..b5f5e477 100644
--- a/src/quemb/molbe/chemfrag.py
+++ b/src/quemb/molbe/chemfrag.py
@@ -1,11 +1,13 @@
 from collections import defaultdict
 from collections.abc import Mapping, Sequence
-from typing import Callable, Final, NewType, cast
+from numbers import Number
+from typing import Callable, Final, NewType, TypeAlias, cast, overload
 
 import chemcoord as cc
 import numpy as np
 from attr import define
 from chemcoord import Cartesian
+from chemcoord.constants import elements
 from ordered_set import OrderedSet
 from pyscf.gto import Mole
 from typing_extensions import Self
@@ -96,15 +98,45 @@ class ConnectivityData:
     #: Do we treat hydrogens differently?
     treat_H_different: Final[bool] = True
 
+    # make it explicit via overloads that some arguments are mutually exclusive
+    InVdWRadius: TypeAlias = Number | Callable[[Number], Number] | Mapping[str, Number]
+
+    @overload
     @classmethod
     def from_cartesian(
         cls,
         m: Cartesian,
-        in_bonds_atoms: Mapping[int, OrderedSet[int]] | None = None,
-        in_vdW_radius: float
-        | Callable[[float], float]
-        | Mapping[str, float]
-        | None = None,
+        *,
+        treat_H_different: bool = True,
+    ) -> Self: ...
+
+    @overload
+    @classmethod
+    def from_cartesian(
+        cls,
+        m: Cartesian,
+        *,
+        in_vdW_radius: InVdWRadius,
+        treat_H_different: bool = True,
+    ) -> Self: ...
+
+    @overload
+    @classmethod
+    def from_cartesian(
+        cls,
+        m: Cartesian,
+        *,
+        in_bonds_atoms: Mapping[int, set[int]],
+        treat_H_different: bool = True,
+    ) -> Self: ...
+
+    @classmethod
+    def from_cartesian(
+        cls,
+        m: Cartesian,
+        *,
+        in_bonds_atoms: Mapping[int, set[int]] | None = None,
+        in_vdW_radius: InVdWRadius | None = None,
         treat_H_different: bool = True,
     ) -> Self:
         """Create a :class:`ConnectivityData` from a :class:`chemcoord.Cartesian`.
@@ -113,7 +145,7 @@ def from_cartesian(
         ----------
         m :
             The Cartesian object to extract the connectivity data from.
-        bonds_atoms :
+        in_bonds_atoms : Mapping[int, OrderedSet[int]]
             Can be used to specify the connectivity graph of the molecule.
             Has exactly the same format as the output of
             :meth:`chemcoord.Cartesian.get_bonds`,
@@ -121,12 +153,12 @@ def from_cartesian(
             Allows it to manually change the connectivity by modifying the output of
             :meth:`chemcoord.Cartesian.get_bonds`.
             The keyword is mutually exclusive with :python:`vdW_radius`.
-        vdW_radius :
+        in_vdW_radius : Number | Callable[[Number], Number] | Mapping[str, Number]
             If :python:`bonds_atoms` is :class:`None`, then the connectivity graph is
             determined by the van der Waals radius of the atoms.
             It is possible to pass:
 
-            * a single float which is used for all atoms,
+            * a single Number which is used for all atoms,
             * a callable which is applied to all radii
               and can be used to e.g. scale via :python:`lambda r: r * 1.1`,
             * a dictionary which maps the element symbol to the van der Waals radius,
@@ -144,22 +176,28 @@ def from_cartesian(
             raise ValueError("Cannot specify both in_bonds_atoms and in_vdW_radius.")
 
         if in_bonds_atoms is not None:
-            bonds_atoms = cast(Mapping[AtomIdx, OrderedSet[AtomIdx]], in_bonds_atoms)
+            bonds_atoms = {
+                AtomIdx(k): OrderedSet([AtomIdx(j) for j in sorted(v)])
+                for k, v in m.get_bonds().items()
+            }
         else:
             with cc.constants.RestoreElementData():
-                # `used_vdW_r` is a view, not a copy !!!
-                used_vdW_r = cc.constants.elements.loc[:, "atomic_radius_cc"]
-                if isinstance(in_vdW_radius, float):
-                    used_vdW_r[:] = used_vdW_r.map(lambda _: in_vdW_radius)
-                if callable(in_vdW_radius):
-                    used_vdW_r[:] = used_vdW_r.map(in_vdW_radius)
+                used_vdW_r = elements.loc[:, "atomic_radius_cc"]
+                if isinstance(in_vdW_radius, Number):
+                    elements.loc[:, "atomic_radius_cc"] = used_vdW_r.map(
+                        lambda _: in_vdW_radius
+                    )
+                elif callable(in_vdW_radius):
+                    elements.loc[:, "atomic_radius_cc"] = used_vdW_r.map(in_vdW_radius)
                 elif isinstance(in_vdW_radius, Mapping):
-                    used_vdW_r.update(in_vdW_radius)  # type: ignore[arg-type]
+                    elements.loc[:, "atomic_radius_cc"].update(in_vdW_radius)  # type: ignore[arg-type]
                 else:
                     # To avoid false-negatives we set all vdW radii to
                     # at least 0.55 Å
                     # or 20 % larger than the tabulated value.
-                    used_vdW_r[:] = np.maximum(0.55, used_vdW_r * 1.20)
+                    elements.loc[:, "atomic_radius_cc"] = np.maximum(
+                        0.55, used_vdW_r * 1.20
+                    )
                 bonds_atoms = {
                     k: OrderedSet(sorted(v)) for k, v in m.get_bonds().items()
                 }
@@ -393,7 +431,8 @@ def from_cartesian(
             If True, we treat hydrogen atoms differently from heavy atoms.
         """
         return cls.from_motifs(
-            ConnectivityData.from_cartesian(mol, treat_H_different), n_BE
+            ConnectivityData.from_cartesian(mol, treat_H_different=treat_H_different),
+            n_BE,
         )
 
     @classmethod
diff --git a/tests/test_chemfrag.py b/tests/test_chemfrag.py
index 40c06cf7..ce5fa57c 100644
--- a/tests/test_chemfrag.py
+++ b/tests/test_chemfrag.py
@@ -1523,3 +1523,22 @@ def test_agreement_with_autogen():
             # For the rest of the atoms the order can be different,
             # so we assert set equality
             assert set(chem_fragment) == set(auto_fragment)
+
+
+def test_manipulation_of_vdW():
+    m = Cartesian.read_xyz("data/octane.xyz")
+
+    conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=100)
+    for atom, connected in conn_data.bonds_atoms.items():
+        # check if everything is connected to everything
+        assert {atom} | connected == set(m.index)
+
+    conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=lambda r: r * 100)
+    for atom, connected in conn_data.bonds_atoms.items():
+        # check if everything is connected to everything
+        assert {atom} | connected == set(m.index)
+
+    conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius={"C": 100})
+    for i_carbon in m.loc[m.atom == "C"].index:
+        # check if carbons are connected to everything
+        assert {i_carbon} | conn_data.bonds_atoms[i_carbon] == set(m.index)

From fa035cdabd95d6b7c0c302f19a9b0b954d54734e Mon Sep 17 00:00:00 2001
From: Oskar Weser <oskar.weser@gmail.com>
Date: Tue, 28 Jan 2025 19:00:10 -0500
Subject: [PATCH 06/11] finished van der Waals modification

---
 src/quemb/molbe/chemfrag.py | 74 ++++++++++++++++++-------------------
 tests/test_chemfrag.py      |  2 +-
 2 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py
index 43c0763b..a5508102 100644
--- a/src/quemb/molbe/chemfrag.py
+++ b/src/quemb/molbe/chemfrag.py
@@ -1,7 +1,7 @@
 from collections import defaultdict
 from collections.abc import Mapping, Sequence
 from numbers import Number
-from typing import Callable, Final, NewType, TypeAlias, cast, overload
+from typing import Callable, Final, NewType, TypeAlias, cast
 
 import chemcoord as cc
 import numpy as np
@@ -75,6 +75,10 @@ def merge_seqs(*seqs: Sequence[T]) -> OrderedSet[T]:
     return OrderedSet().union(*seqs)  # type: ignore[arg-type]
 
 
+# The following can be passed van der Waals radius alternative.
+InVdWRadius: TypeAlias = Number | Callable[[Number], Number] | Mapping[str, Number]
+
+
 @define(frozen=True)
 class ConnectivityData:
     """Data structure to store the connectivity data of a molecule."""
@@ -98,38 +102,6 @@ class ConnectivityData:
     #: Do we treat hydrogens differently?
     treat_H_different: Final[bool] = True
 
-    # make it explicit via overloads that some arguments are mutually exclusive
-    InVdWRadius: TypeAlias = Number | Callable[[Number], Number] | Mapping[str, Number]
-
-    @overload
-    @classmethod
-    def from_cartesian(
-        cls,
-        m: Cartesian,
-        *,
-        treat_H_different: bool = True,
-    ) -> Self: ...
-
-    @overload
-    @classmethod
-    def from_cartesian(
-        cls,
-        m: Cartesian,
-        *,
-        in_vdW_radius: InVdWRadius,
-        treat_H_different: bool = True,
-    ) -> Self: ...
-
-    @overload
-    @classmethod
-    def from_cartesian(
-        cls,
-        m: Cartesian,
-        *,
-        in_bonds_atoms: Mapping[int, set[int]],
-        treat_H_different: bool = True,
-    ) -> Self: ...
-
     @classmethod
     def from_cartesian(
         cls,
@@ -189,8 +161,9 @@ def from_cartesian(
                     )
                 elif callable(in_vdW_radius):
                     elements.loc[:, "atomic_radius_cc"] = used_vdW_r.map(in_vdW_radius)
+
                 elif isinstance(in_vdW_radius, Mapping):
-                    elements.loc[:, "atomic_radius_cc"].update(in_vdW_radius)  # type: ignore[arg-type]
+                    elements.loc[:, "atomic_radius_cc"].update(in_vdW_radius)
                 else:
                     # To avoid false-negatives we set all vdW radii to
                     # at least 0.55 Å
@@ -417,7 +390,13 @@ def get_edges(i_origin: OriginIdx) -> OrderedSet[EdgeIdx]:
 
     @classmethod
     def from_cartesian(
-        cls, mol: Cartesian, n_BE: int, treat_H_different: bool = True
+        cls,
+        mol: Cartesian,
+        n_BE: int,
+        *,
+        treat_H_different: bool = True,
+        in_bonds_atoms: Mapping[int, set[int]] | None = None,
+        in_vdW_radius: InVdWRadius | None = None,
     ) -> Self:
         """Construct a :class:`FragmentedStructure` from a :class:`chemcoord.Cartesian`.
 
@@ -431,12 +410,25 @@ def from_cartesian(
             If True, we treat hydrogen atoms differently from heavy atoms.
         """
         return cls.from_motifs(
-            ConnectivityData.from_cartesian(mol, treat_H_different=treat_H_different),
+            ConnectivityData.from_cartesian(
+                mol,
+                treat_H_different=treat_H_different,
+                in_bonds_atoms=in_bonds_atoms,
+                in_vdW_radius=in_vdW_radius,
+            ),
             n_BE,
         )
 
     @classmethod
-    def from_Mol(cls, mol: Mole, n_BE: int, treat_H_different: bool = True) -> Self:
+    def from_Mol(
+        cls,
+        mol: Mole,
+        n_BE: int,
+        *,
+        treat_H_different: bool = True,
+        in_bonds_atoms: Mapping[int, set[int]] | None = None,
+        in_vdW_radius: InVdWRadius | None = None,
+    ) -> Self:
         """Construct a :class:`FragmentedStructure` from a :class:`pyscf.gto.mole.Mole`.
 
         Parameters
@@ -448,4 +440,10 @@ def from_Mol(cls, mol: Mole, n_BE: int, treat_H_different: bool = True) -> Self:
         treat_H_different :
             If True, we treat hydrogen atoms differently from heavy atoms.
         """
-        return cls.from_cartesian(Cartesian.from_pyscf(mol), n_BE, treat_H_different)
+        return cls.from_cartesian(
+            Cartesian.from_pyscf(mol),
+            n_BE,
+            treat_H_different=treat_H_different,
+            in_bonds_atoms=in_bonds_atoms,
+            in_vdW_radius=in_vdW_radius,
+        )
diff --git a/tests/test_chemfrag.py b/tests/test_chemfrag.py
index ce5fa57c..8b5e6377 100644
--- a/tests/test_chemfrag.py
+++ b/tests/test_chemfrag.py
@@ -1525,7 +1525,7 @@ def test_agreement_with_autogen():
             assert set(chem_fragment) == set(auto_fragment)
 
 
-def test_manipulation_of_vdW():
+def test_conn_data_manipulation_of_vdW():
     m = Cartesian.read_xyz("data/octane.xyz")
 
     conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=100)

From e79c8caf9a4b67aa26a9e06a9328d36497614182 Mon Sep 17 00:00:00 2001
From: Oskar Weser <oskar.weser@gmail.com>
Date: Tue, 28 Jan 2025 19:10:13 -0500
Subject: [PATCH 07/11] ensure that H are not shared among motifs

---
 src/quemb/molbe/chemfrag.py | 15 +++++++++++++++
 tests/test_chemfrag.py      | 19 ++++++++++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py
index a5508102..3cb8ff60 100644
--- a/src/quemb/molbe/chemfrag.py
+++ b/src/quemb/molbe/chemfrag.py
@@ -189,6 +189,21 @@ def from_cartesian(
             motif: merge_seqs([motif], H_atoms)
             for motif, H_atoms in H_per_motif.items()
         }
+
+        def motifs_share_H() -> bool:
+            for i_motif, i_H_atoms in H_per_motif.items():
+                for j_motif, j_H_atoms in H_per_motif.items():
+                    if i_motif == j_motif:
+                        continue
+                    if i_H_atoms & j_H_atoms:
+                        return True
+            return False
+
+        if treat_H_different and motifs_share_H():
+            raise ValueError(
+                "Cannot treat hydrogens differently if motifs share hydrogens."
+            )
+
         return cls(
             bonds_atoms,
             motifs,
diff --git a/tests/test_chemfrag.py b/tests/test_chemfrag.py
index 8b5e6377..2af778b3 100644
--- a/tests/test_chemfrag.py
+++ b/tests/test_chemfrag.py
@@ -1,3 +1,4 @@
+import pytest
 from chemcoord import Cartesian
 from ordered_set import OrderedSet
 from pyscf.gto import M
@@ -1528,17 +1529,29 @@ def test_agreement_with_autogen():
 def test_conn_data_manipulation_of_vdW():
     m = Cartesian.read_xyz("data/octane.xyz")
 
-    conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=100)
+    # if hydrogens are shared among motifs we cannot treat H differently
+    with pytest.raises(ValueError):
+        conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=100)
+        conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=lambda r: r * 100)
+        conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius={"C": 100})
+
+    conn_data = ConnectivityData.from_cartesian(
+        m, in_vdW_radius=100, treat_H_different=False
+    )
     for atom, connected in conn_data.bonds_atoms.items():
         # check if everything is connected to everything
         assert {atom} | connected == set(m.index)
 
-    conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius=lambda r: r * 100)
+    conn_data = ConnectivityData.from_cartesian(
+        m, in_vdW_radius=lambda r: r * 100, treat_H_different=False
+    )
     for atom, connected in conn_data.bonds_atoms.items():
         # check if everything is connected to everything
         assert {atom} | connected == set(m.index)
 
-    conn_data = ConnectivityData.from_cartesian(m, in_vdW_radius={"C": 100})
+    conn_data = ConnectivityData.from_cartesian(
+        m, in_vdW_radius={"C": 100}, treat_H_different=False
+    )
     for i_carbon in m.loc[m.atom == "C"].index:
         # check if carbons are connected to everything
         assert {i_carbon} | conn_data.bonds_atoms[i_carbon] == set(m.index)

From 5bff0b999d27e08af43953de9a9f0bdb55cddb45 Mon Sep 17 00:00:00 2001
From: Oskar Weser <oskar.weser@gmail.com>
Date: Tue, 28 Jan 2025 19:11:42 -0500
Subject: [PATCH 08/11] ignore missing link to Number

---
 docs/source/nitpick-exceptions | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/nitpick-exceptions b/docs/source/nitpick-exceptions
index 31aa8f13..777822d9 100644
--- a/docs/source/nitpick-exceptions
+++ b/docs/source/nitpick-exceptions
@@ -1,4 +1,5 @@
 py:class optional
 py:class numpy.float64
 py:class OrderedSet
-py:class ordered_set.OrderedSet
\ No newline at end of file
+py:class ordered_set.OrderedSet
+py:class Number
\ No newline at end of file

From 33a729b2bbdd3a6def69cd5164103e1d1dc08341 Mon Sep 17 00:00:00 2001
From: Oskar Weser <oskar.weser@gmail.com>
Date: Tue, 28 Jan 2025 19:19:00 -0500
Subject: [PATCH 09/11] fix doc

---
 src/quemb/molbe/chemfrag.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py
index 3cb8ff60..73494dc3 100644
--- a/src/quemb/molbe/chemfrag.py
+++ b/src/quemb/molbe/chemfrag.py
@@ -124,9 +124,9 @@ def from_cartesian(
             which is called internally if this argument is not specified.
             Allows it to manually change the connectivity by modifying the output of
             :meth:`chemcoord.Cartesian.get_bonds`.
-            The keyword is mutually exclusive with :python:`vdW_radius`.
+            The keyword is mutually exclusive with :python:`in_vdW_radius`.
         in_vdW_radius : Number | Callable[[Number], Number] | Mapping[str, Number]
-            If :python:`bonds_atoms` is :class:`None`, then the connectivity graph is
+            If :python:`in_bonds_atoms` is :class:`None`, then the connectivity graph is
             determined by the van der Waals radius of the atoms.
             It is possible to pass:
 
@@ -136,7 +136,7 @@ def from_cartesian(
             * a dictionary which maps the element symbol to the van der Waals radius,
               to change the radius of individual elements, e.g. :python:`{"C": 1.5}`.
 
-            The keyword is mutually exclusive with :python:`bonds_atoms`.
+            The keyword is mutually exclusive with :python:`in_bonds_atoms`.
         treat_H_different :
             If True, we treat hydrogen atoms differently from heavy atoms.
         """

From 6128157dbac530ad283bd0d7b304b681064300c9 Mon Sep 17 00:00:00 2001
From: Oskar Weser <oskar.weser@gmail.com>
Date: Tue, 28 Jan 2025 19:29:26 -0500
Subject: [PATCH 10/11] fixed bug

---
 src/quemb/molbe/chemfrag.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py
index 73494dc3..3d432e2d 100644
--- a/src/quemb/molbe/chemfrag.py
+++ b/src/quemb/molbe/chemfrag.py
@@ -150,7 +150,7 @@ def from_cartesian(
         if in_bonds_atoms is not None:
             bonds_atoms = {
                 AtomIdx(k): OrderedSet([AtomIdx(j) for j in sorted(v)])
-                for k, v in m.get_bonds().items()
+                for k, v in in_bonds_atoms.items()
             }
         else:
             with cc.constants.RestoreElementData():

From f823e134f2597bce43abb6bb6623c81699a9f0b2 Mon Sep 17 00:00:00 2001
From: Oskar Weser <oskar.weser@gmail.com>
Date: Tue, 28 Jan 2025 19:30:57 -0500
Subject: [PATCH 11/11] finished

---
 src/quemb/molbe/chemfrag.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/quemb/molbe/chemfrag.py b/src/quemb/molbe/chemfrag.py
index 3d432e2d..a4dc2bac 100644
--- a/src/quemb/molbe/chemfrag.py
+++ b/src/quemb/molbe/chemfrag.py
@@ -164,13 +164,17 @@ def from_cartesian(
 
                 elif isinstance(in_vdW_radius, Mapping):
                     elements.loc[:, "atomic_radius_cc"].update(in_vdW_radius)
-                else:
+                elif in_vdW_radius is None:
                     # To avoid false-negatives we set all vdW radii to
                     # at least 0.55 Å
                     # or 20 % larger than the tabulated value.
                     elements.loc[:, "atomic_radius_cc"] = np.maximum(
                         0.55, used_vdW_r * 1.20
                     )
+                else:
+                    raise TypeError(
+                        f"Invalid type {type(in_vdW_radius)} for in_vdW_radius."
+                    )
                 bonds_atoms = {
                     k: OrderedSet(sorted(v)) for k, v in m.get_bonds().items()
                 }