kjappelbaum · XiaoqZhang · Aug 14, 2024 · Aug 14, 2024 · kjappelbaum · Aug 18, 2024
diff --git a/src/mofdscribe/featurizers/topology/_tda_helpers.py b/src/mofdscribe/featurizers/topology/_tda_helpers.py
@@ -11,7 +11,7 @@
 
 from mofdscribe.featurizers.utils import flat
 from mofdscribe.featurizers.utils.aggregators import MA_ARRAY_AGGREGATORS
-from mofdscribe.featurizers.utils.substructures import filter_element
+from mofdscribe.featurizers.utils.substructures import filter_element_for_ph
 
 
 # @np_cache
@@ -104,8 +104,8 @@ def make_supercell(
     xyz_periodic_copies = []
     element_copies = []
 
-    # xyz_periodic_copies.append(coords)
-    # element_copies.append(np.array(elements).reshape(-1,1))
+    xyz_periodic_copies.append(coords)
+    element_copies.append(np.array(elements).reshape(-1,1))
     min_range = -3  # we aren't going in the minimum direction too much, so can make this small
     max_range = 20  # make this large enough, but can modify if wanting an even larger cell
 
@@ -228,7 +228,7 @@ def get_images(
 # ToDo: only do this for all if we want
 def get_persistent_images_for_structure(
     structure: Structure,
-    elements: List[List[str]],
+    elements: List[str],
     compute_for_all_elements: bool = True,
     min_size: int = 20,
     spread: float = 0.2,
@@ -245,7 +245,7 @@ def get_persistent_images_for_structure(
 
     Args:
         structure (Structure): input structure
-        elements (List[List[str]]): list of elements to compute for
+        elements (List[str]): list of element groups to compute for
         compute_for_all_elements (bool): compute for all elements
         min_size (int): minimum size of the cell for construction of persistent images
         spread (float): spread of kernel for construction
@@ -273,9 +273,9 @@ def get_persistent_images_for_structure(
     specs = []
     for mb, mp in zip(max_b, max_p):
         specs.append({"minBD": 0, "maxB": mb, "maxP": mp})
-    for element in elements:
+    for elements_group in elements:
         try:
-            filtered_structure = filter_element(structure, element)
+            filtered_structure = filter_element_for_ph(structure, elements_group)
             coords, _weights, _elements = _coords_for_structure(
                 filtered_structure,
                 min_size=min_size,
@@ -294,7 +294,7 @@ def get_persistent_images_for_structure(
                 dimensions=(0, 1, 2),
             )
         except Exception:
-            logger.exception(f"Error computing persistent images for {element}")
+            logger.exception(f"Error computing persistent images for {elements_group}")
             images = {}
             for dim in [0, 1, 2]:
                 im = np.zeros((pixels[0], pixels[1]))
@@ -304,8 +304,8 @@ def get_persistent_images_for_structure(
             persistent_dia[:] = np.nan
 
         # ToDo: make sure that we have the correct length
-        element_images["image"][element] = images
-        element_images["array"][element] = persistent_dia
+        element_images["image"][elements_group] = images
+        element_images["array"][elements_group] = persistent_dia
 
     if compute_for_all_elements:
         try:
@@ -391,7 +391,7 @@ def get_diagrams_for_structure(
     nan_array[:] = np.nan
     for element in elements:
         try:
-            filtered_structure = filter_element(structure, element)
+            filtered_structure = filter_element_for_ph(structure, element)
             coords, weights, _elements = _coords_for_structure(
                 filtered_structure,
                 min_size=min_size,
@@ -442,7 +442,7 @@ def get_persistence_image_limits_for_structure(
     limits = defaultdict(list)
     for element in elements:
         try:
-            filtered_structure = filter_element(structure, element)
+            filtered_structure = filter_element_for_ph(structure, element)
 
             coords, weights, _elements = _coords_for_structure(
                 filtered_structure,

diff --git a/src/mofdscribe/featurizers/topology/ph_hist.py b/src/mofdscribe/featurizers/topology/ph_hist.py
@@ -34,14 +34,14 @@ class PHHist(MOFBaseFeaturizer):
 
     def __init__(
         self,
-        atom_types: Tuple[str] = (
+        atom_types: List[str] = [
             "C-H-N-O",
             "F-Cl-Br-I",
             "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V"
             "-Ag-Nd-U-Ba-Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-"
             "Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-"
             "Th-Np-Lu-Rh-Pu",
-        ),
+        ],
         compute_for_all_elements: bool = True,
         dimensions: Tuple[int] = (1, 2),
         min_size: int = 20,
@@ -57,12 +57,12 @@ def __init__(
         """Initialize the PHStats object.
 
         Args:
-            atom_types (tuple): Atoms that are used to create substructures
+            atom_types (list): Atoms that are used to create substructures
                 for which the persistent homology statistics are computed.
-                Defaults to ( "C-H-N-O", "F-Cl-Br-I",
+                Defaults to [ "C-H-N-O", "F-Cl-Br-I",
                 "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V-Ag-Nd-U-Ba-Ce-K-Ga-
                 Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-
-                Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu", ).
+                Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu", ].
             compute_for_all_elements (bool): Compute descriptor for original structure with all atoms.
                 Defaults to True.
             dimensions (Tuple[int]): Dimensions of topological features to consider.
@@ -89,9 +89,10 @@ def __init__(
         """
         atom_types = [] if atom_types is None else atom_types
         self.elements = atom_types
-        self.atom_types = (
-            list(atom_types) + ["all"] if compute_for_all_elements else list(atom_types)
-        )
+        if compute_for_all_elements:
+            self.atom_types = atom_types + ["all"]
+        else:
+            self.atom_types = atom_types
         self.compute_for_all_elements = compute_for_all_elements
         self.dimensions = dimensions
         self.min_size = min_size

diff --git a/src/mofdscribe/featurizers/topology/ph_image.py b/src/mofdscribe/featurizers/topology/ph_image.py
@@ -53,14 +53,14 @@ class PHImage(MOFBaseFeaturizer):
 
     def __init__(
         self,
-        atom_types: Optional[Tuple[str]] = (
+        atom_types: Optional[List[str]] = [
             "C-H-N-O",
             "F-Cl-Br-I",
             "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V"
             "-Ag-Nd-U-Ba-Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-"
             "Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-"
-            "Th-Np-Lu-Rh-Pu",
-        ),
+            "Th-Np-Lu-Rh-Pu"
+        ],
         dimensions: Tuple[int] = (0, 1, 2),
         compute_for_all_elements: bool = True,
         min_size: int = 20,
@@ -78,14 +78,14 @@ def __init__(
         """Construct a PHImage object.
 
         Args:
-            atom_types (Tuple[str], optional): Atoms that are used to create
+            atom_types (List[str], optional): Atoms that are used to create
                 substructures that are analysed using persistent homology.
                 If multiple atom types separated by hash are provided, e.g.
                 "C-H-N-O", then the substructure consists of all atoms of type
-                C, H, N, or O. Defaults to ( "C-H-N-O", "F-Cl-Br-I",
+                C, H, N, or O. Defaults to [ "C-H-N-O", "F-Cl-Br-I",
                 "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V-Ag-Nd-U-Ba-
                 Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-Ho-Re-Be-Rb-La-Sn-Cs-Pb-
-                Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu").
+                Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu"].
             dimensions (Tuple[int]): Dimensions of topological
                 features to consider for persistence images. Defaults to (0, 1, 2).
             compute_for_all_elements (bool): If true, compute
@@ -181,14 +181,14 @@ def get_birth_persistance_death_from_pixel(self, dimension: int, x: int, y: int)
 
     def _get_feature_labels(self) -> List[str]:
         labels = []
-        _elements = list(self.atom_types)
+        _elements = self.atom_types
         if self.compute_for_all_elements:
             _elements.append("all")
-        for element in _elements:
+        for elements_group in _elements:
             for dim in self.dimensions:
                 for pixel_a in range(self.image_size[0]):
                     for pixel_b in range(self.image_size[1]):
-                        labels.append(f"phimage_{element}_{dim}_{pixel_a}_{pixel_b}")
+                        labels.append(f"phimage_{elements_group}_{dim}_{pixel_a}_{pixel_b}")
 
         return labels
 
@@ -293,7 +293,7 @@ def _featurize(
             alpha_weighting=self.alpha_weight,
         )
         features = []
-        elements = list(self.atom_types)
+        elements = self.atom_types
         if self.compute_for_all_elements:
             elements.append("all")
         for element in elements:

diff --git a/src/mofdscribe/featurizers/topology/ph_stats.py b/src/mofdscribe/featurizers/topology/ph_stats.py
@@ -34,14 +34,14 @@ class PHStats(MOFBaseFeaturizer):
 
     def __init__(
         self,
-        atom_types: Tuple[str] = (
+        atom_types: List[str] = [
             "C-H-N-O",
             "F-Cl-Br-I",
             "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V"
             "-Ag-Nd-U-Ba-Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-"
             "Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-"
             "Th-Np-Lu-Rh-Pu",
-        ),
+        ],
         compute_for_all_elements: bool = True,
         dimensions: Tuple[int] = (1, 2),
         min_size: int = 20,
@@ -54,12 +54,12 @@ def __init__(
         """Initialize the PHStats object.
 
         Args:
-            atom_types (tuple): Atoms that are used to create substructures
+            atom_types (list): Atoms that are used to create substructures
                 for which the persistent homology statistics are computed.
-                Defaults to ( "C-H-N-O", "F-Cl-Br-I",
+                Defaults to [ "C-H-N-O", "F-Cl-Br-I",
                 "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V-Ag-Nd-U-Ba-Ce-K-Ga-
                 Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-
-                Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu", ).
+                Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu", ].
             compute_for_all_elements (bool): Compute descriptor for original structure with all atoms.
                 Defaults to True.
             dimensions (Tuple[int]): Dimensions of topological features to consider.
@@ -81,9 +81,10 @@ def __init__(
         """
         atom_types = [] if atom_types is None else atom_types
         self.elements = atom_types
-        self.atom_types = (
-            list(atom_types) + ["all"] if compute_for_all_elements else list(atom_types)
-        )
+        if compute_for_all_elements:
+            self.atom_types = atom_types + ["all"]
+        else:
+            self.atom_types = atom_types
         self.compute_for_all_elements = compute_for_all_elements
         self.dimensions = dimensions
         self.min_size = min_size

diff --git a/src/mofdscribe/featurizers/topology/ph_vect.py b/src/mofdscribe/featurizers/topology/ph_vect.py
@@ -140,14 +140,14 @@ class PHVect(MOFBaseFeaturizer):
 
     def __init__(
         self,
-        atom_types: Tuple[str] = (
+        atom_types: List[str] = [
             "C-H-N-O",
             "F-Cl-Br-I",
             "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V"
             "-Ag-Nd-U-Ba-Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-"
             "Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-"
             "Th-Np-Lu-Rh-Pu",
-        ),
+        ],
         compute_for_all_elements: bool = True,
         dimensions: Tuple[int] = (1, 2),
         min_size: int = 20,
@@ -165,14 +165,14 @@ def __init__(
         """Construct a PHVect instance.
 
         Args:
-            atom_types (tuple): Atoms that are used to create substructures
+            atom_types (list): Atoms that are used to create substructures
                 that are analysed using persistent homology.
                 If multiple atom types separated by hash are provided,
                 e.g. "C-H-N-O", then the substructure consists of all atoms of type C, H, N, or O.
-                Defaults to ( "C-H-N-O", "F-Cl-Br-I",
+                Defaults to [ "C-H-N-O", "F-Cl-Br-I",
                 "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V-Ag-Nd-U-Ba-Ce-K-Ga-
                 Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-
-                Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu", ).
+                Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu", ].
             compute_for_all_elements (bool): If true, compute persistence images
                 for full structure (i.e. with all elements). If false, it will only do it
                 for the substructures specified with `atom_types`. Defaults to True.
@@ -209,9 +209,10 @@ def __init__(
         """
         atom_types = [] if atom_types is None else atom_types
         self.elements = atom_types
-        self.atom_types = (
-            list(atom_types) + ["all"] if compute_for_all_elements else list(atom_types)
-        )
+        if compute_for_all_elements:
+            self.atom_types = atom_types + ["all"]
+        else:
+            self.atom_types = atom_types
         self.compute_for_all_elements = compute_for_all_elements
         self.min_size = min_size
         self.dimensions = dimensions

diff --git a/src/mofdscribe/featurizers/utils/substructures.py b/src/mofdscribe/featurizers/utils/substructures.py
@@ -35,6 +35,39 @@ def filter_element(
         return Structure.from_sites(keep_sites)
     else:  # input is molecule or IMolecule
         return Molecule.from_sites(keep_sites)
+
+
+def filter_element_for_ph(
+    structure: Union[Structure, IStructure, Molecule, IMolecule], elements: str
+) -> Structure:
+    """Filter a structure by element.
+
+    Args:
+        structure (Union[Structure, IStructure, Molecule, IMolecule]): input structure
+        elements (str): element to filter
+
+    Returns:
+        filtered_structure (Structure): filtered structure
+    """
+    elements_ = []
+    elements_group = (elements,)
+    for atom_type in elements_group:
+        if "-" in atom_type:
+            elements_.extend(atom_type.split("-"))
+        else:
+            elements_.append(atom_type)
+    keep_sites = []
+    for site in structure.sites:
+        if site.specie.symbol in elements_:
+            keep_sites.append(site)
+    if len(keep_sites) == 0:
+        return None
+
+    input_is_structure = isinstance(structure, (Structure, IStructure))
+    if input_is_structure:
+        return Structure.from_sites(keep_sites)
+    else:  # input is molecule or IMolecule
+        return Molecule.from_sites(keep_sites)
 
 
 def elements_in_structure(structure: Structure) -> List[str]: