diff --git a/.deepsource.toml b/.deepsource.toml index 02f9532d..0ac63d33 100644 --- a/.deepsource.toml +++ b/.deepsource.toml @@ -1,16 +1,7 @@ version = 1 -test_patterns = ["tests/**"] - -exclude_patterns = [ - "docs/", - "dev/", - "binder/" -] - [[analyzers]] name = "python" -enabled = true [analyzers.meta] - runtime_version = "3.x.x" + runtime_version = "3.x.x" \ No newline at end of file diff --git a/src/mofdscribe/featurizers/topology/_tda_helpers.py b/src/mofdscribe/featurizers/topology/_tda_helpers.py index b2d58744..95f2f7b6 100644 --- a/src/mofdscribe/featurizers/topology/_tda_helpers.py +++ b/src/mofdscribe/featurizers/topology/_tda_helpers.py @@ -11,7 +11,7 @@ from mofdscribe.featurizers.utils import flat from mofdscribe.featurizers.utils.aggregators import MA_ARRAY_AGGREGATORS -from mofdscribe.featurizers.utils.substructures import filter_element +from mofdscribe.featurizers.utils.substructures import filter_element_for_ph # @np_cache @@ -104,8 +104,8 @@ def make_supercell( xyz_periodic_copies = [] element_copies = [] - # xyz_periodic_copies.append(coords) - # element_copies.append(np.array(elements).reshape(-1,1)) + xyz_periodic_copies.append(coords) + element_copies.append(np.array(elements).reshape(-1,1)) min_range = -3 # we aren't going in the minimum direction too much, so can make this small max_range = 20 # make this large enough, but can modify if wanting an even larger cell @@ -228,7 +228,7 @@ def get_images( # ToDo: only do this for all if we want def get_persistent_images_for_structure( structure: Structure, - elements: List[List[str]], + elements: List[str], compute_for_all_elements: bool = True, min_size: int = 20, spread: float = 0.2, @@ -245,7 +245,7 @@ def get_persistent_images_for_structure( Args: structure (Structure): input structure - elements (List[List[str]]): list of elements to compute for + elements (List[str]): list of element groups to compute for compute_for_all_elements (bool): compute for all elements min_size (int): minimum size of the cell for construction of persistent images spread (float): spread of kernel for construction @@ -273,9 +273,9 @@ def get_persistent_images_for_structure( specs = [] for mb, mp in zip(max_b, max_p): specs.append({"minBD": 0, "maxB": mb, "maxP": mp}) - for element in elements: + for elements_group in elements: try: - filtered_structure = filter_element(structure, element) + filtered_structure = filter_element_for_ph(structure, elements_group) coords, _weights, _elements = _coords_for_structure( filtered_structure, min_size=min_size, @@ -294,7 +294,7 @@ def get_persistent_images_for_structure( dimensions=(0, 1, 2), ) except Exception: - logger.exception(f"Error computing persistent images for {element}") + logger.exception(f"Error computing persistent images for {elements_group}") images = {} for dim in [0, 1, 2]: im = np.zeros((pixels[0], pixels[1])) @@ -304,8 +304,8 @@ def get_persistent_images_for_structure( persistent_dia[:] = np.nan # ToDo: make sure that we have the correct length - element_images["image"][element] = images - element_images["array"][element] = persistent_dia + element_images["image"][elements_group] = images + element_images["array"][elements_group] = persistent_dia if compute_for_all_elements: try: @@ -391,7 +391,7 @@ def get_diagrams_for_structure( nan_array[:] = np.nan for element in elements: try: - filtered_structure = filter_element(structure, element) + filtered_structure = filter_element_for_ph(structure, element) coords, weights, _elements = _coords_for_structure( filtered_structure, min_size=min_size, @@ -442,7 +442,7 @@ def get_persistence_image_limits_for_structure( limits = defaultdict(list) for element in elements: try: - filtered_structure = filter_element(structure, element) + filtered_structure = filter_element_for_ph(structure, element) coords, weights, _elements = _coords_for_structure( filtered_structure, diff --git a/src/mofdscribe/featurizers/topology/ph_hist.py b/src/mofdscribe/featurizers/topology/ph_hist.py index 4a814ef8..69f657a4 100644 --- a/src/mofdscribe/featurizers/topology/ph_hist.py +++ b/src/mofdscribe/featurizers/topology/ph_hist.py @@ -34,14 +34,14 @@ class PHHist(MOFBaseFeaturizer): def __init__( self, - atom_types: Tuple[str] = ( + atom_types: List[str] = [ "C-H-N-O", "F-Cl-Br-I", "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V" "-Ag-Nd-U-Ba-Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-" "Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-" "Th-Np-Lu-Rh-Pu", - ), + ], compute_for_all_elements: bool = True, dimensions: Tuple[int] = (1, 2), min_size: int = 20, @@ -57,12 +57,12 @@ def __init__( """Initialize the PHStats object. Args: - atom_types (tuple): Atoms that are used to create substructures + atom_types (list): Atoms that are used to create substructures for which the persistent homology statistics are computed. - Defaults to ( "C-H-N-O", "F-Cl-Br-I", + Defaults to [ "C-H-N-O", "F-Cl-Br-I", "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V-Ag-Nd-U-Ba-Ce-K-Ga- Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti- - Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu", ). + Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu", ]. compute_for_all_elements (bool): Compute descriptor for original structure with all atoms. Defaults to True. dimensions (Tuple[int]): Dimensions of topological features to consider. @@ -89,9 +89,10 @@ def __init__( """ atom_types = [] if atom_types is None else atom_types self.elements = atom_types - self.atom_types = ( - list(atom_types) + ["all"] if compute_for_all_elements else list(atom_types) - ) + if compute_for_all_elements: + self.atom_types = atom_types + ["all"] + else: + self.atom_types = atom_types self.compute_for_all_elements = compute_for_all_elements self.dimensions = dimensions self.min_size = min_size diff --git a/src/mofdscribe/featurizers/topology/ph_image.py b/src/mofdscribe/featurizers/topology/ph_image.py index cb637ff8..805f36e4 100644 --- a/src/mofdscribe/featurizers/topology/ph_image.py +++ b/src/mofdscribe/featurizers/topology/ph_image.py @@ -53,14 +53,14 @@ class PHImage(MOFBaseFeaturizer): def __init__( self, - atom_types: Optional[Tuple[str]] = ( + atom_types: Optional[List[str]] = [ "C-H-N-O", "F-Cl-Br-I", "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V" "-Ag-Nd-U-Ba-Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-" "Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-" - "Th-Np-Lu-Rh-Pu", - ), + "Th-Np-Lu-Rh-Pu" + ], dimensions: Tuple[int] = (0, 1, 2), compute_for_all_elements: bool = True, min_size: int = 20, @@ -78,14 +78,14 @@ def __init__( """Construct a PHImage object. Args: - atom_types (Tuple[str], optional): Atoms that are used to create + atom_types (List[str], optional): Atoms that are used to create substructures that are analysed using persistent homology. If multiple atom types separated by hash are provided, e.g. "C-H-N-O", then the substructure consists of all atoms of type - C, H, N, or O. Defaults to ( "C-H-N-O", "F-Cl-Br-I", + C, H, N, or O. Defaults to [ "C-H-N-O", "F-Cl-Br-I", "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V-Ag-Nd-U-Ba- Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-Ho-Re-Be-Rb-La-Sn-Cs-Pb- - Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu"). + Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu"]. dimensions (Tuple[int]): Dimensions of topological features to consider for persistence images. Defaults to (0, 1, 2). compute_for_all_elements (bool): If true, compute @@ -181,14 +181,14 @@ def get_birth_persistance_death_from_pixel(self, dimension: int, x: int, y: int) def _get_feature_labels(self) -> List[str]: labels = [] - _elements = list(self.atom_types) + _elements = self.atom_types if self.compute_for_all_elements: _elements.append("all") - for element in _elements: + for elements_group in _elements: for dim in self.dimensions: for pixel_a in range(self.image_size[0]): for pixel_b in range(self.image_size[1]): - labels.append(f"phimage_{element}_{dim}_{pixel_a}_{pixel_b}") + labels.append(f"phimage_{elements_group}_{dim}_{pixel_a}_{pixel_b}") return labels @@ -293,7 +293,7 @@ def _featurize( alpha_weighting=self.alpha_weight, ) features = [] - elements = list(self.atom_types) + elements = self.atom_types if self.compute_for_all_elements: elements.append("all") for element in elements: diff --git a/src/mofdscribe/featurizers/topology/ph_stats.py b/src/mofdscribe/featurizers/topology/ph_stats.py index 841a264d..0e794880 100644 --- a/src/mofdscribe/featurizers/topology/ph_stats.py +++ b/src/mofdscribe/featurizers/topology/ph_stats.py @@ -34,14 +34,14 @@ class PHStats(MOFBaseFeaturizer): def __init__( self, - atom_types: Tuple[str] = ( + atom_types: List[str] = [ "C-H-N-O", "F-Cl-Br-I", "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V" "-Ag-Nd-U-Ba-Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-" "Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-" "Th-Np-Lu-Rh-Pu", - ), + ], compute_for_all_elements: bool = True, dimensions: Tuple[int] = (1, 2), min_size: int = 20, @@ -54,12 +54,12 @@ def __init__( """Initialize the PHStats object. Args: - atom_types (tuple): Atoms that are used to create substructures + atom_types (list): Atoms that are used to create substructures for which the persistent homology statistics are computed. - Defaults to ( "C-H-N-O", "F-Cl-Br-I", + Defaults to [ "C-H-N-O", "F-Cl-Br-I", "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V-Ag-Nd-U-Ba-Ce-K-Ga- Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti- - Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu", ). + Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu", ]. compute_for_all_elements (bool): Compute descriptor for original structure with all atoms. Defaults to True. dimensions (Tuple[int]): Dimensions of topological features to consider. @@ -81,9 +81,10 @@ def __init__( """ atom_types = [] if atom_types is None else atom_types self.elements = atom_types - self.atom_types = ( - list(atom_types) + ["all"] if compute_for_all_elements else list(atom_types) - ) + if compute_for_all_elements: + self.atom_types = atom_types + ["all"] + else: + self.atom_types = atom_types self.compute_for_all_elements = compute_for_all_elements self.dimensions = dimensions self.min_size = min_size diff --git a/src/mofdscribe/featurizers/topology/ph_vect.py b/src/mofdscribe/featurizers/topology/ph_vect.py index 9bc21475..8463d01d 100644 --- a/src/mofdscribe/featurizers/topology/ph_vect.py +++ b/src/mofdscribe/featurizers/topology/ph_vect.py @@ -140,14 +140,14 @@ class PHVect(MOFBaseFeaturizer): def __init__( self, - atom_types: Tuple[str] = ( + atom_types: List[str] = [ "C-H-N-O", "F-Cl-Br-I", "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V" "-Ag-Nd-U-Ba-Ce-K-Ga-Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-" "Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti-Hf-Ir-Nb-Pd-Hg-" "Th-Np-Lu-Rh-Pu", - ), + ], compute_for_all_elements: bool = True, dimensions: Tuple[int] = (1, 2), min_size: int = 20, @@ -165,14 +165,14 @@ def __init__( """Construct a PHVect instance. Args: - atom_types (tuple): Atoms that are used to create substructures + atom_types (list): Atoms that are used to create substructures that are analysed using persistent homology. If multiple atom types separated by hash are provided, e.g. "C-H-N-O", then the substructure consists of all atoms of type C, H, N, or O. - Defaults to ( "C-H-N-O", "F-Cl-Br-I", + Defaults to [ "C-H-N-O", "F-Cl-Br-I", "Cu-Mn-Ni-Mo-Fe-Pt-Zn-Ca-Er-Au-Cd-Co-Gd-Na-Sm-Eu-Tb-V-Ag-Nd-U-Ba-Ce-K-Ga- Cr-Al-Li-Sc-Ru-In-Mg-Zr-Dy-W-Yb-Y-Ho-Re-Be-Rb-La-Sn-Cs-Pb-Pr-Bi-Tm-Sr-Ti- - Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu", ). + Hf-Ir-Nb-Pd-Hg-Th-Np-Lu-Rh-Pu", ]. compute_for_all_elements (bool): If true, compute persistence images for full structure (i.e. with all elements). If false, it will only do it for the substructures specified with `atom_types`. Defaults to True. @@ -209,9 +209,10 @@ def __init__( """ atom_types = [] if atom_types is None else atom_types self.elements = atom_types - self.atom_types = ( - list(atom_types) + ["all"] if compute_for_all_elements else list(atom_types) - ) + if compute_for_all_elements: + self.atom_types = atom_types + ["all"] + else: + self.atom_types = atom_types self.compute_for_all_elements = compute_for_all_elements self.min_size = min_size self.dimensions = dimensions diff --git a/src/mofdscribe/featurizers/utils/substructures.py b/src/mofdscribe/featurizers/utils/substructures.py index 337f5f2d..2b6ba50d 100644 --- a/src/mofdscribe/featurizers/utils/substructures.py +++ b/src/mofdscribe/featurizers/utils/substructures.py @@ -35,6 +35,39 @@ def filter_element( return Structure.from_sites(keep_sites) else: # input is molecule or IMolecule return Molecule.from_sites(keep_sites) + + +def filter_element_for_ph( + structure: Union[Structure, IStructure, Molecule, IMolecule], elements: str +) -> Structure: + """Filter a structure by element. + + Args: + structure (Union[Structure, IStructure, Molecule, IMolecule]): input structure + elements (str): element to filter + + Returns: + filtered_structure (Structure): filtered structure + """ + elements_ = [] + elements_group = (elements,) + for atom_type in elements_group: + if "-" in atom_type: + elements_.extend(atom_type.split("-")) + else: + elements_.append(atom_type) + keep_sites = [] + for site in structure.sites: + if site.specie.symbol in elements_: + keep_sites.append(site) + if len(keep_sites) == 0: + return None + + input_is_structure = isinstance(structure, (Structure, IStructure)) + if input_is_structure: + return Structure.from_sites(keep_sites) + else: # input is molecule or IMolecule + return Molecule.from_sites(keep_sites) def elements_in_structure(structure: Structure) -> List[str]: