update: progress in rewriting

Exabyte-io · Oct 25, 2024 · 864a7cc · 864a7cc
1 parent 7183e4c
commit 864a7cc
Show file tree

Hide file tree

Showing 10 changed files with 288 additions and 30 deletions.
diff --git a/src/py/mat3ra/standata/__init__.py b/src/py/mat3ra/standata/__init__.py
@@ -1,30 +0,0 @@
-import re
-from typing import Dict, List
-
-
-class Standata:
-
-    # Override in children
-    data: dict = {"filesMapByName": []}
-
-    @classmethod
-    def files_map_by_name(cls) -> Dict[str, dict]:
-        return cls.data["filesMapByName"]
-
-    @classmethod
-    def get_as_list(cls):
-        return list(cls.files_map_by_name().values())
-
-    @classmethod
-    def get_by_name(cls, name: str) -> List[dict]:
-        """Returns entity by name.
-
-        Args:
-            name: Name of the entity.
-        """
-        matching_entities = []
-        for key, entity in cls.files_map_by_name().items():
-            regex = re.compile(name, re.IGNORECASE)
-            if re.match(regex, key):
-                matching_entities.append(entity)
-        return matching_entities

diff --git a/src/py/mat3ra/standata/base.py b/src/py/mat3ra/standata/base.py
@@ -0,0 +1,156 @@
+import re
+from typing import Dict, List, Optional
+
+import pandas as pd
+from pydantic import BaseModel
+
+
+class StandataEntity(BaseModel):
+    filename: str
+    categories: List[str]
+
+
+class StandataConfig(BaseModel):
+    categories: Dict[str, List[str]] = {}
+    entities: List[StandataEntity] = []
+
+    def get_categories_as_list(self, separator: str = "/") -> List[str]:
+        """
+        Flattens categories dictionary to list of categories.
+
+        Args:
+            category_map: Dictionary mapping category types to category tags.
+            separator: Separation character used to join category type and tag.
+
+        Example::
+
+            Standata.flatten_categories({"size": ["S", "M", "L"]})
+            # returns ["size/S", "size/M", "size/L"]
+        """
+        category_groups = [list(map(lambda x: f"{key}{separator}{x}", val)) for key, val in self.categories.items()]
+        return [item for sublist in category_groups for item in sublist]
+
+    def convert_tags_to_categories_list(self, *tags: str):
+        """
+        Converts simple tags to '<category_type>/<tag>' format.
+
+        Args:
+            *tags: Category tags for the entity.
+
+        Note:
+            Some tags belong to several categories simultaneously, for instance 'semiconductor' is associated with
+            'electrical_conductivity' and 'type'. This function returns all occurrences of a tag as
+            '<category_type>/<tag>'.
+        """
+        return [cf for cf in self.categories if any([cf.split("/")[1] == t for t in tags])]
+
+    def get_filenames_by_categories(self, *categories: str) -> List[str]:
+        """
+        Returns filenames that match all given categories.
+
+        Args:
+            *categories: Categories for the entity query. Note, that `categories` should be in the same format as the
+            column names in the lookup table.
+        """
+        if len(categories) == 0:
+            return []
+        filenames = []
+        for entity in self.entities:
+            if any([category in entity.categories for category in categories]):
+                filenames.append(entity.filename)
+        return filenames
+
+    def get_filenames_by_regex(self, regex: str) -> List[str]:
+        """
+        Returns filenames that match the regular expression.
+
+        Args:
+            regex: Regular expression for the entity query.
+        """
+        filenames = []
+        for entity in self.entities:
+            if re.match(regex, entity.filename):
+                filenames.append(entity.filename)
+        return filenames
+
+    @property
+    def __lookup_table(self) -> pd.DataFrame:
+        """
+        Creates lookup table for filenames and associated categories.
+
+        For the lookup table category tags are first converted to the <category_type>/<tag> format, which represent the
+        columns of the lookup table. The filenames represent the rows of the lookup table (DataFrame.index). The values
+        in the table are either 0 or 1 depending on whether a filename is associated with a certain category (1) or
+        not (0).
+        """
+        df = pd.DataFrame(
+            0,
+            columns=self.get_categories_as_list(),
+            index=[entity.filename for entity in self.entities],
+        )
+        for entity in self.entities:
+            filename = entity.filename
+            categories = self.convert_tags_to_categories_list(*entity.categories)
+            for category in categories:
+                df.loc[filename, category] = 1
+        return df
+
+
+class StandataFilesMapByName(Dict[str, dict]):
+
+    def get_objects_by_filenames(self, filenames: List[str]) -> List[dict]:
+        """
+        Returns entities by filenames.
+
+        Args:
+            filenames: Filenames of the entities.
+        """
+        matching_objects = []
+        for key, entity in self.items():
+            if key in filenames:
+                matching_objects.append(entity)
+        return matching_objects
+
+
+class StandataData(BaseModel):
+    class Config:
+        arbitrary_types_allowed = True
+
+    filesMapByName: Optional[StandataFilesMapByName] = StandataFilesMapByName()
+    standataConfig: Optional[StandataConfig] = StandataConfig()
+
+    def __init__(self, /, **kwargs):
+        super().__init__(**kwargs)
+        self.filesMapByName = StandataFilesMapByName(kwargs.get("filesMapByName", {}))
+        self.standataConfig = StandataConfig(**kwargs.get("standataConfig", {}))
+
+
+class Standata(BaseModel):
+    # Override in children
+    data: StandataData = StandataData()
+
+    @classmethod
+    def get_as_list(cls):
+        return list(cls.data.filesMapByName.values())
+
+    @classmethod
+    def get_by_name(cls, name: str) -> List[dict]:
+        """
+        Returns entities by name regex.
+
+        Args:
+            name: Name of the entity.
+        """
+        matching_filenames = cls.data.standataConfig.get_filenames_by_regex(name)
+        return cls.data.filesMapByName.get_objects_by_filenames(matching_filenames)
+
+    def get_by_categories(self, *tags: str) -> List[dict]:
+        """
+        Finds entities that match all specified category tags.
+
+        Args:
+            *tags: Category tags for the entity query.
+        """
+        categories = self.data.standataConfig.convert_tags_to_categories_list(*tags)
+        matching_filenames = self.data.standataConfig.get_filenames_by_categories(*categories)
+        return self.data.filesMapByName.get_objects_by_filenames(matching_filenames)
diff --git a/src/py/mat3ra/standata/build/__init__.py b/src/py/mat3ra/standata/build/__init__.py
diff --git a/src/py/mat3ra/standata/build/builder.py b/src/py/mat3ra/standata/build/builder.py
@@ -0,0 +1,75 @@
+import json
+import yaml
+from pathlib import Path
+from typing import Dict, List, Optional, TypedDict, Union
+
+
+from ..base import StandataConfig
+
+EntityItem = TypedDict("EntityItem", {"filename": str, "categories": List[str]})
+
+EntityConfig = TypedDict("EntityConfig", {"categories": Dict[str, List[str]], "entities": List[EntityItem]})
+
+
+class StandataBuilder:
+    """The Standata class associates the entity data files with categories and allows for tag-based queries.
+
+    Attributes:
+        entity_dir: Path to the folder containing entity data files.
+    """
+
+    def __init__(self, entity_config: EntityConfig, entity_dir: Union[str, Path]):
+        """Initializes categories and the entity list.
+
+        Args:
+             entity_config: The contents of the entity config file (`categories.yml`).
+             entity_dir: The path to the directory containing all entities.
+        """
+        self.entity_dir: Path = Path(entity_dir).resolve()
+
+    @classmethod
+    def build_from_file(cls, entity_config_path: Union[Path, str]) -> "StandataConfig":
+        """Creates Standata class instance from entity config file (categories.yml).
+
+        Args:
+            entity_config_path: The path to the entity config file `categories.yml`.
+
+        Note:
+            Here, we assume that the entity config file is located in the same directory as all entity files.
+        """
+        filepath = Path(entity_config_path)
+        config = StandataBuilder.load_config(filepath)
+        return StandataConfig(**config)
+
+    @staticmethod
+    def load_config(entity_config_path: Path) -> EntityConfig:
+        """Loads entity config from file (Yaml).
+
+        Args:
+            entity_config_path: The path to the entity config file `categories.yml`.
+        """
+        entity_config: EntityConfig = {"categories": {}, "entities": []}
+        try:
+            with open(entity_config_path.resolve(), "r") as stream:
+                entity_config = yaml.safe_load(stream)
+        except yaml.YAMLError as e:
+            print(e)
+        return entity_config
+
+    @staticmethod
+    def load_entity(filepath: Path) -> Optional[dict]:
+        """Loads entity config from file (JSON).
+
+        Args:
+            filepath: Path to entity data file (JSON).
+        """
+        entity = None
+        if not filepath.resolve().exists():
+            print(f"Could not find entity file: {filepath.resolve()}")
+            return entity
+        try:
+            with open(filepath.resolve(), "r") as f:
+                entity = json.load(f)
+        except json.JSONDecodeError as e:
+            print(e)
+        return entity
diff --git a/src/py/mat3ra/standata/build/cli.py b/src/py/mat3ra/standata/build/cli.py
@@ -0,0 +1,39 @@
+from pathlib import Path
+from typing import Optional
+
+import typer
+from .builder import StandataBuilder
+
+
+def main(
+    yaml_config: str = typer.Argument(..., help="Location of entity config file."),
+    destination: Optional[str] = typer.Option("--destination", "-d", help="Where to place symlink directory."),
+):
+    config_path = Path(yaml_config)
+    entity_path_parent = config_path.parent
+
+    standata_config = StandataBuilder.build_from_file(config_path)
+
+    save_dir = config_path.parent
+    if destination and Path(destination).resolve().exists():
+        save_dir = Path(destination)
+    categories_root = save_dir / "by_category"
+
+    for entity in standata_config.entities:
+        categories = standata_config.convert_tags_to_categories_list(*entity.categories)
+        entity_path = entity_path_parent / entity.filename
+
+        for category in categories:
+            category_dir = categories_root / category
+            category_dir.mkdir(parents=True, exist_ok=True)
+            linked_entity = category_dir / entity.filename
+            if not linked_entity.exists():
+                linked_entity.symlink_to(entity_path)
+
+
+def typer_app():
+    typer.run(main)
+
+
+if __name__ == "__main__":
+    typer_app()
diff --git a/src/py/mat3ra/standata/data/applications.py b/src/py/mat3ra/standata/data/applications.py
@@ -0,0 +1,3 @@
+import json
+
+applications_data = json.loads(r'''{"standataConfig":{"categories":{"model":["atomistic","macroscopic","mesoscopic","quantum-mechanical","statistical"],"language_type":["scripting","compiled"],"purpose":["command_line_interface","interactive_computing_environment","programming_language"]},"entities":[{"filename":"espresso_gnu_540.json","categories":["quantum-mechanical"]},{"filename":"python_386.json","categories":["scripting","programming_language"]}]},"filesMapByName":{"espresso_gnu_540.json":{"name":"espresso","shortName":"qe","summary":"Quantum Espresso","version":"5.4.0","build":"GNU","hasAdvancedComputeOptions":true,"isLicensed":false},"python_386.json":{"name":"python","shortName":"py","summary":"Python Script","version":"3.8.6","build":"Default","hasAdvancedComputeOptions":false,"isLicensed":false}}}''')
diff --git a/src/py/mat3ra/standata/data/materials.py b/src/py/mat3ra/standata/data/materials.py
diff --git a/src/py/mat3ra/standata/data/properties.py b/src/py/mat3ra/standata/data/properties.py
@@ -0,0 +1,3 @@
+import json
+
+properties_data = json.loads(r'''{"standataConfig":{"categories":{"type":["mechanical","electronic","magnetic","thermal","optical","dynamic","transport","radiological","acoustic","manufacturing","chemical","structural","surface"],"property_class":["meta-property","proto-property"],"value_type":["scalar","vector","matrix","tensor_rank3","tensor_rank4","non-scalar"],"measurement":["angle-resolved-photoemission-spectroscopy","atomic-force-microscopy","x-ray-diffraction","x-ray-fluorescence","transmission-electron-microscopy","scanning-electron-microscopy","thermogravimetric-analysis"],"application":["espresso","vasp","python","shell"]},"entities":[{"filename":"valence_band_offset.json","categories":["scalar","electronic","angle-resolved-photoemission-spectroscopy"]},{"filename":"band_structure.json","categories":["matrix","electronic"]}]},"filesMapByName":{"valence_band_offset.json":{"name":"valence_band_offset","units":"eV","value":0.28},"band_structure.json":{"name":"band_structure","spin":[0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5],"xAxis":{"label":"kpoints","units":"crystal"},"xDataArray":[[0,0,0],[0.28867514,0.20412412,-0.49999997],[0,-0.61237246,0],[0.28867514,-0.40824834,-0.49999997],[-0.57735028,0.20412421,0],[-0.57735028,-0.40824824,0]],"yAxis":{"label":"energy","units":"eV"},"yDataSeries":[[-5.5990059,-3.30219959,-3.30220019,-1.51073812,-3.30221054,-1.51074222],[6.26931638,-0.66503974,-0.6650363,-1.51072293,-0.66501391,-1.5107195],[6.26931998,5.06084876,5.06084821,3.41069883,5.06085301,3.41069761],[6.26934533,5.0608702,5.06086954,3.41070722,5.06085524,3.41071003],[8.71135349,7.69496909,7.69496137,6.91957625,7.69495606,6.91957636],[8.71135587,9.49274379,9.49273868,6.91958498,9.49273487,6.91958424],[8.71135838,9.49275618,9.49275401,16.14829919,9.49273798,16.14830113],[9.41550185,13.89571002,13.89571914,16.1483028,13.89571883,16.14830247]]}}}''')
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		import json

		applications_data = json.loads(r'''{"standataConfig":{"categories":{"model":["atomistic","macroscopic","mesoscopic","quantum-mechanical","statistical"],"language_type":["scripting","compiled"],"purpose":["command_line_interface","interactive_computing_environment","programming_language"]},"entities":[{"filename":"espresso_gnu_540.json","categories":["quantum-mechanical"]},{"filename":"python_386.json","categories":["scripting","programming_language"]}]},"filesMapByName":{"espresso_gnu_540.json":{"name":"espresso","shortName":"qe","summary":"Quantum Espresso","version":"5.4.0","build":"GNU","hasAdvancedComputeOptions":true,"isLicensed":false},"python_386.json":{"name":"python","shortName":"py","summary":"Python Script","version":"3.8.6","build":"Default","hasAdvancedComputeOptions":false,"isLicensed":false}}}''')
Check failure on line 3 in src/py/mat3ra/standata/data/applications.py View workflow job for this annotation GitHub Actions / run-py-linter (3.8.6) Ruff (E501) `src/py/mat3ra/standata/data/applications.py:3:121: E501 Line too long (823 > 120 characters)`
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		import json

		properties_data = json.loads(r'''{"standataConfig":{"categories":{"type":["mechanical","electronic","magnetic","thermal","optical","dynamic","transport","radiological","acoustic","manufacturing","chemical","structural","surface"],"property_class":["meta-property","proto-property"],"value_type":["scalar","vector","matrix","tensor_rank3","tensor_rank4","non-scalar"],"measurement":["angle-resolved-photoemission-spectroscopy","atomic-force-microscopy","x-ray-diffraction","x-ray-fluorescence","transmission-electron-microscopy","scanning-electron-microscopy","thermogravimetric-analysis"],"application":["espresso","vasp","python","shell"]},"entities":[{"filename":"valence_band_offset.json","categories":["scalar","electronic","angle-resolved-photoemission-spectroscopy"]},{"filename":"band_structure.json","categories":["matrix","electronic"]}]},"filesMapByName":{"valence_band_offset.json":{"name":"valence_band_offset","units":"eV","value":0.28},"band_structure.json":{"name":"band_structure","spin":[0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5],"xAxis":{"label":"kpoints","units":"crystal"},"xDataArray":[[0,0,0],[0.28867514,0.20412412,-0.49999997],[0,-0.61237246,0],[0.28867514,-0.40824834,-0.49999997],[-0.57735028,0.20412421,0],[-0.57735028,-0.40824824,0]],"yAxis":{"label":"energy","units":"eV"},"yDataSeries":[[-5.5990059,-3.30219959,-3.30220019,-1.51073812,-3.30221054,-1.51074222],[6.26931638,-0.66503974,-0.6650363,-1.51072293,-0.66501391,-1.5107195],[6.26931998,5.06084876,5.06084821,3.41069883,5.06085301,3.41069761],[6.26934533,5.0608702,5.06086954,3.41070722,5.06085524,3.41071003],[8.71135349,7.69496909,7.69496137,6.91957625,7.69495606,6.91957636],[8.71135587,9.49274379,9.49273868,6.91958498,9.49273487,6.91958424],[8.71135838,9.49275618,9.49275401,16.14829919,9.49273798,16.14830113],[9.41550185,13.89571002,13.89571914,16.1483028,13.89571883,16.14830247]]}}}''')
Check failure on line 3 in src/py/mat3ra/standata/data/properties.py View workflow job for this annotation GitHub Actions / run-py-linter (3.8.6) Ruff (E501) `src/py/mat3ra/standata/data/properties.py:3:121: E501 Line too long (1872 > 120 characters)`