Skip to content

Commit

Permalink
update: progress in rewriting
Browse files Browse the repository at this point in the history
  • Loading branch information
VsevolodX committed Oct 25, 2024
1 parent 7183e4c commit 864a7cc
Show file tree
Hide file tree
Showing 10 changed files with 288 additions and 30 deletions.
30 changes: 0 additions & 30 deletions src/py/mat3ra/standata/__init__.py
Original file line number Diff line number Diff line change
@@ -1,30 +0,0 @@
import re
from typing import Dict, List


class Standata:

# Override in children
data: dict = {"filesMapByName": []}

@classmethod
def files_map_by_name(cls) -> Dict[str, dict]:
return cls.data["filesMapByName"]

@classmethod
def get_as_list(cls):
return list(cls.files_map_by_name().values())

@classmethod
def get_by_name(cls, name: str) -> List[dict]:
"""Returns entity by name.
Args:
name: Name of the entity.
"""
matching_entities = []
for key, entity in cls.files_map_by_name().items():
regex = re.compile(name, re.IGNORECASE)
if re.match(regex, key):
matching_entities.append(entity)
return matching_entities
156 changes: 156 additions & 0 deletions src/py/mat3ra/standata/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import re
from typing import Dict, List, Optional

import pandas as pd
from pydantic import BaseModel


class StandataEntity(BaseModel):
filename: str
categories: List[str]


class StandataConfig(BaseModel):
categories: Dict[str, List[str]] = {}
entities: List[StandataEntity] = []

def get_categories_as_list(self, separator: str = "/") -> List[str]:
"""
Flattens categories dictionary to list of categories.
Args:
category_map: Dictionary mapping category types to category tags.
separator: Separation character used to join category type and tag.
Example::
Standata.flatten_categories({"size": ["S", "M", "L"]})
# returns ["size/S", "size/M", "size/L"]
"""
category_groups = [list(map(lambda x: f"{key}{separator}{x}", val)) for key, val in self.categories.items()]
return [item for sublist in category_groups for item in sublist]

def convert_tags_to_categories_list(self, *tags: str):
"""
Converts simple tags to '<category_type>/<tag>' format.
Args:
*tags: Category tags for the entity.
Note:
Some tags belong to several categories simultaneously, for instance 'semiconductor' is associated with
'electrical_conductivity' and 'type'. This function returns all occurrences of a tag as
'<category_type>/<tag>'.
"""
return [cf for cf in self.categories if any([cf.split("/")[1] == t for t in tags])]

def get_filenames_by_categories(self, *categories: str) -> List[str]:
"""
Returns filenames that match all given categories.
Args:
*categories: Categories for the entity query. Note, that `categories` should be in the same format as the
column names in the lookup table.
"""
if len(categories) == 0:
return []
filenames = []
for entity in self.entities:
if any([category in entity.categories for category in categories]):
filenames.append(entity.filename)
return filenames

def get_filenames_by_regex(self, regex: str) -> List[str]:
"""
Returns filenames that match the regular expression.
Args:
regex: Regular expression for the entity query.
"""
filenames = []
for entity in self.entities:
if re.match(regex, entity.filename):
filenames.append(entity.filename)
return filenames

@property
def __lookup_table(self) -> pd.DataFrame:
"""
Creates lookup table for filenames and associated categories.
For the lookup table category tags are first converted to the <category_type>/<tag> format, which represent the
columns of the lookup table. The filenames represent the rows of the lookup table (DataFrame.index). The values
in the table are either 0 or 1 depending on whether a filename is associated with a certain category (1) or
not (0).
"""
df = pd.DataFrame(
0,
columns=self.get_categories_as_list(),
index=[entity.filename for entity in self.entities],
)
for entity in self.entities:
filename = entity.filename
categories = self.convert_tags_to_categories_list(*entity.categories)
for category in categories:
df.loc[filename, category] = 1
return df


class StandataFilesMapByName(Dict[str, dict]):

def get_objects_by_filenames(self, filenames: List[str]) -> List[dict]:
"""
Returns entities by filenames.
Args:
filenames: Filenames of the entities.
"""
matching_objects = []
for key, entity in self.items():
if key in filenames:
matching_objects.append(entity)
return matching_objects


class StandataData(BaseModel):
class Config:
arbitrary_types_allowed = True

filesMapByName: Optional[StandataFilesMapByName] = StandataFilesMapByName()
standataConfig: Optional[StandataConfig] = StandataConfig()

def __init__(self, /, **kwargs):
super().__init__(**kwargs)
self.filesMapByName = StandataFilesMapByName(kwargs.get("filesMapByName", {}))
self.standataConfig = StandataConfig(**kwargs.get("standataConfig", {}))


class Standata(BaseModel):
# Override in children
data: StandataData = StandataData()

@classmethod
def get_as_list(cls):
return list(cls.data.filesMapByName.values())

@classmethod
def get_by_name(cls, name: str) -> List[dict]:
"""
Returns entities by name regex.
Args:
name: Name of the entity.
"""
matching_filenames = cls.data.standataConfig.get_filenames_by_regex(name)
return cls.data.filesMapByName.get_objects_by_filenames(matching_filenames)

def get_by_categories(self, *tags: str) -> List[dict]:
"""
Finds entities that match all specified category tags.
Args:
*tags: Category tags for the entity query.
"""
categories = self.data.standataConfig.convert_tags_to_categories_list(*tags)
matching_filenames = self.data.standataConfig.get_filenames_by_categories(*categories)
return self.data.filesMapByName.get_objects_by_filenames(matching_filenames)
Empty file.
75 changes: 75 additions & 0 deletions src/py/mat3ra/standata/build/builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import json
import yaml
from pathlib import Path
from typing import Dict, List, Optional, TypedDict, Union


from ..base import StandataConfig

EntityItem = TypedDict("EntityItem", {"filename": str, "categories": List[str]})

EntityConfig = TypedDict("EntityConfig", {"categories": Dict[str, List[str]], "entities": List[EntityItem]})


class StandataBuilder:
"""The Standata class associates the entity data files with categories and allows for tag-based queries.
Attributes:
entity_dir: Path to the folder containing entity data files.
"""

def __init__(self, entity_config: EntityConfig, entity_dir: Union[str, Path]):
"""Initializes categories and the entity list.
Args:
entity_config: The contents of the entity config file (`categories.yml`).
entity_dir: The path to the directory containing all entities.
"""
self.entity_dir: Path = Path(entity_dir).resolve()

@classmethod
def build_from_file(cls, entity_config_path: Union[Path, str]) -> "StandataConfig":
"""Creates Standata class instance from entity config file (categories.yml).
Args:
entity_config_path: The path to the entity config file `categories.yml`.
Note:
Here, we assume that the entity config file is located in the same directory as all entity files.
"""
filepath = Path(entity_config_path)
config = StandataBuilder.load_config(filepath)
return StandataConfig(**config)

@staticmethod
def load_config(entity_config_path: Path) -> EntityConfig:
"""Loads entity config from file (Yaml).
Args:
entity_config_path: The path to the entity config file `categories.yml`.
"""
entity_config: EntityConfig = {"categories": {}, "entities": []}
try:
with open(entity_config_path.resolve(), "r") as stream:
entity_config = yaml.safe_load(stream)
except yaml.YAMLError as e:
print(e)
return entity_config

@staticmethod
def load_entity(filepath: Path) -> Optional[dict]:
"""Loads entity config from file (JSON).
Args:
filepath: Path to entity data file (JSON).
"""
entity = None
if not filepath.resolve().exists():
print(f"Could not find entity file: {filepath.resolve()}")
return entity
try:
with open(filepath.resolve(), "r") as f:
entity = json.load(f)
except json.JSONDecodeError as e:
print(e)
return entity
39 changes: 39 additions & 0 deletions src/py/mat3ra/standata/build/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from pathlib import Path
from typing import Optional

import typer
from .builder import StandataBuilder


def main(
yaml_config: str = typer.Argument(..., help="Location of entity config file."),
destination: Optional[str] = typer.Option("--destination", "-d", help="Where to place symlink directory."),
):
config_path = Path(yaml_config)
entity_path_parent = config_path.parent

standata_config = StandataBuilder.build_from_file(config_path)

save_dir = config_path.parent
if destination and Path(destination).resolve().exists():
save_dir = Path(destination)
categories_root = save_dir / "by_category"

for entity in standata_config.entities:
categories = standata_config.convert_tags_to_categories_list(*entity.categories)
entity_path = entity_path_parent / entity.filename

for category in categories:
category_dir = categories_root / category
category_dir.mkdir(parents=True, exist_ok=True)
linked_entity = category_dir / entity.filename
if not linked_entity.exists():
linked_entity.symlink_to(entity_path)


def typer_app():
typer.run(main)


if __name__ == "__main__":
typer_app()
3 changes: 3 additions & 0 deletions src/py/mat3ra/standata/data/applications.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import json

applications_data = json.loads(r'''{"standataConfig":{"categories":{"model":["atomistic","macroscopic","mesoscopic","quantum-mechanical","statistical"],"language_type":["scripting","compiled"],"purpose":["command_line_interface","interactive_computing_environment","programming_language"]},"entities":[{"filename":"espresso_gnu_540.json","categories":["quantum-mechanical"]},{"filename":"python_386.json","categories":["scripting","programming_language"]}]},"filesMapByName":{"espresso_gnu_540.json":{"name":"espresso","shortName":"qe","summary":"Quantum Espresso","version":"5.4.0","build":"GNU","hasAdvancedComputeOptions":true,"isLicensed":false},"python_386.json":{"name":"python","shortName":"py","summary":"Python Script","version":"3.8.6","build":"Default","hasAdvancedComputeOptions":false,"isLicensed":false}}}''')

Check failure on line 3 in src/py/mat3ra/standata/data/applications.py

View workflow job for this annotation

GitHub Actions / run-py-linter (3.8.6)

Ruff (E501)

src/py/mat3ra/standata/data/applications.py:3:121: E501 Line too long (823 > 120 characters)
3 changes: 3 additions & 0 deletions src/py/mat3ra/standata/data/materials.py

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions src/py/mat3ra/standata/data/properties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import json

properties_data = json.loads(r'''{"standataConfig":{"categories":{"type":["mechanical","electronic","magnetic","thermal","optical","dynamic","transport","radiological","acoustic","manufacturing","chemical","structural","surface"],"property_class":["meta-property","proto-property"],"value_type":["scalar","vector","matrix","tensor_rank3","tensor_rank4","non-scalar"],"measurement":["angle-resolved-photoemission-spectroscopy","atomic-force-microscopy","x-ray-diffraction","x-ray-fluorescence","transmission-electron-microscopy","scanning-electron-microscopy","thermogravimetric-analysis"],"application":["espresso","vasp","python","shell"]},"entities":[{"filename":"valence_band_offset.json","categories":["scalar","electronic","angle-resolved-photoemission-spectroscopy"]},{"filename":"band_structure.json","categories":["matrix","electronic"]}]},"filesMapByName":{"valence_band_offset.json":{"name":"valence_band_offset","units":"eV","value":0.28},"band_structure.json":{"name":"band_structure","spin":[0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5],"xAxis":{"label":"kpoints","units":"crystal"},"xDataArray":[[0,0,0],[0.28867514,0.20412412,-0.49999997],[0,-0.61237246,0],[0.28867514,-0.40824834,-0.49999997],[-0.57735028,0.20412421,0],[-0.57735028,-0.40824824,0]],"yAxis":{"label":"energy","units":"eV"},"yDataSeries":[[-5.5990059,-3.30219959,-3.30220019,-1.51073812,-3.30221054,-1.51074222],[6.26931638,-0.66503974,-0.6650363,-1.51072293,-0.66501391,-1.5107195],[6.26931998,5.06084876,5.06084821,3.41069883,5.06085301,3.41069761],[6.26934533,5.0608702,5.06086954,3.41070722,5.06085524,3.41071003],[8.71135349,7.69496909,7.69496137,6.91957625,7.69495606,6.91957636],[8.71135587,9.49274379,9.49273868,6.91958498,9.49273487,6.91958424],[8.71135838,9.49275618,9.49275401,16.14829919,9.49273798,16.14830113],[9.41550185,13.89571002,13.89571914,16.1483028,13.89571883,16.14830247]]}}}''')

Check failure on line 3 in src/py/mat3ra/standata/data/properties.py

View workflow job for this annotation

GitHub Actions / run-py-linter (3.8.6)

Ruff (E501)

src/py/mat3ra/standata/data/properties.py:3:121: E501 Line too long (1872 > 120 characters)
Loading

0 comments on commit 864a7cc

Please sign in to comment.