-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
288 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +0,0 @@ | ||
import re | ||
from typing import Dict, List | ||
|
||
|
||
class Standata: | ||
|
||
# Override in children | ||
data: dict = {"filesMapByName": []} | ||
|
||
@classmethod | ||
def files_map_by_name(cls) -> Dict[str, dict]: | ||
return cls.data["filesMapByName"] | ||
|
||
@classmethod | ||
def get_as_list(cls): | ||
return list(cls.files_map_by_name().values()) | ||
|
||
@classmethod | ||
def get_by_name(cls, name: str) -> List[dict]: | ||
"""Returns entity by name. | ||
Args: | ||
name: Name of the entity. | ||
""" | ||
matching_entities = [] | ||
for key, entity in cls.files_map_by_name().items(): | ||
regex = re.compile(name, re.IGNORECASE) | ||
if re.match(regex, key): | ||
matching_entities.append(entity) | ||
return matching_entities | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
import re | ||
from typing import Dict, List, Optional | ||
|
||
import pandas as pd | ||
from pydantic import BaseModel | ||
|
||
|
||
class StandataEntity(BaseModel): | ||
filename: str | ||
categories: List[str] | ||
|
||
|
||
class StandataConfig(BaseModel): | ||
categories: Dict[str, List[str]] = {} | ||
entities: List[StandataEntity] = [] | ||
|
||
def get_categories_as_list(self, separator: str = "/") -> List[str]: | ||
""" | ||
Flattens categories dictionary to list of categories. | ||
Args: | ||
category_map: Dictionary mapping category types to category tags. | ||
separator: Separation character used to join category type and tag. | ||
Example:: | ||
Standata.flatten_categories({"size": ["S", "M", "L"]}) | ||
# returns ["size/S", "size/M", "size/L"] | ||
""" | ||
category_groups = [list(map(lambda x: f"{key}{separator}{x}", val)) for key, val in self.categories.items()] | ||
return [item for sublist in category_groups for item in sublist] | ||
|
||
def convert_tags_to_categories_list(self, *tags: str): | ||
""" | ||
Converts simple tags to '<category_type>/<tag>' format. | ||
Args: | ||
*tags: Category tags for the entity. | ||
Note: | ||
Some tags belong to several categories simultaneously, for instance 'semiconductor' is associated with | ||
'electrical_conductivity' and 'type'. This function returns all occurrences of a tag as | ||
'<category_type>/<tag>'. | ||
""" | ||
return [cf for cf in self.categories if any([cf.split("/")[1] == t for t in tags])] | ||
|
||
def get_filenames_by_categories(self, *categories: str) -> List[str]: | ||
""" | ||
Returns filenames that match all given categories. | ||
Args: | ||
*categories: Categories for the entity query. Note, that `categories` should be in the same format as the | ||
column names in the lookup table. | ||
""" | ||
if len(categories) == 0: | ||
return [] | ||
filenames = [] | ||
for entity in self.entities: | ||
if any([category in entity.categories for category in categories]): | ||
filenames.append(entity.filename) | ||
return filenames | ||
|
||
def get_filenames_by_regex(self, regex: str) -> List[str]: | ||
""" | ||
Returns filenames that match the regular expression. | ||
Args: | ||
regex: Regular expression for the entity query. | ||
""" | ||
filenames = [] | ||
for entity in self.entities: | ||
if re.match(regex, entity.filename): | ||
filenames.append(entity.filename) | ||
return filenames | ||
|
||
@property | ||
def __lookup_table(self) -> pd.DataFrame: | ||
""" | ||
Creates lookup table for filenames and associated categories. | ||
For the lookup table category tags are first converted to the <category_type>/<tag> format, which represent the | ||
columns of the lookup table. The filenames represent the rows of the lookup table (DataFrame.index). The values | ||
in the table are either 0 or 1 depending on whether a filename is associated with a certain category (1) or | ||
not (0). | ||
""" | ||
df = pd.DataFrame( | ||
0, | ||
columns=self.get_categories_as_list(), | ||
index=[entity.filename for entity in self.entities], | ||
) | ||
for entity in self.entities: | ||
filename = entity.filename | ||
categories = self.convert_tags_to_categories_list(*entity.categories) | ||
for category in categories: | ||
df.loc[filename, category] = 1 | ||
return df | ||
|
||
|
||
class StandataFilesMapByName(Dict[str, dict]): | ||
|
||
def get_objects_by_filenames(self, filenames: List[str]) -> List[dict]: | ||
""" | ||
Returns entities by filenames. | ||
Args: | ||
filenames: Filenames of the entities. | ||
""" | ||
matching_objects = [] | ||
for key, entity in self.items(): | ||
if key in filenames: | ||
matching_objects.append(entity) | ||
return matching_objects | ||
|
||
|
||
class StandataData(BaseModel): | ||
class Config: | ||
arbitrary_types_allowed = True | ||
|
||
filesMapByName: Optional[StandataFilesMapByName] = StandataFilesMapByName() | ||
standataConfig: Optional[StandataConfig] = StandataConfig() | ||
|
||
def __init__(self, /, **kwargs): | ||
super().__init__(**kwargs) | ||
self.filesMapByName = StandataFilesMapByName(kwargs.get("filesMapByName", {})) | ||
self.standataConfig = StandataConfig(**kwargs.get("standataConfig", {})) | ||
|
||
|
||
class Standata(BaseModel): | ||
# Override in children | ||
data: StandataData = StandataData() | ||
|
||
@classmethod | ||
def get_as_list(cls): | ||
return list(cls.data.filesMapByName.values()) | ||
|
||
@classmethod | ||
def get_by_name(cls, name: str) -> List[dict]: | ||
""" | ||
Returns entities by name regex. | ||
Args: | ||
name: Name of the entity. | ||
""" | ||
matching_filenames = cls.data.standataConfig.get_filenames_by_regex(name) | ||
return cls.data.filesMapByName.get_objects_by_filenames(matching_filenames) | ||
|
||
def get_by_categories(self, *tags: str) -> List[dict]: | ||
""" | ||
Finds entities that match all specified category tags. | ||
Args: | ||
*tags: Category tags for the entity query. | ||
""" | ||
categories = self.data.standataConfig.convert_tags_to_categories_list(*tags) | ||
matching_filenames = self.data.standataConfig.get_filenames_by_categories(*categories) | ||
return self.data.filesMapByName.get_objects_by_filenames(matching_filenames) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import json | ||
import yaml | ||
from pathlib import Path | ||
from typing import Dict, List, Optional, TypedDict, Union | ||
|
||
|
||
from ..base import StandataConfig | ||
|
||
EntityItem = TypedDict("EntityItem", {"filename": str, "categories": List[str]}) | ||
|
||
EntityConfig = TypedDict("EntityConfig", {"categories": Dict[str, List[str]], "entities": List[EntityItem]}) | ||
|
||
|
||
class StandataBuilder: | ||
"""The Standata class associates the entity data files with categories and allows for tag-based queries. | ||
Attributes: | ||
entity_dir: Path to the folder containing entity data files. | ||
""" | ||
|
||
def __init__(self, entity_config: EntityConfig, entity_dir: Union[str, Path]): | ||
"""Initializes categories and the entity list. | ||
Args: | ||
entity_config: The contents of the entity config file (`categories.yml`). | ||
entity_dir: The path to the directory containing all entities. | ||
""" | ||
self.entity_dir: Path = Path(entity_dir).resolve() | ||
|
||
@classmethod | ||
def build_from_file(cls, entity_config_path: Union[Path, str]) -> "StandataConfig": | ||
"""Creates Standata class instance from entity config file (categories.yml). | ||
Args: | ||
entity_config_path: The path to the entity config file `categories.yml`. | ||
Note: | ||
Here, we assume that the entity config file is located in the same directory as all entity files. | ||
""" | ||
filepath = Path(entity_config_path) | ||
config = StandataBuilder.load_config(filepath) | ||
return StandataConfig(**config) | ||
|
||
@staticmethod | ||
def load_config(entity_config_path: Path) -> EntityConfig: | ||
"""Loads entity config from file (Yaml). | ||
Args: | ||
entity_config_path: The path to the entity config file `categories.yml`. | ||
""" | ||
entity_config: EntityConfig = {"categories": {}, "entities": []} | ||
try: | ||
with open(entity_config_path.resolve(), "r") as stream: | ||
entity_config = yaml.safe_load(stream) | ||
except yaml.YAMLError as e: | ||
print(e) | ||
return entity_config | ||
|
||
@staticmethod | ||
def load_entity(filepath: Path) -> Optional[dict]: | ||
"""Loads entity config from file (JSON). | ||
Args: | ||
filepath: Path to entity data file (JSON). | ||
""" | ||
entity = None | ||
if not filepath.resolve().exists(): | ||
print(f"Could not find entity file: {filepath.resolve()}") | ||
return entity | ||
try: | ||
with open(filepath.resolve(), "r") as f: | ||
entity = json.load(f) | ||
except json.JSONDecodeError as e: | ||
print(e) | ||
return entity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from pathlib import Path | ||
from typing import Optional | ||
|
||
import typer | ||
from .builder import StandataBuilder | ||
|
||
|
||
def main( | ||
yaml_config: str = typer.Argument(..., help="Location of entity config file."), | ||
destination: Optional[str] = typer.Option("--destination", "-d", help="Where to place symlink directory."), | ||
): | ||
config_path = Path(yaml_config) | ||
entity_path_parent = config_path.parent | ||
|
||
standata_config = StandataBuilder.build_from_file(config_path) | ||
|
||
save_dir = config_path.parent | ||
if destination and Path(destination).resolve().exists(): | ||
save_dir = Path(destination) | ||
categories_root = save_dir / "by_category" | ||
|
||
for entity in standata_config.entities: | ||
categories = standata_config.convert_tags_to_categories_list(*entity.categories) | ||
entity_path = entity_path_parent / entity.filename | ||
|
||
for category in categories: | ||
category_dir = categories_root / category | ||
category_dir.mkdir(parents=True, exist_ok=True) | ||
linked_entity = category_dir / entity.filename | ||
if not linked_entity.exists(): | ||
linked_entity.symlink_to(entity_path) | ||
|
||
|
||
def typer_app(): | ||
typer.run(main) | ||
|
||
|
||
if __name__ == "__main__": | ||
typer_app() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import json | ||
|
||
applications_data = json.loads(r'''{"standataConfig":{"categories":{"model":["atomistic","macroscopic","mesoscopic","quantum-mechanical","statistical"],"language_type":["scripting","compiled"],"purpose":["command_line_interface","interactive_computing_environment","programming_language"]},"entities":[{"filename":"espresso_gnu_540.json","categories":["quantum-mechanical"]},{"filename":"python_386.json","categories":["scripting","programming_language"]}]},"filesMapByName":{"espresso_gnu_540.json":{"name":"espresso","shortName":"qe","summary":"Quantum Espresso","version":"5.4.0","build":"GNU","hasAdvancedComputeOptions":true,"isLicensed":false},"python_386.json":{"name":"python","shortName":"py","summary":"Python Script","version":"3.8.6","build":"Default","hasAdvancedComputeOptions":false,"isLicensed":false}}}''') | ||
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import json | ||
|
||
properties_data = json.loads(r'''{"standataConfig":{"categories":{"type":["mechanical","electronic","magnetic","thermal","optical","dynamic","transport","radiological","acoustic","manufacturing","chemical","structural","surface"],"property_class":["meta-property","proto-property"],"value_type":["scalar","vector","matrix","tensor_rank3","tensor_rank4","non-scalar"],"measurement":["angle-resolved-photoemission-spectroscopy","atomic-force-microscopy","x-ray-diffraction","x-ray-fluorescence","transmission-electron-microscopy","scanning-electron-microscopy","thermogravimetric-analysis"],"application":["espresso","vasp","python","shell"]},"entities":[{"filename":"valence_band_offset.json","categories":["scalar","electronic","angle-resolved-photoemission-spectroscopy"]},{"filename":"band_structure.json","categories":["matrix","electronic"]}]},"filesMapByName":{"valence_band_offset.json":{"name":"valence_band_offset","units":"eV","value":0.28},"band_structure.json":{"name":"band_structure","spin":[0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5],"xAxis":{"label":"kpoints","units":"crystal"},"xDataArray":[[0,0,0],[0.28867514,0.20412412,-0.49999997],[0,-0.61237246,0],[0.28867514,-0.40824834,-0.49999997],[-0.57735028,0.20412421,0],[-0.57735028,-0.40824824,0]],"yAxis":{"label":"energy","units":"eV"},"yDataSeries":[[-5.5990059,-3.30219959,-3.30220019,-1.51073812,-3.30221054,-1.51074222],[6.26931638,-0.66503974,-0.6650363,-1.51072293,-0.66501391,-1.5107195],[6.26931998,5.06084876,5.06084821,3.41069883,5.06085301,3.41069761],[6.26934533,5.0608702,5.06086954,3.41070722,5.06085524,3.41071003],[8.71135349,7.69496909,7.69496137,6.91957625,7.69495606,6.91957636],[8.71135587,9.49274379,9.49273868,6.91958498,9.49273487,6.91958424],[8.71135838,9.49275618,9.49275401,16.14829919,9.49273798,16.14830113],[9.41550185,13.89571002,13.89571914,16.1483028,13.89571883,16.14830247]]}}}''') | ||
Oops, something went wrong.