-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
676b76e
commit 83af5ee
Showing
8 changed files
with
1,333 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
""" | ||
database_loader.py | ||
Loading database utility | ||
""" | ||
|
||
|
||
import logging | ||
from ruamel.yaml import YAML | ||
from sqlalchemy import Engine, text | ||
from sqlalchemy.orm import Session | ||
|
||
from arxiv.util.dict_io import from_file_to_dict | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
class DatabaseLoader: | ||
""" | ||
Read json/yaml file and load to database. | ||
The top-level key is the table name, and the | ||
""" | ||
engine: Engine | ||
|
||
def __init__(self, engine: Engine): | ||
self.engine = engine | ||
|
||
def load_data(self, data: dict) -> None: | ||
with Session(self.engine) as session: | ||
for table_name, rows in data.items(): | ||
for row in rows: | ||
col_names = ", ".join(row.keys()) # Extract column names | ||
col_placeholders = ", ".join([f":{col}" for col in row.keys()]) # Create placeholders | ||
sql_statement = f"INSERT INTO {table_name} ({col_names}) VALUES ({col_placeholders})" | ||
try: | ||
session.execute(text(sql_statement), row) | ||
except Exception as exc: | ||
logger.error(f"Statement {sql_statement} data: {row!r}") | ||
raise | ||
session.commit() | ||
|
||
|
||
def load_data_from_files(self, filenames: [str]) -> None: | ||
for filename in filenames: | ||
self.load_data(from_file_to_dict(filename)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
""" | ||
Dictionary object <--> file I/O | ||
""" | ||
|
||
import os.path | ||
import typing | ||
from collections import OrderedDict | ||
from typing import TextIO | ||
|
||
import json | ||
from ruamel.yaml import YAML, MappingNode, ScalarNode | ||
from ruamel.yaml.representer import RoundTripRepresenter | ||
|
||
# | ||
# ruamel.yaml to represent the OrderedDict correctly | ||
# | ||
def _repr_str(dumper: RoundTripRepresenter, data: str) -> ScalarNode: | ||
""" | ||
"Print" string object for yaml dumping | ||
Args: | ||
dumper: ruamel.yaml round trip I/O | ||
data: any string data | ||
Returns: | ||
ScalarNode | ||
""" | ||
if '\n' in data: | ||
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') | ||
return dumper.represent_scalar('tag:yaml.org,2002:str', data) | ||
|
||
|
||
def _repr_ordered_dict(dumper: RoundTripRepresenter, data: OrderedDict) -> MappingNode: | ||
""" | ||
"print" ordered dict for yaml dumping | ||
Args: | ||
dumper: ruamel.yaml round trip I/O | ||
data: OrderedDict instance | ||
Returns: | ||
MappingNode: mapped representation | ||
""" | ||
return dumper.represent_mapping('tag:yaml.org,2002:map', dict(data)) | ||
|
||
|
||
def from_yaml_to_dict(filename: str) -> dict: | ||
""" | ||
YAML to dict object | ||
Args: | ||
filename: | ||
Returns: dict object | ||
""" | ||
with open(filename, encoding='utf-8') as yamlfile: | ||
yaml = YAML() | ||
return yaml.load(yamlfile) | ||
|
||
|
||
def from_json_to_dict(filename: str) -> dict: | ||
""" | ||
JSON to dict object | ||
Args: | ||
filename: | ||
Returns: dict object | ||
""" | ||
with open(filename, encoding='utf-8') as jsonfile: | ||
return json.load(jsonfile) | ||
|
||
|
||
def from_dict_to_yaml(data: typing.Union[dict, list], output: TextIO) -> None: | ||
""" | ||
dict object to text | ||
Args: | ||
data: dict or list | ||
output: text | ||
Returns: None | ||
""" | ||
yaml = YAML() | ||
yaml.representer.add_representer(str, _repr_str) | ||
yaml.representer.add_representer(OrderedDict, _repr_ordered_dict) | ||
yaml.dump(data, output) | ||
|
||
|
||
def from_dict_to_json(data: dict, output: TextIO) -> None: | ||
""" | ||
dict object to JSON text | ||
Args: | ||
data: | ||
output: | ||
Returns: | ||
""" | ||
json.dump(data, output, indent=4) | ||
|
||
|
||
def from_file_to_dict(filename: str) -> dict: | ||
""" | ||
Derive file format from filename and read/return dict object | ||
Args: | ||
filename: | ||
Returns: doct object | ||
""" | ||
(name, ext) = os.path.splitext(filename) | ||
match ext.lower(): | ||
case ".yaml": | ||
return from_yaml_to_dict(filename) | ||
case ".yml": | ||
return from_yaml_to_dict(filename) | ||
case ".json": | ||
return from_json_to_dict(filename) | ||
case _: | ||
raise ValueError(f"Unsupported file format: {filename}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
[ | ||
{ | ||
"archive_id": "astro-ph", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "cond-mat", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "cs", | ||
"group_id": "cs" | ||
}, | ||
{ | ||
"archive_id": "econ", | ||
"group_id": "econ" | ||
}, | ||
{ | ||
"archive_id": "eess", | ||
"group_id": "eess" | ||
}, | ||
{ | ||
"archive_id": "gr-qc", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "hep-ex", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "hep-lat", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "hep-ph", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "hep-th", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "math", | ||
"group_id": "math" | ||
}, | ||
{ | ||
"archive_id": "math-ph", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "nlin", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "nucl-ex", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "nucl-th", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "physics", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "q-bio", | ||
"group_id": "q-bio" | ||
}, | ||
{ | ||
"archive_id": "q-fin", | ||
"group_id": "q-fin" | ||
}, | ||
{ | ||
"archive_id": "quant-ph", | ||
"group_id": "physics" | ||
}, | ||
{ | ||
"archive_id": "stat", | ||
"group_id": "stat" | ||
}, | ||
{ | ||
"archive_id": "test", | ||
"group_id": "test" | ||
} | ||
] |
Oops, something went wrong.