Skip to content

Commit

Permalink
arXiv db test data and its loader.
Browse files Browse the repository at this point in the history
  • Loading branch information
ntai-arxiv committed Jan 3, 2025
1 parent 676b76e commit 83af5ee
Show file tree
Hide file tree
Showing 8 changed files with 1,333 additions and 5 deletions.
15 changes: 11 additions & 4 deletions arxiv/auth/legacy/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Helpers and Flask application integration."""
import json
from typing import List, Any
from typing import List, Any, Optional
from datetime import datetime
from pytz import timezone, UTC
import logging
Expand All @@ -14,7 +14,8 @@
from ..auth import scopes
from .. import domain
from ...db import Session, Base, session_factory
from ...db.models import TapirUser, TapirPolicyClass, Category, Archive, Group, EndorsementDomain
from ...db.models import TapirUser, TapirPolicyClass, Category, Archive, Group, EndorsementDomain, CategoryDef, \
ArchiveGroup, License

EASTERN = timezone('US/Eastern')
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -49,7 +50,7 @@ def create_arxiv_db_schema(engine: Engine) -> None:
Base.metadata.create_all(engine)


def bootstrap_arxiv_db(engine: Engine) -> None:
def bootstrap_arxiv_db(engine: Engine, test_data_dir: Optional[str] = None) -> None:
"""Create all tables in the da."""

with ORMSession(engine) as session:
Expand All @@ -61,12 +62,18 @@ def bootstrap_arxiv_db(engine: Engine) -> None:
session.add(TapirPolicyClass(**datum))
session.commit()

test_data_dir = os.path.join(arxiv_base_dir, "development", "test-data")
# In case you are loading the files from library, the data files maybe elsewhere
if test_data_dir is None:
test_data_dir = os.path.join(arxiv_base_dir, "development", "test-data")

for data_class, data_file in [
(Group, "arXiv_groups.json"),
(Archive, "arXiv_archives.json"),
(EndorsementDomain, "arXiv_endorsement_domains.json"),
(Category, "arXiv_categories.json"),
(CategoryDef, "arXiv_category_def.json"),
(ArchiveGroup, "arXiv_archive_group.json"),
(License, "arXiv_licenses.json"),
]:
with ORMSession(engine) as session:
with open(os.path.join(test_data_dir, data_file), encoding="utf-8") as dfd:
Expand Down
44 changes: 44 additions & 0 deletions arxiv/util/database_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
database_loader.py
Loading database utility
"""


import logging
from ruamel.yaml import YAML
from sqlalchemy import Engine, text
from sqlalchemy.orm import Session

from arxiv.util.dict_io import from_file_to_dict

logger = logging.getLogger(__name__)

class DatabaseLoader:
"""
Read json/yaml file and load to database.
The top-level key is the table name, and the
"""
engine: Engine

def __init__(self, engine: Engine):
self.engine = engine

def load_data(self, data: dict) -> None:
with Session(self.engine) as session:
for table_name, rows in data.items():
for row in rows:
col_names = ", ".join(row.keys()) # Extract column names
col_placeholders = ", ".join([f":{col}" for col in row.keys()]) # Create placeholders
sql_statement = f"INSERT INTO {table_name} ({col_names}) VALUES ({col_placeholders})"
try:
session.execute(text(sql_statement), row)
except Exception as exc:
logger.error(f"Statement {sql_statement} data: {row!r}")
raise
session.commit()


def load_data_from_files(self, filenames: [str]) -> None:
for filename in filenames:
self.load_data(from_file_to_dict(filename))
121 changes: 121 additions & 0 deletions arxiv/util/dict_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""
Dictionary object <--> file I/O
"""

import os.path
import typing
from collections import OrderedDict
from typing import TextIO

import json
from ruamel.yaml import YAML, MappingNode, ScalarNode
from ruamel.yaml.representer import RoundTripRepresenter

#
# ruamel.yaml to represent the OrderedDict correctly
#
def _repr_str(dumper: RoundTripRepresenter, data: str) -> ScalarNode:
"""
"Print" string object for yaml dumping
Args:
dumper: ruamel.yaml round trip I/O
data: any string data
Returns:
ScalarNode
"""
if '\n' in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)


def _repr_ordered_dict(dumper: RoundTripRepresenter, data: OrderedDict) -> MappingNode:
"""
"print" ordered dict for yaml dumping
Args:
dumper: ruamel.yaml round trip I/O
data: OrderedDict instance
Returns:
MappingNode: mapped representation
"""
return dumper.represent_mapping('tag:yaml.org,2002:map', dict(data))


def from_yaml_to_dict(filename: str) -> dict:
"""
YAML to dict object
Args:
filename:
Returns: dict object
"""
with open(filename, encoding='utf-8') as yamlfile:
yaml = YAML()
return yaml.load(yamlfile)


def from_json_to_dict(filename: str) -> dict:
"""
JSON to dict object
Args:
filename:
Returns: dict object
"""
with open(filename, encoding='utf-8') as jsonfile:
return json.load(jsonfile)


def from_dict_to_yaml(data: typing.Union[dict, list], output: TextIO) -> None:
"""
dict object to text
Args:
data: dict or list
output: text
Returns: None
"""
yaml = YAML()
yaml.representer.add_representer(str, _repr_str)
yaml.representer.add_representer(OrderedDict, _repr_ordered_dict)
yaml.dump(data, output)


def from_dict_to_json(data: dict, output: TextIO) -> None:
"""
dict object to JSON text
Args:
data:
output:
Returns:
"""
json.dump(data, output, indent=4)


def from_file_to_dict(filename: str) -> dict:
"""
Derive file format from filename and read/return dict object
Args:
filename:
Returns: doct object
"""
(name, ext) = os.path.splitext(filename)
match ext.lower():
case ".yaml":
return from_yaml_to_dict(filename)
case ".yml":
return from_yaml_to_dict(filename)
case ".json":
return from_json_to_dict(filename)
case _:
raise ValueError(f"Unsupported file format: {filename}")
86 changes: 86 additions & 0 deletions development/test-data/arXiv_archive_group.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
[
{
"archive_id": "astro-ph",
"group_id": "physics"
},
{
"archive_id": "cond-mat",
"group_id": "physics"
},
{
"archive_id": "cs",
"group_id": "cs"
},
{
"archive_id": "econ",
"group_id": "econ"
},
{
"archive_id": "eess",
"group_id": "eess"
},
{
"archive_id": "gr-qc",
"group_id": "physics"
},
{
"archive_id": "hep-ex",
"group_id": "physics"
},
{
"archive_id": "hep-lat",
"group_id": "physics"
},
{
"archive_id": "hep-ph",
"group_id": "physics"
},
{
"archive_id": "hep-th",
"group_id": "physics"
},
{
"archive_id": "math",
"group_id": "math"
},
{
"archive_id": "math-ph",
"group_id": "physics"
},
{
"archive_id": "nlin",
"group_id": "physics"
},
{
"archive_id": "nucl-ex",
"group_id": "physics"
},
{
"archive_id": "nucl-th",
"group_id": "physics"
},
{
"archive_id": "physics",
"group_id": "physics"
},
{
"archive_id": "q-bio",
"group_id": "q-bio"
},
{
"archive_id": "q-fin",
"group_id": "q-fin"
},
{
"archive_id": "quant-ph",
"group_id": "physics"
},
{
"archive_id": "stat",
"group_id": "stat"
},
{
"archive_id": "test",
"group_id": "test"
}
]
Loading

0 comments on commit 83af5ee

Please sign in to comment.