arXiv db test data and its loader.

arXiv · Jan 3, 2025 · 83af5ee · 83af5ee
1 parent 676b76e
commit 83af5ee
Show file tree

Hide file tree

Showing 8 changed files with 1,333 additions and 5 deletions.
diff --git a/arxiv/auth/legacy/util.py b/arxiv/auth/legacy/util.py
@@ -1,6 +1,6 @@
 """Helpers and Flask application integration."""
 import json
-from typing import List, Any
+from typing import List, Any, Optional
 from datetime import datetime
 from pytz import timezone, UTC
 import logging
@@ -14,7 +14,8 @@
 from ..auth import scopes
 from .. import domain
 from ...db import Session, Base, session_factory
-from ...db.models import TapirUser, TapirPolicyClass, Category, Archive, Group, EndorsementDomain
+from ...db.models import TapirUser, TapirPolicyClass, Category, Archive, Group, EndorsementDomain, CategoryDef, \
+    ArchiveGroup, License
 
 EASTERN = timezone('US/Eastern')
 logger = logging.getLogger(__name__)
@@ -49,7 +50,7 @@ def create_arxiv_db_schema(engine: Engine) -> None:
     Base.metadata.create_all(engine)
 
 
-def bootstrap_arxiv_db(engine: Engine) -> None:
+def bootstrap_arxiv_db(engine: Engine, test_data_dir: Optional[str] = None) -> None:
     """Create all tables in the da."""
 
     with ORMSession(engine) as session:
@@ -61,12 +62,18 @@ def bootstrap_arxiv_db(engine: Engine) -> None:
             session.add(TapirPolicyClass(**datum))
         session.commit()
 
-    test_data_dir = os.path.join(arxiv_base_dir, "development", "test-data")
+    # In case you are loading the files from library, the data files maybe elsewhere
+    if test_data_dir is None:
+        test_data_dir = os.path.join(arxiv_base_dir, "development", "test-data")
+
     for data_class, data_file in [
         (Group, "arXiv_groups.json"),
         (Archive, "arXiv_archives.json"),
         (EndorsementDomain, "arXiv_endorsement_domains.json"),
         (Category, "arXiv_categories.json"),
+        (CategoryDef, "arXiv_category_def.json"),
+        (ArchiveGroup, "arXiv_archive_group.json"),
+        (License, "arXiv_licenses.json"),
     ]:
         with ORMSession(engine) as session:
             with open(os.path.join(test_data_dir, data_file), encoding="utf-8") as dfd:

diff --git a/arxiv/util/database_loader.py b/arxiv/util/database_loader.py
@@ -0,0 +1,44 @@
+"""
+database_loader.py
+  Loading database utility
+"""
+
+
+import logging
+from ruamel.yaml import YAML
+from sqlalchemy import Engine, text
+from sqlalchemy.orm import Session
+
+from arxiv.util.dict_io import from_file_to_dict
+
+logger = logging.getLogger(__name__)
+
+class DatabaseLoader:
+    """
+    Read json/yaml file and load to database.
+
+    The top-level key is the table name, and the
+    """
+    engine: Engine
+
+    def __init__(self, engine: Engine):
+        self.engine = engine
+
+    def load_data(self, data: dict) -> None:
+        with Session(self.engine) as session:
+            for table_name, rows in data.items():
+                for row in rows:
+                    col_names = ", ".join(row.keys())  # Extract column names
+                    col_placeholders = ", ".join([f":{col}" for col in row.keys()])  # Create placeholders
+                    sql_statement = f"INSERT INTO {table_name} ({col_names}) VALUES ({col_placeholders})"
+                    try:
+                        session.execute(text(sql_statement), row)
+                    except Exception as exc:
+                        logger.error(f"Statement {sql_statement} data: {row!r}")
+                        raise
+            session.commit()
+
+
+    def load_data_from_files(self, filenames: [str]) -> None:
+        for filename in filenames:
+            self.load_data(from_file_to_dict(filename))
diff --git a/arxiv/util/dict_io.py b/arxiv/util/dict_io.py
@@ -0,0 +1,121 @@
+"""
+Dictionary object <--> file I/O
+"""
+
+import os.path
+import typing
+from collections import OrderedDict
+from typing import TextIO
+
+import json
+from ruamel.yaml import YAML, MappingNode, ScalarNode
+from ruamel.yaml.representer import RoundTripRepresenter
+
+#
+# ruamel.yaml to represent the OrderedDict correctly
+#
+def _repr_str(dumper: RoundTripRepresenter, data: str) -> ScalarNode:
+    """
+    "Print" string object for yaml dumping
+
+    Args:
+        dumper: ruamel.yaml round trip I/O
+        data: any string data
+
+    Returns:
+        ScalarNode
+    """
+    if '\n' in data:
+        return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
+    return dumper.represent_scalar('tag:yaml.org,2002:str', data)
+
+
+def _repr_ordered_dict(dumper: RoundTripRepresenter, data: OrderedDict) -> MappingNode:
+    """
+    "print" ordered dict for yaml dumping
+    Args:
+        dumper: ruamel.yaml round trip I/O
+        data: OrderedDict instance
+
+    Returns:
+        MappingNode: mapped representation
+    """
+    return dumper.represent_mapping('tag:yaml.org,2002:map', dict(data))
+
+
+def from_yaml_to_dict(filename: str) -> dict:
+    """
+    YAML to dict object
+    Args:
+        filename:
+
+    Returns: dict object
+
+    """
+    with open(filename, encoding='utf-8') as yamlfile:
+        yaml = YAML()
+        return yaml.load(yamlfile)
+
+
+def from_json_to_dict(filename: str) -> dict:
+    """
+    JSON to dict object
+
+    Args:
+        filename:
+
+    Returns: dict object
+
+    """
+    with open(filename, encoding='utf-8') as jsonfile:
+        return json.load(jsonfile)
+
+
+def from_dict_to_yaml(data: typing.Union[dict, list], output: TextIO) -> None:
+    """
+    dict object to text
+    Args:
+        data: dict or list
+        output: text
+
+    Returns: None
+
+    """
+    yaml = YAML()
+    yaml.representer.add_representer(str, _repr_str)
+    yaml.representer.add_representer(OrderedDict, _repr_ordered_dict)
+    yaml.dump(data, output)
+
+
+def from_dict_to_json(data: dict, output: TextIO) -> None:
+    """
+    dict object to JSON text
+    Args:
+        data:
+        output:
+
+    Returns:
+
+    """
+    json.dump(data, output, indent=4)
+
+
+def from_file_to_dict(filename: str) -> dict:
+    """
+    Derive file format from filename and read/return dict object
+    Args:
+        filename:
+
+    Returns: doct object
+
+    """
+    (name, ext) = os.path.splitext(filename)
+    match ext.lower():
+        case ".yaml":
+            return from_yaml_to_dict(filename)
+        case ".yml":
+            return from_yaml_to_dict(filename)
+        case ".json":
+            return from_json_to_dict(filename)
+        case _:
+            raise ValueError(f"Unsupported file format: {filename}")
diff --git a/development/test-data/arXiv_archive_group.json b/development/test-data/arXiv_archive_group.json
@@ -0,0 +1,86 @@
+[
+  {
+    "archive_id": "astro-ph",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "cond-mat",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "cs",
+    "group_id": "cs"
+  },
+  {
+    "archive_id": "econ",
+    "group_id": "econ"
+  },
+  {
+    "archive_id": "eess",
+    "group_id": "eess"
+  },
+  {
+    "archive_id": "gr-qc",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "hep-ex",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "hep-lat",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "hep-ph",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "hep-th",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "math",
+    "group_id": "math"
+  },
+  {
+    "archive_id": "math-ph",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "nlin",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "nucl-ex",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "nucl-th",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "physics",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "q-bio",
+    "group_id": "q-bio"
+  },
+  {
+    "archive_id": "q-fin",
+    "group_id": "q-fin"
+  },
+  {
+    "archive_id": "quant-ph",
+    "group_id": "physics"
+  },
+  {
+    "archive_id": "stat",
+    "group_id": "stat"
+  },
+  {
+    "archive_id": "test",
+    "group_id": "test"
+  }
+]