Skip to content

Commit

Permalink
Merge pull request #6 from arXiv/get_record
Browse files Browse the repository at this point in the history
Get record
  • Loading branch information
kyokukou authored Dec 16, 2024
2 parents 146418f + 4efd021 commit 140b75f
Show file tree
Hide file tree
Showing 22 changed files with 766 additions and 41 deletions.
14 changes: 9 additions & 5 deletions oaipmh/data/oai_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,26 @@
"oai_dc":MetadataFormat(
prefix="oai_dc",
schema="http://www.openarchives.org/OAI/2.0/oai_dc.xsd",
namespace="http://www.openarchives.org/OAI/2.0/oai_dc/"
namespace="http://www.openarchives.org/OAI/2.0/oai_dc/",
all_versions=True
),
"arXiv":MetadataFormat(
prefix="arXiv",
schema="http://arxiv.org/OAI/arXiv.xsd",
namespace="http://arxiv.org/OAI/arXiv/"
namespace="http://arxiv.org/OAI/arXiv/",
all_versions=False
),
"arXivOld":MetadataFormat(
prefix="arXiv",
prefix="arXivOld",
schema="http://arxiv.org/OAI/arXivOld.xsd",
namespace="http://arxiv.org/OAI/arXivOld/"
namespace="http://arxiv.org/OAI/arXivOld/",
all_versions=False
),
"arXivRaw":MetadataFormat(
prefix="arXivRaw",
schema="http://arxiv.org/OAI/arXivRaw.xsd",
namespace="http://arxiv.org/OAI/arXivRaw/"
namespace="http://arxiv.org/OAI/arXivRaw/",
all_versions=True
),
}

Expand Down
3 changes: 2 additions & 1 deletion oaipmh/data/oai_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ class OAIVerbs:
LIST_SETS = "ListSets"

class MetadataFormat:
def __init__(self, prefix: str, schema: str, namespace: str):
def __init__(self, prefix: str, schema: str, namespace: str, all_versions:bool):
self.prefix = prefix
self.schema = schema
self.namespace = namespace
self.all_versions= all_versions

6 changes: 3 additions & 3 deletions oaipmh/processors/create_set_list.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
from datetime import datetime, timezone
from typing import Dict, Any, Union
from typing import Dict, Any, Union, List

from flask import render_template

from arxiv.taxonomy.category import Group, Archive, Category
from arxiv.taxonomy.definitions import ARCHIVES, GROUPS, CATEGORIES_ACTIVE
from arxiv.taxonomy.definitions import ARCHIVES, GROUPS, CATEGORIES_ACTIVE, CATEGORIES

from oaipmh.data.oai_properties import OAIParams
from oaipmh.serializers.output_formats import Response

def produce_set_list(query_data: Dict[OAIParams, Any]) -> Response:
def display_set_structure(query_data: Dict[OAIParams, Any]) -> Response:
"""create the set structure of a repository"""
groups = {key: grp for key,
grp in GROUPS.items()
Expand Down
23 changes: 23 additions & 0 deletions oaipmh/processors/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import Optional, List

from arxiv.db import Session
from arxiv.db.models import Metadata
from arxiv.identifier import Identifier


def get_record_data_current(arxiv_id: Identifier )-> Optional[Metadata]:
"""fetch latest metadata for a specific paper"""
data=(Session.query(Metadata)
.filter(Metadata.paper_id == arxiv_id.id)
.filter(Metadata.is_current==1)
.first()
)
return data

def get_record_data_all(arxiv_id: Identifier)-> Optional[List[Metadata]]:
"""fetch all metadata for a specific paper"""
data=(Session.query(Metadata)
.filter(Metadata.paper_id == arxiv_id.id)
.all()
)
return data
44 changes: 44 additions & 0 deletions oaipmh/processors/get_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import Dict
from datetime import datetime, timezone

from flask import render_template

from arxiv.identifier import Identifier

from oaipmh.processors.db import get_record_data_all, get_record_data_current
from oaipmh.data.oai_errors import OAINonexistentID
from oaipmh.data.oai_properties import OAIParams, MetadataFormat
from oaipmh.serializers.create_records import arXivOldRecord, arXivRawRecord, arXivRecord, dcRecord
from oaipmh.serializers.output_formats import Response

def do_get_record(arxiv_id: Identifier, format: MetadataFormat, query_data: Dict[OAIParams, str])-> Response:
"""fetches the required data for a record for a specific format
converts data into specif format and renders record template
"""
if format.all_versions:
data=get_record_data_all(arxiv_id)
if not data:
raise OAINonexistentID("Nothing found for this ID",query_params=query_data)

if format.prefix=="oai_dc":
record=dcRecord(data)
else: #arXivRaw
record=arXivRawRecord(data)
else:
data=get_record_data_current(arxiv_id)
if data is None:
raise OAINonexistentID("Nothing found for this ID",query_params=query_data)
if format.prefix=="arXivOld":
record= arXivOldRecord(data)
else: #arXiv
record= arXivRecord(data)

response=render_template("get_record.xml",
response_date=datetime.now(timezone.utc),
query_params=query_data,
record=record,
format=format.prefix
)
headers={"Content-Type":"application/xml"}
return response, 200, headers

5 changes: 2 additions & 3 deletions oaipmh/requests/data_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from oaipmh.data.oai_config import SUPPORTED_METADATA_FORMATS, EARLIEST_DATE
from oaipmh.data.oai_errors import OAIBadArgument, OAIBadFormat, OAIBadResumptionToken
from oaipmh.data.oai_properties import OAIParams, OAIVerbs
from oaipmh.processors.get_record import do_get_record
from oaipmh.processors.resume import ResToken
from oaipmh.serializers.output_formats import Response
from oaipmh.requests.param_processing import process_identifier
Expand All @@ -32,10 +33,8 @@ def get_record(params: Dict[str, str]) -> Response:
raise OAIBadFormat(reason="Did not recognize requested format", query_params=query_data)
meta_type=SUPPORTED_METADATA_FORMATS[meta_type_str]
query_data[OAIParams.META_PREFIX]=meta_type_str
return do_get_record(arxiv_id, meta_type, query_data)

#TODO paramters done, do rest of function

return "<a>b</a>", 200, {}

def list_data(params: Dict[str, str], just_ids: bool)-> Response:
"""runs both list queries. just_ids true for list identifiers, false for list records"""
Expand Down
24 changes: 11 additions & 13 deletions oaipmh/requests/info_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from oaipmh.data.oai_errors import OAIBadArgument
from oaipmh.data.oai_properties import OAIParams, OAIVerbs
from oaipmh.serializers.output_formats import Response
from oaipmh.processors.create_set_list import produce_set_list
from oaipmh.processors.create_set_list import display_set_structure
from oaipmh.requests.param_processing import process_identifier

def identify(params: Dict[str, str]) -> Response:
Expand Down Expand Up @@ -39,20 +39,18 @@ def list_metadata_formats(params: Dict[str, str]) -> Response:
identifier_str=params[OAIParams.ID]
arxiv_id=process_identifier(identifier_str)
query_data[OAIParams.ID]=identifier_str

#TODO get formats for an item
return "<a>b</a>", 200, {}

#all formats are available for all items so we dont actually care about looking it up
else: #give formats repository supports
if given_params != {OAIParams.VERB}:
raise OAIBadArgument(f"Only allowed parameters are {', '.join(str(param) for param in expected_params)}")
response=render_template("metaformats.xml",
response_date=datetime.now(timezone.utc),
query_params=query_data,
formats=oai_config.SUPPORTED_METADATA_FORMATS
)
headers={"Content-Type":"application/xml"}
return response, 200, headers

response=render_template("metaformats.xml",
response_date=datetime.now(timezone.utc),
query_params=query_data,
formats=oai_config.SUPPORTED_METADATA_FORMATS
)
headers={"Content-Type":"application/xml"}
return response, 200, headers

def list_sets(params: Dict[str, str]) -> Response:
"""used to retrieve the set structure of a repository"""
Expand All @@ -65,5 +63,5 @@ def list_sets(params: Dict[str, str]) -> Response:
else:
if given_params != {OAIParams.VERB}:
raise OAIBadArgument(f"No other parameters allowed")
return produce_set_list(query_data)
return display_set_structure(query_data)

10 changes: 7 additions & 3 deletions oaipmh/requests/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from oaipmh.requests.info_queries import identify, list_metadata_formats, list_sets
from oaipmh.requests.data_queries import get_record, list_data
from oaipmh.serializers.output_formats import Response
from oaipmh.data.oai_errors import OAIBadVerb
from oaipmh.data.oai_errors import OAIBadVerb, OAIBadArgument
from oaipmh.data.oai_properties import OAIVerbs
from oaipmh.serializers.output_formats import Response

Expand All @@ -17,8 +17,12 @@ def oai() -> Response:
this defines what the client is asking for as per the OAI standard
further verification of parameters is done with the handlers for individual verbs
"""
#TODO duplicate params dont create errors, technically not to spec
params: Dict[str, str] = request.args.to_dict() if request.method == 'GET' else request.form.to_dict()
param_source=request.args if request.method == 'GET' else request.form
for _, values in param_source.lists():
if len(values) > 1:
raise OAIBadArgument("Duplicate parameters not allowed")
params: Dict[str, str] = param_source.to_dict()

verb = params.get("verb", "")
match verb:
case OAIVerbs.GET_RECORD:
Expand Down
Empty file removed oaipmh/serializers/config_srl.py
Empty file.
107 changes: 107 additions & 0 deletions oaipmh/serializers/create_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from typing import List, Optional
from datetime import datetime

from arxiv.authors import parse_author_affil
from arxiv.db.models import Metadata
from arxiv.document.version import VersionEntry
from arxiv.taxonomy.category import Category
from arxiv.taxonomy.definitions import CATEGORIES

from oaipmh.processors.create_set_list import make_set_str

class Header:
def __init__(self, id:str, date:datetime, cats:List[Category]) -> None:
self.id=f"oai:arXiv.org:{id}"
self.date=date
self.sets=[]
for cat in cats:
self.sets.append(make_set_str(cat))

def __eq__(self, other: object) -> bool:
if not isinstance(other, Header):
return False
return (
self.id == other.id and
self.date == other.date and
self.sets == other.sets
)

class Record: #base record class
def __init__(self, current_meta: Metadata):
self.categories: List[Category]=[]
if current_meta.abs_categories:
for cat in current_meta.abs_categories.split():
self.categories.append(CATEGORIES[cat])

date= current_meta.updated if current_meta.updated else current_meta.created
self.header = Header(current_meta.paper_id, date, self.categories)
self.current_meta = current_meta

#specialized record classes for the different supported metadata types
class arXivRecord(Record):
def __init__(self, current_meta: Metadata):
super().__init__(current_meta)
self.authors= parse_author_affil(current_meta.authors)

class arXivRawRecord(Record):
def __init__(self, metadata: List[Metadata]):
self.versions: List[VersionEntry]=[]
for version in metadata:
entry= VersionEntry(
version=version.version,
raw='',
submitted_date=version.created,
size_kilobytes = version.source_size // 1024 if version.source_size else 0,
source_flag=self._process_source_format(version.source_format, version.source_flags),
is_current=version.is_current,
source_format=version.source_format
)
self.versions.append(entry)
if version.is_current:
super().__init__(version)

@staticmethod
def _process_source_format(format: Optional[str], source_flags: Optional[str]) -> Optional[str]:
"""oai excepts the source information to be in the form of flags for both our flag data and source type data"""
format_map={
'pdftex' :'D',
'tex':'',
'pdf':'',
'withdrawn': 'I',
'html': 'H',
'ps': 'P',
'docx': 'X'
}
shown_flags=['A', 'S'] #not shown: 1, D (duplicates pdftex format sometimes)

result=""
if source_flags:
for flag in shown_flags:
if flag in source_flags:
result+=flag
result+=format_map.get(format,"")

return result or None

class dcRecord(Record):
def __init__(self, metadata: List[Metadata]):
for version in metadata:
if version.is_current:
super().__init__(version)
self.current_version_date=version.created
self.authors= parse_author_affil(version.authors)

if version.version==1:
self.initial_date=version.created

def deduplicate_cat_names(self)-> List[str]:
result=[]
for cat in self.categories:
if cat.full_name not in result:
result.append(cat.full_name)
return result

class arXivOldRecord(Record):
#no extra data
def __init__(self, current_meta: Metadata):
super().__init__(current_meta)
Empty file removed oaipmh/serializers/database_srl.py
Empty file.
12 changes: 12 additions & 0 deletions oaipmh/templates/get_record.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{% extends "base.xml" %}
{% import 'record_formats.xml' as formats %}

{% block request_element %}
{{ macros.request_element(query_params) }}
{% endblock %}

{% block interior_xml %}
<GetRecord>
{{formats.create_record(record, format)}}
</GetRecord>
{% endblock %}
Empty file removed oaipmh/templates/headers.html
Empty file.
12 changes: 11 additions & 1 deletion oaipmh/templates/macros.xml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
{% macro request_element( attributes={}) %}
<request {% for key, value in attributes.items() %}{{ key }}="{{ value }}" {% endfor %}>{{ url_for("general.oai", _external=True) }}</request>
<request{% for key, value in attributes.items() %} {{ key }}="{{ value }}"{% endfor %}>{{ url_for("general.oai", _external=True) }}</request>
{% endmacro %}

{% macro header(header) %}
<header>
<identifier>{{header.id}}</identifier>
<datestamp>{{ header.date.strftime('%Y-%m-%d') }}</datestamp>
{% for set in header.sets %}
<setSpec>{{set}}</setSpec>
{% endfor %}
</header>
{% endmacro %}
Loading

0 comments on commit 140b75f

Please sign in to comment.