Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get record #6

Merged
merged 6 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions oaipmh/data/oai_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,26 @@
"oai_dc":MetadataFormat(
prefix="oai_dc",
schema="http://www.openarchives.org/OAI/2.0/oai_dc.xsd",
namespace="http://www.openarchives.org/OAI/2.0/oai_dc/"
namespace="http://www.openarchives.org/OAI/2.0/oai_dc/",
all_versions=True
),
"arXiv":MetadataFormat(
prefix="arXiv",
schema="http://arxiv.org/OAI/arXiv.xsd",
namespace="http://arxiv.org/OAI/arXiv/"
namespace="http://arxiv.org/OAI/arXiv/",
all_versions=False
),
"arXivOld":MetadataFormat(
prefix="arXiv",
prefix="arXivOld",
schema="http://arxiv.org/OAI/arXivOld.xsd",
namespace="http://arxiv.org/OAI/arXivOld/"
namespace="http://arxiv.org/OAI/arXivOld/",
all_versions=False
),
"arXivRaw":MetadataFormat(
prefix="arXivRaw",
schema="http://arxiv.org/OAI/arXivRaw.xsd",
namespace="http://arxiv.org/OAI/arXivRaw/"
namespace="http://arxiv.org/OAI/arXivRaw/",
all_versions=True
),
}

Expand Down
3 changes: 2 additions & 1 deletion oaipmh/data/oai_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ class OAIVerbs:
LIST_SETS = "ListSets"

class MetadataFormat:
def __init__(self, prefix: str, schema: str, namespace: str):
def __init__(self, prefix: str, schema: str, namespace: str, all_versions:bool):
self.prefix = prefix
self.schema = schema
self.namespace = namespace
self.all_versions= all_versions

6 changes: 3 additions & 3 deletions oaipmh/processors/create_set_list.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
from datetime import datetime, timezone
from typing import Dict, Any, Union
from typing import Dict, Any, Union, List

from flask import render_template

from arxiv.taxonomy.category import Group, Archive, Category
from arxiv.taxonomy.definitions import ARCHIVES, GROUPS, CATEGORIES_ACTIVE
from arxiv.taxonomy.definitions import ARCHIVES, GROUPS, CATEGORIES_ACTIVE, CATEGORIES

from oaipmh.data.oai_properties import OAIParams
from oaipmh.serializers.output_formats import Response

def produce_set_list(query_data: Dict[OAIParams, Any]) -> Response:
def display_set_structure(query_data: Dict[OAIParams, Any]) -> Response:
"""create the set structure of a repository"""
groups = {key: grp for key,
grp in GROUPS.items()
Expand Down
23 changes: 23 additions & 0 deletions oaipmh/processors/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import Optional, List

from arxiv.db import Session
from arxiv.db.models import Metadata
from arxiv.identifier import Identifier


def get_record_data_current(arxiv_id: Identifier )-> Optional[Metadata]:
"""fetch latest metadata for a specific paper"""
data=(Session.query(Metadata)
.filter(Metadata.paper_id == arxiv_id.id)
.filter(Metadata.is_current==1)
.first()
)
return data

def get_record_data_all(arxiv_id: Identifier)-> Optional[List[Metadata]]:
"""fetch all metadata for a specific paper"""
data=(Session.query(Metadata)
.filter(Metadata.paper_id == arxiv_id.id)
.all()
)
return data
44 changes: 44 additions & 0 deletions oaipmh/processors/get_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import Dict
from datetime import datetime, timezone

from flask import render_template

from arxiv.identifier import Identifier

from oaipmh.processors.db import get_record_data_all, get_record_data_current
from oaipmh.data.oai_errors import OAINonexistentID
from oaipmh.data.oai_properties import OAIParams, MetadataFormat
from oaipmh.serializers.create_records import arXivOldRecord, arXivRawRecord, arXivRecord, dcRecord
from oaipmh.serializers.output_formats import Response

def do_get_record(arxiv_id: Identifier, format: MetadataFormat, query_data: Dict[OAIParams, str])-> Response:
"""fetches the required data for a record for a specific format
converts data into specif format and renders record template
"""
if format.all_versions:
data=get_record_data_all(arxiv_id)
if not data:
raise OAINonexistentID("Nothing found for this ID",query_params=query_data)

if format.prefix=="oai_dc":
record=dcRecord(data)
else: #arXivRaw
record=arXivRawRecord(data)
else:
data=get_record_data_current(arxiv_id)
if data is None:
raise OAINonexistentID("Nothing found for this ID",query_params=query_data)
if format.prefix=="arXivOld":
record= arXivOldRecord(data)
else: #arXiv
record= arXivRecord(data)

response=render_template("get_record.xml",
response_date=datetime.now(timezone.utc),
query_params=query_data,
record=record,
format=format.prefix
)
headers={"Content-Type":"application/xml"}
return response, 200, headers

5 changes: 2 additions & 3 deletions oaipmh/requests/data_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from oaipmh.data.oai_config import SUPPORTED_METADATA_FORMATS, EARLIEST_DATE
from oaipmh.data.oai_errors import OAIBadArgument, OAIBadFormat, OAIBadResumptionToken
from oaipmh.data.oai_properties import OAIParams, OAIVerbs
from oaipmh.processors.get_record import do_get_record
from oaipmh.processors.resume import ResToken
from oaipmh.serializers.output_formats import Response
from oaipmh.requests.param_processing import process_identifier
Expand All @@ -32,10 +33,8 @@ def get_record(params: Dict[str, str]) -> Response:
raise OAIBadFormat(reason="Did not recognize requested format", query_params=query_data)
meta_type=SUPPORTED_METADATA_FORMATS[meta_type_str]
query_data[OAIParams.META_PREFIX]=meta_type_str
return do_get_record(arxiv_id, meta_type, query_data)

#TODO paramters done, do rest of function

return "<a>b</a>", 200, {}

def list_data(params: Dict[str, str], just_ids: bool)-> Response:
"""runs both list queries. just_ids true for list identifiers, false for list records"""
Expand Down
24 changes: 11 additions & 13 deletions oaipmh/requests/info_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from oaipmh.data.oai_errors import OAIBadArgument
from oaipmh.data.oai_properties import OAIParams, OAIVerbs
from oaipmh.serializers.output_formats import Response
from oaipmh.processors.create_set_list import produce_set_list
from oaipmh.processors.create_set_list import display_set_structure
from oaipmh.requests.param_processing import process_identifier

def identify(params: Dict[str, str]) -> Response:
Expand Down Expand Up @@ -39,20 +39,18 @@ def list_metadata_formats(params: Dict[str, str]) -> Response:
identifier_str=params[OAIParams.ID]
arxiv_id=process_identifier(identifier_str)
query_data[OAIParams.ID]=identifier_str

#TODO get formats for an item
return "<a>b</a>", 200, {}

#all formats are available for all items so we dont actually care about looking it up
else: #give formats repository supports
if given_params != {OAIParams.VERB}:
raise OAIBadArgument(f"Only allowed parameters are {', '.join(str(param) for param in expected_params)}")
response=render_template("metaformats.xml",
response_date=datetime.now(timezone.utc),
query_params=query_data,
formats=oai_config.SUPPORTED_METADATA_FORMATS
)
headers={"Content-Type":"application/xml"}
return response, 200, headers

response=render_template("metaformats.xml",
response_date=datetime.now(timezone.utc),
query_params=query_data,
formats=oai_config.SUPPORTED_METADATA_FORMATS
)
headers={"Content-Type":"application/xml"}
return response, 200, headers

def list_sets(params: Dict[str, str]) -> Response:
"""used to retrieve the set structure of a repository"""
Expand All @@ -65,5 +63,5 @@ def list_sets(params: Dict[str, str]) -> Response:
else:
if given_params != {OAIParams.VERB}:
raise OAIBadArgument(f"No other parameters allowed")
return produce_set_list(query_data)
return display_set_structure(query_data)

10 changes: 7 additions & 3 deletions oaipmh/requests/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from oaipmh.requests.info_queries import identify, list_metadata_formats, list_sets
from oaipmh.requests.data_queries import get_record, list_data
from oaipmh.serializers.output_formats import Response
from oaipmh.data.oai_errors import OAIBadVerb
from oaipmh.data.oai_errors import OAIBadVerb, OAIBadArgument
from oaipmh.data.oai_properties import OAIVerbs
from oaipmh.serializers.output_formats import Response

Expand All @@ -18,8 +18,12 @@ def oai() -> Response:
this defines what the client is asking for as per the OAI standard
further verification of parameters is done with the handlers for individual verbs
"""
#TODO duplicate params dont create errors, technically not to spec
params: Dict[str, str] = request.args.to_dict() if request.method == 'GET' else request.form.to_dict()
param_source=request.args if request.method == 'GET' else request.form
for _, values in param_source.lists():
if len(values) > 1:
raise OAIBadArgument("Duplicate parameters not allowed")
params: Dict[str, str] = param_source.to_dict()

verb = params.get("verb", "")
match verb:
case OAIVerbs.GET_RECORD:
Expand Down
Empty file removed oaipmh/serializers/config_srl.py
Empty file.
107 changes: 107 additions & 0 deletions oaipmh/serializers/create_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from typing import List, Optional
from datetime import datetime

from arxiv.authors import parse_author_affil
from arxiv.db.models import Metadata
from arxiv.document.version import VersionEntry
from arxiv.taxonomy.category import Category
from arxiv.taxonomy.definitions import CATEGORIES

from oaipmh.processors.create_set_list import make_set_str

class Header:
def __init__(self, id:str, date:datetime, cats:List[Category]) -> None:
self.id=f"oai:arXiv.org:{id}"
self.date=date
self.sets=[]
for cat in cats:
self.sets.append(make_set_str(cat))

def __eq__(self, other: object) -> bool:
if not isinstance(other, Header):
return False
return (
self.id == other.id and
self.date == other.date and
self.sets == other.sets
)

class Record: #base record class
def __init__(self, current_meta: Metadata):
self.categories: List[Category]=[]
if current_meta.abs_categories:
for cat in current_meta.abs_categories.split():
self.categories.append(CATEGORIES[cat])

date= current_meta.updated if current_meta.updated else current_meta.created
self.header = Header(current_meta.paper_id, date, self.categories)
self.current_meta = current_meta

#specialized record classes for the different supported metadata types
class arXivRecord(Record):
def __init__(self, current_meta: Metadata):
super().__init__(current_meta)
self.authors= parse_author_affil(current_meta.authors)

class arXivRawRecord(Record):
def __init__(self, metadata: List[Metadata]):
self.versions: List[VersionEntry]=[]
for version in metadata:
entry= VersionEntry(
version=version.version,
raw='',
submitted_date=version.created,
size_kilobytes = version.source_size // 1000 if version.source_size else 0,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A kilobyte is not 1000 bytes. It is 1024.

source_flag=self._process_source_format(version.source_format, version.source_flags),
is_current=version.is_current,
source_format=version.source_format
)
self.versions.append(entry)
if version.is_current:
super().__init__(version)

@staticmethod
def _process_source_format(format: Optional[str], source_flags: Optional[str]) -> Optional[str]:
"""oai excepts the source information to be in the form of flags for both our flag data and source type data"""
format_map={
'pdftex' :'D',
'tex':'',
'pdf':'',
'withdrawn': 'I',
'html': 'H',
'ps': 'P',
'docx': 'X'
}
shown_flags=['A', 'S'] #not shown: 1, D (duplicates pdftex format sometimes)

result=""
if source_flags:
for flag in shown_flags:
if flag in source_flags:
result+=flag
result+=format_map.get(format,"")

return result or None

class dcRecord(Record):
def __init__(self, metadata: List[Metadata]):
for version in metadata:
if version.is_current:
super().__init__(version)
self.current_version_date=version.created
self.authors= parse_author_affil(version.authors)

if version.version==1:
self.initial_date=version.created

def deduplicate_cat_names(self)-> List[str]:
result=[]
for cat in self.categories:
if cat.full_name not in result:
result.append(cat.full_name)
return result

class arXivOldRecord(Record):
#no extra data
def __init__(self, current_meta: Metadata):
super().__init__(current_meta)
Empty file removed oaipmh/serializers/database_srl.py
Empty file.
12 changes: 12 additions & 0 deletions oaipmh/templates/get_record.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{% extends "base.xml" %}
{% import 'record_formats.xml' as formats %}

{% block request_element %}
{{ macros.request_element(query_params) }}
{% endblock %}

{% block interior_xml %}
<GetRecord>
{{formats.create_record(record, format)}}
</GetRecord>
{% endblock %}
Empty file removed oaipmh/templates/headers.html
Empty file.
12 changes: 11 additions & 1 deletion oaipmh/templates/macros.xml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
{% macro request_element( attributes={}) %}
<request {% for key, value in attributes.items() %}{{ key }}="{{ value }}" {% endfor %}>{{ url_for("general.oai", _external=True) }}</request>
<request{% for key, value in attributes.items() %} {{ key }}="{{ value }}"{% endfor %}>{{ url_for("general.oai", _external=True) }}</request>
{% endmacro %}

{% macro header(header) %}
<header>
<identifier>{{header.id}}</identifier>
<datestamp>{{ header.date.strftime('%Y-%m-%d') }}</datestamp>
{% for set in header.sets %}
<setSpec>{{set}}</setSpec>
{% endfor %}
</header>
{% endmacro %}
Loading
Loading