Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Oai organization #2

Merged
merged 29 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
cf588d5
set up tweaks
kyokukou Nov 4, 2024
1a9b1c2
created layout with empty files
kyokukou Nov 4, 2024
f5c8b48
sort request into handlers based on verb
kyokukou Nov 5, 2024
7b68c94
render a base template
kyokukou Nov 5, 2024
a66d6ad
create basic request element
kyokukou Nov 5, 2024
abd284a
added OAI errors, handler
kyokukou Nov 6, 2024
7261955
refactor using custom response object
kyokukou Nov 6, 2024
5a7266b
parameter check for get record
kyokukou Nov 6, 2024
64a1cdb
small reorganize
kyokukou Nov 6, 2024
3fab0a6
check correct params for identify
kyokukou Nov 6, 2024
e70fb26
check parameters for list_identifiers
kyokukou Nov 7, 2024
266a75f
turn verbs and params into classes
kyokukou Nov 7, 2024
b470b9e
check parameters given for list_metadata_formats
kyokukou Nov 7, 2024
2a5635a
checking params for list records
kyokukou Nov 7, 2024
58ad316
check corrent params present for list sets
kyokukou Nov 7, 2024
babdcf7
draft of creating set list
kyokukou Nov 7, 2024
1745fa4
validating identifiers
kyokukou Nov 7, 2024
321fe23
use error for nonexistant id
kyokukou Nov 7, 2024
1194c28
collect query data for verbs
kyokukou Nov 7, 2024
e7dad0a
errors contain valid query parameters and more info on error causes t…
kyokukou Nov 8, 2024
0098d64
refactor shared get_list code
kyokukou Nov 8, 2024
48a5d8b
display metadata formats page
kyokukou Nov 11, 2024
456c2f5
datetime checking
kyokukou Nov 11, 2024
9412b91
validate set parameters
kyokukou Nov 12, 2024
ebbccb9
helper function to turn arxiv categories into set strings
kyokukou Nov 12, 2024
55591b9
use group:archive:cat_suffix structure for sets
kyokukou Nov 12, 2024
a43f658
update tests, no resumption token for list_sets
kyokukou Nov 12, 2024
cb40319
moved verb sorting into route
kyokukou Nov 13, 2024
798408a
minor refactoring
kyokukou Nov 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ poetry install

run:
python main.py
Then check the app is running with http://localhost:8080/oai

tests:

Expand Down
2 changes: 1 addition & 1 deletion oaipmh/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class Settings(arxiv_base.Settings):
SQLALCHEMY_MAX_OVERFLOW: Optional[int] = 0
SQLALCHEMY_POOL_SIZE: Optional[int] = 10

APPLICATION_ROOT: Optional[str] = None
FLASKS3_BUCKET_NAME: str = "some_bucket" #TODO needed to use url for for some reason?

def check(self) -> None:
"""A check and fix up of a settings object."""
Expand Down
28 changes: 28 additions & 0 deletions oaipmh/data/oai_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from datetime import datetime, timezone
from oaipmh.data.oai_properties import MetadataFormat

#TODO do we want to change this
EARLIEST_DATE=datetime(2007, 5, 23, 0, 0, tzinfo=timezone.utc)

SUPPORTED_METADATA_FORMATS={
"oai_dc":MetadataFormat(
prefix="oai_dc",
schema="http://www.openarchives.org/OAI/2.0/oai_dc.xsd",
namespace="http://www.openarchives.org/OAI/2.0/oai_dc/"
),
"arXiv":MetadataFormat(
prefix="arXiv",
schema="http://arxiv.org/OAI/arXiv.xsd",
namespace="http://arxiv.org/OAI/arXiv/"
),
"arXivOld":MetadataFormat(
prefix="arXiv",
schema="http://arxiv.org/OAI/arXivOld.xsd",
namespace="http://arxiv.org/OAI/arXivOld/"
),
"arXivRaw":MetadataFormat(
prefix="arXivRaw",
schema="http://arxiv.org/OAI/arXivRaw.xsd",
namespace="http://arxiv.org/OAI/arXivRaw/"
),
}
67 changes: 67 additions & 0 deletions oaipmh/data/oai_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from typing import Dict, Optional

from oaipmh.data.oai_properties import OAIParams

class OAIException(Exception):
"""General class for all OAI defined errors"""
code: str
description: str
query_params: Optional[Dict[OAIParams, str]]
reason: Optional[str]

class OAIBadArgument(OAIException):
code="badArgument"
description="The request includes illegal arguments, is missing required arguments, includes a repeated argument, or values for arguments have an illegal syntax."
query_params=None #dont include attributes
def __init__(self, reason:str= None):
self.reason=reason

class OAIBadResumptionToken(OAIException):
code="badResumptionToken"
description="The value of the resumptionToken argument is invalid or expired."
def __init__(self, reason:str, query_params: Dict[OAIParams, str] = None):
self.query_params=query_params
self.reason=reason

class OAIBadVerb(OAIException):
code="badVerb"
description="Value of the verb argument is not a legal OAI-PMH verb, the verb argument is missing, or the verb argument is repeated."
query_params=None #dont include attributes
def __init__(self, reason:str= None):
self.reason=reason

class OAIBadFormat(OAIException):
code="cannotDisseminateFormat"
description="The metadata format identified by the value given for the metadataPrefix argument is not supported by the item or by the repository."
def __init__(self, reason:str= None, query_params: Dict[OAIParams, str] = None):
self.query_params=query_params
self.reason=reason

class OAINonexistentID(OAIException):
code="idDoesNotExist"
description="The value of the identifier argument is unknown or illegal in this repository."
def __init__(self, reason:str= None, query_params: Dict[OAIParams, str] = None):
self.query_params=query_params
self.reason=reason

class OAINoRecordsMatch(OAIException):
code="noRecordsMatch"
description="The combination of the values of the from, until, set and metadataPrefix arguments results in an empty list."
def __init__(self, reason:str= None, query_params: Dict[OAIParams, str] = None):
self.query_params=query_params
self.reason=reason

class OAINoMetadataFormats(OAIException):
code="noMetadataFormats"
description="There are no metadata formats available for the specified item."
def __init__(self, reason:str= None, query_params: Dict[OAIParams, str] = None):
self.query_params=query_params
self.reason=reason

class OAINoSetHierarchy(OAIException):
#should not be triggered for arXiv implementation
code="noSetHierarchy"
description="The repository does not support sets. This exception should not be true for the arXiv implementation."
def __init__(self, reason:str= None, query_params: Dict[OAIParams, str] = None):
self.query_params=query_params
self.reason=reason
22 changes: 22 additions & 0 deletions oaipmh/data/oai_properties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
class OAIParams:
kyokukou marked this conversation as resolved.
Show resolved Hide resolved
VERB = "verb"
ID = "identifier"
META_PREFIX = "metadataPrefix"
SET = "set"
FROM = "from"
UNTIL = "until"
RES_TOKEN = "resumptionToken"

class OAIVerbs:
kyokukou marked this conversation as resolved.
Show resolved Hide resolved
GET_RECORD = "GetRecord"
LIST_RECORDS = "ListRecords"
LIST_IDS = "ListIdentifiers"
IDENTIFY = "Identify"
LIST_META_FORMATS = "ListMetadataFormats"
LIST_SETS = "ListSets"

class MetadataFormat:
def __init__(self, prefix: str, schema: str, namespace: str):
self.prefix = prefix
self.schema = schema
self.namespace = namespace
21 changes: 19 additions & 2 deletions oaipmh/factory.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import logging
from datetime import datetime, timezone

from flask import Flask
from flask import Flask, render_template
from flask_s3 import FlaskS3
from flask.logging import default_handler
from werkzeug.exceptions import HTTPException

from arxiv.base import Base
from arxiv.db import config_query_timing, configure_db

from oaipmh.data.oai_errors import OAIException
from oaipmh.config import Settings
from oaipmh import routes
from oaipmh.requests import routes

s3 = FlaskS3()

Expand All @@ -33,6 +36,20 @@ def create_web_app(**kwargs) -> Flask: # type: ignore
app.register_blueprint(routes.blueprint)
s3.init_app(app)

@app.errorhandler(OAIException)
def handle_oai_error(e):
response=render_template("errors.xml",
response_date=datetime.now(timezone.utc),
error=e)
headers={"Content-Type":"application/xml"}
return response, 200, headers

#TODO make this actually trigger
@app.errorhandler(HTTPException)
def handle_http_error(e):
print("main error handler ran!")
return e.description, e.code, {}

app.jinja_env.trim_blocks = True
app.jinja_env.lstrip_blocks = True
if not app.jinja_env.globals:
Expand Down
41 changes: 41 additions & 0 deletions oaipmh/processors/create_set_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from datetime import datetime, timezone
from typing import Dict, Any, Union

from flask import render_template

from arxiv.taxonomy.category import Group, Archive, Category
from arxiv.taxonomy.definitions import ARCHIVES, GROUPS, CATEGORIES_ACTIVE

from oaipmh.data.oai_properties import OAIParams
from oaipmh.serializers.output_formats import Response

def produce_set_list(query_data: Dict[OAIParams, Any]) -> Response:
"""create the set structure of a repository"""
groups = {key: grp for key,
grp in GROUPS.items()
if grp.is_active and not grp.is_test}
archives= {key: arch for key,
arch in ARCHIVES.items()
if arch.is_active and not arch.in_group == "grp_test"}

response=render_template("setSpec.xml",
response_date=datetime.now(timezone.utc),
query_data=query_data,
groups=groups,
archives=archives,
categories= CATEGORIES_ACTIVE,
to_set= make_set_str
)
return response, 200, {}

def make_set_str(item: Union[Group, Archive, Category]) -> str:
"""helper function to convert arXiv category data into OAI set structure
the grp_ prefix should be removed from group ids
"""
if isinstance(item, Group):
return item.id[4:]
elif isinstance(item, Archive):
return f"{item.in_group[4:]}:{item.id}"
elif isinstance(item, Category):
archive=item.get_archive()
return f"{archive.in_group[4:]}:{item.id.replace('.',':')}"
Empty file added oaipmh/processors/db.py
Empty file.
Empty file added oaipmh/processors/queries.py
Empty file.
Empty file added oaipmh/processors/resume.py
Empty file.
134 changes: 134 additions & 0 deletions oaipmh/requests/data_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
from typing import Dict, Union
import re
from datetime import datetime, timezone

from arxiv.taxonomy.definitions import GROUPS, ARCHIVES_ACTIVE, CATEGORIES_ACTIVE
from arxiv.taxonomy.category import Group, Archive, Category

from oaipmh.data.oai_config import SUPPORTED_METADATA_FORMATS, EARLIEST_DATE
from oaipmh.data.oai_errors import OAIBadArgument, OAIBadFormat
from oaipmh.data.oai_properties import OAIParams, OAIVerbs
from oaipmh.serializers.output_formats import Response
from oaipmh.requests.param_processing import process_identifier

DATE_REGEX = r"\d{4}-\d{2}-\d{2}"

def get_record(params: Dict[str, str]) -> Response:
"""used to get data on a particular record in a particular metadata format"""
query_data: Dict[OAIParams, str]={OAIParams.VERB:OAIVerbs.GET_RECORD}

# get parameters
expected_params={OAIParams.ID, OAIParams.META_PREFIX, OAIParams.VERB}
if set(params.keys()) != expected_params:
raise OAIBadArgument(f"Parameters provided did not match expected. Expected: {', '.join(str(param) for param in expected_params)}")

identifier_str=params[OAIParams.ID]
arxiv_id=process_identifier(identifier_str)
query_data[OAIParams.ID]=identifier_str

meta_type_str=params[OAIParams.META_PREFIX]
if meta_type_str not in SUPPORTED_METADATA_FORMATS:
raise OAIBadFormat(reason="Did not recognize requested format", query_params=query_data)
meta_type=SUPPORTED_METADATA_FORMATS[meta_type_str]

#TODO paramters done, do rest of function

return "<a>b</a>", 200, {}

def list_data(params: Dict[str, str], just_ids: bool)-> Response:
"""runs both list queries. just_ids true for list identifiers, false for list records"""
query_data: Dict[OAIParams, str]={OAIParams.VERB:OAIVerbs.LIST_IDS}

#parameter processing
given_params=set(params.keys())
if OAIParams.RES_TOKEN in given_params: #using resumption token
if given_params != {OAIParams.RES_TOKEN, OAIParams.VERB}: #resumption token is exclusive
raise OAIBadArgument(f"No other paramters allowed with {OAIParams.RES_TOKEN}")
token=params[OAIParams.RES_TOKEN]
#TODO token processing and validation

else: #using request parameters
#correct parameters present
if OAIParams.META_PREFIX not in given_params:
raise OAIBadArgument(f"{OAIParams.META_PREFIX} required.")
allowed_params={OAIParams.VERB,OAIParams.META_PREFIX, OAIParams.FROM, OAIParams.UNTIL, OAIParams.SET }
if given_params-allowed_params: #no extra keys allowed
raise OAIBadArgument(f"Unallowed parameter. Allowed parameters: {', '.join(str(param) for param in allowed_params)}")

#metadata
meta_type_str=params[OAIParams.META_PREFIX]
if meta_type_str not in SUPPORTED_METADATA_FORMATS:
raise OAIBadFormat(reason="Did not recognize requested format", query_params=query_data)
meta_type=SUPPORTED_METADATA_FORMATS[meta_type_str]
query_data[OAIParams.META_PREFIX]=meta_type_str

#dates
from_str=params.get(OAIParams.FROM)
if from_str:
try:
if not re.fullmatch(DATE_REGEX, from_str):
raise ValueError
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is redundant since strptime will throw a ValueError

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

strptime is a little too permissive and accepts things like 3-2-2024 which the specification says must be 03-02-2024

start_date=datetime.strptime(from_str, "%Y-%m-%d")
start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc)
query_data[OAIParams.FROM]=from_str
except Exception:
raise OAIBadArgument("from date format must be YYYY-MM-DD")
else:
start_date=EARLIEST_DATE

until_str=params.get(OAIParams.UNTIL)
if until_str:
try:
if not re.fullmatch(DATE_REGEX, until_str):
raise ValueError
end_date=datetime.strptime(until_str, "%Y-%m-%d")
end_date = end_date.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc)
query_data[OAIParams.UNTIL]=until_str
except Exception:
raise OAIBadArgument("until date format must be YYYY-MM-DD")
else:
end_date=datetime.now(timezone.utc)

#sets
set_str=params.get(OAIParams.SET)
if set_str:
rq_set= _parse_set(set_str)
query_data[OAIParams.SET]=set_str

#TODO check that combined parameters are valid (dates are okay, sets are active and not test) combined with token data

#TODO rest of function

return "<a>b</a>", 200, {}

def _parse_set(set_str:str)-> Union[Group, Archive, Category]:
"""turns OAI style string into taxonomy item
validates item
"""
set_parts=set_str.split(":")
match len(set_parts):
case 1: #asking for a group
rq_set = GROUPS.get(f'grp_{set_str}')
if not rq_set:
raise OAIBadArgument("Set does not exist")
case 2: #archive (including archive as category)
grp_str, archive_str = set_parts
rq_set = ARCHIVES_ACTIVE.get(archive_str)
if not rq_set or f'grp_{grp_str}' != rq_set.in_group:
raise OAIBadArgument("Set does not exist")
case 3: #full category
grp_str, archive_str, category_suffix = set_parts
cat_str = f"{archive_str}.{category_suffix}"
if cat_str not in CATEGORIES_ACTIVE:
raise OAIBadArgument("Set does not exist")
rq_set= CATEGORIES_ACTIVE[cat_str]
archive= rq_set.get_archive()
if archive_str!= archive.id or f'grp_{grp_str}' != archive.in_group:
raise OAIBadArgument("Set does not exist")
case _:
raise OAIBadArgument("Set has too many levels")

return rq_set



Loading
Loading