Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pragerdom/be-366: OpenAIRE authority provider #199

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
e86dbce
feat: schema file for awards
pragerdom Nov 22, 2024
d711eb0
feat: OpenAIRE Provider constructor
pragerdom Nov 26, 2024
b849af8
Merge branch 'main' into pragerdom/be-366-implement-openaire-authorit…
pragerdom Nov 26, 2024
6f7f149
feat: introduce OpenAIRE to tests and model
pragerdom Nov 26, 2024
d23d456
feat: OpenAIRE Authority Provider implementation
pragerdom Dec 1, 2024
b430aee
fix: incorrect (test) docker-compose.yml file
pragerdom Dec 1, 2024
f911415
refactor: remove unused import, return vocab item faster
pragerdom Dec 1, 2024
000fa87
refactor: logger, token caching, config from current_app
pragerdom Dec 2, 2024
f037630
refactor: access app for keys in tests from context
pragerdom Dec 2, 2024
8f6041b
refactor: remove some try-except blocks
pragerdom Dec 3, 2024
70e268f
refactor: make relations (organizations) get more readable
pragerdom Dec 3, 2024
4b9c2e8
fix: prevent None value access
pragerdom Dec 3, 2024
e18e381
Merge branch 'main' into pragerdom/be-366-implement-openaire-authorit…
pragerdom Dec 3, 2024
4113437
refactor: unused import, change program finding method to recursive call
pragerdom Dec 3, 2024
5deb86c
temporary dependancy fix
Dec 4, 2024
b96ddda
refactor: more consistent relations fetch, unite tests with ORCID
pragerdom Dec 4, 2024
95acb2e
format: reformat OpenAIRE provider
pragerdom Dec 4, 2024
3545ddc
refactor: more readable NoneType checking
pragerdom Dec 4, 2024
e1f65e8
Merge branch 'main' into pragerdom/be-366-implement-openaire-authorit…
pragerdom Dec 5, 2024
ae6e22d
fix: ROR provider code update from main
pragerdom Dec 5, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ env:
OAREPO_VERSION: ${{ inputs.oarepo }}
INVENIO_ORCID_CLIENT_ID: ${{secrets.INVENIO_ORCID_CLIENT_ID}}
INVENIO_ORCID_CLIENT_SECRET: ${{secrets.INVENIO_ORCID_CLIENT_SECRET}}
INVENIO_OPENAIRE_CLIENT_ID: ${{secrets.INVENIO_OPENAIRE_CLIENT_ID}}
INVENIO_OPENAIRE_CLIENT_SECRET: ${{secrets.INVENIO_OPENAIRE_CLIENT_SECRET}}

jobs:
build:
Expand Down
38 changes: 9 additions & 29 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
version: '2.2'
name: test_services
services:
search:
image: bitnami/opensearch:2
restart: 'unless-stopped'
image: opensearchproject/opensearch:latest
restart: "unless-stopped"
environment:
# settings only for development. DO NOT use in production!
- bootstrap.memory_lock=true
- 'OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m'
- 'DISABLE_INSTALL_DEMO_CONFIG=true'
- 'DISABLE_SECURITY_PLUGIN=true'
- 'discovery.type=single-node'
- 'OPENSEARCH_PLUGINS=analysis-icu'
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
- "DISABLE_INSTALL_DEMO_CONFIG=true"
- "DISABLE_SECURITY_PLUGIN=true"
- "discovery.type=single-node"
ulimits:
memlock:
soft: -1
Expand All @@ -20,30 +18,12 @@ services:
soft: 65536
hard: 65536
mem_limit: 2g
expose:
- 9200
- 9600
ports:
- '127.0.0.1:9200:9200'
- "127.0.0.1:9200:9200"
- "127.0.0.1:9600:9600"
cache:
image: redis:7
restart: "unless-stopped"
read_only: true
ports:
- '127.0.0.1:6379:6379'
s3:
image: minio/minio:latest
restart: "unless-stopped"
environment:
MINIO_ROOT_USER: "tests"
MINIO_ROOT_PASSWORD: "teststests"
command: server /data --console-address :9001
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
ports:
- "127.0.0.1:19000:9000"
- "127.0.0.1:19001:9001"

- "127.0.0.1:6379:6379"
4 changes: 2 additions & 2 deletions oarepo_vocabularies/authorities/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .providers import AuthorityProvider, RORProviderV2, ORCIDProvider
from .providers import AuthorityProvider, RORProviderV2, ORCIDProvider, OpenAIREProvider

__all__ = ("AuthorityProvider", "RORProviderV2", "ORCIDProvider")
__all__ = ("AuthorityProvider", "RORProviderV2", "ORCIDProvider", "OpenAIREProvider")
4 changes: 3 additions & 1 deletion oarepo_vocabularies/authorities/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from .base import AuthorityProvider
from .ror_provider import RORProviderV2
from .orcid_provider import ORCIDProvider
from .openaire_provider import OpenAIREProvider

__all__ = (
"AuthorityProvider",
"RORProviderV2",
"ORCIDProvider"
"ORCIDProvider",
"OpenAIREProvider"
)
254 changes: 254 additions & 0 deletions oarepo_vocabularies/authorities/providers/openaire_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
import base64
import logging
import os
from flask import current_app
import requests

from oarepo_vocabularies.authorities.providers.base import AuthorityProvider


logger = logging.getLogger("oarepo-vocabularies.providers.openaire")

class OpenAIREClient(object):

def __init__(self, client_id, client_secret, url=None, testing=False, timeout=None, **kwargs):
self.client_id = client_id
self.client_secret = client_secret
self.testing = testing
self.timeout = timeout or 10000

def _get_token(self):
url = "https://aai.openaire.eu/oidc/token"
credentials = f"{self.client_id}:{self.client_secret}"
encoded_credentials = base64.b64encode(credentials.encode('utf-8')).decode('utf-8')

headers = {
"Authorization": f"Basic {encoded_credentials}"
}

data = {
"grant_type": "client_credentials"
}

try:
response = requests.post(url, headers=headers, data=data)
response.raise_for_status()
return response.json().get("access_token")
except requests.exceptions.HTTPError as http_err:
logger.error(f"HTTP error occurred: {http_err}")
except Exception as err:
logger.error(f"Other error occurred: {err}")

def quick_search(self, access_token, search_query="", page=1, page_size=20 ):
url = "https://api.openaire.eu/search/projects?format=json"
if not access_token:
return {}
headers = {
"Authorization": f"Bearer {access_token.strip()}"
}

if not search_query or search_query == "":
return {}

params = {
"name": search_query,
"page": page,
"size": page_size
}

response = requests.get(url, headers=headers, params=params)
if response.status_code != 200:
logger.error(f"Error response: {response.status_code}")
logger.error(f"Response content: {response.text}")
response.raise_for_status()
return response.json()

def get_record(self, item_id, access_token):
url = f"https://api.openaire.eu/search/projects?openaireProjectID={item_id}&format=json"

headers = {
"Authorization": f"Bearer {access_token.strip()}"
}

response = requests.get(url, headers=headers)
response.raise_for_status()
return response.json()


class OpenAIREProvider(AuthorityProvider):

_cached_token = None

def __init__(self, url=None, testing=False, **kwargs):
self.openaire_client = OpenAIREClient(current_app.config["OPENAIRE_CLIENT_ID"], current_app.config["OPENAIRE_CLIENT_SECRET"], url, testing, **kwargs)

def get_access_token(self):
if self._cached_token is None:
self._cached_token = self.openaire_client._get_token()
return self._cached_token

def search(self, identity, params, **kwargs):
params = params or {}
access_token = self.get_access_token()

response = self.openaire_client.quick_search(
access_token=access_token,
search_query=params.get("q", ""),
page=params.get("page", 1),
page_size=params.get("page_size", 20)
)

results = response.get("response", {})

if not results:
return [], 0

items = [self.to_vocabulary_item(openaire_item) for openaire_item in results.get("results", []).get("result", [])]
total = OpenAIREProvider.dict_get(results, "header", "total", "$")


return items, total



def get(self, identity, item_id, **kwargs):

access_token = self.get_access_token()

record = self.openaire_client.get_record(item_id, access_token)

if record is None:
raise KeyError(f"OpenAIRE record {item_id} not found.")

return self.to_vocabulary_item(record.get("response", {}))

@staticmethod
def dict_get(d, *args, default={}):
""" Iteratively reach for a key in a nested dictionary """
for path in args:
if not isinstance(d, dict) or path not in d:
return default
d = d[path]
return d

@staticmethod
def get_program_from_funding(funding_tree):
""" Explicitly search for the first program in the funding tree """
if funding_tree == []:
return "N/A"
if isinstance(funding_tree, list):
funder_info = funding_tree[0].items()
else:
funder_info = funding_tree.items()

for _, value in funder_info:
if isinstance(value, dict):
if "parent" in value and value["parent"] is not None:
for _, value in value["parent"].items():
if "class" in value:
return value["class"]["$"]
if "class" in value:
return value["class"]["$"]

return "N/A"

@staticmethod
def to_vocabulary_item(record):

# Parse the record
header = record.get("header", {})
metadata = record.get("metadata", {})
entity = metadata.get("oaf:entity", {})
project = entity.get("oaf:project", {})

try:
relations = project.get("rels", {}).get("rel", [])
except KeyError:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Key & Attribute error should not be thrown here, as you use .get(...)

relations = {}
except AttributeError:
relations = {}

# If there is only one relation, convert it to a list
if not isinstance(relations, list):
relations = [relations]

# Tags (keywords)
keywords = project.get("keywords", "")

if isinstance(keywords, dict):
keywords = keywords.get("$", "")
tags = keywords.split(",")

# Identifiers
identifiers = []

identifiers.append({
"identifier": header.get("dri:objIdentifier", {}).get("$", ""),
"scheme": "dri:objIdentifier"
})

identifiers.append({
"identifier": project.get("originalId", {}).get("$", ""),
"scheme": "openaire:originalId"
})

# Number (code), title (with locale) and acronym
number = project.get("code", {}).get("$", "")
title = {
header.get("locale", {}).get("$", "en")[:2]: project.get("title", {}).get("$", "")
}
acronym = project.get("acronym", {}).get("$", "")


# Funder and according program
funding = project.get("fundingtree", [])
try:
funder = {
"id": OpenAIREProvider.dict_get(funding, "funder", "id", "$"),
"name": OpenAIREProvider.dict_get(funding, "funder", "name", "$"),
}
except IndexError:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these errors thrown? dict_get seems to check for path existence before x[...]

funder = {}
except KeyError:
funder = {}

program = OpenAIREProvider.get_program_from_funding(funding)

# Subjects and organizations
subjects = []

subject_list = project.get("subject", [])

if not isinstance(subject_list, list) and subject_list is not None:
subject_list = [subject_list]

for subject in subject_list:
subjects.append({
"id": subject.get("@classid", ""),
"subject": subject.get("$", "")
})

organizations = []
for relation in relations:
try:
relation_to = relation.get("to", "")
organizations.append({
"scheme": relation_to.get("@scheme", ""),
"id": relation_to.get("$", ""),
"organization": relation.get("legalname", {}).get("$", "")
})
except AttributeError:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where does the attribute error come from?

organizations.append({})

return {
"$schema": "local://awards/award-v1.0.0.json",
"tags": tags,
"identifiers": identifiers,
"number": number,
"title": title,
"funder": funder,
"acronym": acronym,
"program": program,
"subjects": subjects,
"organizations": organizations
}
12 changes: 1 addition & 11 deletions oarepo_vocabularies/authorities/providers/orcid_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
from orcid import PublicAPI as PublicAPI




logger = logging.getLogger("oarepo-vocabularies.providers.orcid")

class ORCIDClient(PublicAPI):
Expand All @@ -35,15 +33,7 @@ def get_record(self, access_token, orcid_id):

class ORCIDProvider(AuthorityProvider):
def __init__(self, url=None, testing=False, **kwargs):
try:
client_id = current_app.config["ORCID_CLIENT_ID"]
client_secret = current_app.config["ORCID_CLIENT_SECRET"]
except RuntimeError:
client_id = os.environ["INVENIO_ORCID_CLIENT_ID"]
client_secret = os.environ["INVENIO_ORCID_CLIENT_SECRET"]
except KeyError:
raise KeyError("ORCID_CLIENT_ID and ORCID_CLIENT_SECRET must be set in the configuration or as environment variables.")
self.orcid_client = ORCIDClient(client_id, client_secret, testing, **kwargs)
self.orcid_client = ORCIDClient(current_app.config["ORCID_CLIENT_ID"], current_app.config["ORCID_CLIENT_SECRET"], testing, **kwargs)


def search(self, identity, params, **kwargs):
Expand Down
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,9 @@ def app_config(app_config):

app_config["ORCID_CLIENT_ID"] = os.environ["INVENIO_ORCID_CLIENT_ID"]
app_config["ORCID_CLIENT_SECRET"] = os.environ["INVENIO_ORCID_CLIENT_SECRET"]

app_config["OPENAIRE_CLIENT_ID"] = os.environ["INVENIO_OPENAIRE_CLIENT_ID"]
app_config["OPENAIRE_CLIENT_SECRET"] = os.environ["INVENIO_OPENAIRE_CLIENT_SECRET"]

return app_config

Expand Down
Loading
Loading