Skip to content

Commit

Permalink
Merge branch 'main' into dependabot/pip/invenio-accounts-5.1.7
Browse files Browse the repository at this point in the history
  • Loading branch information
GraemeWatt authored Dec 4, 2024
2 parents 122954e + ca52b85 commit cc0dce6
Show file tree
Hide file tree
Showing 15 changed files with 523 additions and 37 deletions.
4 changes: 2 additions & 2 deletions hepdata/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,9 @@ def do_unload(records_to_unload):

@utils.command()
@with_appcontext
@click.option('--endpoint', '-e', type=str, help='Specific endpoint to update (e.g. "rivet" or "MadAnalysis" or "SModelS"). Omit for all.')
@click.option('--endpoint', '-e', type=str, help='Specific endpoint to update (e.g. "rivet" or "MadAnalysis" or "SModelS" or "Combine"). Omit for all.')
def find_and_add_record_analyses(endpoint):
"""Finds analyses such as Rivet, MadAnalysis 5 and SModelS and adds them to records."""
"""Finds analyses such as Rivet, MadAnalysis 5, SModelS and Combine and adds them to records."""
update_analyses(endpoint)


Expand Down
12 changes: 11 additions & 1 deletion hepdata/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def _(x):
CFG_DATA_TYPE = 'datatable'
CFG_SUBMISSIONS_TYPE = 'submission'
CFG_DATA_KEYWORDS = ['observables', 'reactions', 'cmenergies', 'phrases']
CFG_SEARCH_RANGE_TERMS = ["recid", "publication_recid", "inspire_id"] # Possible terms used to OpenSearch API range searches

CFG_CONVERTER_URL = 'https://converter.hepdata.net'
CFG_SUPPORTED_FORMATS = ['yaml', 'root', 'csv', 'yoda', 'yoda1', 'original']
Expand Down Expand Up @@ -331,7 +332,16 @@ def _(x):
'endpoint_url': 'https://zenodo.org/records/13952092/files/smodels-analyses.hepdata.json?download=1',
'url_template': '{0}',
'subscribe_user_id': 7766
}
},
'Combine': {
'endpoint_url': 'https://cms-public-likelihoods-list.web.cern.ch/artifacts/output.json',
'url_template': 'https://doi.org/{0}',
'description': 'Statistical models',
'license': {
'name': 'cc-by-4.0',
'url': 'https://creativecommons.org/licenses/by/4.0'
},
},
#'ufo': {},
#'xfitter': {},
#'applgrid': {},
Expand Down
59 changes: 40 additions & 19 deletions hepdata/ext/opensearch/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,9 @@ def search(query,
('collaboration', collaboration_name), ('date', date)
:param size: [int] max number of hits that should be returned
:param offset: [int] offset for the results (used for pagination)
:param sort_by: [string] sorting field. Currently supported fields:
"title", "collaboration", "date", "relevance"
:param sort_field: [string] sorting field. Currently supported fields:
"title", "collaboration", "date", "relevance",
"recid", "inspire_id"
:param sort_order: [string] order of the sorting either original
(for a particular field) or reversed. Supported:
'' or 'rev'
Expand All @@ -108,23 +109,41 @@ def search(query,
if query == '' and not sort_field:
sort_field = 'date'

query = HEPDataQueryParser.parse_query(query)
# Create search with preference param to ensure consistency of results across shards
search = RecordsSearch(using=os, index=index).with_preference_param()

# Determine if the query is range-based, and get it, or the default search order
range_terms, exclude_tables, parsed_query = HEPDataQueryParser.parse_range_query(query)

# We passed the newly range-parsed query to be parsed
query = HEPDataQueryParser.parse_query(parsed_query)
fuzzy_query = QueryString(query=query, fuzziness='AUTO')

if query:
fuzzy_query = QueryString(query=query, fuzziness='AUTO')
if exclude_tables:
search.query = fuzzy_query

if query and not exclude_tables:
search.query = fuzzy_query | \
Q('has_child', type="child_datatable", query=fuzzy_query)

# Add filter to search for only "publication" objects
search = search.filter("term", doc_type=CFG_PUB_TYPE)
search = QueryBuilder.add_filters(search, filters)


if range_terms and not sort_field and not sort_order:
# Set default search keyword, and set default sort to desc
sort_field = 'recid'
sort_order = 'desc'

try:
mapped_sort_field = sort_fields_mapping(sort_field)
except ValueError as ve:
return {'error': str(ve)}

search = search.sort({mapped_sort_field : {"order" : calculate_sort_order(sort_order, sort_field)}})

search = add_default_aggregations(search, filters)

if post_filter:
Expand All @@ -135,23 +154,25 @@ def search(query,

try:
pub_result = search.execute().to_dict()

parent_filter = {
"terms": {
"_id": [hit["_id"] for hit in pub_result['hits']['hits']]
data_result = {}
if not exclude_tables:
parent_filter = {
"terms": {
"_id": [hit["_id"] for hit in pub_result['hits']['hits']]
}
}
}

data_search = RecordsSearch(using=os, index=index)
data_search = data_search.query('has_parent',
parent_type="parent_publication",
query=parent_filter)
if query:
data_search = data_search.query(QueryString(query=query))
data_search = RecordsSearch(using=os, index=index)
data_search = data_search.query('has_parent',
parent_type="parent_publication",
query=parent_filter)

if query:
data_search = data_search.query(QueryString(query=query))

data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
data_search = data_search[0:data_search_size]
data_result = data_search.execute().to_dict()
data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
data_search = data_search[0:data_search_size]
data_result = data_search.execute().to_dict()

merged_results = merge_results(pub_result, data_result)
return map_result(merged_results, filters)
Expand All @@ -165,7 +186,7 @@ def search(query,
else:
log.error(f'An unexpected error occurred when searching: {e}')
reason = f'An unexpected error occurred: {e.error}'
return { 'error': reason }
return {'error': reason}


@author_index
Expand Down
6 changes: 6 additions & 0 deletions hepdata/ext/opensearch/config/os_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,12 @@ def sort_fields_mapping(sort_by):
return 'creation_date'
elif sort_by == 'latest':
return 'last_updated'
elif sort_by == 'recid':
return 'recid' # No change required
elif sort_by == 'publication_recid':
return 'publication_recid' # No change required
elif sort_by == 'inspire_id':
return 'inspire_id' # No change required
elif not sort_by or sort_by == 'relevance':
return '_score'
else:
Expand Down
2 changes: 1 addition & 1 deletion hepdata/ext/opensearch/config/record_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@
}
},
"inspire_id": {
"type": "text"
"type": "integer"
},
"keywords": {
"properties": {
Expand Down
2 changes: 1 addition & 1 deletion hepdata/ext/opensearch/document_enhancers.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def add_shortened_authors(doc):

def add_analyses(doc):
"""
Add analyses links such as Rivet, MadAnalysis 5, SModelS, HistFactory and NUISANCE to the index.
Add analyses links such as Rivet, MadAnalysis 5, SModelS, Combine, HistFactory and NUISANCE to the index.
:param doc:
:return:
Expand Down
22 changes: 19 additions & 3 deletions hepdata/ext/opensearch/process_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,26 @@
from hepdata.utils.miscellaneous import splitter


def merge_results(pub_result, data_result):
def merge_results(pub_result, data_result=None):
"""
Merge results dictionaries of publication and data table
search result data.
Data result does not exist in publication-only searches,
so defaults to None.
:param pub_result: Publication search data.
:param data_result: Data table search data.
:return: Merged search results dictionary.
"""
merge_dict = dict()
merge_dict['hits'] = pub_result['hits']['hits'] + \
data_result['hits']['hits']

# We don't need to merge if there is no data.
if data_result:
merge_dict['hits'] = pub_result['hits']['hits'] + \
data_result['hits']['hits']
else:
merge_dict['hits'] = pub_result['hits']['hits']

merge_dict['total'] = pub_result['hits']['total']['value']
merge_dict['aggregations'] = pub_result.get('aggregations', {})
return merge_dict
Expand Down
40 changes: 39 additions & 1 deletion hepdata/ext/opensearch/query_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import re
from opensearch_dsl import Q

from hepdata.config import CFG_SEARCH_RANGE_TERMS


class QueryBuilder:

Expand Down Expand Up @@ -52,7 +54,8 @@ def parse_query(query_string):
"phrases": "data_keywords.phrases",
"reactions": "data_keywords.reactions",
"analysis": "analyses.type",
"resources": "resources.description" # Add shorthand for resource description
"resources": "resources.description", # Add shorthand for resource description
"publication_recid": "recid" # Shorthand for HEPData record ID
}
}

Expand Down Expand Up @@ -81,3 +84,38 @@ def _quote_phrase(phrase):
if '"' not in phrase and pattern.fullmatch(phrase):
return f'"{phrase}"'
return phrase

@staticmethod
def parse_range_query(query):
"""
Parses and verifies whether a parsed query string contains a range-based query.
If it does, return either that search keyword,
or the "default" keyword for default search ordering.
Also determines if the query is a publication-only search, where tables are excluded.
Returns the query with publication_recid replaced with 'recid' for opensearch.
Examples: publication_recid:[321 TO 321] inspire_id:[123 TO 123]
:param query: The full query string
:return: A tuple containing a list of parsed range terms,
and a boolean determining whether table exclusion should occur (if range term is publication_recid,
or inspire_id), and the query with term replaced.
"""
# Pattern matching docstring example with placeholder
pattern = rf"(?:^|\s)%s:\s*\[\d+\s+TO\s+\d+]"
range_terms = []
exclude_tables = False
# For all terms that can be range searched
for term in CFG_SEARCH_RANGE_TERMS:
result = re.findall(pattern % term, query)
if result:
range_terms.append(term)

# If we are doing a range search on non-table objects
if ("publication_recid" in range_terms or "inspire_id" in range_terms) and "recid" not in range_terms:
exclude_tables = True

# Finally, we replace publication_recid with the correct mapping for OpenSearch
query = query.replace("publication_recid", "recid")

return range_terms, exclude_tables, query
1 change: 1 addition & 0 deletions hepdata/modules/records/assets/js/hepdata_common.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ HEPDATA.file_type_to_details = {
"rivet": {"icon": "area-chart", "description": "Rivet Analysis"},
"madanalysis": {"icon": "area-chart", "description": "MadAnalysis 5 Analysis"},
"smodels": {"icon": "area-chart", "description": "SModelS Analysis"},
"combine": {"icon": "area-chart", "description": "Combine Analysis"},
"xfitter": {"icon": "area-chart", "description": "xFitter Analysis"},
"applgrid": {"icon": "area-chart", "description": "APPLgrid Analysis"},
"ufo": {"icon": "rocket", "description": "Universal Feynrules Output (UFO)"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ <h4>Add Resource for <span id="selected_resource_item">Submission</span></h4>
<option value="applgrid">APPLgrid</option>
<option value="MadAnalysis">MadAnalysis 5</option>
<option value="SModelS">SModelS</option>
<option value="Combine">Combine</option>
<option value="rivet">Rivet</option>
<option value="fastnlo">fastNLO</option>
<option value="ufo">Universal Feynrules Output (UFO)</option>
Expand Down
13 changes: 11 additions & 2 deletions hepdata/modules/records/utils/analyses.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from hepdata.utils.users import get_user_from_id
from hepdata.modules.records.subscribers.rest import subscribe
from hepdata.modules.records.subscribers.api import is_current_user_subscribed_to_record
from hepdata.modules.records.utils.common import get_license

logging.basicConfig()
log = logging.getLogger(__name__)
Expand All @@ -43,10 +44,11 @@
@shared_task
def update_analyses(endpoint=None):
"""
Update (Rivet, MadAnalysis 5 and SModelS) analyses and remove outdated resources.
Update (Rivet, MadAnalysis 5, SModelS and Combine) analyses and remove outdated resources.
Allow bulk subscription to record update notifications if "subscribe_user_id" in endpoint.
Add optional "description" and "license" fields if present in endpoint.
:param endpoint: either "rivet" or "MadAnalysis" or "SModelS" or None (default) for both
:param endpoint: either "rivet" or "MadAnalysis" or "SModelS" or "Combine" or None (default) for both
"""
endpoints = current_app.config["ANALYSES_ENDPOINTS"]
for analysis_endpoint in endpoints:
Expand Down Expand Up @@ -86,6 +88,13 @@ def update_analyses(endpoint=None):
file_location=_resource_url,
file_type=analysis_endpoint)

if "description" in endpoints[analysis_endpoint]:
new_resource.file_description = str(endpoints[analysis_endpoint]["description"])

if "license" in endpoints[analysis_endpoint]:
resource_license = get_license(endpoints[analysis_endpoint]["license"])
new_resource.file_license = resource_license.id

submission.resources.append(new_resource)
num_new_resources += 1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,13 @@ <h4>Other useful searches</h4>
(SModelS analysis)
</span>
</li>
<li>
<a href='/search?q=analysis:Combine&sort_by=latest'
target="_new">analysis:Combine</a>
<span class="text-muted">
(CMS statistical models in Combine format)
</span>
</li>
<li>
<a href='/search?q=analysis:HistFactory&sort_by=latest'
target="_new">analysis:HistFactory</a>
Expand Down Expand Up @@ -279,6 +286,32 @@ <h4>Searching via Inspire</h4>
</ul>
</div>

<div class="well well-small">
<h4>Range-based Searching</h4>
<p>
We support searching for a range of records using their HEPData record ID or Inspire ID.
</p>
<ul>
<li>Range searching by HEPData record ID:
<ul>
<li>
<a href='/search?q=publication_recid:[1 TO 10]'
target="_new">publication_recid:[1 TO 10]</a>
</li>
</ul>
</li>
<br/>
<li>Range searching by Inspire ID:
<ul>
<li>
<a href='/search?q=inspire_id:[1 TO 10000]'
target="_new">inspire_id:[1 TO 10000]</a>
</li>
</ul>
</li>
</ul>
</div>

</div>
</div>
</div>
Expand Down
2 changes: 1 addition & 1 deletion hepdata/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@
and parsed by ``setup.py``.
"""

__version__ = "0.9.4dev20241112"
__version__ = "0.9.4dev20241204"
19 changes: 18 additions & 1 deletion tests/records_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1031,7 +1031,7 @@ def test_create_breadcrumb_text():


def test_update_analyses(app):
""" Test update of Rivet, MadAnalyses 5 and SModelS analyses """
""" Test update of Rivet, MadAnalyses 5, SModelS and Combine analyses """

# Import a record that already has a Rivet analysis attached (but with '#' in the URL)
import_records(['ins1203852'], synchronous=True)
Expand Down Expand Up @@ -1074,6 +1074,23 @@ def test_update_analyses(app):
submission = get_latest_hepsubmission(inspire_id='1847779', overall_status='finished')
assert is_current_user_subscribed_to_record(submission.publication_recid, user)

# Import a record that has an associated Combine analysis
import_records(['ins2796231'], synchronous=True)
analysis_resources = DataResource.query.filter_by(file_type='Combine').all()
assert len(analysis_resources) == 0
analysis_resources = DataResource.query.filter_by(file_location='https://doi.org/10.17181/bp9fx-6qs64').all()
assert len(analysis_resources) == 1
db.session.delete(analysis_resources[0]) # delete resource so it can be re-added in next step
db.session.commit()
update_analyses('Combine')
analysis_resources = DataResource.query.filter_by(file_type='Combine').all()
assert len(analysis_resources) == 1
assert analysis_resources[0].file_location == 'https://doi.org/10.17181/bp9fx-6qs64'
assert analysis_resources[0].file_description == 'Statistical models'
license_data = License.query.filter_by(id=analysis_resources[0].file_license).first()
assert license_data.name == 'cc-by-4.0'
assert license_data.url == 'https://creativecommons.org/licenses/by/4.0'


def test_generate_license_data_by_id(app):
"""
Expand Down
Loading

0 comments on commit cc0dce6

Please sign in to comment.