diff --git a/hepdata/cli.py b/hepdata/cli.py index 36b4d54f..43849f50 100644 --- a/hepdata/cli.py +++ b/hepdata/cli.py @@ -223,9 +223,9 @@ def do_unload(records_to_unload): @utils.command() @with_appcontext -@click.option('--endpoint', '-e', type=str, help='Specific endpoint to update (e.g. "rivet" or "MadAnalysis" or "SModelS"). Omit for all.') +@click.option('--endpoint', '-e', type=str, help='Specific endpoint to update (e.g. "rivet" or "MadAnalysis" or "SModelS" or "Combine"). Omit for all.') def find_and_add_record_analyses(endpoint): - """Finds analyses such as Rivet, MadAnalysis 5 and SModelS and adds them to records.""" + """Finds analyses such as Rivet, MadAnalysis 5, SModelS and Combine and adds them to records.""" update_analyses(endpoint) diff --git a/hepdata/config.py b/hepdata/config.py index 713693dd..9925a619 100644 --- a/hepdata/config.py +++ b/hepdata/config.py @@ -193,6 +193,7 @@ def _(x): CFG_DATA_TYPE = 'datatable' CFG_SUBMISSIONS_TYPE = 'submission' CFG_DATA_KEYWORDS = ['observables', 'reactions', 'cmenergies', 'phrases'] +CFG_SEARCH_RANGE_TERMS = ["recid", "publication_recid", "inspire_id"] # Possible terms used to OpenSearch API range searches CFG_CONVERTER_URL = 'https://converter.hepdata.net' CFG_SUPPORTED_FORMATS = ['yaml', 'root', 'csv', 'yoda', 'yoda1', 'original'] @@ -331,7 +332,16 @@ def _(x): 'endpoint_url': 'https://zenodo.org/records/13952092/files/smodels-analyses.hepdata.json?download=1', 'url_template': '{0}', 'subscribe_user_id': 7766 - } + }, + 'Combine': { + 'endpoint_url': 'https://cms-public-likelihoods-list.web.cern.ch/artifacts/output.json', + 'url_template': 'https://doi.org/{0}', + 'description': 'Statistical models', + 'license': { + 'name': 'cc-by-4.0', + 'url': 'https://creativecommons.org/licenses/by/4.0' + }, + }, #'ufo': {}, #'xfitter': {}, #'applgrid': {}, diff --git a/hepdata/ext/opensearch/api.py b/hepdata/ext/opensearch/api.py index 9330ad90..9d1d9229 100644 --- a/hepdata/ext/opensearch/api.py +++ b/hepdata/ext/opensearch/api.py @@ -96,8 +96,9 @@ def search(query, ('collaboration', collaboration_name), ('date', date) :param size: [int] max number of hits that should be returned :param offset: [int] offset for the results (used for pagination) - :param sort_by: [string] sorting field. Currently supported fields: - "title", "collaboration", "date", "relevance" + :param sort_field: [string] sorting field. Currently supported fields: + "title", "collaboration", "date", "relevance", + "recid", "inspire_id" :param sort_order: [string] order of the sorting either original (for a particular field) or reversed. Supported: '' or 'rev' @@ -108,23 +109,41 @@ def search(query, if query == '' and not sort_field: sort_field = 'date' - query = HEPDataQueryParser.parse_query(query) # Create search with preference param to ensure consistency of results across shards search = RecordsSearch(using=os, index=index).with_preference_param() + # Determine if the query is range-based, and get it, or the default search order + range_terms, exclude_tables, parsed_query = HEPDataQueryParser.parse_range_query(query) + + # We passed the newly range-parsed query to be parsed + query = HEPDataQueryParser.parse_query(parsed_query) + fuzzy_query = QueryString(query=query, fuzziness='AUTO') + if query: - fuzzy_query = QueryString(query=query, fuzziness='AUTO') + if exclude_tables: + search.query = fuzzy_query + + if query and not exclude_tables: search.query = fuzzy_query | \ Q('has_child', type="child_datatable", query=fuzzy_query) + # Add filter to search for only "publication" objects search = search.filter("term", doc_type=CFG_PUB_TYPE) search = QueryBuilder.add_filters(search, filters) + + if range_terms and not sort_field and not sort_order: + # Set default search keyword, and set default sort to desc + sort_field = 'recid' + sort_order = 'desc' + try: mapped_sort_field = sort_fields_mapping(sort_field) except ValueError as ve: return {'error': str(ve)} + search = search.sort({mapped_sort_field : {"order" : calculate_sort_order(sort_order, sort_field)}}) + search = add_default_aggregations(search, filters) if post_filter: @@ -135,23 +154,25 @@ def search(query, try: pub_result = search.execute().to_dict() - - parent_filter = { - "terms": { - "_id": [hit["_id"] for hit in pub_result['hits']['hits']] + data_result = {} + if not exclude_tables: + parent_filter = { + "terms": { + "_id": [hit["_id"] for hit in pub_result['hits']['hits']] + } } - } - data_search = RecordsSearch(using=os, index=index) - data_search = data_search.query('has_parent', - parent_type="parent_publication", - query=parent_filter) - if query: - data_search = data_search.query(QueryString(query=query)) + data_search = RecordsSearch(using=os, index=index) + data_search = data_search.query('has_parent', + parent_type="parent_publication", + query=parent_filter) + + if query: + data_search = data_search.query(QueryString(query=query)) - data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE - data_search = data_search[0:data_search_size] - data_result = data_search.execute().to_dict() + data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE + data_search = data_search[0:data_search_size] + data_result = data_search.execute().to_dict() merged_results = merge_results(pub_result, data_result) return map_result(merged_results, filters) @@ -165,7 +186,7 @@ def search(query, else: log.error(f'An unexpected error occurred when searching: {e}') reason = f'An unexpected error occurred: {e.error}' - return { 'error': reason } + return {'error': reason} @author_index diff --git a/hepdata/ext/opensearch/config/os_config.py b/hepdata/ext/opensearch/config/os_config.py index 85596198..5117fe19 100644 --- a/hepdata/ext/opensearch/config/os_config.py +++ b/hepdata/ext/opensearch/config/os_config.py @@ -113,6 +113,12 @@ def sort_fields_mapping(sort_by): return 'creation_date' elif sort_by == 'latest': return 'last_updated' + elif sort_by == 'recid': + return 'recid' # No change required + elif sort_by == 'publication_recid': + return 'publication_recid' # No change required + elif sort_by == 'inspire_id': + return 'inspire_id' # No change required elif not sort_by or sort_by == 'relevance': return '_score' else: diff --git a/hepdata/ext/opensearch/config/record_mapping.py b/hepdata/ext/opensearch/config/record_mapping.py index c7b9ddc6..cc55396c 100644 --- a/hepdata/ext/opensearch/config/record_mapping.py +++ b/hepdata/ext/opensearch/config/record_mapping.py @@ -171,7 +171,7 @@ } }, "inspire_id": { - "type": "text" + "type": "integer" }, "keywords": { "properties": { diff --git a/hepdata/ext/opensearch/document_enhancers.py b/hepdata/ext/opensearch/document_enhancers.py index 6ca9173a..f5554b8b 100644 --- a/hepdata/ext/opensearch/document_enhancers.py +++ b/hepdata/ext/opensearch/document_enhancers.py @@ -94,7 +94,7 @@ def add_shortened_authors(doc): def add_analyses(doc): """ - Add analyses links such as Rivet, MadAnalysis 5, SModelS, HistFactory and NUISANCE to the index. + Add analyses links such as Rivet, MadAnalysis 5, SModelS, Combine, HistFactory and NUISANCE to the index. :param doc: :return: diff --git a/hepdata/ext/opensearch/process_results.py b/hepdata/ext/opensearch/process_results.py index e5be6b5d..44e9b98f 100644 --- a/hepdata/ext/opensearch/process_results.py +++ b/hepdata/ext/opensearch/process_results.py @@ -27,10 +27,26 @@ from hepdata.utils.miscellaneous import splitter -def merge_results(pub_result, data_result): +def merge_results(pub_result, data_result=None): + """ + Merge results dictionaries of publication and data table + search result data. + Data result does not exist in publication-only searches, + so defaults to None. + + :param pub_result: Publication search data. + :param data_result: Data table search data. + :return: Merged search results dictionary. + """ merge_dict = dict() - merge_dict['hits'] = pub_result['hits']['hits'] + \ - data_result['hits']['hits'] + + # We don't need to merge if there is no data. + if data_result: + merge_dict['hits'] = pub_result['hits']['hits'] + \ + data_result['hits']['hits'] + else: + merge_dict['hits'] = pub_result['hits']['hits'] + merge_dict['total'] = pub_result['hits']['total']['value'] merge_dict['aggregations'] = pub_result.get('aggregations', {}) return merge_dict diff --git a/hepdata/ext/opensearch/query_builder.py b/hepdata/ext/opensearch/query_builder.py index dd5f9d9f..bb928d6b 100644 --- a/hepdata/ext/opensearch/query_builder.py +++ b/hepdata/ext/opensearch/query_builder.py @@ -23,6 +23,8 @@ import re from opensearch_dsl import Q +from hepdata.config import CFG_SEARCH_RANGE_TERMS + class QueryBuilder: @@ -52,7 +54,8 @@ def parse_query(query_string): "phrases": "data_keywords.phrases", "reactions": "data_keywords.reactions", "analysis": "analyses.type", - "resources": "resources.description" # Add shorthand for resource description + "resources": "resources.description", # Add shorthand for resource description + "publication_recid": "recid" # Shorthand for HEPData record ID } } @@ -81,3 +84,38 @@ def _quote_phrase(phrase): if '"' not in phrase and pattern.fullmatch(phrase): return f'"{phrase}"' return phrase + + @staticmethod + def parse_range_query(query): + """ + Parses and verifies whether a parsed query string contains a range-based query. + If it does, return either that search keyword, + or the "default" keyword for default search ordering. + Also determines if the query is a publication-only search, where tables are excluded. + Returns the query with publication_recid replaced with 'recid' for opensearch. + + Examples: publication_recid:[321 TO 321] inspire_id:[123 TO 123] + + :param query: The full query string + :return: A tuple containing a list of parsed range terms, + and a boolean determining whether table exclusion should occur (if range term is publication_recid, + or inspire_id), and the query with term replaced. + """ + # Pattern matching docstring example with placeholder + pattern = rf"(?:^|\s)%s:\s*\[\d+\s+TO\s+\d+]" + range_terms = [] + exclude_tables = False + # For all terms that can be range searched + for term in CFG_SEARCH_RANGE_TERMS: + result = re.findall(pattern % term, query) + if result: + range_terms.append(term) + + # If we are doing a range search on non-table objects + if ("publication_recid" in range_terms or "inspire_id" in range_terms) and "recid" not in range_terms: + exclude_tables = True + + # Finally, we replace publication_recid with the correct mapping for OpenSearch + query = query.replace("publication_recid", "recid") + + return range_terms, exclude_tables, query diff --git a/hepdata/modules/records/assets/js/hepdata_common.js b/hepdata/modules/records/assets/js/hepdata_common.js index a55d6963..0aec95fb 100644 --- a/hepdata/modules/records/assets/js/hepdata_common.js +++ b/hepdata/modules/records/assets/js/hepdata_common.js @@ -46,6 +46,7 @@ HEPDATA.file_type_to_details = { "rivet": {"icon": "area-chart", "description": "Rivet Analysis"}, "madanalysis": {"icon": "area-chart", "description": "MadAnalysis 5 Analysis"}, "smodels": {"icon": "area-chart", "description": "SModelS Analysis"}, + "combine": {"icon": "area-chart", "description": "Combine Analysis"}, "xfitter": {"icon": "area-chart", "description": "xFitter Analysis"}, "applgrid": {"icon": "area-chart", "description": "APPLgrid Analysis"}, "ufo": {"icon": "rocket", "description": "Universal Feynrules Output (UFO)"}, diff --git a/hepdata/modules/records/templates/hepdata_records/components/resources-widget.html b/hepdata/modules/records/templates/hepdata_records/components/resources-widget.html index 17c295b5..328a5be1 100644 --- a/hepdata/modules/records/templates/hepdata_records/components/resources-widget.html +++ b/hepdata/modules/records/templates/hepdata_records/components/resources-widget.html @@ -41,6 +41,7 @@

Add Resource for Submission

+ diff --git a/hepdata/modules/records/utils/analyses.py b/hepdata/modules/records/utils/analyses.py index 35b4b730..5219e01a 100644 --- a/hepdata/modules/records/utils/analyses.py +++ b/hepdata/modules/records/utils/analyses.py @@ -35,6 +35,7 @@ from hepdata.utils.users import get_user_from_id from hepdata.modules.records.subscribers.rest import subscribe from hepdata.modules.records.subscribers.api import is_current_user_subscribed_to_record +from hepdata.modules.records.utils.common import get_license logging.basicConfig() log = logging.getLogger(__name__) @@ -43,10 +44,11 @@ @shared_task def update_analyses(endpoint=None): """ - Update (Rivet, MadAnalysis 5 and SModelS) analyses and remove outdated resources. + Update (Rivet, MadAnalysis 5, SModelS and Combine) analyses and remove outdated resources. Allow bulk subscription to record update notifications if "subscribe_user_id" in endpoint. + Add optional "description" and "license" fields if present in endpoint. - :param endpoint: either "rivet" or "MadAnalysis" or "SModelS" or None (default) for both + :param endpoint: either "rivet" or "MadAnalysis" or "SModelS" or "Combine" or None (default) for both """ endpoints = current_app.config["ANALYSES_ENDPOINTS"] for analysis_endpoint in endpoints: @@ -86,6 +88,13 @@ def update_analyses(endpoint=None): file_location=_resource_url, file_type=analysis_endpoint) + if "description" in endpoints[analysis_endpoint]: + new_resource.file_description = str(endpoints[analysis_endpoint]["description"]) + + if "license" in endpoints[analysis_endpoint]: + resource_license = get_license(endpoints[analysis_endpoint]["license"]) + new_resource.file_license = resource_license.id + submission.resources.append(new_resource) num_new_resources += 1 diff --git a/hepdata/modules/search/templates/hepdata_search/modals/search_help.html b/hepdata/modules/search/templates/hepdata_search/modals/search_help.html index 357661b1..9feddd01 100644 --- a/hepdata/modules/search/templates/hepdata_search/modals/search_help.html +++ b/hepdata/modules/search/templates/hepdata_search/modals/search_help.html @@ -243,6 +243,13 @@

Other useful searches

(SModelS analysis) +
  • + analysis:Combine + + (CMS statistical models in Combine format) + +
  • analysis:HistFactory @@ -279,6 +286,32 @@

    Searching via Inspire

    +
    +

    Range-based Searching

    +

    + We support searching for a range of records using their HEPData record ID or Inspire ID. +

    + +
    + diff --git a/hepdata/version.py b/hepdata/version.py index eeef8cfc..6be16374 100644 --- a/hepdata/version.py +++ b/hepdata/version.py @@ -28,4 +28,4 @@ and parsed by ``setup.py``. """ -__version__ = "0.9.4dev20241112" +__version__ = "0.9.4dev20241204" diff --git a/tests/records_test.py b/tests/records_test.py index 988f7692..2f6b4a14 100644 --- a/tests/records_test.py +++ b/tests/records_test.py @@ -1031,7 +1031,7 @@ def test_create_breadcrumb_text(): def test_update_analyses(app): - """ Test update of Rivet, MadAnalyses 5 and SModelS analyses """ + """ Test update of Rivet, MadAnalyses 5, SModelS and Combine analyses """ # Import a record that already has a Rivet analysis attached (but with '#' in the URL) import_records(['ins1203852'], synchronous=True) @@ -1074,6 +1074,23 @@ def test_update_analyses(app): submission = get_latest_hepsubmission(inspire_id='1847779', overall_status='finished') assert is_current_user_subscribed_to_record(submission.publication_recid, user) + # Import a record that has an associated Combine analysis + import_records(['ins2796231'], synchronous=True) + analysis_resources = DataResource.query.filter_by(file_type='Combine').all() + assert len(analysis_resources) == 0 + analysis_resources = DataResource.query.filter_by(file_location='https://doi.org/10.17181/bp9fx-6qs64').all() + assert len(analysis_resources) == 1 + db.session.delete(analysis_resources[0]) # delete resource so it can be re-added in next step + db.session.commit() + update_analyses('Combine') + analysis_resources = DataResource.query.filter_by(file_type='Combine').all() + assert len(analysis_resources) == 1 + assert analysis_resources[0].file_location == 'https://doi.org/10.17181/bp9fx-6qs64' + assert analysis_resources[0].file_description == 'Statistical models' + license_data = License.query.filter_by(id=analysis_resources[0].file_license).first() + assert license_data.name == 'cc-by-4.0' + assert license_data.url == 'https://creativecommons.org/licenses/by/4.0' + def test_generate_license_data_by_id(app): """ diff --git a/tests/search_test.py b/tests/search_test.py index 8b997562..72131fa2 100644 --- a/tests/search_test.py +++ b/tests/search_test.py @@ -15,9 +15,11 @@ # You should have received a copy of the GNU General Public License # along with HEPData; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +import unittest + from opensearchpy.exceptions import NotFoundError from opensearch_dsl import Search, Index -import datetime +from datetime import datetime import pytest from invenio_db import db from unittest.mock import call @@ -173,6 +175,92 @@ def test_query_parser(): assert (parsed_query_string6 == 'analyses.type:rivet') + _test_query7 = 'publication_recid:1' + parsed_query_string7 = HEPDataQueryParser.parse_query(_test_query7) + assert (parsed_query_string7 == 'recid:1') + +def test_parse_range_query(): + """ + Tests the range query verification function to ensure that parsed queries are + correctly returning the range term list, and the data search table exclusion status. + """ + test_data =[ + { # Expected to return publication_recid as it is default. + "expected_result": ["recid"], + "exclude_tables": False, # Recid should include tables in search + "query_strings": [ + "recid:[0 TO 10000]", # Correct + "recid: [0 TO 10000]", # Extra valid whitespace + " recid:[0 TO 10000] ", # Left and right whitespace + "recid:[0 TO 10000] AND year:2024" + ] + }, + { # Expected to return publication_recid as it is default. + "expected_result": ["publication_recid"], + "exclude_tables": True, # publication_recid should exclude tables + "query_strings": [ + "publication_recid:[0 TO 10000]", + "publication_recid: [0 TO 10000]", # Extra valid whitespace + " publication_recid:[0 TO 10000] ", # Left and right whitespace + "publication_recid:[0 TO 10000] AND year:2024" + ] + }, + { # Test proper exclusion value of publication_recid and inspire_id + "expected_result": ["publication_recid", "inspire_id"], + "exclude_tables": True, + "query_strings": [ + "publication_recid:[0 TO 10000] AND inspire_id:[0 TO 10000]", # Correct + "publication_recid: [0 TO 10000] AND inspire_id: [0 TO 10000]", # Extra valid whitespace + " publication_recid:[0 TO 10000] AND inspire_id: [0 TO 10000]", # Left and right whitespace + ] + }, + { + "expected_result": ["recid", "inspire_id"], + "exclude_tables": False, + "query_strings": [ + "recid:[0 TO 10000] AND inspire_id:[0 TO 10000]", + "recid: [0 TO 10000] AND inspire_id: [0 TO 10000]", # Extra valid whitespace + " recid:[0 TO 10000] AND inspire_id: [0 TO 10000]", # Left and right whitespace + ] + }, + { + "expected_result": ["recid", "publication_recid"], + "exclude_tables": False, + "query_strings": [ + "recid:[0 TO 10000] AND publication_recid:[0 TO 10000]", + "recid: [0 TO 10000] AND publication_recid: [0 TO 10000]", # Extra valid whitespace + " recid:[0 TO 10000] AND publication_recid: [0 TO 10000]", # Left and right whitespace + ] + }, + { # Some incorrect cases + "expected_result": [], + "exclude_tables": False, + "query_strings": [ + " recid[0 TO 10000] ", + "recsid:[0 TO 10000]", + "INCORRECT:[46 TO 46]", # Mismatched term + "recid:[-0 TO 10000] OR inspire_id:[-123 TO -123]", # Negative numbers + "recid:[NOTINT TO 46]", # Mismatched int left + "recid:[46 TO NOTINT]", # Mismatched int right + "inspire_idd:[0 TO 10000]", # Misspelling + "inspire_id:[0 TO 10000 ]", # Invalid whitespace + "inspire_id:[ 0 TO 10000]", # Invalid whitespace + "inspire_id :[0 TO 10000]", # Invalid whitespace + ] + }, + + ] + + # Each test dictionary in the list has a different expected_result value + for test in test_data: + # For each query string for the current expected_result + for query in test["query_strings"]: + # Execute the verification with current string + result = HEPDataQueryParser.parse_range_query(query) + # Doing the same term replacement required for OpenSearch + parsed_query = query.replace("publication_recid", "recid") + # Expected result based on which test object we are on + assert result == (test["expected_result"], test["exclude_tables"], parsed_query) def test_search(app, load_default_data, identifiers): @@ -315,6 +403,252 @@ def test_search(app, load_default_data, identifiers): assert results == {'error': 'An unexpected error occurred: index_not_found_exception'} +def test_search_range_ids(app, load_default_data, identifiers): + """ + Tests range-based searching where ID-like entries are used + First checks whole range, then single entry + e.g. inspire_id and recid NOT cmenergies etc. + """ + + # Test the parsed entries in config.CFG_SEARCH_RANGE_TERMS + test_queries = [ + "inspire_id", + "recid" + ] + + for test in test_queries: + # Create the range query formatting the keyword per query + range_query = f"{test}:[%d TO %d]" + + # Just do a huge range, to see if everything appears + results = os_api.search(range_query % (0, 100000000)) + # Result count should equal maximum number of entries + assert len(results['results']) == len(identifiers) + + # Testing a range query we know shouldn't work + zero_result = os_api.search(range_query % (0, 0)) + assert not zero_result.get('results') + + # Do a range search for a single result, for each result of the 'all' search above. + for result in results['results']: + # We get the inspire/recid from the current result + identifier_id = int(result[test]) + # Do a search, formatting the query to find a single result + specific_result = os_api.search(range_query % (identifier_id, identifier_id)) + # Check ID of single result + assert int(specific_result['results'][0][test]) == int(identifier_id) + + # Testing another bad result, where the numbers are completely invalid + bad_result = os_api.search(range_query % (identifier_id+1, identifier_id-1)) + assert not bad_result.get('results') + +def test_range_queries(app, load_default_data, identifiers): + """ + Tests search functionality to ensure range queries are functional, together + and alongside other search types + """ + current_year = datetime.today().year + + test_data = [ + { # Check all results are returned, and is sorted by inspire_id + "test_query": "inspire_id:[0 TO 10000000]", + "expected_result": { + "count": len(identifiers), + "expected_inspire_ids": [2751932, 1245023, 1283842], + "expected_rec_ids": [57, 16, 1] + } + }, + { # Check all results are returned, and is sorted by recid + "test_query": "publication_recid:[0 TO 10000000]", + "expected_result": { + "count": len(identifiers), + "expected_inspire_ids": [2751932, 1245023, 1283842], + "expected_rec_ids": [57, 16, 1] + } + }, + { # Check all results are returned, and is sorted by recid + "test_query": "recid:[0 TO 10000000]", + "expected_result": { + "count": len(identifiers), + "expected_inspire_ids": [2751932, 1245023, 1283842], + "expected_rec_ids": [57, 16, 1] + } + }, + { # Full boolean search + "test_query": "inspire_id:[1283842 TO 1283842] AND publication_recid:[1 TO 1] AND recid:[1 TO 1]", + "expected_result": { + "count": 1, "expected_inspire_ids": [1283842], "expected_rec_ids": [1] + } + }, + { # Check all results are returned, and is sorted by recid + "test_query": "publication_recid:[0 TO 10000000] AND recid:[0 TO 10000000]", + "expected_result": { + "count": len(identifiers), + "expected_inspire_ids": [2751932, 1245023, 1283842], + "expected_rec_ids": [57, 16, 1] + } + }, + { # Should cover every ID in the range, and equal the length of identifiers, sorted by recid + "test_query": "inspire_id:[0 TO 10000000] AND publication_recid:[0 TO 10000000]", + "expected_result": { + "count": len(identifiers), + "expected_inspire_ids": [2751932, 1245023, 1283842], + "expected_rec_ids": [57, 16, 1] + } + }, + { # Valid search for a specific entry + "test_query": "inspire_id:[2751932 TO 2751932] AND publication_recid:[57 TO 57]", + "expected_result": { + "count": 1, + "expected_inspire_ids": [2751932], + "expected_rec_ids": [57] + } + }, + { # Valid search for a specific entry using OR + "test_query": "inspire_id:[2751932 TO 2751932] OR publication_recid:[0 TO 0]", + "expected_result": { + "count": 1, + "expected_inspire_ids": [2751932], + "expected_rec_ids": [57] + } + }, + { # Valid search for a specific entry using OR + "test_query": "inspire_id:[0 TO 0] OR publication_recid:[57 TO 57]", + "expected_result": { + "count": 1, + "expected_inspire_ids": [2751932], + "expected_rec_ids": [57] + } + }, + { # Testing adding year to the range + "test_query": f"inspire_id:[2751932 TO 2751932] AND publication_recid:[57 TO 57] AND year:{current_year}", + "expected_result": { + "count": 1, + "expected_inspire_ids": [2751932], + "expected_rec_ids": [57] + } + }, + { # Should be invalid as all entries are set to current year + "test_query": f"inspire_id:[2751932 TO 2751932] AND publication_recid:[57 TO 57] AND year:{current_year - 1}", + "expected_result": { + "count": 0, + "expected_inspire_ids": [], + "expected_rec_ids": [] + } + }, + { # Search text is valid here + "test_query": "inspire_id:[2751932 TO 2751932] AND publication_recid:[57 TO 57] Production of higgsinos", + "expected_result": { + "count": 1, "expected_inspire_ids": [2751932], "expected_rec_ids": [57] + } + }, + { # Search text is valid here + "test_query": "inspire_id:[2751932 TO 2751932] AND publication_recid:[57 TO 57] AND Production of higgsinos", + "expected_result": { + "count": 1, "expected_inspire_ids": [2751932], "expected_rec_ids": [57] + } + }, + { # Search text is invalid as it has been garbled slightly + "test_query": "inspire_id:[2751932 TO 2751932] AND publication_recid:[57 TO 57] AND Prdction of igsnos", + "expected_result": { + "count": 0, "expected_inspire_ids": [], "expected_rec_ids": [] + } + }, + { # No result as the search string is invalid + "test_query": "inspire_id:[2751932 TO 2751932] AND publication_recid:[57 TO 57] AND \"abcdef\"", + "expected_result": { + "count": 0, "expected_inspire_ids": [], "expected_rec_ids": [] + } + }, + { # No result expected as inspire_id should not be matched + "test_query": "inspire_id:[2751933 TO 2751933] AND publication_recid:[57 TO 57]", + "expected_result": { + "count": 0, "expected_inspire_ids": [], "expected_rec_ids": [] + } + }, + { # Result expected as inner resource recid is searched matched + "test_query": "inspire_id:[2751932 TO 2751932] AND recid:[58 TO 58]", + "expected_result": { + "count": 1, "expected_inspire_ids": [2751932], "expected_rec_ids": [57] + } + }, + { # No result as publication_recid is incorrect + "test_query": "inspire_id:[2751932 TO 2751932] AND publication_recid:[5000 TO 5000]", + "expected_result": { + "count": 0, "expected_inspire_ids": [], "expected_rec_ids": [] + } + }, + { # Test specific selection of table by recid + "test_query": "recid:[58 TO 58]", + "expected_result": { + "count": 1, "expected_inspire_ids": [2751932], "expected_rec_ids": [57] + } + }, + { # Test range selection of table by recid, including table + "test_query": "recid:[57 TO 58]", + "expected_result": { + "count": 1, "expected_inspire_ids": [2751932], "expected_rec_ids": [57] + } + }, + { # Test specific selection of publication by recid + "test_query": "publication_recid:[57 TO 57]", + "expected_result": { + "count": 1, "expected_inspire_ids": [2751932], "expected_rec_ids": [57] + } + }, + { # Test specific selection of publication by publication + "test_query": "publication_recid:[57 TO 58]", + "expected_result": { + "count": 1, "expected_inspire_ids": [2751932], "expected_rec_ids": [57] + } + }, + { # ID 58 is included within 57, but not marked + "test_query": "publication_recid:[58 TO 58]", + "expected_result": { + "count": 0, "expected_inspire_ids": [], "expected_rec_ids": [] + } + }, + { # Searching for a specific inspire_id + "test_query": "inspire_id:[2751932 TO 2751932]", + "expected_result": { + "count": 1, "expected_inspire_ids": [2751932], "expected_rec_ids": [57] + } + }, + { # Test selection of a failed case + "test_query": "inspire_id:[2751933 TO 2751933]", + "expected_result": { + "count": 0, "expected_inspire_ids": [], "expected_rec_ids": [] + } + }, + { # Test selection of just the middle value + "test_query": "inspire_id:[1245024 TO 2751931]", + "expected_result": { + "count": 1, "expected_inspire_ids": [1283842], "expected_rec_ids": [1] + } + }, + { # Test selection of all inspire_id values + "test_query": "inspire_id:[1245023 TO 2751932]", + "expected_result": { + "count": 3, "expected_inspire_ids": [2751932, 1245023, 1283842], "expected_rec_ids": [57, 16, 1] + } + } + ] + + for test in test_data: + # Execute the search + results = os_api.search(test['test_query']) + + # Gather the recid and inspire_id results + recid_results = [result['recid'] for result in results['results']] + inspire_results = [int(result['inspire_id']) for result in results['results']] + + # Confirm expected count + assert len(results['results']) == test['expected_result']['count'] + # Confirm recid and inspire_id results are as expected + assert test['expected_result']["expected_inspire_ids"] == inspire_results + assert test['expected_result']["expected_rec_ids"] == recid_results + + def test_merge_results(): pub_result = { "hits": { @@ -710,13 +1044,13 @@ def test_get_all_ids(app, load_default_data, identifiers): # Check last_updated works # Default records were last updated on 2016-07-13 and 2013-12-17 - date_2013_1 = datetime.datetime(year=2013, month=12, day=16) + date_2013_1 = datetime(year=2013, month=12, day=16) assert(os_api.get_all_ids(last_updated=date_2013_1) == expected_record_ids) - date_2013_2 = datetime.datetime(year=2013, month=12, day=17) + date_2013_2 = datetime(year=2013, month=12, day=17) assert(os_api.get_all_ids(last_updated=date_2013_2) == expected_record_ids) - date_2013_3 = datetime.datetime(year=2013, month=12, day=18) + date_2013_3 = datetime(year=2013, month=12, day=18) assert(os_api.get_all_ids(last_updated=date_2013_3) == [1, 57]) - date_2120 = datetime.datetime(year=2120, month=1, day=1) + date_2120 = datetime(year=2120, month=1, day=1) assert(os_api.get_all_ids(last_updated=date_2120) == []) # Check sort by latest works - first record is newer than previous