Merge branch 'main' into dependabot/pip/invenio-accounts-5.1.7

HEPData · Dec 4, 2024 · cc0dce6 · cc0dce6
2 parents 122954e + ca52b85
commit cc0dce6
Show file tree

Hide file tree

Showing 15 changed files with 523 additions and 37 deletions.
diff --git a/hepdata/cli.py b/hepdata/cli.py
@@ -223,9 +223,9 @@ def do_unload(records_to_unload):
 
 @utils.command()
 @with_appcontext
-@click.option('--endpoint', '-e', type=str, help='Specific endpoint to update (e.g. "rivet" or "MadAnalysis" or "SModelS"). Omit for all.')
+@click.option('--endpoint', '-e', type=str, help='Specific endpoint to update (e.g. "rivet" or "MadAnalysis" or "SModelS" or "Combine"). Omit for all.')
 def find_and_add_record_analyses(endpoint):
-    """Finds analyses such as Rivet, MadAnalysis 5 and SModelS and adds them to records."""
+    """Finds analyses such as Rivet, MadAnalysis 5, SModelS and Combine and adds them to records."""
     update_analyses(endpoint)
 
 

diff --git a/hepdata/config.py b/hepdata/config.py
@@ -193,6 +193,7 @@ def _(x):
 CFG_DATA_TYPE = 'datatable'
 CFG_SUBMISSIONS_TYPE = 'submission'
 CFG_DATA_KEYWORDS = ['observables', 'reactions', 'cmenergies', 'phrases']
+CFG_SEARCH_RANGE_TERMS = ["recid", "publication_recid", "inspire_id"]  # Possible terms used to OpenSearch API range searches
 
 CFG_CONVERTER_URL = 'https://converter.hepdata.net'
 CFG_SUPPORTED_FORMATS = ['yaml', 'root', 'csv', 'yoda', 'yoda1', 'original']
@@ -331,7 +332,16 @@ def _(x):
         'endpoint_url': 'https://zenodo.org/records/13952092/files/smodels-analyses.hepdata.json?download=1',
         'url_template': '{0}',
         'subscribe_user_id': 7766
-    }
+    },
+    'Combine': {
+        'endpoint_url': 'https://cms-public-likelihoods-list.web.cern.ch/artifacts/output.json',
+        'url_template': 'https://doi.org/{0}',
+        'description': 'Statistical models',
+        'license': {
+            'name': 'cc-by-4.0',
+            'url': 'https://creativecommons.org/licenses/by/4.0'
+         },
+    },
     #'ufo': {},
     #'xfitter': {},
     #'applgrid': {},

diff --git a/hepdata/ext/opensearch/api.py b/hepdata/ext/opensearch/api.py
@@ -96,8 +96,9 @@ def search(query,
                     ('collaboration', collaboration_name), ('date', date)
     :param size: [int] max number of hits that should be returned
     :param offset: [int] offset for the results (used for pagination)
-    :param sort_by: [string] sorting field. Currently supported fields:
-                    "title", "collaboration", "date", "relevance"
+    :param sort_field: [string] sorting field. Currently supported fields:
+                    "title", "collaboration", "date", "relevance",
+                    "recid", "inspire_id"
     :param sort_order: [string] order of the sorting either original
                     (for a particular field) or reversed. Supported:
                     '' or 'rev'
@@ -108,23 +109,41 @@ def search(query,
     if query == '' and not sort_field:
         sort_field = 'date'
 
-    query = HEPDataQueryParser.parse_query(query)
     # Create search with preference param to ensure consistency of results across shards
     search = RecordsSearch(using=os, index=index).with_preference_param()
 
+    # Determine if the query is range-based, and get it, or the default search order
+    range_terms, exclude_tables, parsed_query = HEPDataQueryParser.parse_range_query(query)
+
+    # We passed the newly range-parsed query to be parsed
+    query = HEPDataQueryParser.parse_query(parsed_query)
+    fuzzy_query = QueryString(query=query, fuzziness='AUTO')
+
     if query:
-        fuzzy_query = QueryString(query=query, fuzziness='AUTO')
+        if exclude_tables:
+            search.query = fuzzy_query
+
+    if query and not exclude_tables:
         search.query = fuzzy_query | \
                        Q('has_child', type="child_datatable", query=fuzzy_query)
 
+    # Add filter to search for only "publication" objects
     search = search.filter("term", doc_type=CFG_PUB_TYPE)
     search = QueryBuilder.add_filters(search, filters)
 
+
+    if range_terms and not sort_field and not sort_order:
+        # Set default search keyword, and set default sort to desc
+        sort_field = 'recid'
+        sort_order = 'desc'
+
     try:
         mapped_sort_field = sort_fields_mapping(sort_field)
     except ValueError as ve:
         return {'error': str(ve)}
+
     search = search.sort({mapped_sort_field : {"order" : calculate_sort_order(sort_order, sort_field)}})
+
     search = add_default_aggregations(search, filters)
 
     if post_filter:
@@ -135,23 +154,25 @@ def search(query,
 
     try:
         pub_result = search.execute().to_dict()
-
-        parent_filter = {
-            "terms": {
-                        "_id": [hit["_id"] for hit in pub_result['hits']['hits']]
+        data_result = {}
+        if not exclude_tables:
+            parent_filter = {
+                "terms": {
+                            "_id": [hit["_id"] for hit in pub_result['hits']['hits']]
+                }
             }
-        }
 
-        data_search = RecordsSearch(using=os, index=index)
-        data_search = data_search.query('has_parent',
-                                        parent_type="parent_publication",
-                                        query=parent_filter)
-        if query:
-            data_search = data_search.query(QueryString(query=query))
+            data_search = RecordsSearch(using=os, index=index)
+            data_search = data_search.query('has_parent',
+                                                parent_type="parent_publication",
+                                                query=parent_filter)
+
+            if query:
+                data_search = data_search.query(QueryString(query=query))
 
-        data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
-        data_search = data_search[0:data_search_size]
-        data_result = data_search.execute().to_dict()
+            data_search_size = size * OPENSEARCH_MAX_RESULT_WINDOW // LIMIT_MAX_RESULTS_PER_PAGE
+            data_search = data_search[0:data_search_size]
+            data_result = data_search.execute().to_dict()
 
         merged_results = merge_results(pub_result, data_result)
         return map_result(merged_results, filters)
@@ -165,7 +186,7 @@ def search(query,
         else:
             log.error(f'An unexpected error occurred when searching: {e}')
             reason = f'An unexpected error occurred: {e.error}'
-        return { 'error': reason }
+        return {'error': reason}
 
 
 @author_index

diff --git a/hepdata/ext/opensearch/config/os_config.py b/hepdata/ext/opensearch/config/os_config.py
@@ -113,6 +113,12 @@ def sort_fields_mapping(sort_by):
         return 'creation_date'
     elif sort_by == 'latest':
         return 'last_updated'
+    elif sort_by == 'recid':
+        return 'recid'  # No change required
+    elif sort_by == 'publication_recid':
+        return 'publication_recid'  # No change required
+    elif sort_by == 'inspire_id':
+        return 'inspire_id'  # No change required
     elif not sort_by or sort_by == 'relevance':
         return '_score'
     else:

diff --git a/hepdata/ext/opensearch/config/record_mapping.py b/hepdata/ext/opensearch/config/record_mapping.py
@@ -171,7 +171,7 @@
         }
     },
     "inspire_id": {
-        "type": "text"
+        "type": "integer"
     },
     "keywords": {
         "properties": {

diff --git a/hepdata/ext/opensearch/document_enhancers.py b/hepdata/ext/opensearch/document_enhancers.py
@@ -94,7 +94,7 @@ def add_shortened_authors(doc):
 
 def add_analyses(doc):
     """
-    Add analyses links such as Rivet, MadAnalysis 5, SModelS, HistFactory and NUISANCE to the index.
+    Add analyses links such as Rivet, MadAnalysis 5, SModelS, Combine, HistFactory and NUISANCE to the index.
 
     :param doc:
     :return:

diff --git a/hepdata/ext/opensearch/process_results.py b/hepdata/ext/opensearch/process_results.py
@@ -27,10 +27,26 @@
 from hepdata.utils.miscellaneous import splitter
 
 
-def merge_results(pub_result, data_result):
+def merge_results(pub_result, data_result=None):
+    """
+    Merge results dictionaries of publication and data table
+      search result data.
+    Data result does not exist in publication-only searches,
+      so defaults to None.
+
+    :param pub_result: Publication search data.
+    :param data_result: Data table search data.
+    :return: Merged search results dictionary.
+    """
     merge_dict = dict()
-    merge_dict['hits'] = pub_result['hits']['hits'] + \
-        data_result['hits']['hits']
+
+    # We don't need to merge if there is no data.
+    if data_result:
+        merge_dict['hits'] = pub_result['hits']['hits'] + \
+            data_result['hits']['hits']
+    else:
+        merge_dict['hits'] = pub_result['hits']['hits']
+
     merge_dict['total'] = pub_result['hits']['total']['value']
     merge_dict['aggregations'] = pub_result.get('aggregations', {})
     return merge_dict

diff --git a/hepdata/ext/opensearch/query_builder.py b/hepdata/ext/opensearch/query_builder.py
@@ -23,6 +23,8 @@
 import re
 from opensearch_dsl import Q
 
+from hepdata.config import CFG_SEARCH_RANGE_TERMS
+
 
 class QueryBuilder:
 
@@ -52,7 +54,8 @@ def parse_query(query_string):
                 "phrases": "data_keywords.phrases",
                 "reactions": "data_keywords.reactions",
                 "analysis": "analyses.type",
-                "resources": "resources.description"  # Add shorthand for resource description
+                "resources": "resources.description",  # Add shorthand for resource description
+                "publication_recid": "recid"  # Shorthand for HEPData record ID
             }
         }
 
@@ -81,3 +84,38 @@ def _quote_phrase(phrase):
         if '"' not in phrase and pattern.fullmatch(phrase):
             return f'"{phrase}"'
         return phrase
+
+    @staticmethod
+    def parse_range_query(query):
+        """
+            Parses and verifies whether a parsed query string contains a range-based query.
+            If it does, return either that search keyword,
+            or the "default" keyword for default search ordering.
+            Also determines if the query is a publication-only search, where tables are excluded.
+            Returns the query with publication_recid replaced with 'recid' for opensearch.
+
+            Examples: publication_recid:[321 TO 321] inspire_id:[123 TO 123]
+
+            :param query: The full query string
+            :return: A tuple containing a list of parsed range terms,
+                and a boolean determining whether table exclusion should occur (if range term is publication_recid,
+                or inspire_id), and the query with term replaced.
+        """
+        # Pattern matching docstring example with placeholder
+        pattern = rf"(?:^|\s)%s:\s*\[\d+\s+TO\s+\d+]"
+        range_terms = []
+        exclude_tables = False
+        # For all terms that can be range searched
+        for term in CFG_SEARCH_RANGE_TERMS:
+            result = re.findall(pattern % term, query)
+            if result:
+                range_terms.append(term)
+
+        # If we are doing a range search on non-table objects
+        if ("publication_recid" in range_terms or "inspire_id" in range_terms) and "recid" not in range_terms:
+            exclude_tables = True
+
+        # Finally, we replace publication_recid with the correct mapping for OpenSearch
+        query = query.replace("publication_recid", "recid")
+
+        return range_terms, exclude_tables, query
diff --git a/hepdata/modules/records/assets/js/hepdata_common.js b/hepdata/modules/records/assets/js/hepdata_common.js
@@ -46,6 +46,7 @@ HEPDATA.file_type_to_details = {
   "rivet": {"icon": "area-chart", "description": "Rivet Analysis"},
   "madanalysis": {"icon": "area-chart", "description": "MadAnalysis 5 Analysis"},
   "smodels": {"icon": "area-chart", "description": "SModelS Analysis"},
+  "combine": {"icon": "area-chart", "description": "Combine Analysis"},
   "xfitter": {"icon": "area-chart", "description": "xFitter Analysis"},
   "applgrid": {"icon": "area-chart", "description": "APPLgrid Analysis"},
   "ufo": {"icon": "rocket", "description": "Universal Feynrules Output (UFO)"},

diff --git a/hepdata/modules/records/templates/hepdata_records/components/resources-widget.html b/hepdata/modules/records/templates/hepdata_records/components/resources-widget.html
@@ -41,6 +41,7 @@ <h4>Add Resource for <span id="selected_resource_item">Submission</span></h4>
                                                     <option value="applgrid">APPLgrid</option>
                                                     <option value="MadAnalysis">MadAnalysis 5</option>
                                                     <option value="SModelS">SModelS</option>
+                                                    <option value="Combine">Combine</option>
                                                     <option value="rivet">Rivet</option>
                                                     <option value="fastnlo">fastNLO</option>
                                                     <option value="ufo">Universal Feynrules Output (UFO)</option>

diff --git a/hepdata/modules/records/utils/analyses.py b/hepdata/modules/records/utils/analyses.py
@@ -35,6 +35,7 @@
 from hepdata.utils.users import get_user_from_id
 from hepdata.modules.records.subscribers.rest import subscribe
 from hepdata.modules.records.subscribers.api import is_current_user_subscribed_to_record
+from hepdata.modules.records.utils.common import get_license
 
 logging.basicConfig()
 log = logging.getLogger(__name__)
@@ -43,10 +44,11 @@
 @shared_task
 def update_analyses(endpoint=None):
     """
-    Update (Rivet, MadAnalysis 5 and SModelS) analyses and remove outdated resources.
+    Update (Rivet, MadAnalysis 5, SModelS and Combine) analyses and remove outdated resources.
     Allow bulk subscription to record update notifications if "subscribe_user_id" in endpoint.
+    Add optional "description" and "license" fields if present in endpoint.
 
-    :param endpoint: either "rivet" or "MadAnalysis" or "SModelS" or None (default) for both
+    :param endpoint: either "rivet" or "MadAnalysis" or "SModelS" or "Combine" or None (default) for both
     """
     endpoints = current_app.config["ANALYSES_ENDPOINTS"]
     for analysis_endpoint in endpoints:
@@ -86,6 +88,13 @@ def update_analyses(endpoint=None):
                                     file_location=_resource_url,
                                     file_type=analysis_endpoint)
 
+                                if "description" in endpoints[analysis_endpoint]:
+                                    new_resource.file_description = str(endpoints[analysis_endpoint]["description"])
+
+                                if "license" in endpoints[analysis_endpoint]:
+                                    resource_license = get_license(endpoints[analysis_endpoint]["license"])
+                                    new_resource.file_license = resource_license.id
+
                                 submission.resources.append(new_resource)
                                 num_new_resources += 1
 

diff --git a/hepdata/modules/search/templates/hepdata_search/modals/search_help.html b/hepdata/modules/search/templates/hepdata_search/modals/search_help.html
@@ -243,6 +243,13 @@ <h4>Other useful searches</h4>
                                        (SModelS analysis)
                                     </span>
                                 </li>
+                                <li>
+                                    <a href='/search?q=analysis:Combine&sort_by=latest'
+                                       target="_new">analysis:Combine</a>
+                                    <span class="text-muted">
+                                       (CMS statistical models in Combine format)
+                                    </span>
+                                </li>
                                 <li>
                                     <a href='/search?q=analysis:HistFactory&sort_by=latest'
                                        target="_new">analysis:HistFactory</a>
@@ -279,6 +286,32 @@ <h4>Searching via Inspire</h4>
                     </ul>
                 </div>
 
+                <div class="well well-small">
+                    <h4>Range-based Searching</h4>
+                  <p>
+                    We support searching for a range of records using their HEPData record ID or Inspire ID.
+                  </p>
+                    <ul>
+                        <li>Range searching by HEPData record ID:
+                            <ul>
+                                <li>
+                                    <a href='/search?q=publication_recid:[1 TO 10]'
+                                       target="_new">publication_recid:[1 TO 10]</a>
+                                </li>
+                            </ul>
+                        </li>
+                        <br/>
+                        <li>Range searching by Inspire ID:
+                            <ul>
+                                <li>
+                                    <a href='/search?q=inspire_id:[1 TO 10000]'
+                                       target="_new">inspire_id:[1 TO 10000]</a>
+                                </li>
+                            </ul>
+                        </li>
+                    </ul>
+                </div>
+
             </div>
         </div>
     </div>

diff --git a/hepdata/version.py b/hepdata/version.py
@@ -28,4 +28,4 @@
 and parsed by ``setup.py``.
 """
 
-__version__ = "0.9.4dev20241112"
+__version__ = "0.9.4dev20241204"
diff --git a/tests/records_test.py b/tests/records_test.py
@@ -1031,7 +1031,7 @@ def test_create_breadcrumb_text():
 
 
 def test_update_analyses(app):
-    """ Test update of Rivet, MadAnalyses 5 and SModelS analyses """
+    """ Test update of Rivet, MadAnalyses 5, SModelS and Combine analyses """
 
     # Import a record that already has a Rivet analysis attached (but with '#' in the URL)
     import_records(['ins1203852'], synchronous=True)
@@ -1074,6 +1074,23 @@ def test_update_analyses(app):
     submission = get_latest_hepsubmission(inspire_id='1847779', overall_status='finished')
     assert is_current_user_subscribed_to_record(submission.publication_recid, user)
 
+    # Import a record that has an associated Combine analysis
+    import_records(['ins2796231'], synchronous=True)
+    analysis_resources = DataResource.query.filter_by(file_type='Combine').all()
+    assert len(analysis_resources) == 0
+    analysis_resources = DataResource.query.filter_by(file_location='https://doi.org/10.17181/bp9fx-6qs64').all()
+    assert len(analysis_resources) == 1
+    db.session.delete(analysis_resources[0])  # delete resource so it can be re-added in next step
+    db.session.commit()
+    update_analyses('Combine')
+    analysis_resources = DataResource.query.filter_by(file_type='Combine').all()
+    assert len(analysis_resources) == 1
+    assert analysis_resources[0].file_location == 'https://doi.org/10.17181/bp9fx-6qs64'
+    assert analysis_resources[0].file_description == 'Statistical models'
+    license_data = License.query.filter_by(id=analysis_resources[0].file_license).first()
+    assert license_data.name == 'cc-by-4.0'
+    assert license_data.url == 'https://creativecommons.org/licenses/by/4.0'
+
 
 def test_generate_license_data_by_id(app):
     """