add a test for all commondata files

NNPDF · Mar 8, 2024 · 8349c67 · 8349c67
1 parent d093c07
commit 8349c67
Show file tree

Hide file tree

Showing 5 changed files with 126 additions and 11 deletions.
diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
@@ -35,6 +35,7 @@
 by modifying the CommonMetaData using one of the loaded Variants one can change the resulting
 :py:class:`validphys.coredata.CommonData` object.
 """
+
 import dataclasses
 from functools import cached_property, lru_cache
 import logging
@@ -588,7 +589,6 @@ def load_kinematics(self, fill_to_three=True, drop_minmax=True):
                     d["mid"] = 0.5 * (d["max"] + d["min"])
 
                 if drop_minmax:
-                    # TODO: for now we are dropping min/max information since it didn't exist in the past
                     d["min"] = None
                     d["max"] = None
                 else:
@@ -786,6 +786,11 @@ def cm_energy(self):
             return None
         return float(energy_string[:-3].replace("P", ".")) * factor
 
+    @cached_property
+    def allowed_datasets(self):
+        """Return the implemented datasets as a list <setname>_<observable>"""
+        return [f"{self.setname}_{i.observable_name}" for i in self.implemented_observables]
+
     @cached_property
     def allowed_observables(self):
         """
@@ -809,7 +814,7 @@ def select_observable(self, obs_name_raw):
 
 
 @lru_cache
-def _parse_entire_set_metadata(metadata_file):
+def parse_set_metadata(metadata_file):
     """Read the metadata file"""
     return parse_yaml_inp(metadata_file, SetMetaData)
 
@@ -822,7 +827,7 @@ def parse_new_metadata(metadata_file, observable_name, variant=None):
     The triplet (metadata_file, observable_name, variant) define unequivocally the information
     to be parsed from the commondata library
     """
-    set_metadata = _parse_entire_set_metadata(metadata_file)
+    set_metadata = parse_set_metadata(metadata_file)
 
     # Select one observable from the entire metadata
     metadata = set_metadata.select_observable(observable_name)

diff --git a/validphys2/src/validphys/loader.py b/validphys2/src/validphys/loader.py
@@ -2,6 +2,7 @@
 Resolve paths to useful objects, and query the existence of different resources
 within the specified paths.
 """
+
 import functools
 from functools import cached_property
 import logging
@@ -22,7 +23,7 @@
 from reportengine import filefinder
 from reportengine.compat import yaml
 from validphys import lhaindex
-from validphys.commondataparser import load_commondata_old, parse_new_metadata
+from validphys.commondataparser import load_commondata_old, parse_new_metadata, parse_set_metadata
 from validphys.core import (
     PDF,
     CommonDataSpec,
@@ -326,6 +327,18 @@ def available_datasets(self):
         old_datasets = [i for i in legacy_to_new_mapping.keys() if not i.startswith(skip)]
         return set(old_datasets)
 
+    @property
+    @functools.lru_cache()
+    def implemented_datasets(self):
+        """Provide all implemented datasets that can be found in the datafiles folder
+        regardless of whether they can be used for fits (i.e., whether they include a theory),
+        are "fake" (integrability/positivity) or are missing some information.
+        """
+        datasets = []
+        for metadata_file in self.commondata_folder.glob("*/metadata.yaml"):
+            datasets += parse_set_metadata(metadata_file).allowed_datasets
+        return datasets
+
     @property
     @functools.lru_cache()
     def available_pdfs(self):

diff --git a/validphys2/src/validphys/plotoptions/kintransforms.py b/validphys2/src/validphys/plotoptions/kintransforms.py
@@ -98,8 +98,9 @@ def new_labels(self, k1, k2, k3):
         return k1, k2, k3
 
     def xq2map(self, k1, k2, k3, **extra_labels):
-        # This is going to be a problem
-        return k1, k2
+        raise NotImplementedError(
+            "xq2map is not implemented for this dataset (kin_override set to identity and process_options not implemented"
+        )
 
 
 class Kintransform(metaclass=abc.ABCMeta):

diff --git a/validphys2/src/validphys/process_options.py b/validphys2/src/validphys/process_options.py
@@ -3,6 +3,7 @@
 
     Only variables included in the `_Vars` enum and processes included in the ``Processes`` dictionary are allowed.
 """
+
 import dataclasses
 from typing import Callable, Optional, Tuple, Union
 
@@ -66,7 +67,7 @@ class _Process:
     def __hash__(self):
         return hash(self.name)
 
-    def same_kin_variables(self, kin_cov):
+    def are_accepted_variables(self, kin_cov):
         """Check if the kinematic variables from the kinematic coverage are the same
         of the accepted variables."""
         # Accepting in any case the legacy variables
@@ -88,7 +89,7 @@ def xq2map(self, kin_df, metadata):
 
         # Check if the kinematic variables defined in metadata corresponds to the
         # accepted variables
-        if not self.same_kin_variables(metadata.kinematic_coverage):
+        if not self.are_accepted_variables(metadata.kinematic_coverage):
             raise NotImplementedError(
                 f"kinematic variables are not supported for process {self.name}. You are using {metadata.kinematic_coverage}, please use {self.accepted_variables} ({metadata.name})"
             )
@@ -159,7 +160,8 @@ def _hqp_yq_xq2map(kin_dict):
 
 def _hqp_yqq_xq2map(kin_dict):
     # Compute x, Q2
-    ratio = np.sqrt(kin_dict[_Vars.m_t2]) / kin_dict[_Vars.sqrts]
+    mass2 = _get_or_fail(kin_dict, [_Vars.m_t2, _Vars.m_ttBar])
+    ratio = np.sqrt(mass2) / kin_dict[_Vars.sqrts]
     x1 = ratio * np.exp(kin_dict[_Vars.y_ttBar])
     x2 = ratio * np.exp(-kin_dict[_Vars.y_ttBar])
     q2 = kin_dict[_Vars.m_t2]
@@ -218,14 +220,14 @@ def _displusjet_xq2map(kin_dict):
 HQP_YQQ = _Process(
     "HQP_YQQ",
     "Differential cross section w.r.t. absolute rapidity of ttBar",
-    accepted_variables=(_Vars.y_ttBar, _Vars.m_t2, _Vars.sqrts),
+    accepted_variables=(_Vars.y_ttBar, _Vars.m_t2, _Vars.sqrts, _Vars.m_ttBar),
     xq2map_function=_hqp_yqq_xq2map,
 )
 
 HQP_PTQ = _Process(
     "HQP_PTQ",
     "Normalized double differential cross section w.r.t. absolute rapidity and transverse momentum of t",
-    accepted_variables=(_Vars.pT_t, _Vars.y_t, _Vars.sqrts),
+    accepted_variables=(_Vars.pT_t, _Vars.y_t, _Vars.sqrts, _Vars.m_t2),
     xq2map_function=_hqp_ptq_xq2map,
 )
 

diff --git a/validphys2/src/validphys/tests/test_datafiles.py b/validphys2/src/validphys/tests/test_datafiles.py
@@ -0,0 +1,94 @@
+"""
+    Test all datafiles
+
+    All tests are under ``test_all_datasets`` ran with all datasets so that one gets one failure per dataset in case of problems
+"""
+
+import pytest
+
+from validphys.covmats import INTRA_DATASET_SYS_NAME
+from validphys.loader import FallbackLoader
+from validphys.plotoptions.kintransforms import identity as kintransform_identity
+
+l = FallbackLoader()
+all_datasets = sorted(l.implemented_datasets)
+
+
+def _load_main_and_variants(dataset_name):
+    """Given a dataset name, returns a list with the default load and all variants"""
+    cds = [l.check_commondata(dataset_name)]
+    for variant_name in cds[0].metadata.variants:
+        cds.append(l.check_commondata(dataset_name, variant=variant_name))
+    return cds
+
+
+@pytest.mark.parametrize("dataset_name", all_datasets)
+def test_all_datasets(dataset_name):
+    """checks that a dataset can be loaded (together with its variants),
+    that the kinematics, uncertainties and data can be read
+    """
+    # Load the data and all its variants
+    cds = _load_main_and_variants(dataset_name)
+    main_cd = cds[0]
+
+    # Check ndata
+    ndata = main_cd.ndata
+
+    # kinematics check
+    _ = main_cd.metadata.load_kinematics(drop_minmax=False)
+    kin_df = main_cd.metadata.load_kinematics()
+
+    # Check that the kinematic coverage is contained in the kinematics dataframe
+    kin_cov = main_cd.metadata.kinematic_coverage
+    assert set(kin_cov) <= set(kin_df.columns.get_level_values(0))
+
+    process_type = main_cd.metadata.process_type
+
+    # check whether the kin override is set to the identity
+    # and if so, check that the process_type is not simply a string
+    kin_override = main_cd.metadata.plotting.kinematics_override
+    if isinstance(kin_override, kintransform_identity) and isinstance(process_type, str):
+        # Skip for the time being the processes for which there is no implementation but have been
+        # merged to master: issue #1991
+        if process_type not in ("HQP_MQQ", "INC"):
+            raise NotImplementedError(f"The {process_type=} is not implemented in process_options")
+
+    elif not isinstance(process_type, str):
+        if not process_type.are_accepted_variables(kin_cov):
+            raise ValueError(
+                f"The dataset {dataset_name} uses {kin_cov} while accepted variables for {process_type} are {process_type.accepted_variables}"
+            )
+
+    # load the central data for every variant
+    all_dc = [cd.metadata.load_data_central() for cd in cds]
+    # and check they have the same lenght (it should've been tested internally already)
+    assert all(len(i) == ndata for i in all_dc)
+
+    # check the uncertainties can be loaded
+    # note that due to legacy data there might be datasets without data_uncertainties
+    # but that would only happen for non-variant (or member 0 of the list)
+    all_unc = [cd.metadata.load_uncertainties() for cd in cds[1:]]
+    if main_cd.metadata.data_uncertainties:
+        all_unc.insert(0, main_cd.metadata.load_uncertainties())
+
+    for unc in all_unc:
+        # Check that, if present, the special `stat` key is ADD and UNCORR
+        if "stat" in unc:
+            stat = unc["stat"]
+            assert stat.columns[0][0] == "ADD"
+            assert stat.columns[0][1] == "UNCORR"
+
+        intra_dataset = unc.columns.get_level_values("type").isin(
+            list(INTRA_DATASET_SYS_NAME) + ["SKIP", "SCALEVAR"]
+        )
+
+        # Check that inter dataset correlations are unique
+        inter_dataset_corrs = unc.loc[:, ~intra_dataset]
+        inter_types = inter_dataset_corrs.columns.get_level_values("type")
+        if not inter_types.is_unique:
+            raise ValueError(
+                f"The inter-dataset uncertainties for {dataset_name} are not unique: {inter_types.value_counts()}"
+            )
+
+        # Check that all treatments are either MULT or ADD
+        assert set(unc.columns.get_level_values("treatment").unique()) <= {"MULT", "ADD"}