diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index 5380ae18c6..c6e2c821b1 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -35,6 +35,7 @@ by modifying the CommonMetaData using one of the loaded Variants one can change the resulting :py:class:`validphys.coredata.CommonData` object. """ + import dataclasses from functools import cached_property, lru_cache import logging @@ -588,7 +589,6 @@ def load_kinematics(self, fill_to_three=True, drop_minmax=True): d["mid"] = 0.5 * (d["max"] + d["min"]) if drop_minmax: - # TODO: for now we are dropping min/max information since it didn't exist in the past d["min"] = None d["max"] = None else: @@ -786,6 +786,11 @@ def cm_energy(self): return None return float(energy_string[:-3].replace("P", ".")) * factor + @cached_property + def allowed_datasets(self): + """Return the implemented datasets as a list _""" + return [f"{self.setname}_{i.observable_name}" for i in self.implemented_observables] + @cached_property def allowed_observables(self): """ @@ -809,7 +814,7 @@ def select_observable(self, obs_name_raw): @lru_cache -def _parse_entire_set_metadata(metadata_file): +def parse_set_metadata(metadata_file): """Read the metadata file""" return parse_yaml_inp(metadata_file, SetMetaData) @@ -822,7 +827,7 @@ def parse_new_metadata(metadata_file, observable_name, variant=None): The triplet (metadata_file, observable_name, variant) define unequivocally the information to be parsed from the commondata library """ - set_metadata = _parse_entire_set_metadata(metadata_file) + set_metadata = parse_set_metadata(metadata_file) # Select one observable from the entire metadata metadata = set_metadata.select_observable(observable_name) diff --git a/validphys2/src/validphys/loader.py b/validphys2/src/validphys/loader.py index c1093febe0..e08d8bacb9 100644 --- a/validphys2/src/validphys/loader.py +++ b/validphys2/src/validphys/loader.py @@ -2,6 +2,7 @@ Resolve paths to useful objects, and query the existence of different resources within the specified paths. """ + import functools from functools import cached_property import logging @@ -22,7 +23,7 @@ from reportengine import filefinder from reportengine.compat import yaml from validphys import lhaindex -from validphys.commondataparser import load_commondata_old, parse_new_metadata +from validphys.commondataparser import load_commondata_old, parse_new_metadata, parse_set_metadata from validphys.core import ( PDF, CommonDataSpec, @@ -326,6 +327,18 @@ def available_datasets(self): old_datasets = [i for i in legacy_to_new_mapping.keys() if not i.startswith(skip)] return set(old_datasets) + @property + @functools.lru_cache() + def implemented_datasets(self): + """Provide all implemented datasets that can be found in the datafiles folder + regardless of whether they can be used for fits (i.e., whether they include a theory), + are "fake" (integrability/positivity) or are missing some information. + """ + datasets = [] + for metadata_file in self.commondata_folder.glob("*/metadata.yaml"): + datasets += parse_set_metadata(metadata_file).allowed_datasets + return datasets + @property @functools.lru_cache() def available_pdfs(self): diff --git a/validphys2/src/validphys/plotoptions/kintransforms.py b/validphys2/src/validphys/plotoptions/kintransforms.py index a37ae6f6ec..6473b54f9e 100644 --- a/validphys2/src/validphys/plotoptions/kintransforms.py +++ b/validphys2/src/validphys/plotoptions/kintransforms.py @@ -98,8 +98,9 @@ def new_labels(self, k1, k2, k3): return k1, k2, k3 def xq2map(self, k1, k2, k3, **extra_labels): - # This is going to be a problem - return k1, k2 + raise NotImplementedError( + "xq2map is not implemented for this dataset (kin_override set to identity and process_options not implemented" + ) class Kintransform(metaclass=abc.ABCMeta): diff --git a/validphys2/src/validphys/process_options.py b/validphys2/src/validphys/process_options.py index dfba77a6cf..fea8aee6da 100644 --- a/validphys2/src/validphys/process_options.py +++ b/validphys2/src/validphys/process_options.py @@ -3,6 +3,7 @@ Only variables included in the `_Vars` enum and processes included in the ``Processes`` dictionary are allowed. """ + import dataclasses from typing import Callable, Optional, Tuple, Union @@ -66,7 +67,7 @@ class _Process: def __hash__(self): return hash(self.name) - def same_kin_variables(self, kin_cov): + def are_accepted_variables(self, kin_cov): """Check if the kinematic variables from the kinematic coverage are the same of the accepted variables.""" # Accepting in any case the legacy variables @@ -88,7 +89,7 @@ def xq2map(self, kin_df, metadata): # Check if the kinematic variables defined in metadata corresponds to the # accepted variables - if not self.same_kin_variables(metadata.kinematic_coverage): + if not self.are_accepted_variables(metadata.kinematic_coverage): raise NotImplementedError( f"kinematic variables are not supported for process {self.name}. You are using {metadata.kinematic_coverage}, please use {self.accepted_variables} ({metadata.name})" ) @@ -159,7 +160,8 @@ def _hqp_yq_xq2map(kin_dict): def _hqp_yqq_xq2map(kin_dict): # Compute x, Q2 - ratio = np.sqrt(kin_dict[_Vars.m_t2]) / kin_dict[_Vars.sqrts] + mass2 = _get_or_fail(kin_dict, [_Vars.m_t2, _Vars.m_ttBar]) + ratio = np.sqrt(mass2) / kin_dict[_Vars.sqrts] x1 = ratio * np.exp(kin_dict[_Vars.y_ttBar]) x2 = ratio * np.exp(-kin_dict[_Vars.y_ttBar]) q2 = kin_dict[_Vars.m_t2] @@ -218,14 +220,14 @@ def _displusjet_xq2map(kin_dict): HQP_YQQ = _Process( "HQP_YQQ", "Differential cross section w.r.t. absolute rapidity of ttBar", - accepted_variables=(_Vars.y_ttBar, _Vars.m_t2, _Vars.sqrts), + accepted_variables=(_Vars.y_ttBar, _Vars.m_t2, _Vars.sqrts, _Vars.m_ttBar), xq2map_function=_hqp_yqq_xq2map, ) HQP_PTQ = _Process( "HQP_PTQ", "Normalized double differential cross section w.r.t. absolute rapidity and transverse momentum of t", - accepted_variables=(_Vars.pT_t, _Vars.y_t, _Vars.sqrts), + accepted_variables=(_Vars.pT_t, _Vars.y_t, _Vars.sqrts, _Vars.m_t2), xq2map_function=_hqp_ptq_xq2map, ) diff --git a/validphys2/src/validphys/tests/test_datafiles.py b/validphys2/src/validphys/tests/test_datafiles.py new file mode 100644 index 0000000000..e0c51c6186 --- /dev/null +++ b/validphys2/src/validphys/tests/test_datafiles.py @@ -0,0 +1,94 @@ +""" + Test all datafiles + + All tests are under ``test_all_datasets`` ran with all datasets so that one gets one failure per dataset in case of problems +""" + +import pytest + +from validphys.covmats import INTRA_DATASET_SYS_NAME +from validphys.loader import FallbackLoader +from validphys.plotoptions.kintransforms import identity as kintransform_identity + +l = FallbackLoader() +all_datasets = sorted(l.implemented_datasets) + + +def _load_main_and_variants(dataset_name): + """Given a dataset name, returns a list with the default load and all variants""" + cds = [l.check_commondata(dataset_name)] + for variant_name in cds[0].metadata.variants: + cds.append(l.check_commondata(dataset_name, variant=variant_name)) + return cds + + +@pytest.mark.parametrize("dataset_name", all_datasets) +def test_all_datasets(dataset_name): + """checks that a dataset can be loaded (together with its variants), + that the kinematics, uncertainties and data can be read + """ + # Load the data and all its variants + cds = _load_main_and_variants(dataset_name) + main_cd = cds[0] + + # Check ndata + ndata = main_cd.ndata + + # kinematics check + _ = main_cd.metadata.load_kinematics(drop_minmax=False) + kin_df = main_cd.metadata.load_kinematics() + + # Check that the kinematic coverage is contained in the kinematics dataframe + kin_cov = main_cd.metadata.kinematic_coverage + assert set(kin_cov) <= set(kin_df.columns.get_level_values(0)) + + process_type = main_cd.metadata.process_type + + # check whether the kin override is set to the identity + # and if so, check that the process_type is not simply a string + kin_override = main_cd.metadata.plotting.kinematics_override + if isinstance(kin_override, kintransform_identity) and isinstance(process_type, str): + # Skip for the time being the processes for which there is no implementation but have been + # merged to master: issue #1991 + if process_type not in ("HQP_MQQ", "INC"): + raise NotImplementedError(f"The {process_type=} is not implemented in process_options") + + elif not isinstance(process_type, str): + if not process_type.are_accepted_variables(kin_cov): + raise ValueError( + f"The dataset {dataset_name} uses {kin_cov} while accepted variables for {process_type} are {process_type.accepted_variables}" + ) + + # load the central data for every variant + all_dc = [cd.metadata.load_data_central() for cd in cds] + # and check they have the same lenght (it should've been tested internally already) + assert all(len(i) == ndata for i in all_dc) + + # check the uncertainties can be loaded + # note that due to legacy data there might be datasets without data_uncertainties + # but that would only happen for non-variant (or member 0 of the list) + all_unc = [cd.metadata.load_uncertainties() for cd in cds[1:]] + if main_cd.metadata.data_uncertainties: + all_unc.insert(0, main_cd.metadata.load_uncertainties()) + + for unc in all_unc: + # Check that, if present, the special `stat` key is ADD and UNCORR + if "stat" in unc: + stat = unc["stat"] + assert stat.columns[0][0] == "ADD" + assert stat.columns[0][1] == "UNCORR" + + intra_dataset = unc.columns.get_level_values("type").isin( + list(INTRA_DATASET_SYS_NAME) + ["SKIP", "SCALEVAR"] + ) + + # Check that inter dataset correlations are unique + inter_dataset_corrs = unc.loc[:, ~intra_dataset] + inter_types = inter_dataset_corrs.columns.get_level_values("type") + if not inter_types.is_unique: + raise ValueError( + f"The inter-dataset uncertainties for {dataset_name} are not unique: {inter_types.value_counts()}" + ) + + # Check that all treatments are either MULT or ADD + assert set(unc.columns.get_level_values("treatment").unique()) <= {"MULT", "ADD"}