updating metadata

NNPDF · Oct 31, 2024 · 1ed351c · 1ed351c
1 parent 70a667b
commit 1ed351c
Show file tree

Hide file tree

Showing 8 changed files with 426 additions and 1,722 deletions.
diff --git a/nnpdf_data/nnpdf_data/commondata/ATLAS_SINGLETOP_7TEV/data_T-Y-NORM.yaml b/nnpdf_data/nnpdf_data/commondata/ATLAS_SINGLETOP_7TEV/data_T-Y-NORM.yaml
@@ -0,0 +1,4 @@
+data_central:
+- 8.67300000e-01
+- 1.19130000e+00
+- 0.437
diff --git a/nnpdf_data/nnpdf_data/commondata/ATLAS_SINGLETOP_7TEV/filter.py b/nnpdf_data/nnpdf_data/commondata/ATLAS_SINGLETOP_7TEV/filter.py
@@ -1,24 +1,20 @@
 import pathlib
+
 import numpy as np
-import pandas
 import pandas as pd
 import yaml
 
 from nnpdf_data.filter_utils.utils import prettify_float
 
 yaml.add_representer(float, prettify_float)
 
-NB_POINTS = 3
-MZ_VALUE = 91.1876  # GeV
-MW_VALUE = 80.398  # GeV
+NB_POINTS = 4
 MT_VALUE = 172.5
 SQRT_S = 7_000.0
 
-# Correct tables to read values [[Z], [W+, W-]]
-TABLES = {1: [0], 4: [0, 2]}  # {table_id: [indexes]}
-
+from nnpdf_data.filter_utils.utils import cormat_to_covmat, covmat_to_artunc
 from nnpdf_data.filter_utils.utils import symmetrize_errors as se
-from nnpdf_data.filter_utils.utils import cormat_to_covmat
+
 
 def load_yaml(table_id: int, version: int = 1) -> dict:
     """Load the HEP data table in yaml format.
@@ -39,6 +35,7 @@ def load_yaml(table_id: int, version: int = 1) -> dict:
 
     return yaml.safe_load(table.read_text())
 
+
 def get_kinematics(hepdata: dict) -> list:
     """Read the version and list of tables from metadata.
 
@@ -59,14 +56,15 @@ def get_kinematics(hepdata: dict) -> list:
     for bin in rapbins[:-1]:  # exclude the last bin (normalised)
         ymin, ymax = [float(value) for value in bin["value"].split('-')]
         kin_value = {
-            "k1": {"min": ymin, "mid": (ymin + ymax) / 2, "max": ymax},
-            "k2": {"min": None, "mid": MT_VALUE ** 2, "max": None},
-            "k3": {"min": None, "mid": SQRT_S, "max": None},
+            "y": {"min": ymin, "mid": (ymin + ymax) / 2, "max": ymax},
+            "M2": {"min": None, "mid": MT_VALUE**2, "max": None},
+            "sqrts": {"min": None, "mid": SQRT_S, "max": None},
         }
         kinematics.append(kin_value)
 
     return kinematics
 
+
 def get_data_values(hepdata: dict, indx: int = 0) -> list:
     """Extract the central values from the HepData yaml file.
 
@@ -86,6 +84,7 @@ def get_data_values(hepdata: dict, indx: int = 0) -> list:
     central = hepdata["dependent_variables"][indx]["values"]
     return np.array([central[i]["value"] for i in range(len(central))])
 
+
 def get_errors(hepdata: list) -> dict:
     """Extract the error values from the HepData yaml file.
     Parameters
@@ -99,25 +98,11 @@ def get_errors(hepdata: list) -> dict:
         source of uncertainties
     """
 
-
-    central_values = get_data_values(hepdata[0])
-
-    # parse the relative statistical uncertainties
-    abs_stat_uncs = []
-    rel_stat_uncs = []
-    for i, errors in enumerate(hepdata[0]["dependent_variables"][2]["values"]):
-        rel_stat_unc = errors["errors"][1]["symerror"]
-        rel_stat_uncs.append(rel_stat_unc)
-        abs_stat_uncs.append(central_values[i] * rel_stat_unc)
-
-    # statistical correlated uncertainties
-    cormat_list = [corr["value"] for corr in hepdata[1]["dependent_variables"][0]["values"]]
-    stat_covmat = cormat_to_covmat(err_list=abs_stat_uncs, cormat_list=cormat_list)
+    central_values = get_data_values(hepdata[0]).reshape(-1, 1)
 
     # parse the systematics
-    # for hepdata[2]["dependent_variables"]
-    uncertainties = []
-    for rapbin in hepdata[2]["dependent_variables"]:
+    rel_uncertainties = []
+    for i, rapbin in enumerate(hepdata[2]["dependent_variables"]):
 
         # loop over sources of uncertainty
         uncertainties_rapbin = []
@@ -127,32 +112,51 @@ def get_errors(hepdata: list) -> dict:
             if "symerror" in source["errors"][0]:
                 uncertainties_rapbin.append(source["errors"][0]["symerror"])
             elif "asymerror" in source["errors"][0]:
+
                 delta_min = source["errors"][0]["asymerror"]["minus"]
                 delta_plus = source["errors"][0]["asymerror"]["plus"]
                 se_delta, se_sigma = se(delta_plus, delta_min)
-                #TODO shift central value
+
+                # shift central value. Note se_delta is the relative symmetrised unc
+                central_values[i] += se_delta * central_values[i]
+
                 uncertainties_rapbin.append(se_sigma)
-        uncertainties.append(uncertainties_rapbin)
+        rel_uncertainties.append(uncertainties_rapbin)
+    rel_uncertainties = np.array(rel_uncertainties)
+    abs_uncertainties = rel_uncertainties * central_values
+
+    # normalised distribution so drop the last bin and separate stat from systematics
+    central_values = central_values.flatten()[:-1]
+    abs_uncertainties = abs_uncertainties[:-1, :]
+    stat_unc = abs_uncertainties[:, 0]
+    sys_unc = abs_uncertainties[:, 1:-2]
 
-    stat = []
-    for data_i in hepdata[0]["dependent_variables"][0]["values"]:
+    # statistical correlated uncertainties
+    stat_cor_dict = hepdata[1]["dependent_variables"][0]["values"]
+    stat_cor = np.array([corr["value"] for corr in stat_cor_dict]).reshape(NB_POINTS, NB_POINTS)
+    stat_cor = stat_cor[:-1, :-1].flatten()
+    stat_covmat = cormat_to_covmat(err_list=stat_unc, cormat_list=stat_cor)
 
-        stat_i = data_i["errors"][0]["symerror"]
-        stat.append(stat_i)
+    # convert stat covmat to artificial systematics
+    stat_art = np.array(covmat_to_artunc(NB_POINTS - 1, stat_covmat))
 
-        if "asymerror" in data_i["errors"][1]:
-            delta_min = data_i["errors"][1]["asymerror"]["minus"]
-            delta_plus = data_i["errors"][1]["asymerror"]["plus"]
-            se_delta, se_sigma = se(delta_plus, delta_min)
-        else:
-            se_delta = 0
-            se_sigma = data_i["errors"][1]["symerror"]
+    # combine stat and sys uncertainties
+    sys_unc_all = np.concatenate([stat_art, sys_unc], axis=1)
 
-        cv_i = data_i["value"] + se_delta
-        systematics.append(se_sigma)
-        central_values.append(cv_i)
+    sys_names_dict = hepdata[2]["independent_variables"][0]["values"]
+    sys_to_drop = ["Data statistical", "Total systematic", "Total"]
+    stat_names = [f"Statistical uncertainty {i + 1}" for i in range(NB_POINTS - 1)]
+    sys_names = stat_names + [
+        key["value"]
+        for key in sys_names_dict
+        if len(key["value"]) > 0 and key["value"] not in sys_to_drop
+    ]
+
+    sys_df = pd.DataFrame(
+        sys_unc_all, columns=sys_names, index=[f"Rapbin {i}" for i in range(NB_POINTS - 1)]
+    )
+    return central_values, {"systematics": sys_df}
 
-    return central_values, {"stat": stat, "sys_corr": systematics}
 
 def format_uncertainties(uncs: dict) -> list:
     """Format the uncertainties to be dumped into the yaml file.
@@ -170,15 +174,18 @@ def format_uncertainties(uncs: dict) -> list:
     """
 
     combined_errors = []
-    for i in range(NB_POINTS):
-        error_value = {}
-        error_value["stat"] = uncs["stat"][i]
-        for j, sys in enumerate(uncs["sys_corr"][i]):
-            error_value[f"sys_corr_{j+1}"] = float(sys)
-        combined_errors.append(error_value)
+    for i in range(NB_POINTS - 1):
+        errors = {}
+        for j, unc in enumerate(uncs["systematics"].iloc[i, :].values):
+            if j < 3:
+                errors[f"stat_corr_{j + 1}"] = float(unc)
+            else:
+                errors[f"sys_corr_{j + 1}"] = float(unc)
+        combined_errors.append(errors)
 
     return combined_errors
 
+
 def dump_commondata(kinematics: list, data: list, errors: dict) -> None:
     """Function that generates and writes the commondata files.
 
@@ -193,62 +200,48 @@ def dump_commondata(kinematics: list, data: list, errors: dict) -> None:
 
     """
 
-    error_definition = {"stat": {
-        "description": "Uncorrelated statistical uncertainties",
-        "treatment": "ADD",
-        "type": "UNCORR"
-        }
-    }
+    error_definition = {}
 
-    n_sys = errors["sys_corr"].shape[1]
+    n_sys = errors["systematics"].shape[1]
 
     for i in range(n_sys):
-        error_definition[f"sys_corr_{i + 1}"] = {
-            "description": f"Systematic uncertainty {i + 1}",
-            "treatment": "ADD",
-            "type": "CORR",
-        }
 
-    # update lumi entry
-    error_definition[f'sys_corr_{n_sys}']['type'] = "UNCORR"
-
-    error_definition["stat"] = {
-        "description": "Uncorrelated statistical uncertainties",
-        "treatment": "ADD",
-        "type": "UNCORR",
-    }
+        if i < 3:
+            error_definition[f"stat_corr_{i + 1}"] = {
+                "description": errors["systematics"].columns[i],
+                "treatment": "ADD",
+                "type": "CORR",
+            }
+        else:
+            error_definition[f"sys_corr_{i - 2}"] = {
+                "description": errors["systematics"].columns[i],
+                "treatment": "ADD",
+                "type": "CORR",
+            }
 
     errors_formatted = format_uncertainties(errors)
+    with open("data_T-Y-NORM.yaml", "w") as file:
+        yaml.dump({"data_central": data.tolist()}, file, sort_keys=False)
 
-    with open("data_ASY.yaml", "w") as file:
-        yaml.dump({"data_central": data}, file, sort_keys=False)
-
-    with open("kinematics_ASY.yaml", "w") as file:
+    with open("kinematics_T-Y-NORM.yaml", "w") as file:
         yaml.dump({"bins": kinematics}, file, sort_keys=False)
 
-    with open("uncertainties_ASY.yaml", "w") as file:
-        yaml.dump({"definitions": error_definition, "bins": errors_formatted}, file, sort_keys=False)
+    with open("uncertainties_T-Y_NORM.yaml", "w") as file:
+        yaml.dump(
+            {"definitions": error_definition, "bins": errors_formatted}, file, sort_keys=False
+        )
 
-def main_filter() -> None:
-    """Main driver of the filter that produces commmondata.
 
-    There are four main different sources of uncertainties.
-
-    1. Statistical uncertainties: ADD, UNCORR
-
-    2. Correlated Systematic uncertainties: ADD, CORR
-
-    3. Uncorrelated Systematic uncertainties: ADD, UNCORR
-
-    """
+def main_filter() -> None:
+    """Main driver of the filter that produces commmondata."""
 
     yaml_content_data = load_yaml(table_id=17, version=2)
     yaml_stat_corr = load_yaml(table_id=34, version=2)
     yaml_sys_sources = load_yaml(table_id=26, version=2)
+
     # yaml_content_uncertainties = load_yaml(table_id=3, version=1)
     kinematics = get_kinematics(yaml_content_data)
-    data_central = get_data_values(yaml_content_data)
-    uncertainties = get_errors([yaml_content_data, yaml_stat_corr, yaml_sys_sources])
+    data_central, uncertainties = get_errors([yaml_content_data, yaml_stat_corr, yaml_sys_sources])
 
     # Generate all the necessary files
     dump_commondata(kinematics, data_central, uncertainties)
@@ -257,4 +250,4 @@ def main_filter() -> None:
 
 
 if __name__ == "__main__":
-    main_filter()
+    main_filter()
diff --git a/nnpdf_data/nnpdf_data/commondata/ATLAS_SINGLETOP_7TEV/kinematics_T-Y-NORM.yaml b/nnpdf_data/nnpdf_data/commondata/ATLAS_SINGLETOP_7TEV/kinematics_T-Y-NORM.yaml
@@ -1,37 +1,37 @@
 bins:
-- k1:
-    min: null
+- y:
+    min: 0.0
     mid: 0.1
-    max: null
-  k2:
+    max: 0.2
+  M2:
     min: null
-    mid: 30032.89
+    mid: 29756.25
     max: null
-  k3:
+  sqrts:
     min: null
     mid: 7000.0
     max: null
-- k1:
-    min: null
+- y:
+    min: 0.2
     mid: 0.4
-    max: null
-  k2:
+    max: 0.6
+  M2:
     min: null
-    mid: 30032.89
+    mid: 29756.25
     max: null
-  k3:
+  sqrts:
     min: null
     mid: 7000.0
     max: null
-- k1:
-    min: null
-    mid: 0.85
-    max: null
-  k2:
+- y:
+    min: 0.6
+    mid: 8.50000000e-01
+    max: 1.1
+  M2:
     min: null
-    mid: 30032.89
+    mid: 29756.25
     max: null
-  k3:
+  sqrts:
     min: null
     mid: 7000.0
     max: null