Skip to content

Commit

Permalink
updating metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
jacoterh committed Oct 31, 2024
1 parent 70a667b commit 1ed351c
Show file tree
Hide file tree
Showing 8 changed files with 426 additions and 1,722 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
data_central:
- 8.67300000e-01
- 1.19130000e+00
- 0.437
175 changes: 84 additions & 91 deletions nnpdf_data/nnpdf_data/commondata/ATLAS_SINGLETOP_7TEV/filter.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,20 @@
import pathlib

import numpy as np
import pandas
import pandas as pd
import yaml

from nnpdf_data.filter_utils.utils import prettify_float

yaml.add_representer(float, prettify_float)

NB_POINTS = 3
MZ_VALUE = 91.1876 # GeV
MW_VALUE = 80.398 # GeV
NB_POINTS = 4
MT_VALUE = 172.5
SQRT_S = 7_000.0

# Correct tables to read values [[Z], [W+, W-]]
TABLES = {1: [0], 4: [0, 2]} # {table_id: [indexes]}

from nnpdf_data.filter_utils.utils import cormat_to_covmat, covmat_to_artunc
from nnpdf_data.filter_utils.utils import symmetrize_errors as se
from nnpdf_data.filter_utils.utils import cormat_to_covmat


def load_yaml(table_id: int, version: int = 1) -> dict:
"""Load the HEP data table in yaml format.
Expand All @@ -39,6 +35,7 @@ def load_yaml(table_id: int, version: int = 1) -> dict:

return yaml.safe_load(table.read_text())


def get_kinematics(hepdata: dict) -> list:
"""Read the version and list of tables from metadata.
Expand All @@ -59,14 +56,15 @@ def get_kinematics(hepdata: dict) -> list:
for bin in rapbins[:-1]: # exclude the last bin (normalised)
ymin, ymax = [float(value) for value in bin["value"].split('-')]
kin_value = {
"k1": {"min": ymin, "mid": (ymin + ymax) / 2, "max": ymax},
"k2": {"min": None, "mid": MT_VALUE ** 2, "max": None},
"k3": {"min": None, "mid": SQRT_S, "max": None},
"y": {"min": ymin, "mid": (ymin + ymax) / 2, "max": ymax},
"M2": {"min": None, "mid": MT_VALUE**2, "max": None},
"sqrts": {"min": None, "mid": SQRT_S, "max": None},
}
kinematics.append(kin_value)

return kinematics


def get_data_values(hepdata: dict, indx: int = 0) -> list:
"""Extract the central values from the HepData yaml file.
Expand All @@ -86,6 +84,7 @@ def get_data_values(hepdata: dict, indx: int = 0) -> list:
central = hepdata["dependent_variables"][indx]["values"]
return np.array([central[i]["value"] for i in range(len(central))])


def get_errors(hepdata: list) -> dict:
"""Extract the error values from the HepData yaml file.
Parameters
Expand All @@ -99,25 +98,11 @@ def get_errors(hepdata: list) -> dict:
source of uncertainties
"""


central_values = get_data_values(hepdata[0])

# parse the relative statistical uncertainties
abs_stat_uncs = []
rel_stat_uncs = []
for i, errors in enumerate(hepdata[0]["dependent_variables"][2]["values"]):
rel_stat_unc = errors["errors"][1]["symerror"]
rel_stat_uncs.append(rel_stat_unc)
abs_stat_uncs.append(central_values[i] * rel_stat_unc)

# statistical correlated uncertainties
cormat_list = [corr["value"] for corr in hepdata[1]["dependent_variables"][0]["values"]]
stat_covmat = cormat_to_covmat(err_list=abs_stat_uncs, cormat_list=cormat_list)
central_values = get_data_values(hepdata[0]).reshape(-1, 1)

# parse the systematics
# for hepdata[2]["dependent_variables"]
uncertainties = []
for rapbin in hepdata[2]["dependent_variables"]:
rel_uncertainties = []
for i, rapbin in enumerate(hepdata[2]["dependent_variables"]):

# loop over sources of uncertainty
uncertainties_rapbin = []
Expand All @@ -127,32 +112,51 @@ def get_errors(hepdata: list) -> dict:
if "symerror" in source["errors"][0]:
uncertainties_rapbin.append(source["errors"][0]["symerror"])
elif "asymerror" in source["errors"][0]:

delta_min = source["errors"][0]["asymerror"]["minus"]
delta_plus = source["errors"][0]["asymerror"]["plus"]
se_delta, se_sigma = se(delta_plus, delta_min)
#TODO shift central value

# shift central value. Note se_delta is the relative symmetrised unc
central_values[i] += se_delta * central_values[i]

uncertainties_rapbin.append(se_sigma)
uncertainties.append(uncertainties_rapbin)
rel_uncertainties.append(uncertainties_rapbin)
rel_uncertainties = np.array(rel_uncertainties)
abs_uncertainties = rel_uncertainties * central_values

# normalised distribution so drop the last bin and separate stat from systematics
central_values = central_values.flatten()[:-1]
abs_uncertainties = abs_uncertainties[:-1, :]
stat_unc = abs_uncertainties[:, 0]
sys_unc = abs_uncertainties[:, 1:-2]

stat = []
for data_i in hepdata[0]["dependent_variables"][0]["values"]:
# statistical correlated uncertainties
stat_cor_dict = hepdata[1]["dependent_variables"][0]["values"]
stat_cor = np.array([corr["value"] for corr in stat_cor_dict]).reshape(NB_POINTS, NB_POINTS)
stat_cor = stat_cor[:-1, :-1].flatten()
stat_covmat = cormat_to_covmat(err_list=stat_unc, cormat_list=stat_cor)

stat_i = data_i["errors"][0]["symerror"]
stat.append(stat_i)
# convert stat covmat to artificial systematics
stat_art = np.array(covmat_to_artunc(NB_POINTS - 1, stat_covmat))

if "asymerror" in data_i["errors"][1]:
delta_min = data_i["errors"][1]["asymerror"]["minus"]
delta_plus = data_i["errors"][1]["asymerror"]["plus"]
se_delta, se_sigma = se(delta_plus, delta_min)
else:
se_delta = 0
se_sigma = data_i["errors"][1]["symerror"]
# combine stat and sys uncertainties
sys_unc_all = np.concatenate([stat_art, sys_unc], axis=1)

cv_i = data_i["value"] + se_delta
systematics.append(se_sigma)
central_values.append(cv_i)
sys_names_dict = hepdata[2]["independent_variables"][0]["values"]
sys_to_drop = ["Data statistical", "Total systematic", "Total"]
stat_names = [f"Statistical uncertainty {i + 1}" for i in range(NB_POINTS - 1)]
sys_names = stat_names + [
key["value"]
for key in sys_names_dict
if len(key["value"]) > 0 and key["value"] not in sys_to_drop
]

sys_df = pd.DataFrame(
sys_unc_all, columns=sys_names, index=[f"Rapbin {i}" for i in range(NB_POINTS - 1)]
)
return central_values, {"systematics": sys_df}

return central_values, {"stat": stat, "sys_corr": systematics}

def format_uncertainties(uncs: dict) -> list:
"""Format the uncertainties to be dumped into the yaml file.
Expand All @@ -170,15 +174,18 @@ def format_uncertainties(uncs: dict) -> list:
"""

combined_errors = []
for i in range(NB_POINTS):
error_value = {}
error_value["stat"] = uncs["stat"][i]
for j, sys in enumerate(uncs["sys_corr"][i]):
error_value[f"sys_corr_{j+1}"] = float(sys)
combined_errors.append(error_value)
for i in range(NB_POINTS - 1):
errors = {}
for j, unc in enumerate(uncs["systematics"].iloc[i, :].values):
if j < 3:
errors[f"stat_corr_{j + 1}"] = float(unc)
else:
errors[f"sys_corr_{j + 1}"] = float(unc)
combined_errors.append(errors)

return combined_errors


def dump_commondata(kinematics: list, data: list, errors: dict) -> None:
"""Function that generates and writes the commondata files.
Expand All @@ -193,62 +200,48 @@ def dump_commondata(kinematics: list, data: list, errors: dict) -> None:
"""

error_definition = {"stat": {
"description": "Uncorrelated statistical uncertainties",
"treatment": "ADD",
"type": "UNCORR"
}
}
error_definition = {}

n_sys = errors["sys_corr"].shape[1]
n_sys = errors["systematics"].shape[1]

for i in range(n_sys):
error_definition[f"sys_corr_{i + 1}"] = {
"description": f"Systematic uncertainty {i + 1}",
"treatment": "ADD",
"type": "CORR",
}

# update lumi entry
error_definition[f'sys_corr_{n_sys}']['type'] = "UNCORR"

error_definition["stat"] = {
"description": "Uncorrelated statistical uncertainties",
"treatment": "ADD",
"type": "UNCORR",
}
if i < 3:
error_definition[f"stat_corr_{i + 1}"] = {
"description": errors["systematics"].columns[i],
"treatment": "ADD",
"type": "CORR",
}
else:
error_definition[f"sys_corr_{i - 2}"] = {
"description": errors["systematics"].columns[i],
"treatment": "ADD",
"type": "CORR",
}

errors_formatted = format_uncertainties(errors)
with open("data_T-Y-NORM.yaml", "w") as file:
yaml.dump({"data_central": data.tolist()}, file, sort_keys=False)

with open("data_ASY.yaml", "w") as file:
yaml.dump({"data_central": data}, file, sort_keys=False)

with open("kinematics_ASY.yaml", "w") as file:
with open("kinematics_T-Y-NORM.yaml", "w") as file:
yaml.dump({"bins": kinematics}, file, sort_keys=False)

with open("uncertainties_ASY.yaml", "w") as file:
yaml.dump({"definitions": error_definition, "bins": errors_formatted}, file, sort_keys=False)
with open("uncertainties_T-Y_NORM.yaml", "w") as file:
yaml.dump(
{"definitions": error_definition, "bins": errors_formatted}, file, sort_keys=False
)

def main_filter() -> None:
"""Main driver of the filter that produces commmondata.

There are four main different sources of uncertainties.
1. Statistical uncertainties: ADD, UNCORR
2. Correlated Systematic uncertainties: ADD, CORR
3. Uncorrelated Systematic uncertainties: ADD, UNCORR
"""
def main_filter() -> None:
"""Main driver of the filter that produces commmondata."""

yaml_content_data = load_yaml(table_id=17, version=2)
yaml_stat_corr = load_yaml(table_id=34, version=2)
yaml_sys_sources = load_yaml(table_id=26, version=2)

# yaml_content_uncertainties = load_yaml(table_id=3, version=1)
kinematics = get_kinematics(yaml_content_data)
data_central = get_data_values(yaml_content_data)
uncertainties = get_errors([yaml_content_data, yaml_stat_corr, yaml_sys_sources])
data_central, uncertainties = get_errors([yaml_content_data, yaml_stat_corr, yaml_sys_sources])

# Generate all the necessary files
dump_commondata(kinematics, data_central, uncertainties)
Expand All @@ -257,4 +250,4 @@ def main_filter() -> None:


if __name__ == "__main__":
main_filter()
main_filter()
Original file line number Diff line number Diff line change
@@ -1,37 +1,37 @@
bins:
- k1:
min: null
- y:
min: 0.0
mid: 0.1
max: null
k2:
max: 0.2
M2:
min: null
mid: 30032.89
mid: 29756.25
max: null
k3:
sqrts:
min: null
mid: 7000.0
max: null
- k1:
min: null
- y:
min: 0.2
mid: 0.4
max: null
k2:
max: 0.6
M2:
min: null
mid: 30032.89
mid: 29756.25
max: null
k3:
sqrts:
min: null
mid: 7000.0
max: null
- k1:
min: null
mid: 0.85
max: null
k2:
- y:
min: 0.6
mid: 8.50000000e-01
max: 1.1
M2:
min: null
mid: 30032.89
mid: 29756.25
max: null
k3:
sqrts:
min: null
mid: 7000.0
max: null
Loading

0 comments on commit 1ed351c

Please sign in to comment.