Skip to content

Commit

Permalink
Changes in rebinning task, removed negative weights from training
Browse files Browse the repository at this point in the history
  • Loading branch information
Lara813 committed Dec 19, 2024
1 parent e06606a commit 490fcc1
Show file tree
Hide file tree
Showing 11 changed files with 507 additions and 96 deletions.
1 change: 1 addition & 0 deletions hbw/categorization/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def catid_2b(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, a
# TODO: not hard-coded -> use config?
ml_processes = [
"signal_ggf", "signal_ggf2", "signal_vbf", "signal_vbf2",
"signal_ggf4", "signal_ggf5", "signal_vbf4", "signal_vbf5",
"hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1",
"hh_ggf_hbb_hvvqqlnu_kl1_kt1", "hh_vbf_hbb_hvvqqlnu_kv1_k2v1_kl1",
"hh_ggf_hbb_hvv2l2nu_kl1_kt1", "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
Expand Down
27 changes: 27 additions & 0 deletions hbw/config/defaults_and_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,28 @@ def set_config_defaults_and_groups(config_inst):
# process groups for conveniently looping over certain processs
# (used in wrapper_factory and during plotting)
config_inst.x.process_groups = {
"ml_study": [
"hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4",
"hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3",
"hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
"hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96",
"hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1",
"hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1",
"hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43",
"hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94",
"hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36",
"hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39",
"hh_ggf_hbb_hww2l2nu_kl0_kt1",
"hh_ggf_hbb_hww2l2nu_kl1_kt1",
"hh_ggf_hbb_hww2l2nu_kl2p45_kt1",
"hh_ggf_hbb_hww2l2nu_kl5_kt1",
"st",
"tt",
"dy_m4to10", "dy_m10to50", "dy_m50toinf",
"w_lnu",
"vv",
"h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth",
],
"all": ["*"],
"default": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "dy", "st", "vv", "w_lnu", "h"], # noqa: E501
"sl": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "qcd", "st", "dy", "vv", "w_lnu", "h"], # noqa: E501
Expand Down Expand Up @@ -273,6 +295,10 @@ def set_config_defaults_and_groups(config_inst):
),
# Dilepton
"SR_dl": (
"sr__1b__ml_signal_ggf", "sr__1b__ml_signal_ggf2", "sr__2b__ml_signal_ggf", "sr__2b__ml_signal_ggf2",
"sr__1b__ml_signal_vbf", "sr__1b__ml_signal_vbf2", "sr__2b__ml_signal_vbf", "sr__2b__ml_signal_vbf2",
"sr__1b__ml_signal_ggf4", "sr__1b__ml_signal_ggf5", "sr__2b__ml_signal_ggf4", "sr__2b__ml_signal_ggf5",
"sr__1b__ml_signal_vbf4", "sr__1b__ml_signal_vbf5", "sr__2b__ml_signal_vbf4", "sr__2b__ml_signal_vbf5",
"sr__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1",
"sr__2mu__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2mu__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1",
"sr__2e__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2e__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1",
Expand Down Expand Up @@ -442,4 +468,5 @@ def set_config_defaults_and_groups(config_inst):
"vbfSR_dl_resolved": is_signal_sm,
"vbfSR_dl_boosted": is_signal_sm,
"BR_dl": is_background,

}
181 changes: 161 additions & 20 deletions hbw/inference/dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,92 @@
# "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1",
# "hh_ggf_hbb_hvv2l2nu_kl5_kt1",

dl.derive("dl_ml_study_5", cls_dict={
"ml_model_name": "dl_22post_ml_study_5",
"config_categories": [
"sr__1b__ml_signal_ggf5",
"sr__1b__ml_signal_vbf5",
"sr__1b__ml_tt",
"sr__1b__ml_st",
"sr__1b__ml_dy",
"sr__1b__ml_h",
"sr__2b__ml_signal_ggf5",
"sr__2b__ml_signal_vbf5",
"sr__2b__ml_tt",
"sr__2b__ml_st",
"sr__2b__ml_dy",
"sr__2b__ml_h",
],
"processes": [
# qqHH_CV_m0p012_C2V_0p03_kl_10p2
# "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
"hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4",
"hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3",
"hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
"hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96",
"hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1",
"hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1",
"hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43",
"hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94",
"hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36",
"hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39",
"hh_ggf_hbb_hww2l2nu_kl0_kt1",
"hh_ggf_hbb_hww2l2nu_kl1_kt1",
"hh_ggf_hbb_hww2l2nu_kl2p45_kt1",
"hh_ggf_hbb_hww2l2nu_kl5_kt1",
"st",
"tt",
"dy",
"w_lnu",
"vv",
"h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth",
],
"systematics": rate_systematics,
})

dl.derive("dl_ml_study_4", cls_dict={
"ml_model_name": "dl_22post_ml_study_4",
"config_categories": [
"sr__1b__ml_signal_ggf4",
"sr__1b__ml_signal_vbf4",
"sr__1b__ml_tt",
"sr__1b__ml_st",
"sr__1b__ml_dy",
"sr__1b__ml_h",
"sr__2b__ml_signal_ggf4",
"sr__2b__ml_signal_vbf4",
"sr__2b__ml_tt",
"sr__2b__ml_st",
"sr__2b__ml_dy",
"sr__2b__ml_h",
],
"processes": [
# qqHH_CV_m0p012_C2V_0p03_kl_10p2
# "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
"hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4",
"hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3",
"hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
"hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96",
"hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1",
"hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1",
"hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43",
"hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94",
"hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36",
"hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39",
"hh_ggf_hbb_hww2l2nu_kl0_kt1",
"hh_ggf_hbb_hww2l2nu_kl1_kt1",
"hh_ggf_hbb_hww2l2nu_kl2p45_kt1",
"hh_ggf_hbb_hww2l2nu_kl5_kt1",
"st",
"tt",
"dy",
"w_lnu",
"vv",
"h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth",
],
"systematics": rate_systematics,
})


dl.derive("dl_ml_study_1", cls_dict={
"ml_model_name": "dl_22post_ml_study_1",
Expand All @@ -171,16 +257,65 @@
"sr__2b__ml_h",
],
"processes": [
"hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
"hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1",
"hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43",
"hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94",
"hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36",
"hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39",
"hh_ggf_hbb_hvv2l2nu_kl0_kt1",
"hh_ggf_hbb_hvv2l2nu_kl1_kt1",
"hh_ggf_hbb_hvv2l2nu_kl2p45_kt1",
"hh_ggf_hbb_hvv2l2nu_kl5_kt1",
# qqHH_CV_m0p012_C2V_0p03_kl_10p2
# "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
"hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4",
"hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3",
"hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
"hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96",
"hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1",
"hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1",
"hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43",
"hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94",
"hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36",
"hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39",
"hh_ggf_hbb_hww2l2nu_kl0_kt1",
"hh_ggf_hbb_hww2l2nu_kl1_kt1",
"hh_ggf_hbb_hww2l2nu_kl2p45_kt1",
"hh_ggf_hbb_hww2l2nu_kl5_kt1",
"st",
"tt",
"dy",
"w_lnu",
"vv",
"h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth",
],
"systematics": rate_systematics,
})

dl.derive("dl_ml_study_3", cls_dict={
"ml_model_name": "dl_22_procs1_w0",
"config_categories": [
"sr__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1",
"sr__1b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
"sr__1b__ml_tt",
"sr__1b__ml_st",
"sr__1b__ml_dy",
"sr__1b__ml_h",
"sr__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1",
"sr__2b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
"sr__2b__ml_tt",
"sr__2b__ml_st",
"sr__2b__ml_dy",
"sr__2b__ml_h",
],
"processes": [
# "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
"hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4",
"hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3",
"hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
"hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96",
"hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1",
"hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1",
"hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43",
"hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94",
"hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36",
"hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39",
"hh_ggf_hbb_hww2l2nu_kl0_kt1",
"hh_ggf_hbb_hww2l2nu_kl1_kt1",
"hh_ggf_hbb_hww2l2nu_kl2p45_kt1",
"hh_ggf_hbb_hww2l2nu_kl5_kt1",
"st",
"tt",
"dy",
"w_lnu",
Expand All @@ -207,16 +342,22 @@
"sr__2b__ml_h",
],
"processes": [
"hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
"hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1",
"hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43",
"hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94",
"hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36",
"hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39",
"hh_ggf_hbb_hvv2l2nu_kl0_kt1",
"hh_ggf_hbb_hvv2l2nu_kl1_kt1",
"hh_ggf_hbb_hvv2l2nu_kl2p45_kt1",
"hh_ggf_hbb_hvv2l2nu_kl5_kt1",
# "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
"hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4",
"hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3",
"hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
"hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96",
"hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1",
"hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1",
"hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43",
"hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94",
"hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36",
"hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39",
"hh_ggf_hbb_hww2l2nu_kl0_kt1",
"hh_ggf_hbb_hww2l2nu_kl1_kt1",
"hh_ggf_hbb_hww2l2nu_kl2p45_kt1",
"hh_ggf_hbb_hww2l2nu_kl5_kt1",
"st",
"tt",
"dy",
"w_lnu",
Expand Down
50 changes: 32 additions & 18 deletions hbw/ml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,13 +322,23 @@ def open_model(self, target: law.LocalDirectoryTarget) -> dict[str, Any]:
models["parameters"] = yaml.load(f_in, Loader=yaml.Loader)

# custom loss needed due to output layer changes for negative weights
from hbw.ml.tf_util import cumulated_crossentropy
models["model"] = tf.keras.models.load_model(
target["mlmodel"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy},
)
models["best_model"] = tf.keras.models.load_model(
target["checkpoint"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy},
)
from hbw.ml.tf_util import cumulated_crossentropy, categorical_crossentropy

# Check for negative weight handling and assign loss function accordingly.
if self.negative_weights == "ignore":
models["model"] = tf.keras.models.load_model(
target["mlmodel"].path, custom_objects={categorical_crossentropy.__name__: categorical_crossentropy},
)
models["best_model"] = tf.keras.models.load_model(
target["checkpoint"].path, custom_objects={categorical_crossentropy.__name__: categorical_crossentropy},
)
else:
models["model"] = tf.keras.models.load_model(
target["mlmodel"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy},
)
models["best_model"] = tf.keras.models.load_model(
target["checkpoint"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy},
)

return models

Expand Down Expand Up @@ -360,7 +370,6 @@ def load_data(
# load into memory
validation.load_all
log_memory("loading validation data")

# store input features as an output
output["mlmodel"].child("input_features.pkl", type="f").dump(self.input_features_ordered, formatter="pickle")

Expand Down Expand Up @@ -401,7 +410,6 @@ def train(
#
# training
#

self.fit_ml_model(task, model, train, validation, output)
log_memory("training")
# save the model and history; TODO: use formatter
Expand Down Expand Up @@ -441,7 +449,7 @@ def evaluate(
"""
Evaluation function that is run as part of the MLEvaluation task
"""
use_best_model = False
use_best_model = False # TODO ML, hier auf True setzen?

if len(events) == 0:
logger.warning(f"Dataset {task.dataset} is empty. No columns are produced.")
Expand All @@ -454,7 +462,7 @@ def evaluate(
process = task.dataset_inst.x("ml_process", task.dataset_inst.processes.get_first().name)
process_inst = task.config_inst.get_process(process)

ml_dataset = self.data_loader(self, process_inst, events)
ml_dataset = self.data_loader(self, process_inst, events, skip_mask=True)

# # store the ml truth label in the events
# events = set_ak_column(
Expand Down Expand Up @@ -486,7 +494,6 @@ def evaluate(
if len(pred[0]) != len(self.processes):
raise Exception("Number of output nodes should be equal to number of processes")
predictions.append(pred)

# store predictions for each model
for j, proc in enumerate(self.processes):
events = set_ak_column(
Expand Down Expand Up @@ -533,7 +540,7 @@ def prepare_ml_model(

from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from hbw.ml.tf_util import cumulated_crossentropy
from hbw.ml.tf_util import cumulated_crossentropy, categorical_crossentropy

n_inputs = len(set(self.input_features))
n_outputs = len(self.processes)
Expand All @@ -554,11 +561,18 @@ def prepare_ml_model(
# compile the network
# NOTE: the custom loss needed due to output layer changes for negative weights
optimizer = keras.optimizers.Adam(learning_rate=0.00050)
model.compile(
loss=cumulated_crossentropy,
optimizer=optimizer,
weighted_metrics=["categorical_accuracy"],
)
if self.negative_weights == "ignore":
model.compile(
loss=categorical_crossentropy,
optimizer=optimizer,
weighted_metrics=["categorical_accuracy"],
)
else:
model.compile(
loss=cumulated_crossentropy,
optimizer=optimizer,
weighted_metrics=["categorical_accuracy"],
)

return model

Expand Down
18 changes: 14 additions & 4 deletions hbw/ml/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,14 @@ class MLDatasetLoader:
# _ "equal_train_weights", "xsec_train_weights", "train_weights", "equal_weights")
evaluation_arrays: tuple = ("prediction",)

def __init__(self, ml_model_inst: MLModel, process: "str", events: ak.Array, stats: dict | None = None):
def __init__(
self,
ml_model_inst: MLModel,
process: "str",
events: ak.Array,
stats: dict | None = None,
skip_mask=False,
):
"""
Initializes the MLDatasetLoader with the given parameters.
Expand All @@ -136,10 +143,13 @@ def __init__(self, ml_model_inst: MLModel, process: "str", events: ak.Array, sta
self._process = process

proc_mask, _ = get_proc_mask(events, process, ml_model_inst.config_inst)
# TODO: die ohne _per_process müssen auch noch, still, per fold never make sense then anymore -> DISCUSS
self._stats = stats
# del_sub_proc_stats(process, sub_id)
self._events = events[proc_mask]
if not skip_mask:
self._events = events[proc_mask]
self._events = events[events.event_weight >= 0.0]
else:
self._events = events

def __repr__(self):
return f"{self.__class__.__name__}({self.ml_model_inst.cls_name}, {self.process})"
Expand Down Expand Up @@ -718,4 +728,4 @@ def prediction(self) -> np.ndarray:
raise Exception("No trained model found in the MLModel instance. Cannot calculate prediction.")
self._prediction = predict_numpy_on_batch(self._ml_model_inst.trained_model, self.features)

return self._prediction
return self._prediction # TODO ML best model
Loading

0 comments on commit 490fcc1

Please sign in to comment.