From 490fcc107cef7e89800fa2544f1777ad3fc1b50a Mon Sep 17 00:00:00 2001 From: Lara Date: Thu, 19 Dec 2024 14:05:26 +0100 Subject: [PATCH 1/8] Changes in rebinning task, removed negative weights from training --- hbw/categorization/categories.py | 1 + hbw/config/defaults_and_groups.py | 27 +++++ hbw/inference/dl.py | 181 ++++++++++++++++++++++++---- hbw/ml/base.py | 50 +++++--- hbw/ml/data_loader.py | 18 ++- hbw/ml/derived/dl.py | 81 ++++++++++++- hbw/ml/mixins.py | 30 +++-- hbw/ml/tf_util.py | 5 + hbw/tasks/inference.py | 190 ++++++++++++++++++++++++------ hbw/tasks/ml.py | 15 ++- hbw/tasks/postfit_plots.py | 5 +- 11 files changed, 507 insertions(+), 96 deletions(-) diff --git a/hbw/categorization/categories.py b/hbw/categorization/categories.py index c5799e36..0320603b 100644 --- a/hbw/categorization/categories.py +++ b/hbw/categorization/categories.py @@ -272,6 +272,7 @@ def catid_2b(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, a # TODO: not hard-coded -> use config? ml_processes = [ "signal_ggf", "signal_ggf2", "signal_vbf", "signal_vbf2", + "signal_ggf4", "signal_ggf5", "signal_vbf4", "signal_vbf5", "hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "hh_ggf_hbb_hvvqqlnu_kl1_kt1", "hh_vbf_hbb_hvvqqlnu_kv1_k2v1_kl1", "hh_ggf_hbb_hvv2l2nu_kl1_kt1", "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", diff --git a/hbw/config/defaults_and_groups.py b/hbw/config/defaults_and_groups.py index d3351dc9..9bb728d7 100644 --- a/hbw/config/defaults_and_groups.py +++ b/hbw/config/defaults_and_groups.py @@ -128,6 +128,28 @@ def set_config_defaults_and_groups(config_inst): # process groups for conveniently looping over certain processs # (used in wrapper_factory and during plotting) config_inst.x.process_groups = { + "ml_study": [ + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy_m4to10", "dy_m10to50", "dy_m50toinf", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], "all": ["*"], "default": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "dy", "st", "vv", "w_lnu", "h"], # noqa: E501 "sl": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "qcd", "st", "dy", "vv", "w_lnu", "h"], # noqa: E501 @@ -273,6 +295,10 @@ def set_config_defaults_and_groups(config_inst): ), # Dilepton "SR_dl": ( + "sr__1b__ml_signal_ggf", "sr__1b__ml_signal_ggf2", "sr__2b__ml_signal_ggf", "sr__2b__ml_signal_ggf2", + "sr__1b__ml_signal_vbf", "sr__1b__ml_signal_vbf2", "sr__2b__ml_signal_vbf", "sr__2b__ml_signal_vbf2", + "sr__1b__ml_signal_ggf4", "sr__1b__ml_signal_ggf5", "sr__2b__ml_signal_ggf4", "sr__2b__ml_signal_ggf5", + "sr__1b__ml_signal_vbf4", "sr__1b__ml_signal_vbf5", "sr__2b__ml_signal_vbf4", "sr__2b__ml_signal_vbf5", "sr__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2mu__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2mu__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2e__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2e__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", @@ -442,4 +468,5 @@ def set_config_defaults_and_groups(config_inst): "vbfSR_dl_resolved": is_signal_sm, "vbfSR_dl_boosted": is_signal_sm, "BR_dl": is_background, + } diff --git a/hbw/inference/dl.py b/hbw/inference/dl.py index 9bde981f..a17c3dc2 100644 --- a/hbw/inference/dl.py +++ b/hbw/inference/dl.py @@ -153,6 +153,92 @@ # "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1", # "hh_ggf_hbb_hvv2l2nu_kl5_kt1", +dl.derive("dl_ml_study_5", cls_dict={ + "ml_model_name": "dl_22post_ml_study_5", + "config_categories": [ + "sr__1b__ml_signal_ggf5", + "sr__1b__ml_signal_vbf5", + "sr__1b__ml_tt", + "sr__1b__ml_st", + "sr__1b__ml_dy", + "sr__1b__ml_h", + "sr__2b__ml_signal_ggf5", + "sr__2b__ml_signal_vbf5", + "sr__2b__ml_tt", + "sr__2b__ml_st", + "sr__2b__ml_dy", + "sr__2b__ml_h", + ], + "processes": [ + # qqHH_CV_m0p012_C2V_0p03_kl_10p2 + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], + "systematics": rate_systematics, +}) + +dl.derive("dl_ml_study_4", cls_dict={ + "ml_model_name": "dl_22post_ml_study_4", + "config_categories": [ + "sr__1b__ml_signal_ggf4", + "sr__1b__ml_signal_vbf4", + "sr__1b__ml_tt", + "sr__1b__ml_st", + "sr__1b__ml_dy", + "sr__1b__ml_h", + "sr__2b__ml_signal_ggf4", + "sr__2b__ml_signal_vbf4", + "sr__2b__ml_tt", + "sr__2b__ml_st", + "sr__2b__ml_dy", + "sr__2b__ml_h", + ], + "processes": [ + # qqHH_CV_m0p012_C2V_0p03_kl_10p2 + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], + "systematics": rate_systematics, +}) + dl.derive("dl_ml_study_1", cls_dict={ "ml_model_name": "dl_22post_ml_study_1", @@ -171,16 +257,65 @@ "sr__2b__ml_h", ], "processes": [ - "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", - "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1", - "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43", - "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94", - "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", - "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", - "hh_ggf_hbb_hvv2l2nu_kl0_kt1", - "hh_ggf_hbb_hvv2l2nu_kl1_kt1", - "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1", - "hh_ggf_hbb_hvv2l2nu_kl5_kt1", + # qqHH_CV_m0p012_C2V_0p03_kl_10p2 + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], + "systematics": rate_systematics, +}) + +dl.derive("dl_ml_study_3", cls_dict={ + "ml_model_name": "dl_22_procs1_w0", + "config_categories": [ + "sr__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "sr__1b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", + "sr__1b__ml_tt", + "sr__1b__ml_st", + "sr__1b__ml_dy", + "sr__1b__ml_h", + "sr__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "sr__2b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", + "sr__2b__ml_tt", + "sr__2b__ml_st", + "sr__2b__ml_dy", + "sr__2b__ml_h", + ], + "processes": [ + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", "tt", "dy", "w_lnu", @@ -207,16 +342,22 @@ "sr__2b__ml_h", ], "processes": [ - "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", - "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1", - "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43", - "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94", - "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", - "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", - "hh_ggf_hbb_hvv2l2nu_kl0_kt1", - "hh_ggf_hbb_hvv2l2nu_kl1_kt1", - "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1", - "hh_ggf_hbb_hvv2l2nu_kl5_kt1", + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", "tt", "dy", "w_lnu", diff --git a/hbw/ml/base.py b/hbw/ml/base.py index c1774524..110d78a0 100644 --- a/hbw/ml/base.py +++ b/hbw/ml/base.py @@ -322,13 +322,23 @@ def open_model(self, target: law.LocalDirectoryTarget) -> dict[str, Any]: models["parameters"] = yaml.load(f_in, Loader=yaml.Loader) # custom loss needed due to output layer changes for negative weights - from hbw.ml.tf_util import cumulated_crossentropy - models["model"] = tf.keras.models.load_model( - target["mlmodel"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, - ) - models["best_model"] = tf.keras.models.load_model( - target["checkpoint"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, - ) + from hbw.ml.tf_util import cumulated_crossentropy, categorical_crossentropy + + # Check for negative weight handling and assign loss function accordingly. + if self.negative_weights == "ignore": + models["model"] = tf.keras.models.load_model( + target["mlmodel"].path, custom_objects={categorical_crossentropy.__name__: categorical_crossentropy}, + ) + models["best_model"] = tf.keras.models.load_model( + target["checkpoint"].path, custom_objects={categorical_crossentropy.__name__: categorical_crossentropy}, + ) + else: + models["model"] = tf.keras.models.load_model( + target["mlmodel"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, + ) + models["best_model"] = tf.keras.models.load_model( + target["checkpoint"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, + ) return models @@ -360,7 +370,6 @@ def load_data( # load into memory validation.load_all log_memory("loading validation data") - # store input features as an output output["mlmodel"].child("input_features.pkl", type="f").dump(self.input_features_ordered, formatter="pickle") @@ -401,7 +410,6 @@ def train( # # training # - self.fit_ml_model(task, model, train, validation, output) log_memory("training") # save the model and history; TODO: use formatter @@ -441,7 +449,7 @@ def evaluate( """ Evaluation function that is run as part of the MLEvaluation task """ - use_best_model = False + use_best_model = False # TODO ML, hier auf True setzen? if len(events) == 0: logger.warning(f"Dataset {task.dataset} is empty. No columns are produced.") @@ -454,7 +462,7 @@ def evaluate( process = task.dataset_inst.x("ml_process", task.dataset_inst.processes.get_first().name) process_inst = task.config_inst.get_process(process) - ml_dataset = self.data_loader(self, process_inst, events) + ml_dataset = self.data_loader(self, process_inst, events, skip_mask=True) # # store the ml truth label in the events # events = set_ak_column( @@ -486,7 +494,6 @@ def evaluate( if len(pred[0]) != len(self.processes): raise Exception("Number of output nodes should be equal to number of processes") predictions.append(pred) - # store predictions for each model for j, proc in enumerate(self.processes): events = set_ak_column( @@ -533,7 +540,7 @@ def prepare_ml_model( from keras.models import Sequential from keras.layers import Dense, BatchNormalization - from hbw.ml.tf_util import cumulated_crossentropy + from hbw.ml.tf_util import cumulated_crossentropy, categorical_crossentropy n_inputs = len(set(self.input_features)) n_outputs = len(self.processes) @@ -554,11 +561,18 @@ def prepare_ml_model( # compile the network # NOTE: the custom loss needed due to output layer changes for negative weights optimizer = keras.optimizers.Adam(learning_rate=0.00050) - model.compile( - loss=cumulated_crossentropy, - optimizer=optimizer, - weighted_metrics=["categorical_accuracy"], - ) + if self.negative_weights == "ignore": + model.compile( + loss=categorical_crossentropy, + optimizer=optimizer, + weighted_metrics=["categorical_accuracy"], + ) + else: + model.compile( + loss=cumulated_crossentropy, + optimizer=optimizer, + weighted_metrics=["categorical_accuracy"], + ) return model diff --git a/hbw/ml/data_loader.py b/hbw/ml/data_loader.py index 55f7b11f..e0d60c08 100644 --- a/hbw/ml/data_loader.py +++ b/hbw/ml/data_loader.py @@ -119,7 +119,14 @@ class MLDatasetLoader: # _ "equal_train_weights", "xsec_train_weights", "train_weights", "equal_weights") evaluation_arrays: tuple = ("prediction",) - def __init__(self, ml_model_inst: MLModel, process: "str", events: ak.Array, stats: dict | None = None): + def __init__( + self, + ml_model_inst: MLModel, + process: "str", + events: ak.Array, + stats: dict | None = None, + skip_mask=False, + ): """ Initializes the MLDatasetLoader with the given parameters. @@ -136,10 +143,13 @@ def __init__(self, ml_model_inst: MLModel, process: "str", events: ak.Array, sta self._process = process proc_mask, _ = get_proc_mask(events, process, ml_model_inst.config_inst) - # TODO: die ohne _per_process müssen auch noch, still, per fold never make sense then anymore -> DISCUSS self._stats = stats # del_sub_proc_stats(process, sub_id) - self._events = events[proc_mask] + if not skip_mask: + self._events = events[proc_mask] + self._events = events[events.event_weight >= 0.0] + else: + self._events = events def __repr__(self): return f"{self.__class__.__name__}({self.ml_model_inst.cls_name}, {self.process})" @@ -718,4 +728,4 @@ def prediction(self) -> np.ndarray: raise Exception("No trained model found in the MLModel instance. Cannot calculate prediction.") self._prediction = predict_numpy_on_batch(self._ml_model_inst.trained_model, self.features) - return self._prediction + return self._prediction # TODO ML best model diff --git a/hbw/ml/derived/dl.py b/hbw/ml/derived/dl.py index 59b4a631..c7774484 100644 --- a/hbw/ml/derived/dl.py +++ b/hbw/ml/derived/dl.py @@ -81,7 +81,7 @@ class DenseClassifierDL(DenseModelMixin, ModelFitMixin, MLClassifierBase): store_name: str = "inputs_inclusive" folds: int = 5 - negative_weights: str = "handle" + negative_weights: str = "ignore" # overwriting DenseModelMixin parameters activation: str = "relu" @@ -269,6 +269,80 @@ def setup(self): "processes": ["signal", "top"], }) +dl_22post_ml_study_5 = dl_22post.derive("dl_22post_ml_study_5", cls_dict={ + "training_configs": lambda self, requested_configs: ["c22post"], + "combine_processes": { + "signal_ggf5": { + # "name": "tt_and_st", + "label": "Signal GGF", + "sub_processes": [ + "hh_ggf_hbb_hvv2l2nu_kl0_kt1", + "hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "hh_ggf_hbb_hvv2l2nu_kl5_kt1", + ], + "weighting": "equal", + }, + "signal_vbf5": { + # "name": "tt_and_st", + "label": "Signal VBF", + "sub_processes": [ + "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", + ], + "weighting": "xsec", + }, + }, + "processes": [ + "signal_ggf5", + "signal_vbf5", + "tt", + "st", + "dy", + "h", + ], +}) + +dl_22post_ml_study_4 = dl_22post.derive("dl_22post_ml_study_4", cls_dict={ + "training_configs": lambda self, requested_configs: ["c22post"], + "combine_processes": { + "signal_ggf4": { + # "name": "tt_and_st", + "label": "Signal GGF", + "sub_processes": [ + "hh_ggf_hbb_hvv2l2nu_kl0_kt1", + "hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1", + ], + "weighting": "xsec", + }, + "signal_vbf4": { + # "name": "tt_and_st", + "label": "Signal VBF", + "sub_processes": [ + "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", + ], + "weighting": "xsec", + }, + }, + "processes": [ + "signal_ggf4", + "signal_vbf4", + "tt", + "st", + "dy", + "h", + ], +}) + dl_22post_ml_study_1 = dl_22post.derive("dl_22post_ml_study_1", cls_dict={ "training_configs": lambda self, requested_configs: ["c22post"], "combine_processes": { @@ -359,14 +433,15 @@ def setup(self): "training_configs": lambda self, requested_configs: ["c22post", "c22pre"], "processes": ["hh_ggf_hbb_hvv2l2nu_kl1_kt1", "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", "tt", "st", "dy", "h"], }) -dl_22_procs1_w0 = dl_22_procs1.derive("dl_22_procs1_w1", cls_dict={ +dl_22_procs1_w0 = dl_22_procs1.derive("dl_22_procs1_w0", cls_dict={ + "training_configs": lambda self, requested_configs: ["c22post"], "ml_process_weights": { "hh_ggf_hbb_hvv2l2nu_kl1_kt1": 1, "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1": 1, "tt": 2, "st": 2, "dy": 2, - "h": 2, + "h": 1, }, }) dl_22_procs1_w1 = dl_22_procs1.derive("dl_22_procs1_w1", cls_dict={ diff --git a/hbw/ml/mixins.py b/hbw/ml/mixins.py index 38c632e2..bac5967a 100644 --- a/hbw/ml/mixins.py +++ b/hbw/ml/mixins.py @@ -55,7 +55,7 @@ def prepare_ml_model( import tensorflow.keras as keras from keras.models import Sequential from keras.layers import Dense, BatchNormalization - from hbw.ml.tf_util import cumulated_crossentropy + # from hbw.ml.tf_util import cumulated_crossentropy, categorical_crossentropy n_inputs = len(set(self.input_features)) n_outputs = len(self.processes) @@ -97,16 +97,24 @@ def prepare_ml_model( epsilon=1e-6, amsgrad=False, ) - model.compile( - loss=cumulated_crossentropy, - # NOTE: we'd preferrably use the Keras CCE, but it does not work when assigning one event - # to multiple classes (target with multiple entries != 0) - # loss ="categorical_crossentropy", - # loss=tf.keras.losses.CategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE), - optimizer=optimizer, - metrics=["categorical_accuracy"], - weighted_metrics=["categorical_accuracy"], - ) + if self.negative_weights == "ignore": + model.compile( + # NOTE: we'd preferrably use the Keras CCE, but it does not work when assigning one event + # to multiple classes (target with multiple entries != 0) + loss="categorical_crossentropy", + optimizer=optimizer, + metrics=["categorical_accuracy"], + weighted_metrics=["categorical_accuracy"], + ) + else: + model.compile( + # NOTE: we'd preferrably use the Keras CCE, but it does not work when assigning one event + # to multiple classes (target with multiple entries != 0) + loss="cumulated_crossentropy", + optimizer=optimizer, + metrics=["categorical_accuracy"], + weighted_metrics=["categorical_accuracy"], + ) return model diff --git a/hbw/ml/tf_util.py b/hbw/ml/tf_util.py index d187bfcf..7b9c6bc7 100644 --- a/hbw/ml/tf_util.py +++ b/hbw/ml/tf_util.py @@ -165,6 +165,11 @@ def _cumulated_crossenropy_from_logits(y_true, y_pred, axis): return ((numerator - denominator) / tf.math.reduce_sum(y_true, axis=axis))[:, 0] +@tf.function +def categorical_crossentropy(): + return tf.keras.losses.CategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE) + + @tf.function def cumulated_crossentropy(y_true, y_pred, from_logits=False, axis=-1): if from_logits: diff --git a/hbw/tasks/inference.py b/hbw/tasks/inference.py index 98b76320..86bfadf1 100644 --- a/hbw/tasks/inference.py +++ b/hbw/tasks/inference.py @@ -12,24 +12,84 @@ import law import order as od -from columnflow.tasks.framework.base import Requirements, RESOLVE_DEFAULT, AnalysisTask +from columnflow.tasks.framework.base import Requirements, RESOLVE_DEFAULT, AnalysisTask, ConfigTask from columnflow.tasks.framework.parameters import SettingsParameter from columnflow.tasks.framework.mixins import ( CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, InferenceModelMixin, ) from columnflow.tasks.framework.remote import RemoteWorkflow from columnflow.tasks.cms.inference import CreateDatacards -from columnflow.util import dev_sandbox, maybe_import +from columnflow.util import dev_sandbox, maybe_import # import_ROOT from hbw.tasks.base import HBWTask +# from hbw.tasks.postfit_plots import load_hists_uproot # import hbw.inference.constants as const array = maybe_import("array") +uproot = maybe_import("uproot") +np = maybe_import("numpy") +hist = maybe_import("hist") logger = law.logger.get_logger(__name__) +# Function copied from Mathis Hist hook commit +def apply_rebinning_edges(h: hist.Histogram, axis_name: str, edges: list): + """ + Generalized rebinning of a single axis from a hist.Histogram, using predefined edges. + + :param h: histogram to rebin + :param axis_name: string representing the axis to rebin + :param edges: list of floats representing the new bin edges. Must be a subset of the original edges. + :return: rebinned histogram + """ + if isinstance(edges, int): + return h[{axis_name: hist.rebin(edges)}] + + ax = h.axes[axis_name] + ax_idx = [a.name for a in h.axes].index(axis_name) + if not all([np.isclose(x, ax.edges).any() for x in edges]): + raise ValueError( + f"Cannot rebin histogram due to incompatible edges for axis '{ax.name}'\n" + f"Edges of histogram are {ax.edges}, requested rebinning to {edges}", + ) + + # If you rebin to a subset of initial range, keep the overflow and underflow + overflow = ax.traits.overflow or (edges[-1] < ax.edges[-1] and not np.isclose(edges[-1], ax.edges[-1])) + underflow = ax.traits.underflow or (edges[0] > ax.edges[0] and not np.isclose(edges[0], ax.edges[0])) + flow = overflow or underflow + new_ax = hist.axis.Variable(edges, name=ax.name, overflow=overflow, underflow=underflow) + axes = list(h.axes) + axes[ax_idx] = new_ax + + hnew = hist.Hist(*axes, name=h.name, storage=h._storage_type()) + + # Offset from bin edge to avoid numeric issues + offset = 0.5 * np.min(ax.edges[1:] - ax.edges[:-1]) + edges_eval = edges + offset + edge_idx = ax.index(edges_eval) + # Avoid going outside the range, reduceat will add the last index anyway + if edge_idx[-1] == ax.size + ax.traits.overflow: + edge_idx = edge_idx[:-1] + + if underflow: + # Only if the original axis had an underflow should you offset + if ax.traits.underflow: + edge_idx += 1 + edge_idx = np.insert(edge_idx, 0, 0) + + # Take is used because reduceat sums i:len(array) for the last entry, in the case + # where the final bin isn't the same between the initial and rebinned histogram, you + # want to drop this value. Add tolerance of 1/2 min bin width to avoid numeric issues + hnew.values(flow=flow)[...] = np.add.reduceat(h.values(flow=flow), edge_idx, + axis=ax_idx).take(indices=range(new_ax.size + underflow + overflow), axis=ax_idx) + if hnew._storage_type() == hist.storage.Weight(): + hnew.variances(flow=flow)[...] = np.add.reduceat(h.variances(flow=flow), edge_idx, + axis=ax_idx).take(indices=range(new_ax.size + underflow + overflow), axis=ax_idx) + return hnew + + def get_hist_name(cat_name: str, proc_name: str, syst_name: str | None = None) -> str: hist_name = f"{cat_name}/{proc_name}" if syst_name: @@ -68,7 +128,8 @@ def get_rebin_values(hist, N_bins_final: int = 10): Function that determines how to rebin a hist to *N_bins_final* bins such that the resulting histogram is flat """ - N_bins_input = hist.GetNbinsX() + N_bins_input = hist.axes[0].size + # N_bins_input = hist.GetNbinsX() # # replace empty bin values (TODO: remove as soon as we can disable the empty bin filling) # EMPTY_BIN_VALUE = 1e-5 @@ -77,32 +138,38 @@ def get_rebin_values(hist, N_bins_final: int = 10): # hist.SetBinContent(i, 0) # determine events per bin the final histogram should have - events_per_bin = hist.Integral() / N_bins_final + events_per_bin = hist.sum().value / N_bins_final + # events_per_bin = hist.Integral() / N_bins_final logger.info(f"============ {round(events_per_bin, 3)} events per bin") # bookkeeping number of bins and number of events bin_count = 1 N_events = 0 - rebin_values = [hist.GetBinLowEdge(1)] + # rebin_values = [hist.GetBinLowEdge(1)] + rebin_values = [int(hist.axes[0].edges[0])] # starting loop at 1 to exclude underflow # ending at N_bins_input + 1 excludes overflow - for i in range(1, N_bins_input + 1): + # for i in range(1, N_bins_input + 1): + for i in range(1, N_bins_input): if bin_count == N_bins_final: # break as soon as N-1 bin edges have been determined --> final bin is x_max break - - N_events += hist.GetBinContent(i) + N_events += hist.view()["value"][i] + # N_events += hist.GetBinContent(i) if i % 100 == 0: logger.info(f"========== Bin {i} of {N_bins_input}, {N_events} events") if N_events >= events_per_bin * bin_count: # when *N_events* surpasses threshold, append the corresponding bin edge and count - logger.info(f"++++++++++ Append bin edge {bin_count} of {N_bins_final} at edge {hist.GetBinLowEdge(i)}") - rebin_values.append(hist.GetBinLowEdge(i + 1)) + # logger.info(f"++++++++++ Append bin edge {bin_count} of {N_bins_final} at edge {hist.GetBinLowEdge(i)}") + logger.info(f"++++++++++ Append bin edge {bin_count} of {N_bins_final} at edge {hist.axes[0].edges[i]}") + # rebin_values.append(hist.GetBinLowEdge(i + 1)) + rebin_values.append(hist.axes[0].edges[i]) bin_count += 1 # final bin is x_max - x_max = hist.GetBinLowEdge(N_bins_input + 1) + x_max = hist.axes[0].edges[N_bins_input] + # x_max = hist.GetBinLowEdge(N_bins_input + 1) rebin_values.append(x_max) logger.info(f"final bin edges: {rebin_values}") return rebin_values @@ -111,7 +178,7 @@ def get_rebin_values(hist, N_bins_final: int = 10): def apply_binning(hist, rebin_values: list): N_bins = len(rebin_values) - 1 rebin_values_ptr = array.array("d", rebin_values) - + # rebin_values_ptr = np.array(rebin_values, dtype="float64") h_out = hist.Rebin(N_bins, hist.GetName(), rebin_values_ptr) return h_out @@ -202,10 +269,14 @@ class ModifyDatacardsFlatRebin( ProducersMixin, SelectorStepsMixin, CalibratorsMixin, + ConfigTask, law.LocalWorkflow, RemoteWorkflow, ): - sandbox = dev_sandbox("bash::$CF_BASE/sandboxes/cmssw_default.sh") + + sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + # sandbox = dev_sandbox("bash::$HBW_BASE/sandboxes/venv_ml_plotting.sh") + # sandbox = dev_sandbox("bash::$CF_BASE/sandboxes/cmssw_default.sh") bins_per_category = SettingsParameter( default=(RESOLVE_DEFAULT,), @@ -272,11 +343,31 @@ def get_rebin_processes(self): rebin_process_condition = self.inference_category_rebin_processes.get(config_category, None) if not rebin_process_condition: - logger.warning( - f"No rebin condition found for category {config_category}; rebinning will be flat " - f"on all processes {[proc.config_process for proc in processes]}", - ) - return processes + if "ggf" in config_category: + for proc in processes.copy(): + proc_name = proc.config_process + + # if not "ggf" in proc_name: + # processes.remove(proc) + # elif proc_name == 'h_ggf': # This is quiet custom made + # processes.remove(proc) + # elif "kl5" in proc_name: + # processes.remove(proc) + # return processes + # elif 'vbf' in config_category: + # for proc in processes.copy(): + # proc_name = proc.config_process + # if not 'vbf' in proc_name: + # processes.remove(proc) + # if proc_name == 'h_vbf': + # processes.remove(proc) + # return processes + # else: + logger.warning( + f"No rebin condition found for category {config_category}; rebinning will be flat " + f"on all processes {[proc.config_process for proc in processes]}", + ) + return processes # transform `rebin_process_condition` into Callable if required if not isinstance(rebin_process_condition, Callable): @@ -321,8 +412,9 @@ def output(self): } def run(self): - import uproot - from ROOT import TH1 + # import uproot + # from ROOT import TH1 + # import_ROOT() # config_inst = self.config_inst # inference_model = self.inference_model_inst @@ -355,7 +447,8 @@ def run(self): # get all nominal histograms nominal_hists = { - proc_name: file[get_hist_name(cat_name, proc_name)].to_pyroot() + proc_name: file[get_hist_name(cat_name, proc_name)].to_hist() + # proc_name: file[get_hist_name(cat_name, proc_name)].to_pyroot() for proc_name in proc_names } @@ -379,22 +472,48 @@ def run(self): # apply rebinning on all histograms and store resulting hists in a ROOT file out_file = uproot.recreate(outputs["shapes"].fn) + rebinned_histograms = {} for key, h in file.items(): - # remove ";1" appendix - key = key.split(";")[0] - try: - # convert histograms to pyroot - h = h.to_pyroot() - except AttributeError: - # skip non-histograms - continue - if not isinstance(h, TH1): - raise Exception(f"{h} is not a TH1 histogram") - h_rebin = apply_binning(h, rebin_values) - problematic_bin_count = check_empty_bins(h_rebin) # noqa - logger.info(f"Inserting histogram with name {key}") - out_file[key] = uproot.from_pyroot(h_rebin) + key = key.split(";")[0] + if "TH1" in str(h): + try: + h = file[key].to_hist() + except AttributeError: + continue + h_rebin = apply_rebinning_edges(h, h.axes[0].name, rebin_values) + # bin_edges = h_rebin.axes[0].edges # Bin edges (first axis) + # bin_values = h_rebin.values() + # bin_variance = h_rebin.variances() # Bin values + # h_rebin_root = uproot.newhist.Histogram( + # bins=(len(h_rebin.edges)-1), + # x_low=h_rebin.edges[0], + # x_high=h_rebin.edges[-1], + # values=h_rebin.values(), + # ) + rebinned_histograms[key] = h_rebin + + logger.info(f"Inserting histogram with name {key}") + out_file[key] = h_rebin + + # for key, h in file.items(): + # # remove ";1" appendix + # key = key.split(";")[0] + # __import__("IPython").embed() + # # try: + # # convert histograms to pyroot + # # h = h.to_pyroot() + # # except AttributeError: + # # skip non-histograms + # # continue + # # if not isinstance(h, TH1): + # # raise Exception(f"{h} is not a TH1 histogram") + + # # h_rebin = apply_binning(h, rebin_values) + # h_rebin = apply_rebinning_edges(h, "0", rebin_values) + # # problematic_bin_count = check_empty_bins(h_rebin) + # logger.info(f"Inserting histogram with name {key}") + # out_file[key] = uproot.recreate(h_rebin) class PrepareInferenceTaskCalls( @@ -404,6 +523,7 @@ class PrepareInferenceTaskCalls( ProducersMixin, SelectorStepsMixin, CalibratorsMixin, + ConfigTask, ): """ Simple task that produces string to run certain tasks in Inference diff --git a/hbw/tasks/ml.py b/hbw/tasks/ml.py index 5e45e9d7..9546f4dd 100644 --- a/hbw/tasks/ml.py +++ b/hbw/tasks/ml.py @@ -398,17 +398,24 @@ def run(self): outputs["stats"].dump(stats, formatter="json") # initialize the DatasetLoader - ml_dataset = self.ml_model_inst.data_loader(self.ml_model_inst, process, events, stats) - # NOTE: Almost closure + if self.ml_model_inst.negative_weights == "ignore": + ml_dataset = self.ml_model_inst.data_loader(self.ml_model_inst, process, events, stats) + logger.info( + f"{self.ml_model_inst.negative_weights} method chosen to hanlde negative weights:", + " All negative weights will be removed from training.", + ) + else: + ml_dataset = self.ml_model_inst.data_loader(self.ml_model_inst, process, events, stats, skip_mask=True) + sum_train_weights = np.sum(ml_dataset.train_weights) n_events_per_fold = len(ml_dataset.train_weights) - logger.info(f"Sum of traing weights is: {sum_train_weights} for {n_events_per_fold} {process} events") + logger.info(f"Sum of training weights is: {sum_train_weights} for {n_events_per_fold} {process} events") if self.ml_model_inst.config_inst.get_process(process).x("ml_config", None): if self.ml_model_inst.config_inst.get_process(process).x.ml_config.weighting == "equal": for sub_proc in self.ml_model_inst.config_inst.get_process(process).x.ml_config.sub_processes: - proc_mask, sub_id = get_proc_mask(events, sub_proc, self.ml_model_inst.config_inst) + proc_mask, _ = get_proc_mask(ml_dataset._events, sub_proc, self.ml_model_inst.config_inst) xcheck_weight_sum = np.sum(ml_dataset.train_weights[proc_mask]) xcheck_n_events = len(ml_dataset.train_weights[proc_mask]) logger.info( diff --git a/hbw/tasks/postfit_plots.py b/hbw/tasks/postfit_plots.py index 9df3745a..96fa8f82 100644 --- a/hbw/tasks/postfit_plots.py +++ b/hbw/tasks/postfit_plots.py @@ -35,12 +35,13 @@ def load_hists_uproot(fit_diagnostics_path): """ Helper to load histograms from a fit_diagnostics file """ # prepare output dict hists = DotDict() + __import__("IPython").embed() with uproot.open(fit_diagnostics_path) as tfile: keys = [key.split("/") for key in tfile.keys()] for key in keys: if len(key) != 3: continue - + __import__("IPython").embed() # get the histogram from the tfile h_in = tfile["/".join(key)] @@ -167,6 +168,8 @@ def run(self): else: fit_type = "fit_s" + __import__("IPython").embed() + all_hists = all_hists[f"shapes_{fit_type}"] for channel, hists in all_hists.items(): From 13970d5043b392e9f89a3e1a69a6475c3a87fea1 Mon Sep 17 00:00:00 2001 From: Lara Date: Fri, 17 Jan 2025 10:32:48 +0100 Subject: [PATCH 2/8] Modified PostfitPlot task --- hbw/config/defaults_and_groups.py | 17 ++- hbw/inference/base.py | 2 +- hbw/inference/dl.py | 54 ++++++++- hbw/ml/data_loader.py | 46 +++++++- hbw/ml/derived/dl.py | 21 +++- hbw/ml/mixins.py | 4 +- hbw/ml/stats.py | 9 +- hbw/plotting/plot_fits.py | 3 +- hbw/tasks/inference.py | 77 +----------- hbw/tasks/ml.py | 12 +- hbw/tasks/postfit_plots.py | 190 ++++++++++++++++++++++-------- 11 files changed, 287 insertions(+), 148 deletions(-) diff --git a/hbw/config/defaults_and_groups.py b/hbw/config/defaults_and_groups.py index 9bb728d7..dac08f38 100644 --- a/hbw/config/defaults_and_groups.py +++ b/hbw/config/defaults_and_groups.py @@ -150,6 +150,16 @@ def set_config_defaults_and_groups(config_inst): "vv", "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", ], + "test_postfit": [ + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h", + ], "all": ["*"], "default": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "dy", "st", "vv", "w_lnu", "h"], # noqa: E501 "sl": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "qcd", "st", "dy", "vv", "w_lnu", "h"], # noqa: E501 @@ -303,6 +313,7 @@ def set_config_defaults_and_groups(config_inst): "sr__2mu__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2mu__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2e__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2e__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__emu__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__emu__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "sr__1b", "sr__2b", ), "vbfSR_dl": ( "sr__1b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", "sr__2b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", @@ -375,10 +386,8 @@ def set_config_defaults_and_groups(config_inst): for proc, _, _ in config_inst.walk_processes() if proc.has_tag("is_signal") }, "dilep": { - "hh_ggf_hbb_hvv2l2nu_kl0_kt1": {"scale": 10000, "unstack": True}, - "hh_ggf_hbb_hvv2l2nu_kl1_kt1": {"scale": 10000, "unstack": True}, - "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1": {"scale": 10000, "unstack": True}, - "hh_ggf_hbb_hvv2l2nu_kl5_kt1": {"scale": 10000, "unstack": True}, + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1": {"scale": 90000, "unstack": True}, + "hh_ggf_hbb_hww2l2nu_kl1_kt1": {"scale": 10000, "unstack": True}, }, "dileptest": { "hh_ggf_hbb_hvv2l2nu_kl1_kt1": {"scale": 10000, "unstack": True}, diff --git a/hbw/inference/base.py b/hbw/inference/base.py index 5ee3d013..22f1c17b 100644 --- a/hbw/inference/base.py +++ b/hbw/inference/base.py @@ -89,7 +89,7 @@ def config_variable(self: InferenceModel, config_cat_inst: od.Config): dnn_proc = dnn_cat.replace("ml_", "") return f"mlscore.{dnn_proc}" else: - return "mli_mbb" + return "mli_lep_pt" def customize_category(self: InferenceModel, cat_inst: DotDict, config_cat_inst: od.Config): """ Function to allow customizing the inference category """ diff --git a/hbw/inference/dl.py b/hbw/inference/dl.py index a17c3dc2..5a4f42fc 100644 --- a/hbw/inference/dl.py +++ b/hbw/inference/dl.py @@ -240,7 +240,7 @@ }) -dl.derive("dl_ml_study_1", cls_dict={ +dl_ml_study_1 = dl.derive("dl_ml_study_1", cls_dict={ "ml_model_name": "dl_22post_ml_study_1", "config_categories": [ "sr__1b__ml_signal_ggf", @@ -283,7 +283,11 @@ "systematics": rate_systematics, }) -dl.derive("dl_ml_study_3", cls_dict={ +dl_ml_study_1.derive("dl_ml_study_1_handle", cls_dict={ + "ml_model_name": "dl_22post_ml_study_1_handle", +}) + +dl_ml_study_3 = dl.derive("dl_ml_study_3", cls_dict={ "ml_model_name": "dl_22_procs1_w0", "config_categories": [ "sr__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", @@ -325,7 +329,11 @@ "systematics": rate_systematics, }) -dl.derive("dl_ml_study_2", cls_dict={ +dl_ml_study_3.derive("dl_ml_study_3_handle", cls_dict={ + "ml_model_name": "dl_22_procs1_w0_handle", +}) + +dl_ml_study_2 = dl.derive("dl_ml_study_2", cls_dict={ "ml_model_name": "dl_22post_ml_study_2", "config_categories": [ "sr__1b__ml_signal_ggf2", @@ -367,6 +375,14 @@ "systematics": rate_systematics, }) +dl_ml_study_2.derive("dl_ml_study_2_handle", cls_dict={ + "ml_model_name": "dl_22post_ml_study_2_handle", +}) + +dl_ml_study_2.derive("dl_ml_study_2_ignore", cls_dict={ + "ml_model_name": "dl_22post_ml_study_2", +}) + dl.derive("dl_hww_and_hzz", cls_dict={ "processes": [ "hh_ggf_hbb_hww_kl0_kt1", @@ -531,3 +547,35 @@ "systematics": rate_systematics}, ) dl.derive("dl_rates_only", cls_dict={"systematics": rate_systematics}) + +dl.derive("dl_postfit_test", cls_dict={ + "ml_model_name": None, + "config_categories": [ + "sr__1b", + "sr__2b", + ], + "processes": [ + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], + "systematics": rate_systematics, +}) diff --git a/hbw/ml/data_loader.py b/hbw/ml/data_loader.py index e0d60c08..64057c0a 100644 --- a/hbw/ml/data_loader.py +++ b/hbw/ml/data_loader.py @@ -141,9 +141,11 @@ def __init__( """ self._ml_model_inst = ml_model_inst self._process = process + self._skip_mask = skip_mask proc_mask, _ = get_proc_mask(events, process, ml_model_inst.config_inst) self._stats = stats + # __import__("IPython").embed() # del_sub_proc_stats(process, sub_id) if not skip_mask: self._events = events[proc_mask] @@ -177,6 +179,10 @@ def parameters(self): } return self._parameters + @property + def skip_mask(self): + return self._skip_mask + @property def ml_model_inst(self): return self._ml_model_inst @@ -255,6 +261,14 @@ def shuffle_indices(self) -> np.ndarray: self._shuffle_indices = np.random.permutation(self.n_events) return self._shuffle_indices + @property + def num_event_per_process(self) -> str: + if not self.skip_mask: + self._num_events_per_process = "num_events_pos_weights_per_process" + else: + self._num_events_per_process = "num_events_per_process" + return self._num_events_per_process + def get_xsec_train_weights(self) -> np.ndarray: """ Weighting such that each event has roughly the same weight, @@ -267,10 +281,20 @@ def get_xsec_train_weights(self) -> np.ndarray: raise Exception("cannot determine train weights without stats") _, sub_id = get_proc_mask(self._events, self.process, self.ml_model_inst.config_inst) - sum_abs_weights = np.sum([self.stats[self.process]["sum_abs_weights_per_process"][str(id)] for id in sub_id]) - num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id]) + sum_weights = np.sum([self.stats[self.process]["sum_pos_weights_per_process"][str(id)] for id in sub_id]) + num_events = np.sum( + [self.stats[self.process][self.num_event_per_process][str(id)] for id in sub_id], + ) + # if not self.skip_mask: + # num_events = np.sum( + # [self.stats[self.process]["num_events_pos_weights_per_process"][str(id)] for id in sub_id], + # ) + # else: + # num_events = np.sum( + # [self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id], + # ) - xsec_train_weights = self.weights / sum_abs_weights * num_events + xsec_train_weights = self.weights / sum_weights * num_events return xsec_train_weights @@ -286,7 +310,15 @@ def get_equal_train_weights(self) -> np.ndarray: combined_proc_inst = self.ml_model_inst.config_inst.get_process(self.process) _, sub_id_proc = get_proc_mask(self._events, self.process, self.ml_model_inst.config_inst) - num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id_proc]) + num_events = np.sum( + [self.stats[self.process][self.num_event_per_process][str(id)] for id in sub_id_proc], + ) + # if not self.skip_mask: + # num_events = np.sum( + # [self.stats[self.process]["num_events_pos_weights_per_process"][str(id)] for id in sub_id_proc], + # ) + # else: + # num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id_proc]) targeted_sum_of_weights_per_process = ( num_events / len(combined_proc_inst.x.ml_config.sub_processes) ) @@ -724,8 +756,10 @@ def prediction(self) -> np.ndarray: self._prediction = self.load_data("prediction") else: # calcluate prediction if needed - if not hasattr(self._ml_model_inst, "trained_model"): + if not hasattr(self._ml_model_inst, "best_model"): + # if not hasattr(self._ml_model_inst, "trained_model"): raise Exception("No trained model found in the MLModel instance. Cannot calculate prediction.") - self._prediction = predict_numpy_on_batch(self._ml_model_inst.trained_model, self.features) + # self._prediction = predict_numpy_on_batch(self._ml_model_inst.trained_model, self.features) + self._prediction = predict_numpy_on_batch(self._ml_model_inst.best_model, self.features) return self._prediction # TODO ML best model diff --git a/hbw/ml/derived/dl.py b/hbw/ml/derived/dl.py index c7774484..c0a77d7c 100644 --- a/hbw/ml/derived/dl.py +++ b/hbw/ml/derived/dl.py @@ -293,7 +293,7 @@ def setup(self): "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", ], - "weighting": "xsec", + "weighting": "equal", }, }, "processes": [ @@ -345,6 +345,7 @@ def setup(self): dl_22post_ml_study_1 = dl_22post.derive("dl_22post_ml_study_1", cls_dict={ "training_configs": lambda self, requested_configs: ["c22post"], + "negative_weights": "ignore", "combine_processes": { "signal_ggf": { # "name": "tt_and_st", @@ -381,8 +382,13 @@ def setup(self): ], }) +dl_22post_ml_study_1_handle = dl_22post_ml_study_1.derive("dl_22post_ml_study_1_handle", cls_dict={ + "negative_weights": "handle", +}) + dl_22post_ml_study_2 = dl_22post.derive("dl_22post_ml_study_2", cls_dict={ "training_configs": lambda self, requested_configs: ["c22post"], + "negative_weights": "ignore", "combine_processes": { "signal_ggf2": { # "name": "tt_and_st", @@ -407,7 +413,7 @@ def setup(self): "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", ], - "weighting": "xsec", + "weighting": "equal", }, }, "processes": [ @@ -419,6 +425,11 @@ def setup(self): "h", ], }) + +dl_22post_ml_study_2_handle = dl_22post_ml_study_2.derive("dl_22post_ml_study_2_handle", cls_dict={ + "negative_weights": "handle", +}) + # # setups with different processes (0: baseline, 1: add SM vbf + single H, 2: add SL+all HH variations) # NOTE: we should decide which signal processes exactly to use: @@ -435,6 +446,7 @@ def setup(self): }) dl_22_procs1_w0 = dl_22_procs1.derive("dl_22_procs1_w0", cls_dict={ "training_configs": lambda self, requested_configs: ["c22post"], + "negative_weights": "ignore", "ml_process_weights": { "hh_ggf_hbb_hvv2l2nu_kl1_kt1": 1, "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1": 1, @@ -444,6 +456,11 @@ def setup(self): "h": 1, }, }) + +dl_22_procs1_w0_handle = dl_22_procs1_w0.derive("dl_22_procs1_w0_handle", cls_dict={ + "negative_weights": "handle", +}) + dl_22_procs1_w1 = dl_22_procs1.derive("dl_22_procs1_w1", cls_dict={ "ml_process_weights": { "hh_ggf_hbb_hvv2l2nu_kl1_kt1": 1, diff --git a/hbw/ml/mixins.py b/hbw/ml/mixins.py index bac5967a..2fc36b01 100644 --- a/hbw/ml/mixins.py +++ b/hbw/ml/mixins.py @@ -55,7 +55,7 @@ def prepare_ml_model( import tensorflow.keras as keras from keras.models import Sequential from keras.layers import Dense, BatchNormalization - # from hbw.ml.tf_util import cumulated_crossentropy, categorical_crossentropy + from hbw.ml.tf_util import cumulated_crossentropy # , categorical_crossentropy n_inputs = len(set(self.input_features)) n_outputs = len(self.processes) @@ -110,7 +110,7 @@ def prepare_ml_model( model.compile( # NOTE: we'd preferrably use the Keras CCE, but it does not work when assigning one event # to multiple classes (target with multiple entries != 0) - loss="cumulated_crossentropy", + loss=cumulated_crossentropy, optimizer=optimizer, metrics=["categorical_accuracy"], weighted_metrics=["categorical_accuracy"], diff --git a/hbw/ml/stats.py b/hbw/ml/stats.py index 1a4b0a88..9a68c8e6 100644 --- a/hbw/ml/stats.py +++ b/hbw/ml/stats.py @@ -76,15 +76,16 @@ def ml_preparation( events = set_ak_column_f32(events, "event_weight", weight) stats["sum_weights"] += float(ak.sum(weight, axis=0)) weight_map["sum_weights"] = weight - weight_map["sum_abs_weights"] = (weight, weight > 0) - weight_map["sum_pos_weights"] = np.abs(weight) + weight_map["sum_pos_weights"] = (weight, weight > 0) + weight_map["sum_abs_weights"] = np.abs(weight) + weight_map["num_events_pos_weights"] = weight > 0 # normalization weight only norm_weight = events["stitched_normalization_weight"] stats["sum_norm_weights"] += float(ak.sum(norm_weight, axis=0)) weight_map["sum_norm_weights"] = norm_weight - weight_map["sum_abs_norm_weights"] = (norm_weight, norm_weight > 0) - weight_map["sum_pos_norm_weights"] = np.abs(norm_weight) + weight_map["sum_pos_norm_weights"] = (norm_weight, norm_weight > 0) + weight_map["sum_abs_norm_weights"] = np.abs(norm_weight) group_map = { "process": { diff --git a/hbw/plotting/plot_fits.py b/hbw/plotting/plot_fits.py index c1d9af4e..e7cb8f13 100644 --- a/hbw/plotting/plot_fits.py +++ b/hbw/plotting/plot_fits.py @@ -52,7 +52,8 @@ def scalable_exponnorm(x, A, loc, scale, K=1): def plot_fit( - hists: OrderedDict[od.Process, hist.Hist], + hists: dict[str, OrderedDict[od.Process, hist.Hist]], + # hists: OrderedDict[od.Process, hist.Hist], config_inst: od.Config, category_inst: od.Category, variable_insts: list[od.Variable], diff --git a/hbw/tasks/inference.py b/hbw/tasks/inference.py index 86bfadf1..0bb35882 100644 --- a/hbw/tasks/inference.py +++ b/hbw/tasks/inference.py @@ -8,7 +8,6 @@ from typing import Callable -# import luigi import law import order as od @@ -19,11 +18,8 @@ ) from columnflow.tasks.framework.remote import RemoteWorkflow from columnflow.tasks.cms.inference import CreateDatacards -from columnflow.util import dev_sandbox, maybe_import # import_ROOT +from columnflow.util import dev_sandbox, maybe_import from hbw.tasks.base import HBWTask -# from hbw.tasks.postfit_plots import load_hists_uproot - -# import hbw.inference.constants as const array = maybe_import("array") uproot = maybe_import("uproot") @@ -129,47 +125,33 @@ def get_rebin_values(hist, N_bins_final: int = 10): the resulting histogram is flat """ N_bins_input = hist.axes[0].size - # N_bins_input = hist.GetNbinsX() - - # # replace empty bin values (TODO: remove as soon as we can disable the empty bin filling) - # EMPTY_BIN_VALUE = 1e-5 - # for i in range(1, N_bins_input + 1): - # if hist.GetBinContent(i) == EMPTY_BIN_VALUE: - # hist.SetBinContent(i, 0) # determine events per bin the final histogram should have events_per_bin = hist.sum().value / N_bins_final - # events_per_bin = hist.Integral() / N_bins_final logger.info(f"============ {round(events_per_bin, 3)} events per bin") # bookkeeping number of bins and number of events bin_count = 1 N_events = 0 - # rebin_values = [hist.GetBinLowEdge(1)] rebin_values = [int(hist.axes[0].edges[0])] # starting loop at 1 to exclude underflow # ending at N_bins_input + 1 excludes overflow - # for i in range(1, N_bins_input + 1): for i in range(1, N_bins_input): if bin_count == N_bins_final: # break as soon as N-1 bin edges have been determined --> final bin is x_max break N_events += hist.view()["value"][i] - # N_events += hist.GetBinContent(i) if i % 100 == 0: logger.info(f"========== Bin {i} of {N_bins_input}, {N_events} events") if N_events >= events_per_bin * bin_count: # when *N_events* surpasses threshold, append the corresponding bin edge and count - # logger.info(f"++++++++++ Append bin edge {bin_count} of {N_bins_final} at edge {hist.GetBinLowEdge(i)}") logger.info(f"++++++++++ Append bin edge {bin_count} of {N_bins_final} at edge {hist.axes[0].edges[i]}") - # rebin_values.append(hist.GetBinLowEdge(i + 1)) rebin_values.append(hist.axes[0].edges[i]) bin_count += 1 # final bin is x_max x_max = hist.axes[0].edges[N_bins_input] - # x_max = hist.GetBinLowEdge(N_bins_input + 1) rebin_values.append(x_max) logger.info(f"final bin edges: {rebin_values}") return rebin_values @@ -275,8 +257,6 @@ class ModifyDatacardsFlatRebin( ): sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) - # sandbox = dev_sandbox("bash::$HBW_BASE/sandboxes/venv_ml_plotting.sh") - # sandbox = dev_sandbox("bash::$CF_BASE/sandboxes/cmssw_default.sh") bins_per_category = SettingsParameter( default=(RESOLVE_DEFAULT,), @@ -339,7 +319,6 @@ def get_rebin_processes(self): """ config_category = self.branch_data.config_category processes = self.branch_data.processes.copy() - # inf_proc_names = [self.inference_model_inst.inf_proc(proc.name) for proc in self.branch_data.processes] rebin_process_condition = self.inference_category_rebin_processes.get(config_category, None) if not rebin_process_condition: @@ -347,22 +326,6 @@ def get_rebin_processes(self): for proc in processes.copy(): proc_name = proc.config_process - # if not "ggf" in proc_name: - # processes.remove(proc) - # elif proc_name == 'h_ggf': # This is quiet custom made - # processes.remove(proc) - # elif "kl5" in proc_name: - # processes.remove(proc) - # return processes - # elif 'vbf' in config_category: - # for proc in processes.copy(): - # proc_name = proc.config_process - # if not 'vbf' in proc_name: - # processes.remove(proc) - # if proc_name == 'h_vbf': - # processes.remove(proc) - # return processes - # else: logger.warning( f"No rebin condition found for category {config_category}; rebinning will be flat " f"on all processes {[proc.config_process for proc in processes]}", @@ -412,12 +375,6 @@ def output(self): } def run(self): - # import uproot - # from ROOT import TH1 - # import_ROOT() - - # config_inst = self.config_inst - # inference_model = self.inference_model_inst inputs = self.input() outputs = self.output() @@ -431,7 +388,6 @@ def run(self): outputs["card"].dump(datacard, formatter="text") with uproot.open(inp_shapes.fn) as file: - # logger.info(f"File keys: {file.keys()}") # determine which histograms are present cat_names, proc_names, syst_names = get_cat_proc_syst_names(file) @@ -482,39 +438,14 @@ def run(self): except AttributeError: continue h_rebin = apply_rebinning_edges(h, h.axes[0].name, rebin_values) - # bin_edges = h_rebin.axes[0].edges # Bin edges (first axis) - # bin_values = h_rebin.values() - # bin_variance = h_rebin.variances() # Bin values - # h_rebin_root = uproot.newhist.Histogram( - # bins=(len(h_rebin.edges)-1), - # x_low=h_rebin.edges[0], - # x_high=h_rebin.edges[-1], - # values=h_rebin.values(), - # ) + # problematic_bin_count = check_empty_bins(h_rebin) + # TODO: we should reimplement check empty bins, check for all bkg not per process + rebinned_histograms[key] = h_rebin logger.info(f"Inserting histogram with name {key}") out_file[key] = h_rebin - # for key, h in file.items(): - # # remove ";1" appendix - # key = key.split(";")[0] - # __import__("IPython").embed() - # # try: - # # convert histograms to pyroot - # # h = h.to_pyroot() - # # except AttributeError: - # # skip non-histograms - # # continue - # # if not isinstance(h, TH1): - # # raise Exception(f"{h} is not a TH1 histogram") - - # # h_rebin = apply_binning(h, rebin_values) - # h_rebin = apply_rebinning_edges(h, "0", rebin_values) - # # problematic_bin_count = check_empty_bins(h_rebin) - # logger.info(f"Inserting histogram with name {key}") - # out_file[key] = uproot.recreate(h_rebin) - class PrepareInferenceTaskCalls( HBWTask, diff --git a/hbw/tasks/ml.py b/hbw/tasks/ml.py index 9546f4dd..2aeb7ce8 100644 --- a/hbw/tasks/ml.py +++ b/hbw/tasks/ml.py @@ -330,6 +330,8 @@ def output(self): # the stats dict is created per process+fold, but should always be identical, therefore we store it only once outputs["stats"] = self.target("stats.json") + outputs["cross_check_weights"] = self.target(f"cross_check_weights_{process}_{self.fold}.yaml") + outputs["parameters"] = self.target("parameters.yaml") return outputs @@ -410,9 +412,9 @@ def run(self): sum_train_weights = np.sum(ml_dataset.train_weights) n_events_per_fold = len(ml_dataset.train_weights) logger.info(f"Sum of training weights is: {sum_train_weights} for {n_events_per_fold} {process} events") - + xcheck = {} if self.ml_model_inst.config_inst.get_process(process).x("ml_config", None): - + xcheck[process] = {} if self.ml_model_inst.config_inst.get_process(process).x.ml_config.weighting == "equal": for sub_proc in self.ml_model_inst.config_inst.get_process(process).x.ml_config.sub_processes: proc_mask, _ = get_proc_mask(ml_dataset._events, sub_proc, self.ml_model_inst.config_inst) @@ -422,6 +424,12 @@ def run(self): f"For the equal weighting method the sum of weights for {sub_proc} is {xcheck_weight_sum} " f"(No. of events: {xcheck_n_events})", ) + if sub_proc not in xcheck[process]: + xcheck[process][sub_proc] = {} + xcheck[process][sub_proc]["weight_sum"] = int(xcheck_weight_sum) + xcheck[process][sub_proc]["num_events"] = xcheck_n_events + # __import__("IPython").embed() + outputs["cross_check_weights"].dump(xcheck, formatter="yaml") for input_array in self.ml_model_inst.data_loader.input_arrays: logger.info(f"loading data for input array {input_array}") diff --git a/hbw/tasks/postfit_plots.py b/hbw/tasks/postfit_plots.py index 96fa8f82..949f0b7e 100644 --- a/hbw/tasks/postfit_plots.py +++ b/hbw/tasks/postfit_plots.py @@ -6,55 +6,136 @@ from __future__ import annotations -from collections import OrderedDict +from collections import OrderedDict, defaultdict import law import luigi import order as od - +import re from columnflow.tasks.framework.base import ConfigTask from columnflow.tasks.framework.mixins import ( InferenceModelMixin, MLModelsMixin, ProducersMixin, SelectorStepsMixin, - CalibratorsMixin, + CalibratorsMixin, DatasetsProcessesMixin, ) from columnflow.tasks.framework.plotting import ( - PlotBase, PlotBase1D, + PlotBase, PlotBase1D, ProcessPlotSettingMixin, # VariablesPlotSettingMixin, ) from columnflow.tasks.framework.decorators import view_output_plots from hbw.tasks.base import HBWTask from columnflow.util import dev_sandbox, DotDict, maybe_import +# from hbw.inference.base import inf_proc + uproot = maybe_import("uproot") +hist = maybe_import("hist") logger = law.logger.get_logger(__name__) -def load_hists_uproot(fit_diagnostics_path): +def reverse_inf_proc(proc): + """ + Helper function that reverses the transformations done by inf_proc. + """ + if proc.startswith("ggHH_"): + # Adjust pattern to split the last part into two groups + pattern = r"ggHH_kl_([mp\d]+)_kt_([mp\d]+)_([a-zA-Z\d]{3})([a-zA-Z\d]+)" + replacement = r"hh_ggf_\3_\4_kl\1_kt\2" + return re.sub(pattern, replacement, proc) + elif proc.startswith("qqHH_"): + # Adjust pattern to split the last part into two groups + pattern = r"qqHH_CV_([mp\d]+)_C2V_([mp\d]+)_kl_([mp\d]+)_([a-zA-Z\d]{3})([a-zA-Z\d]+)" + replacement = r"hh_vbf_\4_\5_kv\1_k2v\2_kl\3" + return re.sub(pattern, replacement, proc) + elif proc == "qqH": + pattern = r"qqH" + replacement = r"h_vbf" + return re.sub(pattern, replacement, proc) + elif proc == "ggH": + pattern = r"ggH" + replacement = r"h_ggf" + return re.sub(pattern, replacement, proc) + elif proc == "ggZH": + pattern = r"ggZH" + replacement = r"zh_gg" + return re.sub(pattern, replacement, proc) + elif "H" in proc: + proc = proc.lower() + return proc + else: + # If the string doesn't match the patterns, return it unchanged + return proc + + +def load_hists_uproot(fit_diagnostics_path, fit_type): """ Helper to load histograms from a fit_diagnostics file """ + with uproot.open(fit_diagnostics_path) as tfile: + if any("shapes_fit_s" in _k for _k in tfile.keys()): + if fit_type != "prefit": + fit_type = "fit_s" + hists = get_hists_from_fit_diagnostics(tfile)[f"shapes_{fit_type}"] + else: + # if fit_type != "prefit": + # fit_type = "postfit" + hists = get_hists_from_multidimfit(tfile)[f"{fit_type}"] + + return hists + + +def get_hists_from_fit_diagnostics(tfile): # prepare output dict hists = DotDict() - __import__("IPython").embed() - with uproot.open(fit_diagnostics_path) as tfile: - keys = [key.split("/") for key in tfile.keys()] - for key in keys: - if len(key) != 3: - continue - __import__("IPython").embed() - # get the histogram from the tfile - h_in = tfile["/".join(key)] - # unpack key - fit, channel, process = key - process = process.split(";")[0] + keys = [key.split("/") for key in tfile.keys()] + for key in keys: + if len(key) != 3: + continue - if "data" not in process: - # transform TH1F to hist - h_in = h_in.to_hist() + # get the histogram from the tfile + h_in = tfile["/".join(key)] - # set the histogram in a deep dictionary - hists = law.util.merge_dicts(hists, DotDict.wrap({fit: {channel: {process: h_in}}}), deep=True) + # unpack key + fit, channel, process = key + process = process.split(";")[0] + + if "data" in process or "total" in process: + continue + else: + # transform TH1F to hist + h_in = h_in.to_hist() + + # set the histogram in a deep dictionary + hists = law.util.merge_dicts(hists, DotDict.wrap({fit: {channel: {process: h_in}}}), deep=True) + return hists + + +def get_hists_from_multidimfit(tfile): + """ Helper to load histograms from a fit_diagnostics file """ + # prepare output dict + hists = DotDict() + keys = [key.split("/") for key in tfile.keys()] + for key in keys: + if len(key) != 2: + continue + # get the histogram from the tfile + h_in = tfile["/".join(key)] + + # unpack key + fit_and_channel, process = key + fit = fit_and_channel.split("_")[-1] + fit = fit.replace("_", "") + channel = fit_and_channel.replace(f"_{fit}", "") + process = process.split(";")[0] + + if "data" in process or "Total" in process: + continue + # transform TH1F to hist + else: + h_in = h_in.to_hist() + + # set the histogram in a deep dictionary + hists = law.util.merge_dicts(hists, DotDict.wrap({fit: {channel: {process: h_in}}}), deep=True) return hists @@ -68,11 +149,12 @@ def load_hists_uproot(fit_diagnostics_path): from columnflow.plotting.plot_util import ( prepare_plot_config, prepare_style_config, + apply_process_settings, ) def plot_postfit_shapes( - hists: OrderedDict, + hists: OrderedDict[od.Process, hist.Hist], config_inst: od.Config, category_inst: od.Category, variable_insts: list[od.Variable], @@ -86,7 +168,7 @@ def plot_postfit_shapes( **kwargs, ) -> tuple(plt.Figure, tuple(plt.Axes)): variable_inst = law.util.make_tuple(variable_insts)[0] - + hists = apply_process_settings(hists, process_settings) plot_config = prepare_plot_config( hists, shape_norm=shape_norm, @@ -112,6 +194,8 @@ class PlotPostfitShapes( # NOTE: mixins might be wrong and could (should?) be extended to MultiConfigTask HBWTask, PlotBase1D, + ProcessPlotSettingMixin, + DatasetsProcessesMixin, # to correctly setup our InferenceModel, we need all these mixins, but hopefully, all these # parameters are automatically resolved correctly InferenceModelMixin, @@ -134,7 +218,7 @@ class PlotPostfitShapes( sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) plot_function = PlotBase.plot_function.copy( - default="hbw.tasks.plotting.plot_postfit_shapes", + default="hbw.tasks.postfit_plots.plot_postfit_shapes", add_default_to_description=True, ) @@ -160,47 +244,53 @@ def run(self): f"Note! It is important that the requested inference_model {self.inference_model} " "is identical to the one that has been used to create the datacards", ) - all_hists = load_hists_uproot(self.fit_diagnostics_file) outp = self.output() if self.prefit: fit_type = "prefit" else: - fit_type = "fit_s" + fit_type = "postfit" - __import__("IPython").embed() + all_hists = load_hists_uproot(self.fit_diagnostics_file, fit_type) + process_insts = list(map(self.config_inst.get_process, self.processes)) - all_hists = all_hists[f"shapes_{fit_type}"] - - for channel, hists in all_hists.items(): + for channel, h_in in all_hists.items(): has_category = self.inference_model_inst.has_category(channel) if not has_category: logger.warning(f"Category {channel} is not part of the inference model {self.inference_model}") - for proc_key in list(hists.keys()): - # remove unnecessary histograms - if "data" in proc_key or "total" in proc_key: - hists.pop(proc_key) + hists = defaultdict(OrderedDict) + + for proc_key in list(h_in.keys()): + proc_inst = None + # try getting the config process via InferenceModel + if has_category: + # TODO: process customization based on inference process? e.g. scale + inference_process = self.inference_model_inst.get_process(proc_key, channel) + proc_inst = self.config_inst.get_process(inference_process.config_process) + else: + # try getting proc inst directly via config + proc_inst = self.config_inst.get_process(proc_key, default=None) + + # replace string keys with process instances + if proc_inst: + plot_proc = [ + proc for proc in process_insts if proc.has_process(proc_inst) or proc.name == proc_inst.name + ] + if len(plot_proc) != 1: + if len(plot_proc) > 1: + raise Exception(f"{proc_key} was assigned to more then one porcess insts ({plot_proc}) ") + else: + logger.warning(f"{proc_key} in root file, but won't be plotted.") continue - proc_inst = None - # try getting the config process via InferenceModel - if has_category: - # TODO: process customization based on inference process? e.g. scale - inference_process = self.inference_model_inst.get_process(proc_key, channel) - proc_inst = self.config_inst.get_process(inference_process.config_process) + if plot_proc[0] not in hists: + hists[plot_proc[0]] = {} + hists[plot_proc[0]] = h_in[proc_key] else: - # try getting proc inst directly via config - proc_inst = self.config_inst.get_process(proc_key, default=None) - - # replace string keys with process instances - if proc_inst: - hists[proc_inst] = hists[proc_key] - hists.pop(proc_key) + hists[plot_proc[0]] = hists[plot_proc[0]] + h_in[proc_key] - # try getting the config category and variable via InferenceModel if has_category: - # TODO: category/variable customization based on inference model? inference_category = self.inference_model_inst.get_category(channel) config_category = self.config_inst.get_category(inference_category.config_category) variable_inst = self.config_inst.get_variable(inference_category.config_variable) From 1ff9bb3fdf349e53e4334b051c6e923accb364f7 Mon Sep 17 00:00:00 2001 From: Lara Sophie Markus Date: Fri, 24 Jan 2025 08:10:31 +0100 Subject: [PATCH 3/8] cleanup and fixes in postfit plotting --- hbw/config/defaults_and_groups.py | 8 +-- hbw/config/styling.py | 2 + hbw/tasks/inference.py | 10 ++++ hbw/tasks/postfit_plots.py | 88 +++++++++++++------------------ 4 files changed, 52 insertions(+), 56 deletions(-) diff --git a/hbw/config/defaults_and_groups.py b/hbw/config/defaults_and_groups.py index dac08f38..883f758f 100644 --- a/hbw/config/defaults_and_groups.py +++ b/hbw/config/defaults_and_groups.py @@ -151,8 +151,8 @@ def set_config_defaults_and_groups(config_inst): "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", ], "test_postfit": [ - "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", - "hh_ggf_hbb_hww2l2nu_kl1_kt1", + # "hh_vbf_hbb_hww2l2nu", + "hh_ggf_hbb_hww2l2nu", "st", "tt", "dy", @@ -386,8 +386,8 @@ def set_config_defaults_and_groups(config_inst): for proc, _, _ in config_inst.walk_processes() if proc.has_tag("is_signal") }, "dilep": { - "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1": {"scale": 90000, "unstack": True}, - "hh_ggf_hbb_hww2l2nu_kl1_kt1": {"scale": 10000, "unstack": True}, + "hh_vbf_hbb_hww2l2nu": {"scale": 90000, "unstack": True}, + "hh_ggf_hbb_hww2l2nu": {"scale": 10000, "unstack": True}, }, "dileptest": { "hh_ggf_hbb_hvv2l2nu_kl1_kt1": {"scale": 10000, "unstack": True}, diff --git a/hbw/config/styling.py b/hbw/config/styling.py index 318f7db4..83fd2aa3 100644 --- a/hbw/config/styling.py +++ b/hbw/config/styling.py @@ -203,7 +203,9 @@ def stylize_processes(config: od.Config) -> None: # unstack signal in plotting if "hh_" in proc.name.lower(): + proc.has_tag("is_signal") proc.unstack = True + proc.scale = "stack" # labels used for ML categories proc.x.ml_label = ml_labels.get(proc.name, proc.name) diff --git a/hbw/tasks/inference.py b/hbw/tasks/inference.py index 0bb35882..1cd0357b 100644 --- a/hbw/tasks/inference.py +++ b/hbw/tasks/inference.py @@ -508,6 +508,9 @@ def run(self): # combine category names with card fn to a single string datacards = ",".join([f"{cat_name}=$CARDS_PATH/{card_fn}" for cat_name, card_fn in zip(cat_names, card_fns)]) + # # name of the output root file that contains the Pre+Postfit shapes + # output_file = "" + base_cmd = f"export CARDS_PATH={cards_path}" + "\n" print("\n\n") @@ -541,4 +544,11 @@ def run(self): f"--order-by-impact" ) print(cmd, "\n\n") + + # running PreAndPostfitShapes for Pre+Postfit plots + cmd = base_cmd + ( + f"law run PreAndPostFitShapes --version {identifier} --datacards {datacards} " + # f"--output-name {output_file}" + ) + print(cmd, "\n\n") output["FitDiagnostics"].dump(cmd, formatter="text") diff --git a/hbw/tasks/postfit_plots.py b/hbw/tasks/postfit_plots.py index 949f0b7e..faa0dfbf 100644 --- a/hbw/tasks/postfit_plots.py +++ b/hbw/tasks/postfit_plots.py @@ -11,7 +11,6 @@ import law import luigi import order as od -import re from columnflow.tasks.framework.base import ConfigTask from columnflow.tasks.framework.mixins import ( InferenceModelMixin, MLModelsMixin, ProducersMixin, SelectorStepsMixin, @@ -34,40 +33,6 @@ logger = law.logger.get_logger(__name__) -def reverse_inf_proc(proc): - """ - Helper function that reverses the transformations done by inf_proc. - """ - if proc.startswith("ggHH_"): - # Adjust pattern to split the last part into two groups - pattern = r"ggHH_kl_([mp\d]+)_kt_([mp\d]+)_([a-zA-Z\d]{3})([a-zA-Z\d]+)" - replacement = r"hh_ggf_\3_\4_kl\1_kt\2" - return re.sub(pattern, replacement, proc) - elif proc.startswith("qqHH_"): - # Adjust pattern to split the last part into two groups - pattern = r"qqHH_CV_([mp\d]+)_C2V_([mp\d]+)_kl_([mp\d]+)_([a-zA-Z\d]{3})([a-zA-Z\d]+)" - replacement = r"hh_vbf_\4_\5_kv\1_k2v\2_kl\3" - return re.sub(pattern, replacement, proc) - elif proc == "qqH": - pattern = r"qqH" - replacement = r"h_vbf" - return re.sub(pattern, replacement, proc) - elif proc == "ggH": - pattern = r"ggH" - replacement = r"h_ggf" - return re.sub(pattern, replacement, proc) - elif proc == "ggZH": - pattern = r"ggZH" - replacement = r"zh_gg" - return re.sub(pattern, replacement, proc) - elif "H" in proc: - proc = proc.lower() - return proc - else: - # If the string doesn't match the patterns, return it unchanged - return proc - - def load_hists_uproot(fit_diagnostics_path, fit_type): """ Helper to load histograms from a fit_diagnostics file """ with uproot.open(fit_diagnostics_path) as tfile: @@ -154,7 +119,7 @@ def get_hists_from_multidimfit(tfile): def plot_postfit_shapes( - hists: OrderedDict[od.Process, hist.Hist], + h: OrderedDict, # [od.Process, hist.Hist], config_inst: od.Config, category_inst: od.Category, variable_insts: list[od.Variable], @@ -168,7 +133,7 @@ def plot_postfit_shapes( **kwargs, ) -> tuple(plt.Figure, tuple(plt.Axes)): variable_inst = law.util.make_tuple(variable_insts)[0] - hists = apply_process_settings(hists, process_settings) + hists = apply_process_settings(h.copy(), process_settings) plot_config = prepare_plot_config( hists, shape_norm=shape_norm, @@ -232,11 +197,20 @@ class PlotPostfitShapes( description="Whether to do prefit or postfit plots; defaults to False", ) + @property + def fit_type(self) -> str: + if self.prefit: + fit_type = "prefit" + else: + fit_type = "postfit" + self._fit_type = fit_type + return self._fit_type + def requires(self): return {} def output(self): - return {"plots": self.target("plots", dir=True)} + return {"plots": self.target(f"plots_{self.fit_type}", dir=True)} @view_output_plots def run(self): @@ -246,12 +220,8 @@ def run(self): ) outp = self.output() - if self.prefit: - fit_type = "prefit" - else: - fit_type = "postfit" - all_hists = load_hists_uproot(self.fit_diagnostics_file, fit_type) + all_hists = load_hists_uproot(self.fit_diagnostics_file, self.fit_type) process_insts = list(map(self.config_inst.get_process, self.processes)) for channel, h_in in all_hists.items(): @@ -259,8 +229,9 @@ def run(self): if not has_category: logger.warning(f"Category {channel} is not part of the inference model {self.inference_model}") - hists = defaultdict(OrderedDict) + hist_map = defaultdict(list) + # First map process inst for plotting to processes of root shapes for proc_key in list(h_in.keys()): proc_inst = None # try getting the config process via InferenceModel @@ -273,22 +244,31 @@ def run(self): proc_inst = self.config_inst.get_process(proc_key, default=None) # replace string keys with process instances + # map HHinference processes to plotting proc_inst if proc_inst: plot_proc = [ proc for proc in process_insts if proc.has_process(proc_inst) or proc.name == proc_inst.name ] if len(plot_proc) != 1: if len(plot_proc) > 1: - raise Exception(f"{proc_key} was assigned to more then one porcess insts ({plot_proc}) ") + logger.warning(f"{proc_key} was assigned to more then one process insts ({plot_proc}) ") else: logger.warning(f"{proc_key} in root file, but won't be plotted.") - continue + continue - if plot_proc[0] not in hists: - hists[plot_proc[0]] = {} - hists[plot_proc[0]] = h_in[proc_key] + if plot_proc[0] not in hist_map: + hist_map[plot_proc[0]] = [proc_key] else: - hists[plot_proc[0]] = hists[plot_proc[0]] + h_in[proc_key] + hist_map[plot_proc[0]].append(proc_key) + + # Plot Pre/Postfit plot for each channel + for channel, h_in in all_hists.items(): + hists = defaultdict(OrderedDict) + for proc in hist_map: + plot_proc = proc.copy() + hists[plot_proc] = h_in[hist_map[proc][0]] + for p in hist_map[proc][1:]: + hists[plot_proc] += h_in[p] if has_category: inference_category = self.inference_model_inst.get_category(channel) @@ -299,14 +279,18 @@ def run(self): config_category = od.Category(channel, id=1) variable_inst = od.Variable("dummy") + # take copy of proc_inst so labeling, sclaing etc is not modified on proc inst directly + + # __import__("IPYthon").embed() + h = hists.copy() # call the plot function fig, _ = self.call_plot_func( self.plot_function, - hists=hists, + h=h, config_inst=self.config_inst, category_inst=config_category, variable_insts=variable_inst, **self.get_plot_parameters(), ) - outp["plots"].child(f"{channel}_{fit_type}.pdf", type="f").dump(fig, formatter="mpl") + outp["plots"].child(f"{channel}_{self.fit_type}.pdf", type="f").dump(fig, formatter="mpl") From c38c4583c6ad15bb350c1b7e78ef2a1363cc35e7 Mon Sep 17 00:00:00 2001 From: lamarkus <67378765+Lara813@users.noreply.github.com> Date: Fri, 24 Jan 2025 10:12:23 +0100 Subject: [PATCH 4/8] Update hbw/ml/data_loader.py Co-authored-by: Mathis Frahm <49306645+mafrahm@users.noreply.github.com> --- hbw/ml/data_loader.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hbw/ml/data_loader.py b/hbw/ml/data_loader.py index ff71d0da..87826ee9 100644 --- a/hbw/ml/data_loader.py +++ b/hbw/ml/data_loader.py @@ -240,10 +240,9 @@ def shuffle_indices(self) -> np.ndarray: @property def num_event_per_process(self) -> str: if not self.skip_mask: - self._num_events_per_process = "num_events_pos_weights_per_process" + return "num_events_pos_weights_per_process" else: - self._num_events_per_process = "num_events_per_process" - return self._num_events_per_process + return "num_events_per_process" def get_xsec_train_weights(self) -> np.ndarray: """ From 69460b11c42c00b2d5287511adfd16842521c896 Mon Sep 17 00:00:00 2001 From: lamarkus <67378765+Lara813@users.noreply.github.com> Date: Fri, 24 Jan 2025 10:23:22 +0100 Subject: [PATCH 5/8] Apply suggestions from code review Co-authored-by: Mathis Frahm <49306645+mafrahm@users.noreply.github.com> --- hbw/plotting/plot_fits.py | 3 +-- hbw/tasks/inference.py | 15 ++++++--------- hbw/tasks/postfit_plots.py | 28 ++++++++++++---------------- 3 files changed, 19 insertions(+), 27 deletions(-) diff --git a/hbw/plotting/plot_fits.py b/hbw/plotting/plot_fits.py index e7cb8f13..c1d9af4e 100644 --- a/hbw/plotting/plot_fits.py +++ b/hbw/plotting/plot_fits.py @@ -52,8 +52,7 @@ def scalable_exponnorm(x, A, loc, scale, K=1): def plot_fit( - hists: dict[str, OrderedDict[od.Process, hist.Hist]], - # hists: OrderedDict[od.Process, hist.Hist], + hists: OrderedDict[od.Process, hist.Hist], config_inst: od.Config, category_inst: od.Category, variable_insts: list[od.Variable], diff --git a/hbw/tasks/inference.py b/hbw/tasks/inference.py index 1cd0357b..f291765d 100644 --- a/hbw/tasks/inference.py +++ b/hbw/tasks/inference.py @@ -31,6 +31,7 @@ # Function copied from Mathis Hist hook commit +# TODO: define once at central place (hist_util.py) def apply_rebinning_edges(h: hist.Histogram, axis_name: str, edges: list): """ Generalized rebinning of a single axis from a hist.Histogram, using predefined edges. @@ -322,15 +323,11 @@ def get_rebin_processes(self): rebin_process_condition = self.inference_category_rebin_processes.get(config_category, None) if not rebin_process_condition: - if "ggf" in config_category: - for proc in processes.copy(): - proc_name = proc.config_process - - logger.warning( - f"No rebin condition found for category {config_category}; rebinning will be flat " - f"on all processes {[proc.config_process for proc in processes]}", - ) - return processes + logger.warning( + f"No rebin condition found for category {config_category}; rebinning will be flat " + f"on all processes {[proc.config_process for proc in processes]}", + ) + return processes # transform `rebin_process_condition` into Callable if required if not isinstance(rebin_process_condition, Callable): diff --git a/hbw/tasks/postfit_plots.py b/hbw/tasks/postfit_plots.py index faa0dfbf..b931a7a0 100644 --- a/hbw/tasks/postfit_plots.py +++ b/hbw/tasks/postfit_plots.py @@ -37,12 +37,10 @@ def load_hists_uproot(fit_diagnostics_path, fit_type): """ Helper to load histograms from a fit_diagnostics file """ with uproot.open(fit_diagnostics_path) as tfile: if any("shapes_fit_s" in _k for _k in tfile.keys()): - if fit_type != "prefit": + if fit_type == "postfit": fit_type = "fit_s" hists = get_hists_from_fit_diagnostics(tfile)[f"shapes_{fit_type}"] else: - # if fit_type != "prefit": - # fit_type = "postfit" hists = get_hists_from_multidimfit(tfile)[f"{fit_type}"] return hists @@ -119,7 +117,7 @@ def get_hists_from_multidimfit(tfile): def plot_postfit_shapes( - h: OrderedDict, # [od.Process, hist.Hist], + hists: OrderedDict[od.Process, hist.Hist], config_inst: od.Config, category_inst: od.Category, variable_insts: list[od.Variable], @@ -133,7 +131,7 @@ def plot_postfit_shapes( **kwargs, ) -> tuple(plt.Figure, tuple(plt.Axes)): variable_inst = law.util.make_tuple(variable_insts)[0] - hists = apply_process_settings(h.copy(), process_settings) + hists = apply_process_settings(hists, process_settings) plot_config = prepare_plot_config( hists, shape_norm=shape_norm, @@ -200,11 +198,9 @@ class PlotPostfitShapes( @property def fit_type(self) -> str: if self.prefit: - fit_type = "prefit" + return "prefit" else: - fit_type = "postfit" - self._fit_type = fit_type - return self._fit_type + return "postfit" def requires(self): return {} @@ -232,11 +228,10 @@ def run(self): hist_map = defaultdict(list) # First map process inst for plotting to processes of root shapes - for proc_key in list(h_in.keys()): + for proc_key in h_in.keys(): proc_inst = None # try getting the config process via InferenceModel if has_category: - # TODO: process customization based on inference process? e.g. scale inference_process = self.inference_model_inst.get_process(proc_key, channel) proc_inst = self.config_inst.get_process(inference_process.config_process) else: @@ -249,12 +244,13 @@ def run(self): plot_proc = [ proc for proc in process_insts if proc.has_process(proc_inst) or proc.name == proc_inst.name ] - if len(plot_proc) != 1: - if len(plot_proc) > 1: - logger.warning(f"{proc_key} was assigned to more then one process insts ({plot_proc}) ") - else: - logger.warning(f"{proc_key} in root file, but won't be plotted.") + if len(plot_proc) > 1: + logger.warning(f"{proc_key} was assigned to ({", ".join([p.name for p in plot_proc])}) but {plot_proc[].name was chosen}") + elif len(plot_proc) == 0: + logger.warning(f"{proc_key} in root file, but won't be plotted.") continue + plot_proc = plot_proc[0] + if plot_proc[0] not in hist_map: hist_map[plot_proc[0]] = [proc_key] From 9dba52ea2f9b658d30c219c95fdee00e3c443881 Mon Sep 17 00:00:00 2001 From: Lara Date: Fri, 24 Jan 2025 14:10:45 +0100 Subject: [PATCH 6/8] Review changes --- hbw/ml/tf_util.py | 5 --- hbw/tasks/postfit_plots.py | 87 ++++++++++++++++++++++++-------------- 2 files changed, 55 insertions(+), 37 deletions(-) diff --git a/hbw/ml/tf_util.py b/hbw/ml/tf_util.py index 7b9c6bc7..d187bfcf 100644 --- a/hbw/ml/tf_util.py +++ b/hbw/ml/tf_util.py @@ -165,11 +165,6 @@ def _cumulated_crossenropy_from_logits(y_true, y_pred, axis): return ((numerator - denominator) / tf.math.reduce_sum(y_true, axis=axis))[:, 0] -@tf.function -def categorical_crossentropy(): - return tf.keras.losses.CategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE) - - @tf.function def cumulated_crossentropy(y_true, y_pred, from_logits=False, axis=-1): if from_logits: diff --git a/hbw/tasks/postfit_plots.py b/hbw/tasks/postfit_plots.py index b931a7a0..4069b211 100644 --- a/hbw/tasks/postfit_plots.py +++ b/hbw/tasks/postfit_plots.py @@ -47,6 +47,8 @@ def load_hists_uproot(fit_diagnostics_path, fit_type): def get_hists_from_fit_diagnostics(tfile): + """ Helper function to load histograms from root file created by FitDiagnostics """ + # prepare output dict hists = DotDict() @@ -74,7 +76,7 @@ def get_hists_from_fit_diagnostics(tfile): def get_hists_from_multidimfit(tfile): - """ Helper to load histograms from a fit_diagnostics file """ + """ Helper function to load histograms from root file created by MultiDimFit """ # prepare output dict hists = DotDict() keys = [key.split("/") for key in tfile.keys()] @@ -179,6 +181,7 @@ class PlotPostfitShapes( """ sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + datasets = None plot_function = PlotBase.plot_function.copy( default="hbw.tasks.postfit_plots.plot_postfit_shapes", @@ -208,29 +211,18 @@ def requires(self): def output(self): return {"plots": self.target(f"plots_{self.fit_type}", dir=True)} - @view_output_plots - def run(self): - logger.warning( - f"Note! It is important that the requested inference_model {self.inference_model} " - "is identical to the one that has been used to create the datacards", - ) - - outp = self.output() - - all_hists = load_hists_uproot(self.fit_diagnostics_file, self.fit_type) - process_insts = list(map(self.config_inst.get_process, self.processes)) - - for channel, h_in in all_hists.items(): - has_category = self.inference_model_inst.has_category(channel) - if not has_category: - logger.warning(f"Category {channel} is not part of the inference model {self.inference_model}") + # map processes in root shape to corresponding process instance used for plotting + def prepare_hist_map(self, hist_processes: set, process_insts: list) -> defaultdict(list): hist_map = defaultdict(list) - # First map process inst for plotting to processes of root shapes - for proc_key in h_in.keys(): + for proc_key in hist_processes: proc_inst = None # try getting the config process via InferenceModel + # NOTE: Only the first category is taken to get the config process instance, + # assuming they are the same for all categories + channel = self.inference_model_inst.get_categories_with_process(proc_key)[0] + has_category = self.inference_model_inst.has_category(channel) if has_category: inference_process = self.inference_model_inst.get_process(proc_key, channel) proc_inst = self.config_inst.get_process(inference_process.config_process) @@ -239,29 +231,63 @@ def run(self): proc_inst = self.config_inst.get_process(proc_key, default=None) # replace string keys with process instances - # map HHinference processes to plotting proc_inst + # map HHinference processes to process instances for plotting if proc_inst: plot_proc = [ proc for proc in process_insts if proc.has_process(proc_inst) or proc.name == proc_inst.name ] if len(plot_proc) > 1: - logger.warning(f"{proc_key} was assigned to ({", ".join([p.name for p in plot_proc])}) but {plot_proc[].name was chosen}") + logger.warning( + f"{proc_key} was assigned to ({','.join([p.name for p in plot_proc])})", + f" but {plot_proc[0].name} was chosen", + ) elif len(plot_proc) == 0: logger.warning(f"{proc_key} in root file, but won't be plotted.") - continue + continue plot_proc = plot_proc[0] - - if plot_proc[0] not in hist_map: - hist_map[plot_proc[0]] = [proc_key] + if plot_proc not in hist_map: + hist_map[plot_proc] = [proc_key] else: - hist_map[plot_proc[0]].append(proc_key) + hist_map[plot_proc].append(proc_key) + + return hist_map + + @view_output_plots + def run(self): + logger.warning( + f"Note! It is important that the requested inference_model {self.inference_model} " + "is identical to the one that has been used to create the datacards", + ) + + outp = self.output() + + # Load all required histograms corresponding to the fit_type from the root input file + all_hists = load_hists_uproot(self.fit_diagnostics_file, self.fit_type) + + # Get list of all process instances required for plotting + process_insts = list(map(self.config_inst.get_process, self.processes)) + + # map processes in root shape to corresponding process instance used for plotting + hist_processes = {key for _, h_in in all_hists.items() for key in h_in.keys()} + hist_map = self.prepare_hist_map(hist_processes, process_insts) # Plot Pre/Postfit plot for each channel for channel, h_in in all_hists.items(): + + # Check for coherence between inference and pre/postfit categories + has_category = self.inference_model_inst.has_category(channel) + if not has_category: + logger.warning(f"Category {channel} is not part of the inference model {self.inference_model}") + continue + + # Create Histograms hists = defaultdict(OrderedDict) for proc in hist_map: - plot_proc = proc.copy() + plot_proc = proc.copy() # NOTE: copy produced, so actual process is not modified by process settings + if hist_map[proc][0] not in h_in: + logger.warning(f"No histograms for {proc.name} found in {channel}.") + continue hists[plot_proc] = h_in[hist_map[proc][0]] for p in hist_map[proc][1:]: hists[plot_proc] += h_in[p] @@ -275,14 +301,11 @@ def run(self): config_category = od.Category(channel, id=1) variable_inst = od.Variable("dummy") - # take copy of proc_inst so labeling, sclaing etc is not modified on proc inst directly - - # __import__("IPYthon").embed() - h = hists.copy() # call the plot function + h = hists.copy() # NOTE: copy produced, so actual process is not modified by process settings fig, _ = self.call_plot_func( self.plot_function, - h=h, + hists=h, config_inst=self.config_inst, category_inst=config_category, variable_insts=variable_inst, From caccecd8e0db55a6175125222819c620c170f1b4 Mon Sep 17 00:00:00 2001 From: lamarkus <67378765+Lara813@users.noreply.github.com> Date: Fri, 24 Jan 2025 14:28:38 +0100 Subject: [PATCH 7/8] Apply suggestions from code review Co-authored-by: Mathis Frahm <49306645+mafrahm@users.noreply.github.com> --- hbw/ml/mixins.py | 25 +++++++------------------ hbw/tasks/inference.py | 1 + 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/hbw/ml/mixins.py b/hbw/ml/mixins.py index 2fc36b01..4e05e4dd 100644 --- a/hbw/ml/mixins.py +++ b/hbw/ml/mixins.py @@ -97,24 +97,13 @@ def prepare_ml_model( epsilon=1e-6, amsgrad=False, ) - if self.negative_weights == "ignore": - model.compile( - # NOTE: we'd preferrably use the Keras CCE, but it does not work when assigning one event - # to multiple classes (target with multiple entries != 0) - loss="categorical_crossentropy", - optimizer=optimizer, - metrics=["categorical_accuracy"], - weighted_metrics=["categorical_accuracy"], - ) - else: - model.compile( - # NOTE: we'd preferrably use the Keras CCE, but it does not work when assigning one event - # to multiple classes (target with multiple entries != 0) - loss=cumulated_crossentropy, - optimizer=optimizer, - metrics=["categorical_accuracy"], - weighted_metrics=["categorical_accuracy"], - ) + model_compile_kwargs = { + "loss": "categorical_crossentropy" if self.negative_weights == "ignore" else cumulated_crossentropy, + "optimizer": optimizer, + "metrics": ["categorical_accuracy"], + "weighted_metrics": ["categorical_accuracy"], + } + model.compile(**model_compile_kwargs) return model diff --git a/hbw/tasks/inference.py b/hbw/tasks/inference.py index f291765d..536f670d 100644 --- a/hbw/tasks/inference.py +++ b/hbw/tasks/inference.py @@ -32,6 +32,7 @@ # Function copied from Mathis Hist hook commit # TODO: define once at central place (hist_util.py) +# TODO: define once at central place (hist_util.py) def apply_rebinning_edges(h: hist.Histogram, axis_name: str, edges: list): """ Generalized rebinning of a single axis from a hist.Histogram, using predefined edges. From 7641039614690fc24d5cda35216ce852fad6f361 Mon Sep 17 00:00:00 2001 From: lamarkus <67378765+Lara813@users.noreply.github.com> Date: Fri, 24 Jan 2025 14:29:58 +0100 Subject: [PATCH 8/8] Update hbw/tasks/inference.py Co-authored-by: Mathis Frahm <49306645+mafrahm@users.noreply.github.com> --- hbw/ml/base.py | 27 +++++++++------------------ hbw/ml/mixins.py | 2 +- hbw/tasks/inference.py | 1 - 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/hbw/ml/base.py b/hbw/ml/base.py index ec54be53..7644f7a2 100644 --- a/hbw/ml/base.py +++ b/hbw/ml/base.py @@ -335,23 +335,14 @@ def open_model(self, target: law.LocalDirectoryTarget) -> dict[str, Any]: models["parameters"] = yaml.load(f_in, Loader=yaml.Loader) # custom loss needed due to output layer changes for negative weights - from hbw.ml.tf_util import cumulated_crossentropy, categorical_crossentropy + from hbw.ml.tf_util import cumulated_crossentropy - # Check for negative weight handling and assign loss function accordingly. - if self.negative_weights == "ignore": - models["model"] = tf.keras.models.load_model( - target["mlmodel"].path, custom_objects={categorical_crossentropy.__name__: categorical_crossentropy}, - ) - models["best_model"] = tf.keras.models.load_model( - target["checkpoint"].path, custom_objects={categorical_crossentropy.__name__: categorical_crossentropy}, - ) - else: - models["model"] = tf.keras.models.load_model( - target["mlmodel"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, - ) - models["best_model"] = tf.keras.models.load_model( - target["checkpoint"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, - ) + models["model"] = tf.keras.models.load_model( + target["mlmodel"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, + ) + models["best_model"] = tf.keras.models.load_model( + target["checkpoint"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, + ) return models @@ -553,7 +544,7 @@ def prepare_ml_model( from keras.models import Sequential from keras.layers import Dense, BatchNormalization - from hbw.ml.tf_util import cumulated_crossentropy, categorical_crossentropy + from hbw.ml.tf_util import cumulated_crossentropy n_inputs = len(set(self.input_features)) n_outputs = len(self.processes) @@ -576,7 +567,7 @@ def prepare_ml_model( optimizer = keras.optimizers.Adam(learning_rate=0.00050) if self.negative_weights == "ignore": model.compile( - loss=categorical_crossentropy, + loss="categorical_crossentropy", optimizer=optimizer, weighted_metrics=["categorical_accuracy"], ) diff --git a/hbw/ml/mixins.py b/hbw/ml/mixins.py index 4e05e4dd..3a423119 100644 --- a/hbw/ml/mixins.py +++ b/hbw/ml/mixins.py @@ -55,7 +55,7 @@ def prepare_ml_model( import tensorflow.keras as keras from keras.models import Sequential from keras.layers import Dense, BatchNormalization - from hbw.ml.tf_util import cumulated_crossentropy # , categorical_crossentropy + from hbw.ml.tf_util import cumulated_crossentropy n_inputs = len(set(self.input_features)) n_outputs = len(self.processes) diff --git a/hbw/tasks/inference.py b/hbw/tasks/inference.py index 536f670d..f291765d 100644 --- a/hbw/tasks/inference.py +++ b/hbw/tasks/inference.py @@ -32,7 +32,6 @@ # Function copied from Mathis Hist hook commit # TODO: define once at central place (hist_util.py) -# TODO: define once at central place (hist_util.py) def apply_rebinning_edges(h: hist.Histogram, axis_name: str, edges: list): """ Generalized rebinning of a single axis from a hist.Histogram, using predefined edges.