diff --git a/hbw/categorization/categories.py b/hbw/categorization/categories.py index c5799e36..0320603b 100644 --- a/hbw/categorization/categories.py +++ b/hbw/categorization/categories.py @@ -272,6 +272,7 @@ def catid_2b(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, a # TODO: not hard-coded -> use config? ml_processes = [ "signal_ggf", "signal_ggf2", "signal_vbf", "signal_vbf2", + "signal_ggf4", "signal_ggf5", "signal_vbf4", "signal_vbf5", "hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "hh_ggf_hbb_hvvqqlnu_kl1_kt1", "hh_vbf_hbb_hvvqqlnu_kv1_k2v1_kl1", "hh_ggf_hbb_hvv2l2nu_kl1_kt1", "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", diff --git a/hbw/config/defaults_and_groups.py b/hbw/config/defaults_and_groups.py index d3351dc9..9bb728d7 100644 --- a/hbw/config/defaults_and_groups.py +++ b/hbw/config/defaults_and_groups.py @@ -128,6 +128,28 @@ def set_config_defaults_and_groups(config_inst): # process groups for conveniently looping over certain processs # (used in wrapper_factory and during plotting) config_inst.x.process_groups = { + "ml_study": [ + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy_m4to10", "dy_m10to50", "dy_m50toinf", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], "all": ["*"], "default": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "dy", "st", "vv", "w_lnu", "h"], # noqa: E501 "sl": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "qcd", "st", "dy", "vv", "w_lnu", "h"], # noqa: E501 @@ -273,6 +295,10 @@ def set_config_defaults_and_groups(config_inst): ), # Dilepton "SR_dl": ( + "sr__1b__ml_signal_ggf", "sr__1b__ml_signal_ggf2", "sr__2b__ml_signal_ggf", "sr__2b__ml_signal_ggf2", + "sr__1b__ml_signal_vbf", "sr__1b__ml_signal_vbf2", "sr__2b__ml_signal_vbf", "sr__2b__ml_signal_vbf2", + "sr__1b__ml_signal_ggf4", "sr__1b__ml_signal_ggf5", "sr__2b__ml_signal_ggf4", "sr__2b__ml_signal_ggf5", + "sr__1b__ml_signal_vbf4", "sr__1b__ml_signal_vbf5", "sr__2b__ml_signal_vbf4", "sr__2b__ml_signal_vbf5", "sr__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2mu__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2mu__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2e__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2e__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", @@ -442,4 +468,5 @@ def set_config_defaults_and_groups(config_inst): "vbfSR_dl_resolved": is_signal_sm, "vbfSR_dl_boosted": is_signal_sm, "BR_dl": is_background, + } diff --git a/hbw/inference/dl.py b/hbw/inference/dl.py index 9bde981f..a17c3dc2 100644 --- a/hbw/inference/dl.py +++ b/hbw/inference/dl.py @@ -153,6 +153,92 @@ # "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1", # "hh_ggf_hbb_hvv2l2nu_kl5_kt1", +dl.derive("dl_ml_study_5", cls_dict={ + "ml_model_name": "dl_22post_ml_study_5", + "config_categories": [ + "sr__1b__ml_signal_ggf5", + "sr__1b__ml_signal_vbf5", + "sr__1b__ml_tt", + "sr__1b__ml_st", + "sr__1b__ml_dy", + "sr__1b__ml_h", + "sr__2b__ml_signal_ggf5", + "sr__2b__ml_signal_vbf5", + "sr__2b__ml_tt", + "sr__2b__ml_st", + "sr__2b__ml_dy", + "sr__2b__ml_h", + ], + "processes": [ + # qqHH_CV_m0p012_C2V_0p03_kl_10p2 + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], + "systematics": rate_systematics, +}) + +dl.derive("dl_ml_study_4", cls_dict={ + "ml_model_name": "dl_22post_ml_study_4", + "config_categories": [ + "sr__1b__ml_signal_ggf4", + "sr__1b__ml_signal_vbf4", + "sr__1b__ml_tt", + "sr__1b__ml_st", + "sr__1b__ml_dy", + "sr__1b__ml_h", + "sr__2b__ml_signal_ggf4", + "sr__2b__ml_signal_vbf4", + "sr__2b__ml_tt", + "sr__2b__ml_st", + "sr__2b__ml_dy", + "sr__2b__ml_h", + ], + "processes": [ + # qqHH_CV_m0p012_C2V_0p03_kl_10p2 + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], + "systematics": rate_systematics, +}) + dl.derive("dl_ml_study_1", cls_dict={ "ml_model_name": "dl_22post_ml_study_1", @@ -171,16 +257,65 @@ "sr__2b__ml_h", ], "processes": [ - "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", - "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1", - "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43", - "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94", - "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", - "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", - "hh_ggf_hbb_hvv2l2nu_kl0_kt1", - "hh_ggf_hbb_hvv2l2nu_kl1_kt1", - "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1", - "hh_ggf_hbb_hvv2l2nu_kl5_kt1", + # qqHH_CV_m0p012_C2V_0p03_kl_10p2 + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], + "systematics": rate_systematics, +}) + +dl.derive("dl_ml_study_3", cls_dict={ + "ml_model_name": "dl_22_procs1_w0", + "config_categories": [ + "sr__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "sr__1b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", + "sr__1b__ml_tt", + "sr__1b__ml_st", + "sr__1b__ml_dy", + "sr__1b__ml_h", + "sr__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "sr__2b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", + "sr__2b__ml_tt", + "sr__2b__ml_st", + "sr__2b__ml_dy", + "sr__2b__ml_h", + ], + "processes": [ + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", "tt", "dy", "w_lnu", @@ -207,16 +342,22 @@ "sr__2b__ml_h", ], "processes": [ - "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", - "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1", - "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43", - "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94", - "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", - "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", - "hh_ggf_hbb_hvv2l2nu_kl0_kt1", - "hh_ggf_hbb_hvv2l2nu_kl1_kt1", - "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1", - "hh_ggf_hbb_hvv2l2nu_kl5_kt1", + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", "tt", "dy", "w_lnu", diff --git a/hbw/ml/base.py b/hbw/ml/base.py index c1774524..110d78a0 100644 --- a/hbw/ml/base.py +++ b/hbw/ml/base.py @@ -322,13 +322,23 @@ def open_model(self, target: law.LocalDirectoryTarget) -> dict[str, Any]: models["parameters"] = yaml.load(f_in, Loader=yaml.Loader) # custom loss needed due to output layer changes for negative weights - from hbw.ml.tf_util import cumulated_crossentropy - models["model"] = tf.keras.models.load_model( - target["mlmodel"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, - ) - models["best_model"] = tf.keras.models.load_model( - target["checkpoint"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, - ) + from hbw.ml.tf_util import cumulated_crossentropy, categorical_crossentropy + + # Check for negative weight handling and assign loss function accordingly. + if self.negative_weights == "ignore": + models["model"] = tf.keras.models.load_model( + target["mlmodel"].path, custom_objects={categorical_crossentropy.__name__: categorical_crossentropy}, + ) + models["best_model"] = tf.keras.models.load_model( + target["checkpoint"].path, custom_objects={categorical_crossentropy.__name__: categorical_crossentropy}, + ) + else: + models["model"] = tf.keras.models.load_model( + target["mlmodel"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, + ) + models["best_model"] = tf.keras.models.load_model( + target["checkpoint"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, + ) return models @@ -360,7 +370,6 @@ def load_data( # load into memory validation.load_all log_memory("loading validation data") - # store input features as an output output["mlmodel"].child("input_features.pkl", type="f").dump(self.input_features_ordered, formatter="pickle") @@ -401,7 +410,6 @@ def train( # # training # - self.fit_ml_model(task, model, train, validation, output) log_memory("training") # save the model and history; TODO: use formatter @@ -441,7 +449,7 @@ def evaluate( """ Evaluation function that is run as part of the MLEvaluation task """ - use_best_model = False + use_best_model = False # TODO ML, hier auf True setzen? if len(events) == 0: logger.warning(f"Dataset {task.dataset} is empty. No columns are produced.") @@ -454,7 +462,7 @@ def evaluate( process = task.dataset_inst.x("ml_process", task.dataset_inst.processes.get_first().name) process_inst = task.config_inst.get_process(process) - ml_dataset = self.data_loader(self, process_inst, events) + ml_dataset = self.data_loader(self, process_inst, events, skip_mask=True) # # store the ml truth label in the events # events = set_ak_column( @@ -486,7 +494,6 @@ def evaluate( if len(pred[0]) != len(self.processes): raise Exception("Number of output nodes should be equal to number of processes") predictions.append(pred) - # store predictions for each model for j, proc in enumerate(self.processes): events = set_ak_column( @@ -533,7 +540,7 @@ def prepare_ml_model( from keras.models import Sequential from keras.layers import Dense, BatchNormalization - from hbw.ml.tf_util import cumulated_crossentropy + from hbw.ml.tf_util import cumulated_crossentropy, categorical_crossentropy n_inputs = len(set(self.input_features)) n_outputs = len(self.processes) @@ -554,11 +561,18 @@ def prepare_ml_model( # compile the network # NOTE: the custom loss needed due to output layer changes for negative weights optimizer = keras.optimizers.Adam(learning_rate=0.00050) - model.compile( - loss=cumulated_crossentropy, - optimizer=optimizer, - weighted_metrics=["categorical_accuracy"], - ) + if self.negative_weights == "ignore": + model.compile( + loss=categorical_crossentropy, + optimizer=optimizer, + weighted_metrics=["categorical_accuracy"], + ) + else: + model.compile( + loss=cumulated_crossentropy, + optimizer=optimizer, + weighted_metrics=["categorical_accuracy"], + ) return model diff --git a/hbw/ml/data_loader.py b/hbw/ml/data_loader.py index 55f7b11f..e0d60c08 100644 --- a/hbw/ml/data_loader.py +++ b/hbw/ml/data_loader.py @@ -119,7 +119,14 @@ class MLDatasetLoader: # _ "equal_train_weights", "xsec_train_weights", "train_weights", "equal_weights") evaluation_arrays: tuple = ("prediction",) - def __init__(self, ml_model_inst: MLModel, process: "str", events: ak.Array, stats: dict | None = None): + def __init__( + self, + ml_model_inst: MLModel, + process: "str", + events: ak.Array, + stats: dict | None = None, + skip_mask=False, + ): """ Initializes the MLDatasetLoader with the given parameters. @@ -136,10 +143,13 @@ def __init__(self, ml_model_inst: MLModel, process: "str", events: ak.Array, sta self._process = process proc_mask, _ = get_proc_mask(events, process, ml_model_inst.config_inst) - # TODO: die ohne _per_process müssen auch noch, still, per fold never make sense then anymore -> DISCUSS self._stats = stats # del_sub_proc_stats(process, sub_id) - self._events = events[proc_mask] + if not skip_mask: + self._events = events[proc_mask] + self._events = events[events.event_weight >= 0.0] + else: + self._events = events def __repr__(self): return f"{self.__class__.__name__}({self.ml_model_inst.cls_name}, {self.process})" @@ -718,4 +728,4 @@ def prediction(self) -> np.ndarray: raise Exception("No trained model found in the MLModel instance. Cannot calculate prediction.") self._prediction = predict_numpy_on_batch(self._ml_model_inst.trained_model, self.features) - return self._prediction + return self._prediction # TODO ML best model diff --git a/hbw/ml/derived/dl.py b/hbw/ml/derived/dl.py index 59b4a631..c7774484 100644 --- a/hbw/ml/derived/dl.py +++ b/hbw/ml/derived/dl.py @@ -81,7 +81,7 @@ class DenseClassifierDL(DenseModelMixin, ModelFitMixin, MLClassifierBase): store_name: str = "inputs_inclusive" folds: int = 5 - negative_weights: str = "handle" + negative_weights: str = "ignore" # overwriting DenseModelMixin parameters activation: str = "relu" @@ -269,6 +269,80 @@ def setup(self): "processes": ["signal", "top"], }) +dl_22post_ml_study_5 = dl_22post.derive("dl_22post_ml_study_5", cls_dict={ + "training_configs": lambda self, requested_configs: ["c22post"], + "combine_processes": { + "signal_ggf5": { + # "name": "tt_and_st", + "label": "Signal GGF", + "sub_processes": [ + "hh_ggf_hbb_hvv2l2nu_kl0_kt1", + "hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "hh_ggf_hbb_hvv2l2nu_kl5_kt1", + ], + "weighting": "equal", + }, + "signal_vbf5": { + # "name": "tt_and_st", + "label": "Signal VBF", + "sub_processes": [ + "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", + ], + "weighting": "xsec", + }, + }, + "processes": [ + "signal_ggf5", + "signal_vbf5", + "tt", + "st", + "dy", + "h", + ], +}) + +dl_22post_ml_study_4 = dl_22post.derive("dl_22post_ml_study_4", cls_dict={ + "training_configs": lambda self, requested_configs: ["c22post"], + "combine_processes": { + "signal_ggf4": { + # "name": "tt_and_st", + "label": "Signal GGF", + "sub_processes": [ + "hh_ggf_hbb_hvv2l2nu_kl0_kt1", + "hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1", + ], + "weighting": "xsec", + }, + "signal_vbf4": { + # "name": "tt_and_st", + "label": "Signal VBF", + "sub_processes": [ + "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", + ], + "weighting": "xsec", + }, + }, + "processes": [ + "signal_ggf4", + "signal_vbf4", + "tt", + "st", + "dy", + "h", + ], +}) + dl_22post_ml_study_1 = dl_22post.derive("dl_22post_ml_study_1", cls_dict={ "training_configs": lambda self, requested_configs: ["c22post"], "combine_processes": { @@ -359,14 +433,15 @@ def setup(self): "training_configs": lambda self, requested_configs: ["c22post", "c22pre"], "processes": ["hh_ggf_hbb_hvv2l2nu_kl1_kt1", "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", "tt", "st", "dy", "h"], }) -dl_22_procs1_w0 = dl_22_procs1.derive("dl_22_procs1_w1", cls_dict={ +dl_22_procs1_w0 = dl_22_procs1.derive("dl_22_procs1_w0", cls_dict={ + "training_configs": lambda self, requested_configs: ["c22post"], "ml_process_weights": { "hh_ggf_hbb_hvv2l2nu_kl1_kt1": 1, "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1": 1, "tt": 2, "st": 2, "dy": 2, - "h": 2, + "h": 1, }, }) dl_22_procs1_w1 = dl_22_procs1.derive("dl_22_procs1_w1", cls_dict={ diff --git a/hbw/ml/mixins.py b/hbw/ml/mixins.py index 38c632e2..bac5967a 100644 --- a/hbw/ml/mixins.py +++ b/hbw/ml/mixins.py @@ -55,7 +55,7 @@ def prepare_ml_model( import tensorflow.keras as keras from keras.models import Sequential from keras.layers import Dense, BatchNormalization - from hbw.ml.tf_util import cumulated_crossentropy + # from hbw.ml.tf_util import cumulated_crossentropy, categorical_crossentropy n_inputs = len(set(self.input_features)) n_outputs = len(self.processes) @@ -97,16 +97,24 @@ def prepare_ml_model( epsilon=1e-6, amsgrad=False, ) - model.compile( - loss=cumulated_crossentropy, - # NOTE: we'd preferrably use the Keras CCE, but it does not work when assigning one event - # to multiple classes (target with multiple entries != 0) - # loss ="categorical_crossentropy", - # loss=tf.keras.losses.CategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE), - optimizer=optimizer, - metrics=["categorical_accuracy"], - weighted_metrics=["categorical_accuracy"], - ) + if self.negative_weights == "ignore": + model.compile( + # NOTE: we'd preferrably use the Keras CCE, but it does not work when assigning one event + # to multiple classes (target with multiple entries != 0) + loss="categorical_crossentropy", + optimizer=optimizer, + metrics=["categorical_accuracy"], + weighted_metrics=["categorical_accuracy"], + ) + else: + model.compile( + # NOTE: we'd preferrably use the Keras CCE, but it does not work when assigning one event + # to multiple classes (target with multiple entries != 0) + loss="cumulated_crossentropy", + optimizer=optimizer, + metrics=["categorical_accuracy"], + weighted_metrics=["categorical_accuracy"], + ) return model diff --git a/hbw/ml/tf_util.py b/hbw/ml/tf_util.py index d187bfcf..7b9c6bc7 100644 --- a/hbw/ml/tf_util.py +++ b/hbw/ml/tf_util.py @@ -165,6 +165,11 @@ def _cumulated_crossenropy_from_logits(y_true, y_pred, axis): return ((numerator - denominator) / tf.math.reduce_sum(y_true, axis=axis))[:, 0] +@tf.function +def categorical_crossentropy(): + return tf.keras.losses.CategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE) + + @tf.function def cumulated_crossentropy(y_true, y_pred, from_logits=False, axis=-1): if from_logits: diff --git a/hbw/tasks/inference.py b/hbw/tasks/inference.py index 98b76320..86bfadf1 100644 --- a/hbw/tasks/inference.py +++ b/hbw/tasks/inference.py @@ -12,24 +12,84 @@ import law import order as od -from columnflow.tasks.framework.base import Requirements, RESOLVE_DEFAULT, AnalysisTask +from columnflow.tasks.framework.base import Requirements, RESOLVE_DEFAULT, AnalysisTask, ConfigTask from columnflow.tasks.framework.parameters import SettingsParameter from columnflow.tasks.framework.mixins import ( CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, InferenceModelMixin, ) from columnflow.tasks.framework.remote import RemoteWorkflow from columnflow.tasks.cms.inference import CreateDatacards -from columnflow.util import dev_sandbox, maybe_import +from columnflow.util import dev_sandbox, maybe_import # import_ROOT from hbw.tasks.base import HBWTask +# from hbw.tasks.postfit_plots import load_hists_uproot # import hbw.inference.constants as const array = maybe_import("array") +uproot = maybe_import("uproot") +np = maybe_import("numpy") +hist = maybe_import("hist") logger = law.logger.get_logger(__name__) +# Function copied from Mathis Hist hook commit +def apply_rebinning_edges(h: hist.Histogram, axis_name: str, edges: list): + """ + Generalized rebinning of a single axis from a hist.Histogram, using predefined edges. + + :param h: histogram to rebin + :param axis_name: string representing the axis to rebin + :param edges: list of floats representing the new bin edges. Must be a subset of the original edges. + :return: rebinned histogram + """ + if isinstance(edges, int): + return h[{axis_name: hist.rebin(edges)}] + + ax = h.axes[axis_name] + ax_idx = [a.name for a in h.axes].index(axis_name) + if not all([np.isclose(x, ax.edges).any() for x in edges]): + raise ValueError( + f"Cannot rebin histogram due to incompatible edges for axis '{ax.name}'\n" + f"Edges of histogram are {ax.edges}, requested rebinning to {edges}", + ) + + # If you rebin to a subset of initial range, keep the overflow and underflow + overflow = ax.traits.overflow or (edges[-1] < ax.edges[-1] and not np.isclose(edges[-1], ax.edges[-1])) + underflow = ax.traits.underflow or (edges[0] > ax.edges[0] and not np.isclose(edges[0], ax.edges[0])) + flow = overflow or underflow + new_ax = hist.axis.Variable(edges, name=ax.name, overflow=overflow, underflow=underflow) + axes = list(h.axes) + axes[ax_idx] = new_ax + + hnew = hist.Hist(*axes, name=h.name, storage=h._storage_type()) + + # Offset from bin edge to avoid numeric issues + offset = 0.5 * np.min(ax.edges[1:] - ax.edges[:-1]) + edges_eval = edges + offset + edge_idx = ax.index(edges_eval) + # Avoid going outside the range, reduceat will add the last index anyway + if edge_idx[-1] == ax.size + ax.traits.overflow: + edge_idx = edge_idx[:-1] + + if underflow: + # Only if the original axis had an underflow should you offset + if ax.traits.underflow: + edge_idx += 1 + edge_idx = np.insert(edge_idx, 0, 0) + + # Take is used because reduceat sums i:len(array) for the last entry, in the case + # where the final bin isn't the same between the initial and rebinned histogram, you + # want to drop this value. Add tolerance of 1/2 min bin width to avoid numeric issues + hnew.values(flow=flow)[...] = np.add.reduceat(h.values(flow=flow), edge_idx, + axis=ax_idx).take(indices=range(new_ax.size + underflow + overflow), axis=ax_idx) + if hnew._storage_type() == hist.storage.Weight(): + hnew.variances(flow=flow)[...] = np.add.reduceat(h.variances(flow=flow), edge_idx, + axis=ax_idx).take(indices=range(new_ax.size + underflow + overflow), axis=ax_idx) + return hnew + + def get_hist_name(cat_name: str, proc_name: str, syst_name: str | None = None) -> str: hist_name = f"{cat_name}/{proc_name}" if syst_name: @@ -68,7 +128,8 @@ def get_rebin_values(hist, N_bins_final: int = 10): Function that determines how to rebin a hist to *N_bins_final* bins such that the resulting histogram is flat """ - N_bins_input = hist.GetNbinsX() + N_bins_input = hist.axes[0].size + # N_bins_input = hist.GetNbinsX() # # replace empty bin values (TODO: remove as soon as we can disable the empty bin filling) # EMPTY_BIN_VALUE = 1e-5 @@ -77,32 +138,38 @@ def get_rebin_values(hist, N_bins_final: int = 10): # hist.SetBinContent(i, 0) # determine events per bin the final histogram should have - events_per_bin = hist.Integral() / N_bins_final + events_per_bin = hist.sum().value / N_bins_final + # events_per_bin = hist.Integral() / N_bins_final logger.info(f"============ {round(events_per_bin, 3)} events per bin") # bookkeeping number of bins and number of events bin_count = 1 N_events = 0 - rebin_values = [hist.GetBinLowEdge(1)] + # rebin_values = [hist.GetBinLowEdge(1)] + rebin_values = [int(hist.axes[0].edges[0])] # starting loop at 1 to exclude underflow # ending at N_bins_input + 1 excludes overflow - for i in range(1, N_bins_input + 1): + # for i in range(1, N_bins_input + 1): + for i in range(1, N_bins_input): if bin_count == N_bins_final: # break as soon as N-1 bin edges have been determined --> final bin is x_max break - - N_events += hist.GetBinContent(i) + N_events += hist.view()["value"][i] + # N_events += hist.GetBinContent(i) if i % 100 == 0: logger.info(f"========== Bin {i} of {N_bins_input}, {N_events} events") if N_events >= events_per_bin * bin_count: # when *N_events* surpasses threshold, append the corresponding bin edge and count - logger.info(f"++++++++++ Append bin edge {bin_count} of {N_bins_final} at edge {hist.GetBinLowEdge(i)}") - rebin_values.append(hist.GetBinLowEdge(i + 1)) + # logger.info(f"++++++++++ Append bin edge {bin_count} of {N_bins_final} at edge {hist.GetBinLowEdge(i)}") + logger.info(f"++++++++++ Append bin edge {bin_count} of {N_bins_final} at edge {hist.axes[0].edges[i]}") + # rebin_values.append(hist.GetBinLowEdge(i + 1)) + rebin_values.append(hist.axes[0].edges[i]) bin_count += 1 # final bin is x_max - x_max = hist.GetBinLowEdge(N_bins_input + 1) + x_max = hist.axes[0].edges[N_bins_input] + # x_max = hist.GetBinLowEdge(N_bins_input + 1) rebin_values.append(x_max) logger.info(f"final bin edges: {rebin_values}") return rebin_values @@ -111,7 +178,7 @@ def get_rebin_values(hist, N_bins_final: int = 10): def apply_binning(hist, rebin_values: list): N_bins = len(rebin_values) - 1 rebin_values_ptr = array.array("d", rebin_values) - + # rebin_values_ptr = np.array(rebin_values, dtype="float64") h_out = hist.Rebin(N_bins, hist.GetName(), rebin_values_ptr) return h_out @@ -202,10 +269,14 @@ class ModifyDatacardsFlatRebin( ProducersMixin, SelectorStepsMixin, CalibratorsMixin, + ConfigTask, law.LocalWorkflow, RemoteWorkflow, ): - sandbox = dev_sandbox("bash::$CF_BASE/sandboxes/cmssw_default.sh") + + sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + # sandbox = dev_sandbox("bash::$HBW_BASE/sandboxes/venv_ml_plotting.sh") + # sandbox = dev_sandbox("bash::$CF_BASE/sandboxes/cmssw_default.sh") bins_per_category = SettingsParameter( default=(RESOLVE_DEFAULT,), @@ -272,11 +343,31 @@ def get_rebin_processes(self): rebin_process_condition = self.inference_category_rebin_processes.get(config_category, None) if not rebin_process_condition: - logger.warning( - f"No rebin condition found for category {config_category}; rebinning will be flat " - f"on all processes {[proc.config_process for proc in processes]}", - ) - return processes + if "ggf" in config_category: + for proc in processes.copy(): + proc_name = proc.config_process + + # if not "ggf" in proc_name: + # processes.remove(proc) + # elif proc_name == 'h_ggf': # This is quiet custom made + # processes.remove(proc) + # elif "kl5" in proc_name: + # processes.remove(proc) + # return processes + # elif 'vbf' in config_category: + # for proc in processes.copy(): + # proc_name = proc.config_process + # if not 'vbf' in proc_name: + # processes.remove(proc) + # if proc_name == 'h_vbf': + # processes.remove(proc) + # return processes + # else: + logger.warning( + f"No rebin condition found for category {config_category}; rebinning will be flat " + f"on all processes {[proc.config_process for proc in processes]}", + ) + return processes # transform `rebin_process_condition` into Callable if required if not isinstance(rebin_process_condition, Callable): @@ -321,8 +412,9 @@ def output(self): } def run(self): - import uproot - from ROOT import TH1 + # import uproot + # from ROOT import TH1 + # import_ROOT() # config_inst = self.config_inst # inference_model = self.inference_model_inst @@ -355,7 +447,8 @@ def run(self): # get all nominal histograms nominal_hists = { - proc_name: file[get_hist_name(cat_name, proc_name)].to_pyroot() + proc_name: file[get_hist_name(cat_name, proc_name)].to_hist() + # proc_name: file[get_hist_name(cat_name, proc_name)].to_pyroot() for proc_name in proc_names } @@ -379,22 +472,48 @@ def run(self): # apply rebinning on all histograms and store resulting hists in a ROOT file out_file = uproot.recreate(outputs["shapes"].fn) + rebinned_histograms = {} for key, h in file.items(): - # remove ";1" appendix - key = key.split(";")[0] - try: - # convert histograms to pyroot - h = h.to_pyroot() - except AttributeError: - # skip non-histograms - continue - if not isinstance(h, TH1): - raise Exception(f"{h} is not a TH1 histogram") - h_rebin = apply_binning(h, rebin_values) - problematic_bin_count = check_empty_bins(h_rebin) # noqa - logger.info(f"Inserting histogram with name {key}") - out_file[key] = uproot.from_pyroot(h_rebin) + key = key.split(";")[0] + if "TH1" in str(h): + try: + h = file[key].to_hist() + except AttributeError: + continue + h_rebin = apply_rebinning_edges(h, h.axes[0].name, rebin_values) + # bin_edges = h_rebin.axes[0].edges # Bin edges (first axis) + # bin_values = h_rebin.values() + # bin_variance = h_rebin.variances() # Bin values + # h_rebin_root = uproot.newhist.Histogram( + # bins=(len(h_rebin.edges)-1), + # x_low=h_rebin.edges[0], + # x_high=h_rebin.edges[-1], + # values=h_rebin.values(), + # ) + rebinned_histograms[key] = h_rebin + + logger.info(f"Inserting histogram with name {key}") + out_file[key] = h_rebin + + # for key, h in file.items(): + # # remove ";1" appendix + # key = key.split(";")[0] + # __import__("IPython").embed() + # # try: + # # convert histograms to pyroot + # # h = h.to_pyroot() + # # except AttributeError: + # # skip non-histograms + # # continue + # # if not isinstance(h, TH1): + # # raise Exception(f"{h} is not a TH1 histogram") + + # # h_rebin = apply_binning(h, rebin_values) + # h_rebin = apply_rebinning_edges(h, "0", rebin_values) + # # problematic_bin_count = check_empty_bins(h_rebin) + # logger.info(f"Inserting histogram with name {key}") + # out_file[key] = uproot.recreate(h_rebin) class PrepareInferenceTaskCalls( @@ -404,6 +523,7 @@ class PrepareInferenceTaskCalls( ProducersMixin, SelectorStepsMixin, CalibratorsMixin, + ConfigTask, ): """ Simple task that produces string to run certain tasks in Inference diff --git a/hbw/tasks/ml.py b/hbw/tasks/ml.py index 5e45e9d7..9546f4dd 100644 --- a/hbw/tasks/ml.py +++ b/hbw/tasks/ml.py @@ -398,17 +398,24 @@ def run(self): outputs["stats"].dump(stats, formatter="json") # initialize the DatasetLoader - ml_dataset = self.ml_model_inst.data_loader(self.ml_model_inst, process, events, stats) - # NOTE: Almost closure + if self.ml_model_inst.negative_weights == "ignore": + ml_dataset = self.ml_model_inst.data_loader(self.ml_model_inst, process, events, stats) + logger.info( + f"{self.ml_model_inst.negative_weights} method chosen to hanlde negative weights:", + " All negative weights will be removed from training.", + ) + else: + ml_dataset = self.ml_model_inst.data_loader(self.ml_model_inst, process, events, stats, skip_mask=True) + sum_train_weights = np.sum(ml_dataset.train_weights) n_events_per_fold = len(ml_dataset.train_weights) - logger.info(f"Sum of traing weights is: {sum_train_weights} for {n_events_per_fold} {process} events") + logger.info(f"Sum of training weights is: {sum_train_weights} for {n_events_per_fold} {process} events") if self.ml_model_inst.config_inst.get_process(process).x("ml_config", None): if self.ml_model_inst.config_inst.get_process(process).x.ml_config.weighting == "equal": for sub_proc in self.ml_model_inst.config_inst.get_process(process).x.ml_config.sub_processes: - proc_mask, sub_id = get_proc_mask(events, sub_proc, self.ml_model_inst.config_inst) + proc_mask, _ = get_proc_mask(ml_dataset._events, sub_proc, self.ml_model_inst.config_inst) xcheck_weight_sum = np.sum(ml_dataset.train_weights[proc_mask]) xcheck_n_events = len(ml_dataset.train_weights[proc_mask]) logger.info( diff --git a/hbw/tasks/postfit_plots.py b/hbw/tasks/postfit_plots.py index 9df3745a..96fa8f82 100644 --- a/hbw/tasks/postfit_plots.py +++ b/hbw/tasks/postfit_plots.py @@ -35,12 +35,13 @@ def load_hists_uproot(fit_diagnostics_path): """ Helper to load histograms from a fit_diagnostics file """ # prepare output dict hists = DotDict() + __import__("IPython").embed() with uproot.open(fit_diagnostics_path) as tfile: keys = [key.split("/") for key in tfile.keys()] for key in keys: if len(key) != 3: continue - + __import__("IPython").embed() # get the histogram from the tfile h_in = tfile["/".join(key)] @@ -167,6 +168,8 @@ def run(self): else: fit_type = "fit_s" + __import__("IPython").embed() + all_hists = all_hists[f"shapes_{fit_type}"] for channel, hists in all_hists.items():