Modified PostfitPlot task

uhh-cms · Jan 21, 2025 · 13970d5 · 13970d5
1 parent 490fcc1
commit 13970d5
Show file tree

Hide file tree

Showing 11 changed files with 287 additions and 148 deletions.
diff --git a/hbw/config/defaults_and_groups.py b/hbw/config/defaults_and_groups.py
@@ -150,6 +150,16 @@ def set_config_defaults_and_groups(config_inst):
             "vv",
             "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth",
         ],
+        "test_postfit": [
+            "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1",
+            "hh_ggf_hbb_hww2l2nu_kl1_kt1",
+            "st",
+            "tt",
+            "dy",
+            "w_lnu",
+            "vv",
+            "h",
+        ],
         "all": ["*"],
         "default": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "dy", "st", "vv", "w_lnu", "h"],  # noqa: E501
         "sl": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "qcd", "st", "dy", "vv", "w_lnu", "h"],  # noqa: E501
@@ -303,6 +313,7 @@ def set_config_defaults_and_groups(config_inst):
             "sr__2mu__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2mu__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1",
             "sr__2e__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2e__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1",
             "sr__emu__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__emu__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1",
+            "sr__1b", "sr__2b",
         ),
         "vbfSR_dl": (
             "sr__1b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", "sr__2b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
@@ -375,10 +386,8 @@ def set_config_defaults_and_groups(config_inst):
             for proc, _, _ in config_inst.walk_processes() if proc.has_tag("is_signal")
         },
         "dilep": {
-            "hh_ggf_hbb_hvv2l2nu_kl0_kt1": {"scale": 10000, "unstack": True},
-            "hh_ggf_hbb_hvv2l2nu_kl1_kt1": {"scale": 10000, "unstack": True},
-            "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1": {"scale": 10000, "unstack": True},
-            "hh_ggf_hbb_hvv2l2nu_kl5_kt1": {"scale": 10000, "unstack": True},
+            "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1": {"scale": 90000, "unstack": True},
+            "hh_ggf_hbb_hww2l2nu_kl1_kt1": {"scale": 10000, "unstack": True},
         },
         "dileptest": {
             "hh_ggf_hbb_hvv2l2nu_kl1_kt1": {"scale": 10000, "unstack": True},

diff --git a/hbw/inference/base.py b/hbw/inference/base.py
@@ -89,7 +89,7 @@ def config_variable(self: InferenceModel, config_cat_inst: od.Config):
             dnn_proc = dnn_cat.replace("ml_", "")
             return f"mlscore.{dnn_proc}"
         else:
-            return "mli_mbb"
+            return "mli_lep_pt"
 
     def customize_category(self: InferenceModel, cat_inst: DotDict, config_cat_inst: od.Config):
         """ Function to allow customizing the inference category """

diff --git a/hbw/inference/dl.py b/hbw/inference/dl.py
@@ -240,7 +240,7 @@
 })
 
 
-dl.derive("dl_ml_study_1", cls_dict={
+dl_ml_study_1 = dl.derive("dl_ml_study_1", cls_dict={
     "ml_model_name": "dl_22post_ml_study_1",
     "config_categories": [
         "sr__1b__ml_signal_ggf",
@@ -283,7 +283,11 @@
     "systematics": rate_systematics,
 })
 
-dl.derive("dl_ml_study_3", cls_dict={
+dl_ml_study_1.derive("dl_ml_study_1_handle", cls_dict={
+    "ml_model_name": "dl_22post_ml_study_1_handle",
+})
+
+dl_ml_study_3 = dl.derive("dl_ml_study_3", cls_dict={
     "ml_model_name": "dl_22_procs1_w0",
     "config_categories": [
         "sr__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1",
@@ -325,7 +329,11 @@
     "systematics": rate_systematics,
 })
 
-dl.derive("dl_ml_study_2", cls_dict={
+dl_ml_study_3.derive("dl_ml_study_3_handle", cls_dict={
+    "ml_model_name": "dl_22_procs1_w0_handle",
+})
+
+dl_ml_study_2 = dl.derive("dl_ml_study_2", cls_dict={
     "ml_model_name": "dl_22post_ml_study_2",
     "config_categories": [
         "sr__1b__ml_signal_ggf2",
@@ -367,6 +375,14 @@
     "systematics": rate_systematics,
 })
 
+dl_ml_study_2.derive("dl_ml_study_2_handle", cls_dict={
+    "ml_model_name": "dl_22post_ml_study_2_handle",
+})
+
+dl_ml_study_2.derive("dl_ml_study_2_ignore", cls_dict={
+    "ml_model_name": "dl_22post_ml_study_2",
+})
+
 dl.derive("dl_hww_and_hzz", cls_dict={
     "processes": [
         "hh_ggf_hbb_hww_kl0_kt1",
@@ -531,3 +547,35 @@
     "systematics": rate_systematics},
 )
 dl.derive("dl_rates_only", cls_dict={"systematics": rate_systematics})
+
+dl.derive("dl_postfit_test", cls_dict={
+    "ml_model_name": None,
+    "config_categories": [
+        "sr__1b",
+        "sr__2b",
+    ],
+    "processes": [
+        # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
+        "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4",
+        "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3",
+        "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2",
+        "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96",
+        "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1",
+        "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1",
+        "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43",
+        "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94",
+        "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36",
+        "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39",
+        "hh_ggf_hbb_hww2l2nu_kl0_kt1",
+        "hh_ggf_hbb_hww2l2nu_kl1_kt1",
+        "hh_ggf_hbb_hww2l2nu_kl2p45_kt1",
+        "hh_ggf_hbb_hww2l2nu_kl5_kt1",
+        "st",
+        "tt",
+        "dy",
+        "w_lnu",
+        "vv",
+        "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth",
+    ],
+    "systematics": rate_systematics,
+})
diff --git a/hbw/ml/data_loader.py b/hbw/ml/data_loader.py
@@ -141,9 +141,11 @@ def __init__(
         """
         self._ml_model_inst = ml_model_inst
         self._process = process
+        self._skip_mask = skip_mask
 
         proc_mask, _ = get_proc_mask(events, process, ml_model_inst.config_inst)
         self._stats = stats
+        # __import__("IPython").embed()
         # del_sub_proc_stats(process, sub_id)
         if not skip_mask:
             self._events = events[proc_mask]
@@ -177,6 +179,10 @@ def parameters(self):
         }
         return self._parameters
 
+    @property
+    def skip_mask(self):
+        return self._skip_mask
+
     @property
     def ml_model_inst(self):
         return self._ml_model_inst
@@ -255,6 +261,14 @@ def shuffle_indices(self) -> np.ndarray:
         self._shuffle_indices = np.random.permutation(self.n_events)
         return self._shuffle_indices
 
+    @property
+    def num_event_per_process(self) -> str:
+        if not self.skip_mask:
+            self._num_events_per_process = "num_events_pos_weights_per_process"
+        else:
+            self._num_events_per_process = "num_events_per_process"
+        return self._num_events_per_process
+
     def get_xsec_train_weights(self) -> np.ndarray:
         """
         Weighting such that each event has roughly the same weight,
@@ -267,10 +281,20 @@ def get_xsec_train_weights(self) -> np.ndarray:
             raise Exception("cannot determine train weights without stats")
 
         _, sub_id = get_proc_mask(self._events, self.process, self.ml_model_inst.config_inst)
-        sum_abs_weights = np.sum([self.stats[self.process]["sum_abs_weights_per_process"][str(id)] for id in sub_id])
-        num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id])
+        sum_weights = np.sum([self.stats[self.process]["sum_pos_weights_per_process"][str(id)] for id in sub_id])
+        num_events = np.sum(
+            [self.stats[self.process][self.num_event_per_process][str(id)] for id in sub_id],
+        )
+        # if not self.skip_mask:
+        #     num_events = np.sum(
+        #         [self.stats[self.process]["num_events_pos_weights_per_process"][str(id)] for id in sub_id],
+        #     )
+        # else:
+        #     num_events = np.sum(
+        #         [self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id],
+        #     )
 
-        xsec_train_weights = self.weights / sum_abs_weights * num_events
+        xsec_train_weights = self.weights / sum_weights * num_events
 
         return xsec_train_weights
 
@@ -286,7 +310,15 @@ def get_equal_train_weights(self) -> np.ndarray:
 
         combined_proc_inst = self.ml_model_inst.config_inst.get_process(self.process)
         _, sub_id_proc = get_proc_mask(self._events, self.process, self.ml_model_inst.config_inst)
-        num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id_proc])
+        num_events = np.sum(
+            [self.stats[self.process][self.num_event_per_process][str(id)] for id in sub_id_proc],
+        )
+        # if not self.skip_mask:
+        #     num_events = np.sum(
+        #         [self.stats[self.process]["num_events_pos_weights_per_process"][str(id)] for id in sub_id_proc],
+        #     )
+        # else:
+        #     num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id_proc])
         targeted_sum_of_weights_per_process = (
             num_events / len(combined_proc_inst.x.ml_config.sub_processes)
         )
@@ -724,8 +756,10 @@ def prediction(self) -> np.ndarray:
             self._prediction = self.load_data("prediction")
         else:
             # calcluate prediction if needed
-            if not hasattr(self._ml_model_inst, "trained_model"):
+            if not hasattr(self._ml_model_inst, "best_model"):
+                # if not hasattr(self._ml_model_inst, "trained_model"):
                 raise Exception("No trained model found in the MLModel instance. Cannot calculate prediction.")
-            self._prediction = predict_numpy_on_batch(self._ml_model_inst.trained_model, self.features)
+            # self._prediction = predict_numpy_on_batch(self._ml_model_inst.trained_model, self.features)
+            self._prediction = predict_numpy_on_batch(self._ml_model_inst.best_model, self.features)
 
         return self._prediction  # TODO ML best model
diff --git a/hbw/ml/derived/dl.py b/hbw/ml/derived/dl.py
@@ -293,7 +293,7 @@ def setup(self):
                 "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36",
                 "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39",
             ],
-            "weighting": "xsec",
+            "weighting": "equal",
         },
     },
     "processes": [
@@ -345,6 +345,7 @@ def setup(self):
 
 dl_22post_ml_study_1 = dl_22post.derive("dl_22post_ml_study_1", cls_dict={
     "training_configs": lambda self, requested_configs: ["c22post"],
+    "negative_weights": "ignore",
     "combine_processes": {
         "signal_ggf": {
             # "name": "tt_and_st",
@@ -381,8 +382,13 @@ def setup(self):
     ],
 })
 
+dl_22post_ml_study_1_handle = dl_22post_ml_study_1.derive("dl_22post_ml_study_1_handle", cls_dict={
+    "negative_weights": "handle",
+})
+
 dl_22post_ml_study_2 = dl_22post.derive("dl_22post_ml_study_2", cls_dict={
     "training_configs": lambda self, requested_configs: ["c22post"],
+    "negative_weights": "ignore",
     "combine_processes": {
         "signal_ggf2": {
             # "name": "tt_and_st",
@@ -407,7 +413,7 @@ def setup(self):
                 "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39",
 
             ],
-            "weighting": "xsec",
+            "weighting": "equal",
         },
     },
     "processes": [
@@ -419,6 +425,11 @@ def setup(self):
         "h",
     ],
 })
+
+dl_22post_ml_study_2_handle = dl_22post_ml_study_2.derive("dl_22post_ml_study_2_handle", cls_dict={
+    "negative_weights": "handle",
+})
+
 #
 # setups with different processes (0: baseline, 1: add SM vbf + single H, 2: add SL+all HH variations)
 # NOTE: we should decide which signal processes exactly to use:
@@ -435,6 +446,7 @@ def setup(self):
 })
 dl_22_procs1_w0 = dl_22_procs1.derive("dl_22_procs1_w0", cls_dict={
     "training_configs": lambda self, requested_configs: ["c22post"],
+    "negative_weights": "ignore",
     "ml_process_weights": {
         "hh_ggf_hbb_hvv2l2nu_kl1_kt1": 1,
         "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1": 1,
@@ -444,6 +456,11 @@ def setup(self):
         "h": 1,
     },
 })
+
+dl_22_procs1_w0_handle = dl_22_procs1_w0.derive("dl_22_procs1_w0_handle", cls_dict={
+    "negative_weights": "handle",
+})
+
 dl_22_procs1_w1 = dl_22_procs1.derive("dl_22_procs1_w1", cls_dict={
     "ml_process_weights": {
         "hh_ggf_hbb_hvv2l2nu_kl1_kt1": 1,

diff --git a/hbw/ml/mixins.py b/hbw/ml/mixins.py
@@ -55,7 +55,7 @@ def prepare_ml_model(
         import tensorflow.keras as keras
         from keras.models import Sequential
         from keras.layers import Dense, BatchNormalization
-        # from hbw.ml.tf_util import cumulated_crossentropy, categorical_crossentropy
+        from hbw.ml.tf_util import cumulated_crossentropy  # , categorical_crossentropy
 
         n_inputs = len(set(self.input_features))
         n_outputs = len(self.processes)
@@ -110,7 +110,7 @@ def prepare_ml_model(
             model.compile(
                 # NOTE: we'd preferrably use the Keras CCE, but it does not work when assigning one event
                 #       to multiple classes (target with multiple entries != 0)
-                loss="cumulated_crossentropy",
+                loss=cumulated_crossentropy,
                 optimizer=optimizer,
                 metrics=["categorical_accuracy"],
                 weighted_metrics=["categorical_accuracy"],

diff --git a/hbw/ml/stats.py b/hbw/ml/stats.py
@@ -76,15 +76,16 @@ def ml_preparation(
         events = set_ak_column_f32(events, "event_weight", weight)
         stats["sum_weights"] += float(ak.sum(weight, axis=0))
         weight_map["sum_weights"] = weight
-        weight_map["sum_abs_weights"] = (weight, weight > 0)
-        weight_map["sum_pos_weights"] = np.abs(weight)
+        weight_map["sum_pos_weights"] = (weight, weight > 0)
+        weight_map["sum_abs_weights"] = np.abs(weight)
+        weight_map["num_events_pos_weights"] = weight > 0
 
         # normalization weight only
         norm_weight = events["stitched_normalization_weight"]
         stats["sum_norm_weights"] += float(ak.sum(norm_weight, axis=0))
         weight_map["sum_norm_weights"] = norm_weight
-        weight_map["sum_abs_norm_weights"] = (norm_weight, norm_weight > 0)
-        weight_map["sum_pos_norm_weights"] = np.abs(norm_weight)
+        weight_map["sum_pos_norm_weights"] = (norm_weight, norm_weight > 0)
+        weight_map["sum_abs_norm_weights"] = np.abs(norm_weight)
 
     group_map = {
         "process": {

diff --git a/hbw/plotting/plot_fits.py b/hbw/plotting/plot_fits.py
@@ -52,7 +52,8 @@ def scalable_exponnorm(x, A, loc, scale, K=1):
 
 
 def plot_fit(
-    hists: OrderedDict[od.Process, hist.Hist],
+    hists: dict[str, OrderedDict[od.Process, hist.Hist]],
+    # hists: OrderedDict[od.Process, hist.Hist],
     config_inst: od.Config,
     category_inst: od.Category,
     variable_insts: list[od.Variable],