Merge pull request #96 from uhh-cms/ML_combine_proc

Ml combine proc
uhh-cms · Dec 11, 2024 · 4b437be · 4b437be
2 parents 1ae1663 + 4e3b009
commit 4b437be
Show file tree

Hide file tree

Showing 13 changed files with 479 additions and 21 deletions.
diff --git a/hbw/categorization/categories.py b/hbw/categorization/categories.py
@@ -271,6 +271,7 @@ def catid_2b(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, a
 
 # TODO: not hard-coded -> use config?
 ml_processes = [
+    "signal_ggf", "signal_ggf2", "signal_vbf", "signal_vbf2",
     "hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1",
     "hh_ggf_hbb_hvvqqlnu_kl1_kt1", "hh_vbf_hbb_hvvqqlnu_kv1_k2v1_kl1",
     "hh_ggf_hbb_hvv2l2nu_kl1_kt1", "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",

diff --git a/hbw/config/categories.py b/hbw/config/categories.py
@@ -323,16 +323,17 @@ def add_categories_ml(config, ml_model_inst):
     # add ml categories directly to the config
     # NOTE: this is a bit dangerous, because our ID depends on the MLModel, but
     #       we can reconfigure our MLModel after having created these categories
+    # TODO: config is empty and therefore fails
     ml_categories = []
     for i, proc in enumerate(ml_model_inst.processes):
-        cat_label = config.get_process(proc).x.ml_label
+        # cat_label = config.get_process(proc).x.ml_label
         ml_categories.append(config.add_category(
             # NOTE: name and ID is unique as long as we don't use
             #       multiple ml_models simutaneously
             name=f"ml_{proc}",
             id=(i + 1) * 1000,
             selection=f"catid_ml_{proc}",
-            label=f"{cat_label} category",
+            # label=f"{cat_label} category",
             aux={"ml_proc": proc},
         ))
 

diff --git a/hbw/config/processes.py b/hbw/config/processes.py
@@ -10,6 +10,7 @@
 from scinum import Number
 
 from cmsdb.util import add_decay_process
+from columnflow.util import DotDict
 
 from hbw.config.styling import color_palette
 
@@ -199,3 +200,23 @@ def configure_hbw_processes(config: od.Config):
         if config.has_process(bg):
             bg = config.get_process(bg)
             background.add_process(bg)
+
+
+from random import randint
+
+
+def create_combined_proc_forML(config: od.Config, proc_name: str, proc_dict: dict, color=None):
+
+    combining_proc = []
+    for proc in proc_dict.sub_processes:
+        combining_proc.append(config.get_process(proc, default=None))
+    proc_name = add_parent_process(config,
+        combining_proc,
+        name=proc_name,
+        id=randint(10000000, 99999999),
+        # TODO: random number (could by chance be a already used number --> should be checked)
+        label=proc_dict.get("label", "combined custom process"),
+        color=proc_dict.get("color", None),
+    )
+    ml_config = DotDict({"weighting": proc_dict.get("weighting", None), "sub_processes": proc_dict.sub_processes})
+    proc_name.x.ml_config = ml_config
diff --git a/hbw/inference/dl.py b/hbw/inference/dl.py
@@ -142,6 +142,90 @@
 
 dl = HBWInferenceModelBase.derive("dl", cls_dict=default_cls_dict)
 
+# "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
+# "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1",
+# "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43",
+# "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94",
+# "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36",
+# "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39",
+# "hh_ggf_hbb_hvv2l2nu_kl0_kt1",
+# "hh_ggf_hbb_hvv2l2nu_kl1_kt1",
+# "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1",
+# "hh_ggf_hbb_hvv2l2nu_kl5_kt1",
+
+
+dl.derive("dl_ml_study_1", cls_dict={
+    "ml_model_name": "dl_22post_ml_study_1",
+    "config_categories": [
+        "sr__1b__ml_signal_ggf",
+        "sr__1b__ml_signal_vbf",
+        "sr__1b__ml_tt",
+        "sr__1b__ml_st",
+        "sr__1b__ml_dy",
+        "sr__1b__ml_h",
+        "sr__2b__ml_signal_ggf",
+        "sr__2b__ml_signal_vbf",
+        "sr__2b__ml_tt",
+        "sr__2b__ml_st",
+        "sr__2b__ml_dy",
+        "sr__2b__ml_h",
+    ],
+    "processes": [
+        "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
+        "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1",
+        "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43",
+        "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94",
+        "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36",
+        "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39",
+        "hh_ggf_hbb_hvv2l2nu_kl0_kt1",
+        "hh_ggf_hbb_hvv2l2nu_kl1_kt1",
+        "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1",
+        "hh_ggf_hbb_hvv2l2nu_kl5_kt1",
+        "tt",
+        "dy",
+        "w_lnu",
+        "vv",
+        "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth",
+    ],
+    "systematics": rate_systematics,
+})
+
+dl.derive("dl_ml_study_2", cls_dict={
+    "ml_model_name": "dl_22post_ml_study_2",
+    "config_categories": [
+        "sr__1b__ml_signal_ggf2",
+        "sr__1b__ml_signal_vbf2",
+        "sr__1b__ml_tt",
+        "sr__1b__ml_st",
+        "sr__1b__ml_dy",
+        "sr__1b__ml_h",
+        "sr__2b__ml_signal_ggf2",
+        "sr__2b__ml_signal_vbf2",
+        "sr__2b__ml_tt",
+        "sr__2b__ml_st",
+        "sr__2b__ml_dy",
+        "sr__2b__ml_h",
+    ],
+    "processes": [
+        "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
+        "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1",
+        "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43",
+        "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94",
+        "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36",
+        "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39",
+        "hh_ggf_hbb_hvv2l2nu_kl0_kt1",
+        "hh_ggf_hbb_hvv2l2nu_kl1_kt1",
+        "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1",
+        "hh_ggf_hbb_hvv2l2nu_kl5_kt1",
+        "tt",
+        "dy",
+        "w_lnu",
+        "vv",
+        "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth",
+    ],
+    "systematics": rate_systematics,
+})
+
 dl.derive("dl_hww_and_hzz", cls_dict={
     "processes": [
         "hh_ggf_hbb_hww_kl0_kt1",

diff --git a/hbw/ml/base.py b/hbw/ml/base.py
@@ -273,6 +273,7 @@ def uses(self, config_inst: od.Config) -> set[Route | str]:
         columns = {"mli_*"}
         # TODO: switch to full event weight
         # TODO: this might not work with data, to be checked
+        columns.add("process_id")
         columns.add("normalization_weight")
         columns.add("stitched_normalization_weight")
         columns.add("event_weight")

diff --git a/hbw/ml/data_loader.py b/hbw/ml/data_loader.py
@@ -19,6 +19,40 @@
 logger = law.logger.get_logger(__name__)
 
 
+def get_proc_mask(
+    events: ak.Array,
+    proc: str | od.Process,
+    config_inst: od.Config | None = None,
+) -> tuple(np.ndarray, list):
+    """
+    Creates the mask selecting events belonging to the process *proc* and a list of all ids belonging to this process.
+
+    :param events: Event array    
+    :param proc: Either string or process instance.
+    :param config_inst: An instance of the Config, can be None if Porcess instance is given.
+    :return process mask and the corresponding process ids
+    """
+    # get process instance
+    if config_inst:
+        proc_inst = config_inst.get_process(proc)
+    elif isinstance(proc, od.Process):
+        proc_inst = proc
+
+    proc_id = events.process_id
+    unique_proc_ids = set(proc_id)
+
+    # get list of Ids that are belonging to the process and are present in the event array
+    sub_id = [
+        proc_inst.id
+        for proc_inst, _, _ in proc_inst.walk_processes(include_self=True)
+        if proc_inst.id in unique_proc_ids
+    ]
+
+    # Create process mask
+    proc_mask = np.isin(proc_id, sub_id)
+    return proc_mask, sub_id
+
+
 def input_features_sanity_checks(ml_model_inst: MLModel, input_features: list[str]):
     """
     Perform sanity checks on the input features.
@@ -78,8 +112,10 @@ def __init__(self, ml_model_inst: MLModel, process: "str", events: ak.Array, sta
         """
         self._ml_model_inst = ml_model_inst
         self._process = process
+
+        proc_mask, _ = get_proc_mask(events, process, ml_model_inst.config_inst)
         self._stats = stats
-        self._events = events
+        self._events = events[proc_mask]
 
     def __repr__(self):
         return f"{self.__class__.__name__}({self.ml_model_inst.cls_name}, {self.process})"
@@ -185,21 +221,89 @@ def shuffle_indices(self) -> np.ndarray:
         self._shuffle_indices = np.random.permutation(self.n_events)
         return self._shuffle_indices
 
+    def get_xsec_train_weights(self) -> np.ndarray:
+        """
+        Weighting such that each event has roughly the same weight,
+        sub processes are weighted accoridng to their cross section
+        """
+        if hasattr(self, "_xsec_train_weights"):
+            return self._xsec_train_weights
+
+        if not self.stats:
+            raise Exception("cannot determine train weights without stats")
+
+        _, sub_id = get_proc_mask(self._events, self.process, self.ml_model_inst.config_inst)
+        sum_abs_weights = np.sum([self.stats[self.process]["sum_abs_weights_per_process"][str(id)] for id in sub_id])
+        num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id])
+
+        xsec_train_weights = self.weights / sum_abs_weights * num_events
+
+        return xsec_train_weights
+
+    def get_equal_train_weights(self) -> np.ndarray:
+        """
+        Weighting such that events of each sub processes are weighted equally
+        """
+        if hasattr(self, "_equally_train_weights"):
+            return self._equal_train_weights
+
+        if not self.stats:
+            raise Exception("cannot determine train weights without stats")
+
+        combined_proc_inst = self.ml_model_inst.config_inst.get_process(self.process)
+        _, sub_id_proc = get_proc_mask(self._events, self.process, self.ml_model_inst.config_inst)
+        num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id_proc])
+        targeted_sum_of_weights_per_process = (
+            num_events / len(combined_proc_inst.x.ml_config.sub_processes)
+        )
+        equal_train_weights = ak.full_like(self.weights, 1.)
+        sub_class_factors = {}
+
+        for proc in combined_proc_inst.x.ml_config.sub_processes:
+            proc_mask, sub_id = get_proc_mask(self._events, proc, self.ml_model_inst.config_inst)
+            sum_pos_weights_per_sub_proc = 0.
+            sum_pos_weights_per_proc = self.stats[self.process]["sum_pos_weights_per_process"]
+
+            for id in sub_id:
+                id = str(id)
+                if id in self.stats[self.process]["num_events_per_process"]:
+                    sum_pos_weights_per_sub_proc += sum_pos_weights_per_proc[id]
+
+            if sum_pos_weights_per_sub_proc == 0:
+                norm_const_per_proc = 1.
+                logger.info(
+                    f"No weight sum found in stats for sub process {proc}."
+                    f"Normalization constant set to 1 but results are probably not correct.")
+            else:
+                norm_const_per_proc = targeted_sum_of_weights_per_process / sum_pos_weights_per_sub_proc
+                logger.info(f"Normalizing constant for {proc} is {norm_const_per_proc}")
+
+            sub_class_factors[proc] = norm_const_per_proc
+            equal_train_weights = np.where(proc_mask, self.weights * norm_const_per_proc, equal_train_weights)
+
+        return equal_train_weights
+
     @property
     def train_weights(self) -> np.ndarray:
         """
-        Weighting such that each event has roughly the same weight
+        Weighting according to the parameters set in the ML model config
         """
         if hasattr(self, "_train_weights"):
             return self._train_weights
 
         if not self.stats:
             raise Exception("cannot determine train weights without stats")
 
-        sum_abs_weights = self.stats[self.process]["sum_abs_weights"]
-        num_events = self.stats[self.process]["num_events"]
+        # TODO: hier muss np.float gemacht werden
+        proc = self.process
+        proc_inst = self.ml_model_inst.config_inst.get_process(proc)
+        if proc_inst.x("ml_config", None) and proc_inst.x.ml_config.weighting == "equal":
+            train_weights = self.get_equal_train_weights()
+        else:
+            train_weights = self.get_xsec_train_weights()
+
+        self._train_weights = ak.to_numpy(train_weights).astype(np.float32)
 
-        self._train_weights = self.weights / sum_abs_weights * num_events
         return self._train_weights
 
     @property
@@ -213,11 +317,26 @@ def equal_weights(self) -> np.ndarray:
         if not self.stats:
             raise Exception("cannot determine val weights without stats")
 
+        # TODO: per process pls [done] and now please tidy up
         processes = self.ml_model_inst.processes
-        sum_abs_weights = self.stats[self.process]["sum_abs_weights"]
-        num_events_per_process = {proc: self.stats[proc]["num_events"] for proc in processes}
-
-        self._validation_weights = self.weights / sum_abs_weights * max(num_events_per_process.values())
+        num_events_per_process = {}
+        for proc in processes:
+            id_list = list(self.stats[proc]["num_events_per_process"].keys())
+            proc_inst = self.ml_model_inst.config_inst.get_process(proc)
+            sub_id = [
+                p_inst.id
+                for p_inst, _, _ in proc_inst.walk_processes(include_self=True)
+                if str(p_inst.id) in id_list
+            ]
+            if proc == self.process:
+                sum_abs_weights = np.sum([
+                    self.stats[self.process]["sum_abs_weights_per_process"][str(id)] for id in sub_id
+                ])
+            num_events_per_proc = np.sum([self.stats[proc]["num_events_per_process"][str(id)] for id in sub_id])
+            num_events_per_process[proc] = num_events_per_proc
+
+        validation_weights = self.weights / sum_abs_weights * max(num_events_per_process.values())
+        self._validation_weights = ak.to_numpy(validation_weights).astype(np.float32)
 
         return self._validation_weights
 
@@ -544,6 +663,7 @@ def target(self) -> np.ndarray:
         if self._ml_model_inst.negative_weights == "handle":
             target[self.m_negative_weights] = 1 - target[self.m_negative_weights]
 
+        # NOTE: I think here the targets are somehow 64floats... Maybe check that
         self._target = target
         return self._target