From da23e9faf7d39234cabb8398b1a25268ac2b46e0 Mon Sep 17 00:00:00 2001
From: Mathis Frahm <mathisfrahm@gmx.de>
Date: Mon, 8 Jan 2024 14:09:29 +0100
Subject: [PATCH 1/2] implement categorization based on gen particles

---
 hbw/config/categories.py    | 57 +++++++++++++++++++++++++++
 hbw/config/config_run2.py   |  2 +
 hbw/selection/categories.py | 78 +++++++++++++++++++++++++++++++++++++
 hbw/selection/common.py     |  9 +++--
 hbw/selection/gen.py        | 56 ++++++++++++++++++++++++++
 5 files changed, 199 insertions(+), 3 deletions(-)
 create mode 100644 hbw/selection/gen.py

diff --git a/hbw/config/categories.py b/hbw/config/categories.py
index 4edaaa63..8dbac682 100644
--- a/hbw/config/categories.py
+++ b/hbw/config/categories.py
@@ -2,6 +2,21 @@
 
 """
 Definition of categories.
+
+Categorizer modules (used to determine category masks) are defined in hbw.selection.categories
+
+Ids for combinations of categories are built as the sum of category ids.
+To avoid reusing category ids, each category block (e.g. leptons, jets, ...) uses ids of a different
+power of 10.
+
+power of 10 | category block
+
+0: free (only used for inclusive category)
+1: jet (resolved vs boosted)
+2: bjet (1 vs geq 2)
+3: lepton
+4: dnn
+5: gen-level leptons (not combined with other categories)
 """
 
 from collections import OrderedDict
@@ -17,12 +32,54 @@
 logger = law.logger.get_logger(__name__)
 
 
+@call_once_on_config()
+def add_gen_categories(config: od.Config) -> None:
+    gen_0lep = config.add_category(  # noqa
+        name="gen_0lep",
+        id=100000,
+        selection="catid_gen_0lep",  # this should not be called!
+        label="No gen lepton",
+    )
+    gen_1lep = config.add_category(
+        name="gen_1lep",
+        id=200000,
+        label="1 gen lepton",
+    )
+    gen_1lep.add_category(
+        name="gen_1e",
+        id=300000,
+        selection="catid_gen_1e",
+        label="1 gen electron",
+    )
+    gen_1lep.add_category(
+        name="gen_1mu",
+        id=400000,
+        selection="catid_gen_1mu",
+        label="1 gen muon",
+    )
+    gen_1lep.add_category(
+        name="gen_1tau",
+        id=500000,
+        selection="catid_gen_1tau",
+        label="1 gen tau",
+    )
+    gen_2lep = config.add_category(  # noqa
+        name="gen_geq2lep",
+        id=600000,
+        selection="catid_geq_2_gen_leptons",
+        label=r"$\geq 2$ gen leptons",
+    )
+
+
 @call_once_on_config()
 def add_categories_selection(config: od.Config) -> None:
     """
     Adds categories to a *config*, that are typically produced in `SelectEvents`.
     """
 
+    # adds categories based on the existence of gen particles
+    add_gen_categories(config)
+
     config.x.lepton_channels = {
         "sl": ("1e", "1mu"),
         "dl": ("2e", "2mu", "emu"),
diff --git a/hbw/config/config_run2.py b/hbw/config/config_run2.py
index 0b7b048f..ddcbf2bd 100644
--- a/hbw/config/config_run2.py
+++ b/hbw/config/config_run2.py
@@ -432,6 +432,8 @@ def make_jme_filename(jme_aux, sample_type, name, era=None):
             "run", "luminosityBlock", "event",
             # columns added during selection, required in general
             "mc_weight", "PV.npvs", "process_id", "category_ids", "deterministic_seed",
+            # Gen information (for categorization)
+            "HardGenPart.pdgId",
             # weight-related columns
             "pu_weight*", "pdf_weight*",
             "murf_envelope_weight*", "mur_weight*", "muf_weight*",
diff --git a/hbw/selection/categories.py b/hbw/selection/categories.py
index 7aa2da2f..a8cd44cf 100644
--- a/hbw/selection/categories.py
+++ b/hbw/selection/categories.py
@@ -4,9 +4,12 @@
 Selection methods defining categories based on selection step results.
 """
 
+from __future__ import annotations
+
 from columnflow.util import maybe_import
 from columnflow.categorization import Categorizer, categorizer
 from columnflow.selection import SelectionResult
+from columnflow.columnar_util import has_ak_column, optional_column
 
 np = maybe_import("numpy")
 ak = maybe_import("awkward")
@@ -17,6 +20,81 @@ def catid_selection_incl(self: Categorizer, events: ak.Array, **kwargs) -> tuple
     mask = ak.ones_like(events.event) > 0
     return events, mask
 
+#
+# Categorizers based on gen info
+#
+
+
+@categorizer(
+    uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId"),
+    gp_dict={},  # dict with pdgId + number of required prompt particles with this pdgId
+    ignore_charge=True,
+    call_force=True,
+)
+def catid_n_gen_particles(
+    self: Categorizer, events: ak.Array, results: SelectionResult | None = None, **kwargs,
+) -> tuple[ak.Array, ak.Array]:
+    """ Categorizer to select events with a certain number of prompt gen particles """
+    # start with true mask
+    mask = np.ones(len(events), dtype=bool)
+    if self.dataset_inst.is_data:
+        # for data, always return true mask
+        return events, mask
+
+    if has_ak_column(events, "HardGenPart.pdgId"):
+        gp_id = events.HardGenPart.pdgId
+    else:
+        # try to get gp_id column via SelectionResult
+        gp_id = events.GenPart.pdgId[results.objects.GenPart.HardGenPart]
+
+    if self.ignore_charge:
+        gp_id = abs(gp_id)
+
+    for pdgId, num_particles in self.gp_dict.items():
+        mask = mask & (ak.sum(gp_id == pdgId, axis=1) == num_particles)
+
+    return events, mask
+
+
+catid_gen_0lep = catid_n_gen_particles.derive("catid_gen_0lep", cls_dict={"gp_dict": {11: 0, 13: 0, 15: 0}})
+catid_gen_1e = catid_n_gen_particles.derive("catid_gen_1e", cls_dict={"gp_dict": {11: 1, 13: 0, 15: 0}})
+catid_gen_1mu = catid_n_gen_particles.derive("catid_gen_1mu", cls_dict={"gp_dict": {11: 0, 13: 1, 15: 0}})
+catid_gen_1tau = catid_n_gen_particles.derive("catid_gen_1tau", cls_dict={"gp_dict": {11: 0, 13: 0, 15: 1}})
+# catid_gen_2e = catid_n_gen_particles.derive("catid_gen_2e", cls_dict={"gp_dict": {11: 2, 13: 0, 15: 0}})
+# catid_gen_2mu = catid_n_gen_particles.derive("catid_gen_2mu", cls_dict={"gp_dict": {11: 0, 13: 2, 15: 0}})
+# catid_gen_2tau = catid_n_gen_particles.derive("catid_gen_2tau", cls_dict={"gp_dict": {11: 0, 13: 0, 15: 2}})
+# catid_gen_emu = catid_n_gen_particles.derive("catid_gen_emu", cls_dict={"gp_dict": {11: 1, 13: 1, 15: 0}})
+# catid_gen_etau = catid_n_gen_particles.derive("catid_gen_etau", cls_dict={"gp_dict": {11: 1, 13: 0, 15: 1}})
+# catid_gen_mutau = catid_n_gen_particles.derive("catid_gen_mutau", cls_dict={"gp_dict": {11: 0, 13: 1, 15: 1}})
+
+
+@categorizer(
+    uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId"),
+    call_force=True,
+)
+def catid_geq_2_gen_leptons(
+    self: Categorizer, events: ak.Array, results: SelectionResult | None = None, **kwargs,
+) -> tuple[ak.Array, ak.Array]:
+    """ Categorizer to select events with a certain number of hard gen particles """
+
+    if self.dataset_inst.is_data:
+        # for data, always return true mask
+        return events, np.ones(len(events), dtype=bool)
+
+    if has_ak_column(events, "HardGenPart.pdgId"):
+        gp_id = events.HardGenPart.pdgId
+    else:
+        # try to get gp_id column via SelectionResult
+        gp_id = events.GenPart.pdgId[results.objects.GenPart.HardGenPart]
+
+    abs_id = abs(gp_id)
+    mask = (
+        ak.sum(abs_id == 11, axis=1) + ak.sum(abs_id == 13, axis=1) + ak.sum(abs_id == 15, axis=1)
+    ) >= 2
+
+    return events, mask
+
+
 #
 # Categorizer called as part of cf.SelectEvents
 #
diff --git a/hbw/selection/common.py b/hbw/selection/common.py
index 24d3bf1e..d13594a9 100644
--- a/hbw/selection/common.py
+++ b/hbw/selection/common.py
@@ -22,6 +22,7 @@
 from columnflow.production.categories import category_ids
 from columnflow.production.processes import process_ids
 
+from hbw.selection.gen import hard_gen_particles
 from hbw.production.weights import event_weights_to_normalize, large_weights_killer
 from hbw.selection.stats import hbw_increment_stats
 from hbw.selection.cutflow_features import cutflow_features
@@ -328,6 +329,9 @@ def post_selection(
 ) -> Tuple[ak.Array, SelectionResult]:
     """ Methods that are called for both SL and DL after calling the selection modules """
 
+    if self.dataset_inst.is_mc:
+        events, results = self[hard_gen_particles](events, results, **kwargs)
+
     # build categories
     events = self[category_ids](events, results=results, **kwargs)
 
@@ -358,7 +362,6 @@ def log_fraction(stats_key: str, msg: str | None = None):
     events = ak.fill_none(events, EMPTY_FLOAT)
 
     logger.info(f"Selected {ak.sum(results.event)} from {len(events)} events")
-
     return events, results
 
 
@@ -371,5 +374,5 @@ def post_selection_init(self: Selector) -> None:
     if not getattr(self, "dataset_inst", None) or self.dataset_inst.is_data:
         return
 
-    self.uses.add(event_weights_to_normalize)
-    self.produces.add(event_weights_to_normalize)
+    self.uses.update({event_weights_to_normalize, hard_gen_particles})
+    self.produces.update({event_weights_to_normalize, hard_gen_particles})
diff --git a/hbw/selection/gen.py b/hbw/selection/gen.py
new file mode 100644
index 00000000..47223294
--- /dev/null
+++ b/hbw/selection/gen.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+
+"""
+Selectors related to gen-level particles.
+"""
+
+import law
+
+from columnflow.selection import Selector, SelectionResult, selector
+from columnflow.util import maybe_import
+
+np = maybe_import("numpy")
+ak = maybe_import("awkward")
+
+logger = law.logger.get_logger(__name__)
+
+
+pdgId_map = {
+    1: "down",
+    2: "up",
+    3: "strange",
+    4: "charm",
+    5: "bottom",
+    6: "top",
+    11: "electron",
+    12: "e_neutrino",
+    13: "muon",
+    14: "mu_neutrino",
+    15: "tau",
+    16: "tau_neutrino",
+    21: "gluon",
+    22: "photon",
+    23: "Z",
+    24: "W",
+    25: "Higgs",
+}
+
+
+@selector(
+    uses={"GenPart.statusFlags"},
+    mc_only=True,
+)
+def hard_gen_particles(
+    self: Selector,
+    events: ak.Array,
+    results,
+    **kwargs,
+) -> tuple[ak.Array, SelectionResult]:
+
+    gp_mask = events.GenPart.hasFlags("isHardProcess")
+
+    results = results + SelectionResult(
+        objects={"GenPart": {"HardGenPart": gp_mask}},
+    )
+
+    return events, results

From b02307fe20e9cbeb4a8fd9eae73097e144cc8726 Mon Sep 17 00:00:00 2001
From: Mathis Frahm <mathisfrahm@gmx.de>
Date: Tue, 9 Jan 2024 09:36:29 +0100
Subject: [PATCH 2/2] further generalize gen particles categorizer

---
 hbw/selection/categories.py | 59 +++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/hbw/selection/categories.py b/hbw/selection/categories.py
index a8cd44cf..0049fb46 100644
--- a/hbw/selection/categories.py
+++ b/hbw/selection/categories.py
@@ -6,6 +6,8 @@
 
 from __future__ import annotations
 
+import law
+
 from columnflow.util import maybe_import
 from columnflow.categorization import Categorizer, categorizer
 from columnflow.selection import SelectionResult
@@ -26,15 +28,20 @@ def catid_selection_incl(self: Categorizer, events: ak.Array, **kwargs) -> tuple
 
 
 @categorizer(
-    uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId"),
-    gp_dict={},  # dict with pdgId + number of required prompt particles with this pdgId
+    uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId", "GenPart.statusFlags"),
+    gp_dict={},  # dict with (tuple of) pdgId + number of required prompt particles with this pdgId
     ignore_charge=True,
     call_force=True,
+    _operator="eq",
 )
 def catid_n_gen_particles(
     self: Categorizer, events: ak.Array, results: SelectionResult | None = None, **kwargs,
 ) -> tuple[ak.Array, ak.Array]:
     """ Categorizer to select events with a certain number of prompt gen particles """
+
+    # possible options to compare number of particles with required number of particles: ==, >=, >, <=, <
+    assert self._operator in ("eq", "ge", "gt", "le", "lt")
+
     # start with true mask
     mask = np.ones(len(events), dtype=bool)
     if self.dataset_inst.is_data:
@@ -44,14 +51,26 @@ def catid_n_gen_particles(
     if has_ak_column(events, "HardGenPart.pdgId"):
         gp_id = events.HardGenPart.pdgId
     else:
-        # try to get gp_id column via SelectionResult
-        gp_id = events.GenPart.pdgId[results.objects.GenPart.HardGenPart]
+        try:
+            # try to get gp_id column via SelectionResult
+            gp_id = events.GenPart.pdgId[results.objects.GenPart.HardGenPart]
+        except AttributeError:
+            # try to select hard gen particles via status flags
+            gp_id = events.GenPart.pdgId[events.GenPart.hasFlags("isHardProcess")]
 
     if self.ignore_charge:
         gp_id = abs(gp_id)
 
-    for pdgId, num_particles in self.gp_dict.items():
-        mask = mask & (ak.sum(gp_id == pdgId, axis=1) == num_particles)
+    for pdgIds, required_n_particles in self.gp_dict.items():
+        # make sure that 'pdgIds' is a tuple
+        pdgIds = law.util.make_tuple(pdgIds)
+
+        # get number of gen particles with requested pdgIds for each event
+        n_particles = sum([ak.sum(gp_id == pdgId, axis=1) for pdgId in pdgIds])
+
+        # compare number of gen particles with required number of particles with requested operator
+        this_mask = getattr(n_particles, f"__{self._operator}__")(required_n_particles)
+        mask = mask & this_mask
 
     return events, mask
 
@@ -66,33 +85,9 @@ def catid_n_gen_particles(
 # catid_gen_emu = catid_n_gen_particles.derive("catid_gen_emu", cls_dict={"gp_dict": {11: 1, 13: 1, 15: 0}})
 # catid_gen_etau = catid_n_gen_particles.derive("catid_gen_etau", cls_dict={"gp_dict": {11: 1, 13: 0, 15: 1}})
 # catid_gen_mutau = catid_n_gen_particles.derive("catid_gen_mutau", cls_dict={"gp_dict": {11: 0, 13: 1, 15: 1}})
-
-
-@categorizer(
-    uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId"),
-    call_force=True,
+catid_geq_2_gen_leptons = catid_n_gen_particles.derive(
+    "catid_geq_2_gen_leptons", cls_dict={"gp_dict": {(11, 13, 15): 2}, "_operator": "ge"},
 )
-def catid_geq_2_gen_leptons(
-    self: Categorizer, events: ak.Array, results: SelectionResult | None = None, **kwargs,
-) -> tuple[ak.Array, ak.Array]:
-    """ Categorizer to select events with a certain number of hard gen particles """
-
-    if self.dataset_inst.is_data:
-        # for data, always return true mask
-        return events, np.ones(len(events), dtype=bool)
-
-    if has_ak_column(events, "HardGenPart.pdgId"):
-        gp_id = events.HardGenPart.pdgId
-    else:
-        # try to get gp_id column via SelectionResult
-        gp_id = events.GenPart.pdgId[results.objects.GenPart.HardGenPart]
-
-    abs_id = abs(gp_id)
-    mask = (
-        ak.sum(abs_id == 11, axis=1) + ak.sum(abs_id == 13, axis=1) + ak.sum(abs_id == 15, axis=1)
-    ) >= 2
-
-    return events, mask
 
 
 #