implement categorization based on gen particles

uhh-cms · Jan 8, 2024 · 610685b · 610685b
1 parent 7345eaa
commit 610685b
Show file tree

Hide file tree

Showing 4 changed files with 153 additions and 2 deletions.
diff --git a/hbw/config/categories.py b/hbw/config/categories.py
@@ -2,6 +2,21 @@
 
 """
 Definition of categories.
+
+Categorizer modules (used to determine category masks) are defined in hbw.selection.categories
+
+Ids for combinations of categories are built as the sum of category ids.
+To avoid reusing category ids, each category block (e.g. leptons, jets, ...) uses ids of a different
+power of 10.
+
+power of 10 | category block
+
+1: free (only used for inclusive category)
+2: jet (resolved vs boosted)
+3: bjet (1 vs geq 2)
+4: lepton
+5: dnn
+6: gen leptons
 """
 
 from collections import OrderedDict
@@ -17,12 +32,43 @@
 logger = law.logger.get_logger(__name__)
 
 
+@call_once_on_config()
+def add_gen_categories(config: od.Config) -> None:
+    gen_1lep = config.add_category(
+        name="gen_1lep",
+        id=100000,
+        selection="catid_selection_incl",  # this should not be called!
+        label="1 gen lepton",
+    )
+    gen_1lep.add_category(
+        name="gen_1e",
+        id=200000,
+        selection="catid_gen_1e",
+        label="1 gen electron",
+    )
+    gen_1lep.add_category(
+        name="gen_1mu",
+        id=300000,
+        selection="catid_gen_1mu",
+        label="1 gen muon",
+    )
+    gen_1lep.add_category(
+        name="gen_1tau",
+        id=400000,
+        selection="catid_gen_1tau",
+        label="1 gen tau",
+    )
+
+
 @call_once_on_config()
 def add_categories_selection(config: od.Config) -> None:
     """
     Adds categories to a *config*, that are typically produced in `SelectEvents`.
     """
 
+    # adds categories based on the existence of gen particles
+    # add_gen_categories(config)
+
     config.x.lepton_channels = {
         "sl": ("1e", "1mu"),
         "dl": ("2e", "2mu", "emu"),

diff --git a/hbw/selection/categories.py b/hbw/selection/categories.py
@@ -4,9 +4,12 @@
 Selection methods defining categories based on selection step results.
 """
 
+from __future__ import annotations
+
 from columnflow.util import maybe_import
 from columnflow.categorization import Categorizer, categorizer
 from columnflow.selection import SelectionResult
+from columnflow.columnar_util import has_ak_column, optional_column
 
 np = maybe_import("numpy")
 ak = maybe_import("awkward")
@@ -17,6 +20,50 @@ def catid_selection_incl(self: Categorizer, events: ak.Array, **kwargs) -> tuple
     mask = ak.ones_like(events.event) > 0
     return events, mask
 
+#
+# Categorizers based on gen info
+#
+
+
+@categorizer(
+    uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId"),
+    n_particles={},  # dict with pdgId + number of required hard particles from this pdgId
+    consider_charge=False,
+    call_force=True,
+)
+def catid_n_gen_particles(
+    self: Categorizer, events: ak.Array, results: SelectionResult | None = None, **kwargs,
+) -> tuple[ak.Array, ak.Array]:
+    """ Categorizer to select events with a certain number of hard gen particles """
+    # start with true mask
+    mask = np.ones(len(events), dtype=bool)
+    if self.dataset_inst.is_data:
+        # for data, always return true mask
+        return events, mask
+
+    if has_ak_column(events, "HardGenPart.pdgId"):
+        gp_id = events.HardGenPart.pdgId
+    else:
+        # try to get gp_id column via SelectionResult
+        gp_id = events.GenPart.pdgId[results.GenPart.HardGenpart]
+
+    for pdgId, num_particles in self.n_particles:
+        mask = mask & ak.sum(gp_id == num_particles, axis=1)
+
+    return events, mask
+
+
+catid_gen_1e = catid_n_gen_particles.derive("catid_gen_1e", cls_dict={"n_particles": {11: 1, 13: 0, 15: 0}})
+catid_gen_1mu = catid_n_gen_particles.derive("catid_gen_1mu", cls_dict={"n_particles": {11: 0, 13: 1, 15: 0}})
+catid_gen_1tau = catid_n_gen_particles.derive("catid_gen_1tau", cls_dict={"n_particles": {11: 0, 13: 0, 15: 1}})
+# catid_gen_2e = catid_n_gen_particles.derive("catid_gen_2e", cls_dict={"n_particles": {11: 2, 13: 0, 15: 0}})
+# catid_gen_2mu = catid_n_gen_particles.derive("catid_gen_2mu", cls_dict={"n_particles": {11: 0, 13: 2, 15: 0}})
+# catid_gen_2tau = catid_n_gen_particles.derive("catid_gen_2tau", cls_dict={"n_particles": {11: 0, 13: 0, 15: 2}})
+# catid_gen_emu = catid_n_gen_particles.derive("catid_gen_emu", cls_dict={"n_particles": {11: 1, 13: 1, 15: 0}})
+# catid_gen_etau = catid_n_gen_particles.derive("catid_gen_etau", cls_dict={"n_particles": {11: 1, 13: 0, 15: 1}})
+# catid_gen_mutau = catid_n_gen_particles.derive("catid_gen_mutau", cls_dict={"n_particles": {11: 0, 13: 1, 15: 1}})
+
+
 #
 # Categorizer called as part of cf.SelectEvents
 #

diff --git a/hbw/selection/common.py b/hbw/selection/common.py
@@ -22,6 +22,7 @@
 from columnflow.production.categories import category_ids
 from columnflow.production.processes import process_ids
 
+from hbw.selection.gen import hard_gen_particles
 from hbw.production.weights import event_weights_to_normalize, large_weights_killer
 from hbw.selection.stats import hbw_increment_stats
 from hbw.selection.cutflow_features import cutflow_features
@@ -328,6 +329,9 @@ def post_selection(
 ) -> Tuple[ak.Array, SelectionResult]:
     """ Methods that are called for both SL and DL after calling the selection modules """
 
+    if self.dataset_inst.is_mc:
+        events = self[hard_gen_particles](events, **kwargs)
+
     # build categories
     events = self[category_ids](events, results=results, **kwargs)
 
@@ -371,5 +375,5 @@ def post_selection_init(self: Selector) -> None:
     if not getattr(self, "dataset_inst", None) or self.dataset_inst.is_data:
         return
 
-    self.uses.add(event_weights_to_normalize)
-    self.produces.add(event_weights_to_normalize)
+    self.uses.update({event_weights_to_normalize, hard_gen_particles})
+    self.produces.update({event_weights_to_normalize, hard_gen_particles})
diff --git a/hbw/selection/gen.py b/hbw/selection/gen.py
@@ -0,0 +1,54 @@
+# coding: utf-8
+
+"""
+Selectors related to gen-level particles.
+"""
+
+import law
+
+from columnflow.selection import Selector, SelectionResult, selector
+from columnflow.util import maybe_import
+
+np = maybe_import("numpy")
+ak = maybe_import("awkward")
+
+logger = law.logger.get_logger(__name__)
+
+
+pdgId_map = {
+    1: "down",
+    2: "up",
+    3: "strange",
+    4: "charm",
+    5: "bottom",
+    6: "top",
+    11: "electron",
+    12: "e_neutrino",
+    13: "muon",
+    14: "mu_neutrino",
+    15: "tau",
+    16: "tau_neutrino",
+    21: "gluon",
+    22: "photon",
+    23: "Z",
+    24: "W",
+    25: "Higgs",
+}
+
+
+@selector(
+    uses={"GenPart.statusFlags"},
+    mc_only=True,
+)
+def hard_gen_particles(
+    self: Selector,
+    events: ak.Array,
+    results,
+    **kwargs,
+) -> tuple[ak.Array, SelectionResult]:
+
+    gp_mask = events.GenPart.hasFlags("isHardProcess")
+
+    return events, SelectionResult(
+        objects={"GenPart": {"HardGenPart": gp_mask}},
+    )