Merge pull request #65 from uhh-cms/feature/gen_lepton_tag

Categorization based on number of gen particles
uhh-cms · Jan 10, 2024 · 5c03497 · 5c03497
2 parents 7345eaa + b02307f
commit 5c03497
Show file tree

Hide file tree

Showing 5 changed files with 194 additions and 3 deletions.
diff --git a/hbw/config/categories.py b/hbw/config/categories.py
@@ -2,6 +2,21 @@
 
 """
 Definition of categories.
+
+Categorizer modules (used to determine category masks) are defined in hbw.selection.categories
+
+Ids for combinations of categories are built as the sum of category ids.
+To avoid reusing category ids, each category block (e.g. leptons, jets, ...) uses ids of a different
+power of 10.
+
+power of 10 | category block
+
+0: free (only used for inclusive category)
+1: jet (resolved vs boosted)
+2: bjet (1 vs geq 2)
+3: lepton
+4: dnn
+5: gen-level leptons (not combined with other categories)
 """
 
 from collections import OrderedDict
@@ -17,12 +32,54 @@
 logger = law.logger.get_logger(__name__)
 
 
+@call_once_on_config()
+def add_gen_categories(config: od.Config) -> None:
+    gen_0lep = config.add_category(  # noqa
+        name="gen_0lep",
+        id=100000,
+        selection="catid_gen_0lep",  # this should not be called!
+        label="No gen lepton",
+    )
+    gen_1lep = config.add_category(
+        name="gen_1lep",
+        id=200000,
+        label="1 gen lepton",
+    )
+    gen_1lep.add_category(
+        name="gen_1e",
+        id=300000,
+        selection="catid_gen_1e",
+        label="1 gen electron",
+    )
+    gen_1lep.add_category(
+        name="gen_1mu",
+        id=400000,
+        selection="catid_gen_1mu",
+        label="1 gen muon",
+    )
+    gen_1lep.add_category(
+        name="gen_1tau",
+        id=500000,
+        selection="catid_gen_1tau",
+        label="1 gen tau",
+    )
+    gen_2lep = config.add_category(  # noqa
+        name="gen_geq2lep",
+        id=600000,
+        selection="catid_geq_2_gen_leptons",
+        label=r"$\geq 2$ gen leptons",
+    )
+
+
 @call_once_on_config()
 def add_categories_selection(config: od.Config) -> None:
     """
     Adds categories to a *config*, that are typically produced in `SelectEvents`.
     """
 
+    # adds categories based on the existence of gen particles
+    add_gen_categories(config)
+
     config.x.lepton_channels = {
         "sl": ("1e", "1mu"),
         "dl": ("2e", "2mu", "emu"),

diff --git a/hbw/config/config_run2.py b/hbw/config/config_run2.py
@@ -432,6 +432,8 @@ def make_jme_filename(jme_aux, sample_type, name, era=None):
             "run", "luminosityBlock", "event",
             # columns added during selection, required in general
             "mc_weight", "PV.npvs", "process_id", "category_ids", "deterministic_seed",
+            # Gen information (for categorization)
+            "HardGenPart.pdgId",
             # weight-related columns
             "pu_weight*", "pdf_weight*",
             "murf_envelope_weight*", "mur_weight*", "muf_weight*",

diff --git a/hbw/selection/categories.py b/hbw/selection/categories.py
@@ -4,9 +4,14 @@
 Selection methods defining categories based on selection step results.
 """
 
+from __future__ import annotations
+
+import law
+
 from columnflow.util import maybe_import
 from columnflow.categorization import Categorizer, categorizer
 from columnflow.selection import SelectionResult
+from columnflow.columnar_util import has_ak_column, optional_column
 
 np = maybe_import("numpy")
 ak = maybe_import("awkward")
@@ -17,6 +22,74 @@ def catid_selection_incl(self: Categorizer, events: ak.Array, **kwargs) -> tuple
     mask = ak.ones_like(events.event) > 0
     return events, mask
 
+#
+# Categorizers based on gen info
+#
+
+
+@categorizer(
+    uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId", "GenPart.statusFlags"),
+    gp_dict={},  # dict with (tuple of) pdgId + number of required prompt particles with this pdgId
+    ignore_charge=True,
+    call_force=True,
+    _operator="eq",
+)
+def catid_n_gen_particles(
+    self: Categorizer, events: ak.Array, results: SelectionResult | None = None, **kwargs,
+) -> tuple[ak.Array, ak.Array]:
+    """ Categorizer to select events with a certain number of prompt gen particles """
+
+    # possible options to compare number of particles with required number of particles: ==, >=, >, <=, <
+    assert self._operator in ("eq", "ge", "gt", "le", "lt")
+
+    # start with true mask
+    mask = np.ones(len(events), dtype=bool)
+    if self.dataset_inst.is_data:
+        # for data, always return true mask
+        return events, mask
+
+    if has_ak_column(events, "HardGenPart.pdgId"):
+        gp_id = events.HardGenPart.pdgId
+    else:
+        try:
+            # try to get gp_id column via SelectionResult
+            gp_id = events.GenPart.pdgId[results.objects.GenPart.HardGenPart]
+        except AttributeError:
+            # try to select hard gen particles via status flags
+            gp_id = events.GenPart.pdgId[events.GenPart.hasFlags("isHardProcess")]
+
+    if self.ignore_charge:
+        gp_id = abs(gp_id)
+
+    for pdgIds, required_n_particles in self.gp_dict.items():
+        # make sure that 'pdgIds' is a tuple
+        pdgIds = law.util.make_tuple(pdgIds)
+
+        # get number of gen particles with requested pdgIds for each event
+        n_particles = sum([ak.sum(gp_id == pdgId, axis=1) for pdgId in pdgIds])
+
+        # compare number of gen particles with required number of particles with requested operator
+        this_mask = getattr(n_particles, f"__{self._operator}__")(required_n_particles)
+        mask = mask & this_mask
+
+    return events, mask
+
+
+catid_gen_0lep = catid_n_gen_particles.derive("catid_gen_0lep", cls_dict={"gp_dict": {11: 0, 13: 0, 15: 0}})
+catid_gen_1e = catid_n_gen_particles.derive("catid_gen_1e", cls_dict={"gp_dict": {11: 1, 13: 0, 15: 0}})
+catid_gen_1mu = catid_n_gen_particles.derive("catid_gen_1mu", cls_dict={"gp_dict": {11: 0, 13: 1, 15: 0}})
+catid_gen_1tau = catid_n_gen_particles.derive("catid_gen_1tau", cls_dict={"gp_dict": {11: 0, 13: 0, 15: 1}})
+# catid_gen_2e = catid_n_gen_particles.derive("catid_gen_2e", cls_dict={"gp_dict": {11: 2, 13: 0, 15: 0}})
+# catid_gen_2mu = catid_n_gen_particles.derive("catid_gen_2mu", cls_dict={"gp_dict": {11: 0, 13: 2, 15: 0}})
+# catid_gen_2tau = catid_n_gen_particles.derive("catid_gen_2tau", cls_dict={"gp_dict": {11: 0, 13: 0, 15: 2}})
+# catid_gen_emu = catid_n_gen_particles.derive("catid_gen_emu", cls_dict={"gp_dict": {11: 1, 13: 1, 15: 0}})
+# catid_gen_etau = catid_n_gen_particles.derive("catid_gen_etau", cls_dict={"gp_dict": {11: 1, 13: 0, 15: 1}})
+# catid_gen_mutau = catid_n_gen_particles.derive("catid_gen_mutau", cls_dict={"gp_dict": {11: 0, 13: 1, 15: 1}})
+catid_geq_2_gen_leptons = catid_n_gen_particles.derive(
+    "catid_geq_2_gen_leptons", cls_dict={"gp_dict": {(11, 13, 15): 2}, "_operator": "ge"},
+)
+
+
 #
 # Categorizer called as part of cf.SelectEvents
 #

diff --git a/hbw/selection/common.py b/hbw/selection/common.py
@@ -22,6 +22,7 @@
 from columnflow.production.categories import category_ids
 from columnflow.production.processes import process_ids
 
+from hbw.selection.gen import hard_gen_particles
 from hbw.production.weights import event_weights_to_normalize, large_weights_killer
 from hbw.selection.stats import hbw_increment_stats
 from hbw.selection.cutflow_features import cutflow_features
@@ -328,6 +329,9 @@ def post_selection(
 ) -> Tuple[ak.Array, SelectionResult]:
     """ Methods that are called for both SL and DL after calling the selection modules """
 
+    if self.dataset_inst.is_mc:
+        events, results = self[hard_gen_particles](events, results, **kwargs)
+
     # build categories
     events = self[category_ids](events, results=results, **kwargs)
 
@@ -358,7 +362,6 @@ def log_fraction(stats_key: str, msg: str | None = None):
     events = ak.fill_none(events, EMPTY_FLOAT)
 
     logger.info(f"Selected {ak.sum(results.event)} from {len(events)} events")
-
     return events, results
 
 
@@ -371,5 +374,5 @@ def post_selection_init(self: Selector) -> None:
     if not getattr(self, "dataset_inst", None) or self.dataset_inst.is_data:
         return
 
-    self.uses.add(event_weights_to_normalize)
-    self.produces.add(event_weights_to_normalize)
+    self.uses.update({event_weights_to_normalize, hard_gen_particles})
+    self.produces.update({event_weights_to_normalize, hard_gen_particles})
diff --git a/hbw/selection/gen.py b/hbw/selection/gen.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+
+"""
+Selectors related to gen-level particles.
+"""
+
+import law
+
+from columnflow.selection import Selector, SelectionResult, selector
+from columnflow.util import maybe_import
+
+np = maybe_import("numpy")
+ak = maybe_import("awkward")
+
+logger = law.logger.get_logger(__name__)
+
+
+pdgId_map = {
+    1: "down",
+    2: "up",
+    3: "strange",
+    4: "charm",
+    5: "bottom",
+    6: "top",
+    11: "electron",
+    12: "e_neutrino",
+    13: "muon",
+    14: "mu_neutrino",
+    15: "tau",
+    16: "tau_neutrino",
+    21: "gluon",
+    22: "photon",
+    23: "Z",
+    24: "W",
+    25: "Higgs",
+}
+
+
+@selector(
+    uses={"GenPart.statusFlags"},
+    mc_only=True,
+)
+def hard_gen_particles(
+    self: Selector,
+    events: ak.Array,
+    results,
+    **kwargs,
+) -> tuple[ak.Array, SelectionResult]:
+
+    gp_mask = events.GenPart.hasFlags("isHardProcess")
+
+    results = results + SelectionResult(
+        objects={"GenPart": {"HardGenPart": gp_mask}},
+    )
+
+    return events, results