From da23e9faf7d39234cabb8398b1a25268ac2b46e0 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Mon, 8 Jan 2024 14:09:29 +0100 Subject: [PATCH 1/2] implement categorization based on gen particles --- hbw/config/categories.py | 57 +++++++++++++++++++++++++++ hbw/config/config_run2.py | 2 + hbw/selection/categories.py | 78 +++++++++++++++++++++++++++++++++++++ hbw/selection/common.py | 9 +++-- hbw/selection/gen.py | 56 ++++++++++++++++++++++++++ 5 files changed, 199 insertions(+), 3 deletions(-) create mode 100644 hbw/selection/gen.py diff --git a/hbw/config/categories.py b/hbw/config/categories.py index 4edaaa63..8dbac682 100644 --- a/hbw/config/categories.py +++ b/hbw/config/categories.py @@ -2,6 +2,21 @@ """ Definition of categories. + +Categorizer modules (used to determine category masks) are defined in hbw.selection.categories + +Ids for combinations of categories are built as the sum of category ids. +To avoid reusing category ids, each category block (e.g. leptons, jets, ...) uses ids of a different +power of 10. + +power of 10 | category block + +0: free (only used for inclusive category) +1: jet (resolved vs boosted) +2: bjet (1 vs geq 2) +3: lepton +4: dnn +5: gen-level leptons (not combined with other categories) """ from collections import OrderedDict @@ -17,12 +32,54 @@ logger = law.logger.get_logger(__name__) +@call_once_on_config() +def add_gen_categories(config: od.Config) -> None: + gen_0lep = config.add_category( # noqa + name="gen_0lep", + id=100000, + selection="catid_gen_0lep", # this should not be called! + label="No gen lepton", + ) + gen_1lep = config.add_category( + name="gen_1lep", + id=200000, + label="1 gen lepton", + ) + gen_1lep.add_category( + name="gen_1e", + id=300000, + selection="catid_gen_1e", + label="1 gen electron", + ) + gen_1lep.add_category( + name="gen_1mu", + id=400000, + selection="catid_gen_1mu", + label="1 gen muon", + ) + gen_1lep.add_category( + name="gen_1tau", + id=500000, + selection="catid_gen_1tau", + label="1 gen tau", + ) + gen_2lep = config.add_category( # noqa + name="gen_geq2lep", + id=600000, + selection="catid_geq_2_gen_leptons", + label=r"$\geq 2$ gen leptons", + ) + + @call_once_on_config() def add_categories_selection(config: od.Config) -> None: """ Adds categories to a *config*, that are typically produced in `SelectEvents`. """ + # adds categories based on the existence of gen particles + add_gen_categories(config) + config.x.lepton_channels = { "sl": ("1e", "1mu"), "dl": ("2e", "2mu", "emu"), diff --git a/hbw/config/config_run2.py b/hbw/config/config_run2.py index 0b7b048f..ddcbf2bd 100644 --- a/hbw/config/config_run2.py +++ b/hbw/config/config_run2.py @@ -432,6 +432,8 @@ def make_jme_filename(jme_aux, sample_type, name, era=None): "run", "luminosityBlock", "event", # columns added during selection, required in general "mc_weight", "PV.npvs", "process_id", "category_ids", "deterministic_seed", + # Gen information (for categorization) + "HardGenPart.pdgId", # weight-related columns "pu_weight*", "pdf_weight*", "murf_envelope_weight*", "mur_weight*", "muf_weight*", diff --git a/hbw/selection/categories.py b/hbw/selection/categories.py index 7aa2da2f..a8cd44cf 100644 --- a/hbw/selection/categories.py +++ b/hbw/selection/categories.py @@ -4,9 +4,12 @@ Selection methods defining categories based on selection step results. """ +from __future__ import annotations + from columnflow.util import maybe_import from columnflow.categorization import Categorizer, categorizer from columnflow.selection import SelectionResult +from columnflow.columnar_util import has_ak_column, optional_column np = maybe_import("numpy") ak = maybe_import("awkward") @@ -17,6 +20,81 @@ def catid_selection_incl(self: Categorizer, events: ak.Array, **kwargs) -> tuple mask = ak.ones_like(events.event) > 0 return events, mask +# +# Categorizers based on gen info +# + + +@categorizer( + uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId"), + gp_dict={}, # dict with pdgId + number of required prompt particles with this pdgId + ignore_charge=True, + call_force=True, +) +def catid_n_gen_particles( + self: Categorizer, events: ak.Array, results: SelectionResult | None = None, **kwargs, +) -> tuple[ak.Array, ak.Array]: + """ Categorizer to select events with a certain number of prompt gen particles """ + # start with true mask + mask = np.ones(len(events), dtype=bool) + if self.dataset_inst.is_data: + # for data, always return true mask + return events, mask + + if has_ak_column(events, "HardGenPart.pdgId"): + gp_id = events.HardGenPart.pdgId + else: + # try to get gp_id column via SelectionResult + gp_id = events.GenPart.pdgId[results.objects.GenPart.HardGenPart] + + if self.ignore_charge: + gp_id = abs(gp_id) + + for pdgId, num_particles in self.gp_dict.items(): + mask = mask & (ak.sum(gp_id == pdgId, axis=1) == num_particles) + + return events, mask + + +catid_gen_0lep = catid_n_gen_particles.derive("catid_gen_0lep", cls_dict={"gp_dict": {11: 0, 13: 0, 15: 0}}) +catid_gen_1e = catid_n_gen_particles.derive("catid_gen_1e", cls_dict={"gp_dict": {11: 1, 13: 0, 15: 0}}) +catid_gen_1mu = catid_n_gen_particles.derive("catid_gen_1mu", cls_dict={"gp_dict": {11: 0, 13: 1, 15: 0}}) +catid_gen_1tau = catid_n_gen_particles.derive("catid_gen_1tau", cls_dict={"gp_dict": {11: 0, 13: 0, 15: 1}}) +# catid_gen_2e = catid_n_gen_particles.derive("catid_gen_2e", cls_dict={"gp_dict": {11: 2, 13: 0, 15: 0}}) +# catid_gen_2mu = catid_n_gen_particles.derive("catid_gen_2mu", cls_dict={"gp_dict": {11: 0, 13: 2, 15: 0}}) +# catid_gen_2tau = catid_n_gen_particles.derive("catid_gen_2tau", cls_dict={"gp_dict": {11: 0, 13: 0, 15: 2}}) +# catid_gen_emu = catid_n_gen_particles.derive("catid_gen_emu", cls_dict={"gp_dict": {11: 1, 13: 1, 15: 0}}) +# catid_gen_etau = catid_n_gen_particles.derive("catid_gen_etau", cls_dict={"gp_dict": {11: 1, 13: 0, 15: 1}}) +# catid_gen_mutau = catid_n_gen_particles.derive("catid_gen_mutau", cls_dict={"gp_dict": {11: 0, 13: 1, 15: 1}}) + + +@categorizer( + uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId"), + call_force=True, +) +def catid_geq_2_gen_leptons( + self: Categorizer, events: ak.Array, results: SelectionResult | None = None, **kwargs, +) -> tuple[ak.Array, ak.Array]: + """ Categorizer to select events with a certain number of hard gen particles """ + + if self.dataset_inst.is_data: + # for data, always return true mask + return events, np.ones(len(events), dtype=bool) + + if has_ak_column(events, "HardGenPart.pdgId"): + gp_id = events.HardGenPart.pdgId + else: + # try to get gp_id column via SelectionResult + gp_id = events.GenPart.pdgId[results.objects.GenPart.HardGenPart] + + abs_id = abs(gp_id) + mask = ( + ak.sum(abs_id == 11, axis=1) + ak.sum(abs_id == 13, axis=1) + ak.sum(abs_id == 15, axis=1) + ) >= 2 + + return events, mask + + # # Categorizer called as part of cf.SelectEvents # diff --git a/hbw/selection/common.py b/hbw/selection/common.py index 24d3bf1e..d13594a9 100644 --- a/hbw/selection/common.py +++ b/hbw/selection/common.py @@ -22,6 +22,7 @@ from columnflow.production.categories import category_ids from columnflow.production.processes import process_ids +from hbw.selection.gen import hard_gen_particles from hbw.production.weights import event_weights_to_normalize, large_weights_killer from hbw.selection.stats import hbw_increment_stats from hbw.selection.cutflow_features import cutflow_features @@ -328,6 +329,9 @@ def post_selection( ) -> Tuple[ak.Array, SelectionResult]: """ Methods that are called for both SL and DL after calling the selection modules """ + if self.dataset_inst.is_mc: + events, results = self[hard_gen_particles](events, results, **kwargs) + # build categories events = self[category_ids](events, results=results, **kwargs) @@ -358,7 +362,6 @@ def log_fraction(stats_key: str, msg: str | None = None): events = ak.fill_none(events, EMPTY_FLOAT) logger.info(f"Selected {ak.sum(results.event)} from {len(events)} events") - return events, results @@ -371,5 +374,5 @@ def post_selection_init(self: Selector) -> None: if not getattr(self, "dataset_inst", None) or self.dataset_inst.is_data: return - self.uses.add(event_weights_to_normalize) - self.produces.add(event_weights_to_normalize) + self.uses.update({event_weights_to_normalize, hard_gen_particles}) + self.produces.update({event_weights_to_normalize, hard_gen_particles}) diff --git a/hbw/selection/gen.py b/hbw/selection/gen.py new file mode 100644 index 00000000..47223294 --- /dev/null +++ b/hbw/selection/gen.py @@ -0,0 +1,56 @@ +# coding: utf-8 + +""" +Selectors related to gen-level particles. +""" + +import law + +from columnflow.selection import Selector, SelectionResult, selector +from columnflow.util import maybe_import + +np = maybe_import("numpy") +ak = maybe_import("awkward") + +logger = law.logger.get_logger(__name__) + + +pdgId_map = { + 1: "down", + 2: "up", + 3: "strange", + 4: "charm", + 5: "bottom", + 6: "top", + 11: "electron", + 12: "e_neutrino", + 13: "muon", + 14: "mu_neutrino", + 15: "tau", + 16: "tau_neutrino", + 21: "gluon", + 22: "photon", + 23: "Z", + 24: "W", + 25: "Higgs", +} + + +@selector( + uses={"GenPart.statusFlags"}, + mc_only=True, +) +def hard_gen_particles( + self: Selector, + events: ak.Array, + results, + **kwargs, +) -> tuple[ak.Array, SelectionResult]: + + gp_mask = events.GenPart.hasFlags("isHardProcess") + + results = results + SelectionResult( + objects={"GenPart": {"HardGenPart": gp_mask}}, + ) + + return events, results From b02307fe20e9cbeb4a8fd9eae73097e144cc8726 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Tue, 9 Jan 2024 09:36:29 +0100 Subject: [PATCH 2/2] further generalize gen particles categorizer --- hbw/selection/categories.py | 59 +++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/hbw/selection/categories.py b/hbw/selection/categories.py index a8cd44cf..0049fb46 100644 --- a/hbw/selection/categories.py +++ b/hbw/selection/categories.py @@ -6,6 +6,8 @@ from __future__ import annotations +import law + from columnflow.util import maybe_import from columnflow.categorization import Categorizer, categorizer from columnflow.selection import SelectionResult @@ -26,15 +28,20 @@ def catid_selection_incl(self: Categorizer, events: ak.Array, **kwargs) -> tuple @categorizer( - uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId"), - gp_dict={}, # dict with pdgId + number of required prompt particles with this pdgId + uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId", "GenPart.statusFlags"), + gp_dict={}, # dict with (tuple of) pdgId + number of required prompt particles with this pdgId ignore_charge=True, call_force=True, + _operator="eq", ) def catid_n_gen_particles( self: Categorizer, events: ak.Array, results: SelectionResult | None = None, **kwargs, ) -> tuple[ak.Array, ak.Array]: """ Categorizer to select events with a certain number of prompt gen particles """ + + # possible options to compare number of particles with required number of particles: ==, >=, >, <=, < + assert self._operator in ("eq", "ge", "gt", "le", "lt") + # start with true mask mask = np.ones(len(events), dtype=bool) if self.dataset_inst.is_data: @@ -44,14 +51,26 @@ def catid_n_gen_particles( if has_ak_column(events, "HardGenPart.pdgId"): gp_id = events.HardGenPart.pdgId else: - # try to get gp_id column via SelectionResult - gp_id = events.GenPart.pdgId[results.objects.GenPart.HardGenPart] + try: + # try to get gp_id column via SelectionResult + gp_id = events.GenPart.pdgId[results.objects.GenPart.HardGenPart] + except AttributeError: + # try to select hard gen particles via status flags + gp_id = events.GenPart.pdgId[events.GenPart.hasFlags("isHardProcess")] if self.ignore_charge: gp_id = abs(gp_id) - for pdgId, num_particles in self.gp_dict.items(): - mask = mask & (ak.sum(gp_id == pdgId, axis=1) == num_particles) + for pdgIds, required_n_particles in self.gp_dict.items(): + # make sure that 'pdgIds' is a tuple + pdgIds = law.util.make_tuple(pdgIds) + + # get number of gen particles with requested pdgIds for each event + n_particles = sum([ak.sum(gp_id == pdgId, axis=1) for pdgId in pdgIds]) + + # compare number of gen particles with required number of particles with requested operator + this_mask = getattr(n_particles, f"__{self._operator}__")(required_n_particles) + mask = mask & this_mask return events, mask @@ -66,33 +85,9 @@ def catid_n_gen_particles( # catid_gen_emu = catid_n_gen_particles.derive("catid_gen_emu", cls_dict={"gp_dict": {11: 1, 13: 1, 15: 0}}) # catid_gen_etau = catid_n_gen_particles.derive("catid_gen_etau", cls_dict={"gp_dict": {11: 1, 13: 0, 15: 1}}) # catid_gen_mutau = catid_n_gen_particles.derive("catid_gen_mutau", cls_dict={"gp_dict": {11: 0, 13: 1, 15: 1}}) - - -@categorizer( - uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId"), - call_force=True, +catid_geq_2_gen_leptons = catid_n_gen_particles.derive( + "catid_geq_2_gen_leptons", cls_dict={"gp_dict": {(11, 13, 15): 2}, "_operator": "ge"}, ) -def catid_geq_2_gen_leptons( - self: Categorizer, events: ak.Array, results: SelectionResult | None = None, **kwargs, -) -> tuple[ak.Array, ak.Array]: - """ Categorizer to select events with a certain number of hard gen particles """ - - if self.dataset_inst.is_data: - # for data, always return true mask - return events, np.ones(len(events), dtype=bool) - - if has_ak_column(events, "HardGenPart.pdgId"): - gp_id = events.HardGenPart.pdgId - else: - # try to get gp_id column via SelectionResult - gp_id = events.GenPart.pdgId[results.objects.GenPart.HardGenPart] - - abs_id = abs(gp_id) - mask = ( - ak.sum(abs_id == 11, axis=1) + ak.sum(abs_id == 13, axis=1) + ak.sum(abs_id == 15, axis=1) - ) >= 2 - - return events, mask #