Skip to content

Commit

Permalink
Merge pull request #65 from uhh-cms/feature/gen_lepton_tag
Browse files Browse the repository at this point in the history
Categorization based on number of gen particles
  • Loading branch information
apaasch authored Jan 10, 2024
2 parents 7345eaa + b02307f commit 5c03497
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 3 deletions.
57 changes: 57 additions & 0 deletions hbw/config/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,21 @@

"""
Definition of categories.
Categorizer modules (used to determine category masks) are defined in hbw.selection.categories
Ids for combinations of categories are built as the sum of category ids.
To avoid reusing category ids, each category block (e.g. leptons, jets, ...) uses ids of a different
power of 10.
power of 10 | category block
0: free (only used for inclusive category)
1: jet (resolved vs boosted)
2: bjet (1 vs geq 2)
3: lepton
4: dnn
5: gen-level leptons (not combined with other categories)
"""

from collections import OrderedDict
Expand All @@ -17,12 +32,54 @@
logger = law.logger.get_logger(__name__)


@call_once_on_config()
def add_gen_categories(config: od.Config) -> None:
gen_0lep = config.add_category( # noqa
name="gen_0lep",
id=100000,
selection="catid_gen_0lep", # this should not be called!
label="No gen lepton",
)
gen_1lep = config.add_category(
name="gen_1lep",
id=200000,
label="1 gen lepton",
)
gen_1lep.add_category(
name="gen_1e",
id=300000,
selection="catid_gen_1e",
label="1 gen electron",
)
gen_1lep.add_category(
name="gen_1mu",
id=400000,
selection="catid_gen_1mu",
label="1 gen muon",
)
gen_1lep.add_category(
name="gen_1tau",
id=500000,
selection="catid_gen_1tau",
label="1 gen tau",
)
gen_2lep = config.add_category( # noqa
name="gen_geq2lep",
id=600000,
selection="catid_geq_2_gen_leptons",
label=r"$\geq 2$ gen leptons",
)


@call_once_on_config()
def add_categories_selection(config: od.Config) -> None:
"""
Adds categories to a *config*, that are typically produced in `SelectEvents`.
"""

# adds categories based on the existence of gen particles
add_gen_categories(config)

config.x.lepton_channels = {
"sl": ("1e", "1mu"),
"dl": ("2e", "2mu", "emu"),
Expand Down
2 changes: 2 additions & 0 deletions hbw/config/config_run2.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,8 @@ def make_jme_filename(jme_aux, sample_type, name, era=None):
"run", "luminosityBlock", "event",
# columns added during selection, required in general
"mc_weight", "PV.npvs", "process_id", "category_ids", "deterministic_seed",
# Gen information (for categorization)
"HardGenPart.pdgId",
# weight-related columns
"pu_weight*", "pdf_weight*",
"murf_envelope_weight*", "mur_weight*", "muf_weight*",
Expand Down
73 changes: 73 additions & 0 deletions hbw/selection/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@
Selection methods defining categories based on selection step results.
"""

from __future__ import annotations

import law

from columnflow.util import maybe_import
from columnflow.categorization import Categorizer, categorizer
from columnflow.selection import SelectionResult
from columnflow.columnar_util import has_ak_column, optional_column

np = maybe_import("numpy")
ak = maybe_import("awkward")
Expand All @@ -17,6 +22,74 @@ def catid_selection_incl(self: Categorizer, events: ak.Array, **kwargs) -> tuple
mask = ak.ones_like(events.event) > 0
return events, mask

#
# Categorizers based on gen info
#


@categorizer(
uses=optional_column("HardGenPart.pdgId", "GenPart.pdgId", "GenPart.statusFlags"),
gp_dict={}, # dict with (tuple of) pdgId + number of required prompt particles with this pdgId
ignore_charge=True,
call_force=True,
_operator="eq",
)
def catid_n_gen_particles(
self: Categorizer, events: ak.Array, results: SelectionResult | None = None, **kwargs,
) -> tuple[ak.Array, ak.Array]:
""" Categorizer to select events with a certain number of prompt gen particles """

# possible options to compare number of particles with required number of particles: ==, >=, >, <=, <
assert self._operator in ("eq", "ge", "gt", "le", "lt")

# start with true mask
mask = np.ones(len(events), dtype=bool)
if self.dataset_inst.is_data:
# for data, always return true mask
return events, mask

if has_ak_column(events, "HardGenPart.pdgId"):
gp_id = events.HardGenPart.pdgId
else:
try:
# try to get gp_id column via SelectionResult
gp_id = events.GenPart.pdgId[results.objects.GenPart.HardGenPart]
except AttributeError:
# try to select hard gen particles via status flags
gp_id = events.GenPart.pdgId[events.GenPart.hasFlags("isHardProcess")]

if self.ignore_charge:
gp_id = abs(gp_id)

for pdgIds, required_n_particles in self.gp_dict.items():
# make sure that 'pdgIds' is a tuple
pdgIds = law.util.make_tuple(pdgIds)

# get number of gen particles with requested pdgIds for each event
n_particles = sum([ak.sum(gp_id == pdgId, axis=1) for pdgId in pdgIds])

# compare number of gen particles with required number of particles with requested operator
this_mask = getattr(n_particles, f"__{self._operator}__")(required_n_particles)
mask = mask & this_mask

return events, mask


catid_gen_0lep = catid_n_gen_particles.derive("catid_gen_0lep", cls_dict={"gp_dict": {11: 0, 13: 0, 15: 0}})
catid_gen_1e = catid_n_gen_particles.derive("catid_gen_1e", cls_dict={"gp_dict": {11: 1, 13: 0, 15: 0}})
catid_gen_1mu = catid_n_gen_particles.derive("catid_gen_1mu", cls_dict={"gp_dict": {11: 0, 13: 1, 15: 0}})
catid_gen_1tau = catid_n_gen_particles.derive("catid_gen_1tau", cls_dict={"gp_dict": {11: 0, 13: 0, 15: 1}})
# catid_gen_2e = catid_n_gen_particles.derive("catid_gen_2e", cls_dict={"gp_dict": {11: 2, 13: 0, 15: 0}})
# catid_gen_2mu = catid_n_gen_particles.derive("catid_gen_2mu", cls_dict={"gp_dict": {11: 0, 13: 2, 15: 0}})
# catid_gen_2tau = catid_n_gen_particles.derive("catid_gen_2tau", cls_dict={"gp_dict": {11: 0, 13: 0, 15: 2}})
# catid_gen_emu = catid_n_gen_particles.derive("catid_gen_emu", cls_dict={"gp_dict": {11: 1, 13: 1, 15: 0}})
# catid_gen_etau = catid_n_gen_particles.derive("catid_gen_etau", cls_dict={"gp_dict": {11: 1, 13: 0, 15: 1}})
# catid_gen_mutau = catid_n_gen_particles.derive("catid_gen_mutau", cls_dict={"gp_dict": {11: 0, 13: 1, 15: 1}})
catid_geq_2_gen_leptons = catid_n_gen_particles.derive(
"catid_geq_2_gen_leptons", cls_dict={"gp_dict": {(11, 13, 15): 2}, "_operator": "ge"},
)


#
# Categorizer called as part of cf.SelectEvents
#
Expand Down
9 changes: 6 additions & 3 deletions hbw/selection/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from columnflow.production.categories import category_ids
from columnflow.production.processes import process_ids

from hbw.selection.gen import hard_gen_particles
from hbw.production.weights import event_weights_to_normalize, large_weights_killer
from hbw.selection.stats import hbw_increment_stats
from hbw.selection.cutflow_features import cutflow_features
Expand Down Expand Up @@ -328,6 +329,9 @@ def post_selection(
) -> Tuple[ak.Array, SelectionResult]:
""" Methods that are called for both SL and DL after calling the selection modules """

if self.dataset_inst.is_mc:
events, results = self[hard_gen_particles](events, results, **kwargs)

# build categories
events = self[category_ids](events, results=results, **kwargs)

Expand Down Expand Up @@ -358,7 +362,6 @@ def log_fraction(stats_key: str, msg: str | None = None):
events = ak.fill_none(events, EMPTY_FLOAT)

logger.info(f"Selected {ak.sum(results.event)} from {len(events)} events")

return events, results


Expand All @@ -371,5 +374,5 @@ def post_selection_init(self: Selector) -> None:
if not getattr(self, "dataset_inst", None) or self.dataset_inst.is_data:
return

self.uses.add(event_weights_to_normalize)
self.produces.add(event_weights_to_normalize)
self.uses.update({event_weights_to_normalize, hard_gen_particles})
self.produces.update({event_weights_to_normalize, hard_gen_particles})
56 changes: 56 additions & 0 deletions hbw/selection/gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# coding: utf-8

"""
Selectors related to gen-level particles.
"""

import law

from columnflow.selection import Selector, SelectionResult, selector
from columnflow.util import maybe_import

np = maybe_import("numpy")
ak = maybe_import("awkward")

logger = law.logger.get_logger(__name__)


pdgId_map = {
1: "down",
2: "up",
3: "strange",
4: "charm",
5: "bottom",
6: "top",
11: "electron",
12: "e_neutrino",
13: "muon",
14: "mu_neutrino",
15: "tau",
16: "tau_neutrino",
21: "gluon",
22: "photon",
23: "Z",
24: "W",
25: "Higgs",
}


@selector(
uses={"GenPart.statusFlags"},
mc_only=True,
)
def hard_gen_particles(
self: Selector,
events: ak.Array,
results,
**kwargs,
) -> tuple[ak.Array, SelectionResult]:

gp_mask = events.GenPart.hasFlags("isHardProcess")

results = results + SelectionResult(
objects={"GenPart": {"HardGenPart": gp_mask}},
)

return events, results

0 comments on commit 5c03497

Please sign in to comment.