Skip to content

Commit

Permalink
add preparation_producer
Browse files Browse the repository at this point in the history
  • Loading branch information
mafrahm committed Nov 16, 2023
1 parent 5fd9a84 commit 37ecad0
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 4 deletions.
4 changes: 4 additions & 0 deletions hbw/ml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ def setup(self):
x_title=f"DNN output score {self.config_inst.get_process(proc).x.ml_label}",
)

def preparation_producer(self: MLModel, config_inst: od.Config):
""" producer that is run as part of PrepareMLEvents and MLEvaluation (before `evaluate`) """
return "ml_preparation"

def requires(self, task: law.Task) -> str:
# Custom requirements (none currently)
return {}
Expand Down
73 changes: 73 additions & 0 deletions hbw/ml/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# coding: utf-8

""" production methods regarding ml stats """

from __future__ import annotations

import functools

from columnflow.production import Producer, producer
from columnflow.util import maybe_import
from columnflow.ml import MLModel
from columnflow.columnar_util import set_ak_column
from columnflow.selection.stats import increment_stats


ak = maybe_import("awkward")
np = maybe_import("numpy")

# helper
set_ak_column_f32 = functools.partial(set_ak_column, value_type=np.float32)


@producer(
uses={"normalization_weight"},
)
def ml_preparation(
self: Producer,
events: ak.Array,
stats: dict = {},
fold_indices: ak.Array | None = None,
ml_model_inst: MLModel | None = None,
**kwargs,
) -> ak.Array:
"""
Producer that is run as part of PrepareMLEvents to collect relevant stats
"""
weight = events["normalization_weight"]
stats["num_events"] += len(events)
stats["sum_weights"] += ak.sum(weight, axis=0)

weight_map = {
"num_events": Ellipsis, # all events
}

if self.dataset_inst.is_mc:
weight_map["sum_weights"] = weight
weight_map["sum_abs_weights"] = (weight, weight > 0)
weight_map["sum_pos_weights"] = np.abs(weight)

group_map = {
"process": {
"values": events.process_id,
"mask_fn": (lambda v: events.process_id == v),
},
"fold": {
"values": fold_indices,
"mask_fn": (lambda v: fold_indices == v),
},
}

group_combinations = [("process", "fold")]

self[increment_stats](
events,
None,
stats,
weight_map=weight_map,
group_map=group_map,
group_combinations=group_combinations,
**kwargs,
)

return events
6 changes: 4 additions & 2 deletions hbw/production/ml_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,13 @@ def ml_inputs(self: Producer, events: ak.Array, **kwargs) -> ak.Array:

# low-level features
# TODO: this could be more generalized
for var in ["pt", "eta"]:
for var in ["pt", "eta", "btagDeepFlavB"]:
events = set_ak_column_f32(events, f"mli_b1_{var}", events.Bjet[:, 0][var])
events = set_ak_column_f32(events, f"mli_b2_{var}", events.Bjet[:, 1][var])
events = set_ak_column_f32(events, f"mli_j1_{var}", events.Lightjet[:, 0][var])
events = set_ak_column_f32(events, f"mli_j2_{var}", events.Lightjet[:, 1][var])
if var == "btagDeepFlavB":
continue
events = set_ak_column_f32(events, f"mli_lep_{var}", events.Lepton[:, 0][var])
events = set_ak_column_f32(events, f"mli_met_{var}", events.MET[var])

Expand All @@ -81,7 +83,7 @@ def ml_inputs(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
# all possible jet pairs
jet_pairs = ak.combinations(events.Jet, 2)
dr = jet_pairs[:, :, "0"].delta_r(jet_pairs[:, :, "1"])
events = set_ak_column_f32(events, "mindr_jj", ak.min(dr, axis=1))
events = set_ak_column_f32(events, "mli_mindr_jj", ak.min(dr, axis=1))

# vbf jet pair features
events = set_ak_column_f32(events, "mli_vbf_deta", abs(events.VBFJet[:, 0].eta - events.VBFJet[:, 1].eta))
Expand Down
2 changes: 1 addition & 1 deletion law.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ default_analysis: hbw.analysis.hbw_merged.hbw_merged
default_config: c17
default_dataset: ggHH_kl_1_kt_1_sl_hbbhww_powheg

production_modules: columnflow.production.{categories,processes,pileup,normalization,seeds}, hbw.production.{weights,features,ml_inputs,categories,gen_hbw_decay}
production_modules: columnflow.production.{categories,processes,pileup,normalization,seeds}, hbw.production.{weights,features,ml_inputs,categories,gen_hbw_decay}, hbw.ml.stats
calibration_modules: columnflow.calibration.jets, hbw.calibration.default
selection_modules: hbw.selection.{common,sl,dl}
categorization_modules: hbw.selection.categories
Expand Down
3 changes: 2 additions & 1 deletion law.nocert.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ cf.BundleBashSandbox: local
cf.BundleCMSSWSandbox: local
cf.BundleExternalFiles: local
# GetDatasetLFNs requires a Grid certificate -> use a common space to store the output
cf.GetDatasetLFNs: local, /nfs/dust/cms/user/frahmmat/data
cf.GetDatasetLFNs: local, /nfs/dust/cms/user/frahmmat/public/hh2bbww/data/common_store
cf.CalibrateEvents: local
cf.SelectEvents: local
cf.CreateCutflowHistograms: local
Expand All @@ -42,6 +42,7 @@ cf.MergeReducedEvents: local
cf.ProduceColumns: local
cf.PrepareMLEvents: local
cf.MergeMLEvents: local
cf.MergeMLStats: local
cf.MLTraining: local
cf.MLEvaluation: local
cf.CreateHistograms: local
Expand Down
1 change: 1 addition & 0 deletions law.shared.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ cf.MergeReducedEvents: local, %(shared_hbw_location)s
cf.ProduceColumns: local, %(shared_hbw_location)s
cf.PrepareMLEvents: local, %(shared_hbw_location)s
cf.MergeMLEvents: local, %(shared_hbw_location)s
cf.MergeMLStats: local, %(shared_hbw_location)s
cf.MLTraining: local, %(shared_hbw_location)s
cf.MLEvaluation: local, %(shared_hbw_location)s

Expand Down

0 comments on commit 37ecad0

Please sign in to comment.