Skip to content

Commit

Permalink
Merge pull request #96 from uhh-cms/ML_combine_proc
Browse files Browse the repository at this point in the history
Ml combine proc
  • Loading branch information
mafrahm authored Dec 11, 2024
2 parents 1ae1663 + 4e3b009 commit 4b437be
Show file tree
Hide file tree
Showing 13 changed files with 479 additions and 21 deletions.
1 change: 1 addition & 0 deletions hbw/categorization/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ def catid_2b(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, a

# TODO: not hard-coded -> use config?
ml_processes = [
"signal_ggf", "signal_ggf2", "signal_vbf", "signal_vbf2",
"hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1",
"hh_ggf_hbb_hvvqqlnu_kl1_kt1", "hh_vbf_hbb_hvvqqlnu_kv1_k2v1_kl1",
"hh_ggf_hbb_hvv2l2nu_kl1_kt1", "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
Expand Down
5 changes: 3 additions & 2 deletions hbw/config/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,16 +323,17 @@ def add_categories_ml(config, ml_model_inst):
# add ml categories directly to the config
# NOTE: this is a bit dangerous, because our ID depends on the MLModel, but
# we can reconfigure our MLModel after having created these categories
# TODO: config is empty and therefore fails
ml_categories = []
for i, proc in enumerate(ml_model_inst.processes):
cat_label = config.get_process(proc).x.ml_label
# cat_label = config.get_process(proc).x.ml_label
ml_categories.append(config.add_category(
# NOTE: name and ID is unique as long as we don't use
# multiple ml_models simutaneously
name=f"ml_{proc}",
id=(i + 1) * 1000,
selection=f"catid_ml_{proc}",
label=f"{cat_label} category",
# label=f"{cat_label} category",
aux={"ml_proc": proc},
))

Expand Down
21 changes: 21 additions & 0 deletions hbw/config/processes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from scinum import Number

from cmsdb.util import add_decay_process
from columnflow.util import DotDict

from hbw.config.styling import color_palette

Expand Down Expand Up @@ -199,3 +200,23 @@ def configure_hbw_processes(config: od.Config):
if config.has_process(bg):
bg = config.get_process(bg)
background.add_process(bg)


from random import randint


def create_combined_proc_forML(config: od.Config, proc_name: str, proc_dict: dict, color=None):

combining_proc = []
for proc in proc_dict.sub_processes:
combining_proc.append(config.get_process(proc, default=None))
proc_name = add_parent_process(config,
combining_proc,
name=proc_name,
id=randint(10000000, 99999999),
# TODO: random number (could by chance be a already used number --> should be checked)
label=proc_dict.get("label", "combined custom process"),
color=proc_dict.get("color", None),
)
ml_config = DotDict({"weighting": proc_dict.get("weighting", None), "sub_processes": proc_dict.sub_processes})
proc_name.x.ml_config = ml_config
84 changes: 84 additions & 0 deletions hbw/inference/dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,90 @@

dl = HBWInferenceModelBase.derive("dl", cls_dict=default_cls_dict)

# "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
# "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1",
# "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43",
# "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94",
# "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36",
# "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39",
# "hh_ggf_hbb_hvv2l2nu_kl0_kt1",
# "hh_ggf_hbb_hvv2l2nu_kl1_kt1",
# "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1",
# "hh_ggf_hbb_hvv2l2nu_kl5_kt1",


dl.derive("dl_ml_study_1", cls_dict={
"ml_model_name": "dl_22post_ml_study_1",
"config_categories": [
"sr__1b__ml_signal_ggf",
"sr__1b__ml_signal_vbf",
"sr__1b__ml_tt",
"sr__1b__ml_st",
"sr__1b__ml_dy",
"sr__1b__ml_h",
"sr__2b__ml_signal_ggf",
"sr__2b__ml_signal_vbf",
"sr__2b__ml_tt",
"sr__2b__ml_st",
"sr__2b__ml_dy",
"sr__2b__ml_h",
],
"processes": [
"hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
"hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1",
"hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43",
"hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94",
"hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36",
"hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39",
"hh_ggf_hbb_hvv2l2nu_kl0_kt1",
"hh_ggf_hbb_hvv2l2nu_kl1_kt1",
"hh_ggf_hbb_hvv2l2nu_kl2p45_kt1",
"hh_ggf_hbb_hvv2l2nu_kl5_kt1",
"tt",
"dy",
"w_lnu",
"vv",
"h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth",
],
"systematics": rate_systematics,
})

dl.derive("dl_ml_study_2", cls_dict={
"ml_model_name": "dl_22post_ml_study_2",
"config_categories": [
"sr__1b__ml_signal_ggf2",
"sr__1b__ml_signal_vbf2",
"sr__1b__ml_tt",
"sr__1b__ml_st",
"sr__1b__ml_dy",
"sr__1b__ml_h",
"sr__2b__ml_signal_ggf2",
"sr__2b__ml_signal_vbf2",
"sr__2b__ml_tt",
"sr__2b__ml_st",
"sr__2b__ml_dy",
"sr__2b__ml_h",
],
"processes": [
"hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1",
"hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1",
"hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43",
"hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94",
"hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36",
"hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39",
"hh_ggf_hbb_hvv2l2nu_kl0_kt1",
"hh_ggf_hbb_hvv2l2nu_kl1_kt1",
"hh_ggf_hbb_hvv2l2nu_kl2p45_kt1",
"hh_ggf_hbb_hvv2l2nu_kl5_kt1",
"tt",
"dy",
"w_lnu",
"vv",
"h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth",
],
"systematics": rate_systematics,
})

dl.derive("dl_hww_and_hzz", cls_dict={
"processes": [
"hh_ggf_hbb_hww_kl0_kt1",
Expand Down
1 change: 1 addition & 0 deletions hbw/ml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ def uses(self, config_inst: od.Config) -> set[Route | str]:
columns = {"mli_*"}
# TODO: switch to full event weight
# TODO: this might not work with data, to be checked
columns.add("process_id")
columns.add("normalization_weight")
columns.add("stitched_normalization_weight")
columns.add("event_weight")
Expand Down
138 changes: 129 additions & 9 deletions hbw/ml/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,40 @@
logger = law.logger.get_logger(__name__)


def get_proc_mask(
events: ak.Array,
proc: str | od.Process,
config_inst: od.Config | None = None,
) -> tuple(np.ndarray, list):
"""
Creates the mask selecting events belonging to the process *proc* and a list of all ids belonging to this process.
:param events: Event array
:param proc: Either string or process instance.
:param config_inst: An instance of the Config, can be None if Porcess instance is given.
:return process mask and the corresponding process ids
"""
# get process instance
if config_inst:
proc_inst = config_inst.get_process(proc)
elif isinstance(proc, od.Process):
proc_inst = proc

proc_id = events.process_id
unique_proc_ids = set(proc_id)

# get list of Ids that are belonging to the process and are present in the event array
sub_id = [
proc_inst.id
for proc_inst, _, _ in proc_inst.walk_processes(include_self=True)
if proc_inst.id in unique_proc_ids
]

# Create process mask
proc_mask = np.isin(proc_id, sub_id)
return proc_mask, sub_id


def input_features_sanity_checks(ml_model_inst: MLModel, input_features: list[str]):
"""
Perform sanity checks on the input features.
Expand Down Expand Up @@ -78,8 +112,10 @@ def __init__(self, ml_model_inst: MLModel, process: "str", events: ak.Array, sta
"""
self._ml_model_inst = ml_model_inst
self._process = process

proc_mask, _ = get_proc_mask(events, process, ml_model_inst.config_inst)
self._stats = stats
self._events = events
self._events = events[proc_mask]

def __repr__(self):
return f"{self.__class__.__name__}({self.ml_model_inst.cls_name}, {self.process})"
Expand Down Expand Up @@ -185,21 +221,89 @@ def shuffle_indices(self) -> np.ndarray:
self._shuffle_indices = np.random.permutation(self.n_events)
return self._shuffle_indices

def get_xsec_train_weights(self) -> np.ndarray:
"""
Weighting such that each event has roughly the same weight,
sub processes are weighted accoridng to their cross section
"""
if hasattr(self, "_xsec_train_weights"):
return self._xsec_train_weights

if not self.stats:
raise Exception("cannot determine train weights without stats")

_, sub_id = get_proc_mask(self._events, self.process, self.ml_model_inst.config_inst)
sum_abs_weights = np.sum([self.stats[self.process]["sum_abs_weights_per_process"][str(id)] for id in sub_id])
num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id])

xsec_train_weights = self.weights / sum_abs_weights * num_events

return xsec_train_weights

def get_equal_train_weights(self) -> np.ndarray:
"""
Weighting such that events of each sub processes are weighted equally
"""
if hasattr(self, "_equally_train_weights"):
return self._equal_train_weights

if not self.stats:
raise Exception("cannot determine train weights without stats")

combined_proc_inst = self.ml_model_inst.config_inst.get_process(self.process)
_, sub_id_proc = get_proc_mask(self._events, self.process, self.ml_model_inst.config_inst)
num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id_proc])
targeted_sum_of_weights_per_process = (
num_events / len(combined_proc_inst.x.ml_config.sub_processes)
)
equal_train_weights = ak.full_like(self.weights, 1.)
sub_class_factors = {}

for proc in combined_proc_inst.x.ml_config.sub_processes:
proc_mask, sub_id = get_proc_mask(self._events, proc, self.ml_model_inst.config_inst)
sum_pos_weights_per_sub_proc = 0.
sum_pos_weights_per_proc = self.stats[self.process]["sum_pos_weights_per_process"]

for id in sub_id:
id = str(id)
if id in self.stats[self.process]["num_events_per_process"]:
sum_pos_weights_per_sub_proc += sum_pos_weights_per_proc[id]

if sum_pos_weights_per_sub_proc == 0:
norm_const_per_proc = 1.
logger.info(
f"No weight sum found in stats for sub process {proc}."
f"Normalization constant set to 1 but results are probably not correct.")
else:
norm_const_per_proc = targeted_sum_of_weights_per_process / sum_pos_weights_per_sub_proc
logger.info(f"Normalizing constant for {proc} is {norm_const_per_proc}")

sub_class_factors[proc] = norm_const_per_proc
equal_train_weights = np.where(proc_mask, self.weights * norm_const_per_proc, equal_train_weights)

return equal_train_weights

@property
def train_weights(self) -> np.ndarray:
"""
Weighting such that each event has roughly the same weight
Weighting according to the parameters set in the ML model config
"""
if hasattr(self, "_train_weights"):
return self._train_weights

if not self.stats:
raise Exception("cannot determine train weights without stats")

sum_abs_weights = self.stats[self.process]["sum_abs_weights"]
num_events = self.stats[self.process]["num_events"]
# TODO: hier muss np.float gemacht werden
proc = self.process
proc_inst = self.ml_model_inst.config_inst.get_process(proc)
if proc_inst.x("ml_config", None) and proc_inst.x.ml_config.weighting == "equal":
train_weights = self.get_equal_train_weights()
else:
train_weights = self.get_xsec_train_weights()

self._train_weights = ak.to_numpy(train_weights).astype(np.float32)

self._train_weights = self.weights / sum_abs_weights * num_events
return self._train_weights

@property
Expand All @@ -213,11 +317,26 @@ def equal_weights(self) -> np.ndarray:
if not self.stats:
raise Exception("cannot determine val weights without stats")

# TODO: per process pls [done] and now please tidy up
processes = self.ml_model_inst.processes
sum_abs_weights = self.stats[self.process]["sum_abs_weights"]
num_events_per_process = {proc: self.stats[proc]["num_events"] for proc in processes}

self._validation_weights = self.weights / sum_abs_weights * max(num_events_per_process.values())
num_events_per_process = {}
for proc in processes:
id_list = list(self.stats[proc]["num_events_per_process"].keys())
proc_inst = self.ml_model_inst.config_inst.get_process(proc)
sub_id = [
p_inst.id
for p_inst, _, _ in proc_inst.walk_processes(include_self=True)
if str(p_inst.id) in id_list
]
if proc == self.process:
sum_abs_weights = np.sum([
self.stats[self.process]["sum_abs_weights_per_process"][str(id)] for id in sub_id
])
num_events_per_proc = np.sum([self.stats[proc]["num_events_per_process"][str(id)] for id in sub_id])
num_events_per_process[proc] = num_events_per_proc

validation_weights = self.weights / sum_abs_weights * max(num_events_per_process.values())
self._validation_weights = ak.to_numpy(validation_weights).astype(np.float32)

return self._validation_weights

Expand Down Expand Up @@ -544,6 +663,7 @@ def target(self) -> np.ndarray:
if self._ml_model_inst.negative_weights == "handle":
target[self.m_negative_weights] = 1 - target[self.m_negative_weights]

# NOTE: I think here the targets are somehow 64floats... Maybe check that
self._target = target
return self._target

Expand Down
Loading

0 comments on commit 4b437be

Please sign in to comment.