From 860cdd48d03945a3d374cc07b2c044ce899dbf34 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Wed, 4 Dec 2024 09:04:06 +0100 Subject: [PATCH 01/28] cleanup in scripts --- hbw/scripts/hbwtasks.sh | 6 +++--- hbw/scripts/test_config.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/hbw/scripts/hbwtasks.sh b/hbw/scripts/hbwtasks.sh index 7ebc65ff..2486a8a1 100644 --- a/hbw/scripts/hbwtasks.sh +++ b/hbw/scripts/hbwtasks.sh @@ -40,7 +40,7 @@ hbw_calibration(){ --cf.CalibrateEvents-no-poll \ --cf.CalibrateEvents-parallel-jobs 4000 \ --cf.CalibrateEvents-retries 1 \ - --cf.CalibrateEvents-tasks-per-job 2 \ + --cf.CalibrateEvents-tasks-per-job 1 \ --cf.CalibrateEvents-job-workers 1 \ --cf.BundleRepo-custom-checksum $(checksum) \ $@ @@ -55,7 +55,7 @@ hbw_reduction(){ --cf.ReduceEvents-no-poll \ --cf.ReduceEvents-parallel-jobs 4000 \ --cf.ReduceEvents-retries 1 \ - --cf.ReduceEvents-tasks-per-job 2 \ + --cf.ReduceEvents-tasks-per-job 1 \ --cf.ReduceEvents-job-workers 1 \ --cf.BundleRepo-custom-checksum $(checksum) \ $@ @@ -71,7 +71,7 @@ hbw_merge_reduction(){ --cf.ReduceEvents-pilot \ --cf.ReduceEvents-parallel-jobs 4000 \ --cf.ReduceEvents-retries 1 \ - --cf.ReduceEvents-tasks-per-job 2 \ + --cf.ReduceEvents-tasks-per-job 1 \ --cf.ReduceEvents-job-workers 1 \ --cf.BundleRepo-custom-checksum $(checksum) \ $@ diff --git a/hbw/scripts/test_config.py b/hbw/scripts/test_config.py index b81fc515..0a40ff1e 100644 --- a/hbw/scripts/test_config.py +++ b/hbw/scripts/test_config.py @@ -9,7 +9,7 @@ default_analysis = law.config.get_expanded("analysis", "default_analysis") default_config = law.config.get_expanded("analysis", "default_config") - +default_config = "c22uhhpost" analysis_inst = ana = AnalysisTask.get_analysis_inst(default_analysis) config_inst = cfg = ana.get_config(default_config) @@ -110,11 +110,11 @@ print("Direction:", shift_inst.direction) print("Aliases:", shift_inst.x.column_aliases) -# get some exemplary aux (all 3 methods get you the same result) -default_selector = cfg.get_aux("default_selector") -default_selector = cfg.aux["default_selector"] -default_selector = cfg.x.default_selector -print("================= default selector:", default_selector, "=======") +# # get some exemplary aux (all 3 methods get you the same result) +# default_selector = cfg.get_aux("default_selector") +# default_selector = cfg.aux["default_selector"] +# default_selector = cfg.x.default_selector +# print("================= default selector:", default_selector, "=======") # set some exemplary aux youself cfg.set_aux("example", "test") From 1d057d5c2433d9a6a105163a0a13b99d5e6b10be Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Wed, 4 Dec 2024 09:04:56 +0100 Subject: [PATCH 02/28] use correct btag reweighting and add vjets weight_producer --- hbw/weight/default.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/hbw/weight/default.py b/hbw/weight/default.py index 501ef584..2b472e03 100644 --- a/hbw/weight/default.py +++ b/hbw/weight/default.py @@ -96,6 +96,10 @@ def base_init(self: WeightProducer) -> None: # remove dependency towards top pt weights self.weight_columns.pop("top_pt_weight", None) + if not self.dataset_inst.has_tag("is_v_jets"): + # remove dependency towards vjets weights + self.weight_columns.pop("vjets_weight", None) + self.shifts = set() # when jec sources are known btag SF source, then propagate the shift to the WeightProducer @@ -152,7 +156,14 @@ def base_init(self: WeightProducer) -> None: **default_correction_weights, } default_weight_producer = base.derive("default", cls_dict={"weight_columns": default_weight_columns}) -base.derive("unstitched", cls_dict={"weight_columns": {**default_correction_weights, "normalization_weight": []}}) +with_vjets_weight = default_weight_producer.derive("with_vjets_weight", cls_dict={"weight_columns": { + **default_correction_weights, + "vjets_weight": [], # TODO: corrections/shift missing + "stitched_normalization_weight": [], +}}) +base.derive("unstitched", cls_dict={"weight_columns": { + **default_correction_weights, "normalization_weight": [], +}}) weight_columns_execpt_btag = default_weight_columns.copy() weight_columns_execpt_btag.pop("normalized_ht_njet_nhf_btag_weight") From a274649d9a77ca278df6c5909fd678fad6e72a44 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Thu, 5 Dec 2024 12:20:21 +0100 Subject: [PATCH 03/28] switch vjets pt reweighting json and apply NLO EW weights --- hbw/config/config_run2.py | 32 ++++++++++++++++++++++++-------- hbw/production/gen_v.py | 8 +++++--- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/hbw/config/config_run2.py b/hbw/config/config_run2.py index cce3738e..854705b5 100644 --- a/hbw/config/config_run2.py +++ b/hbw/config/config_run2.py @@ -357,14 +357,28 @@ def if_era( # V+jets reweighting cfg.x.vjets_reweighting = DotDict.wrap({ - "w": { - "value": "wjets_kfactor_value", - "error": "wjets_kfactor_error", - }, "z": { - "value": "zjets_kfactor_value", - "error": "zjets_kfactor_error", + "value": "eej_pTV_kappa_NLO_EW", + "ew": "eej_pTV_kappa_NLO_EW", + "error": "eej_pTV_d1kappa_EW", # NOTE: not sure if this is correct to use as error (d2,d3?) + "d2": "eej_pTV_d2kappa_EW", + "d3": "eej_pTV_d3kappa_EW", + }, + "w": { + "value": "aj_pTV_kappa_NLO_EW", + "ew": "aj_pTV_kappa_NLO_EW", + "error": "aj_pTV_d1kappa_EW", # NOTE: not sure if this is correct to use as error (d2,d3?) + "d2": "aj_pTV_d2kappa_EW", + "d3": "aj_pTV_d3kappa_EW", }, + # "w": { + # "value": "wjets_kfactor_value", + # "error": "wjets_kfactor_error", + # }, + # "z": { + # "value": "zjets_kfactor_value", + # "error": "zjets_kfactor_error", + # }, }) ################################################################################################ @@ -587,8 +601,10 @@ def add_external(name, value): add_external("muon_sf", (f"{json_mirror}/POG/MUO/{corr_tag}/muon_Z.json.gz", "v1")) # btag scale factor add_external("btag_sf_corr", (f"{json_mirror}/POG/BTV/{corr_tag}/btagging.json.gz", "v1")) - # V+jets reweighting (still unused and not centrally produced) - add_external("vjets_reweighting", f"{json_mirror}/data/json/vjets_reweighting.json.gz") + # V+jets reweighting (derived for 13 TeV, custom json converted from ROOT, not centrally produced) + # ROOT files (eej.root and aj.root) taken from here: + # https://github.com/UHH2/2HDM/tree/ultra_legacy/data/ScaleFactors/VJetsCorrections + add_external("vjets_reweighting", (f"{json_mirror}/data/json/vjets_pt.json.gz", "v1")) if cfg.x.run == 2: # met phi corrector (still unused and missing in Run3) add_external("met_phi_corr", (f"{json_mirror}/POG/JME/{corr_tag}/met.json.gz", "v1")) diff --git a/hbw/production/gen_v.py b/hbw/production/gen_v.py index a564359f..fb87f611 100644 --- a/hbw/production/gen_v.py +++ b/hbw/production/gen_v.py @@ -178,9 +178,10 @@ def get_kfactor(obj_name, key, obj): kfactor[key] = get_kfactor(boson, key, events.GenVBoson) weights = { - "nominal": kfactor.value, - "up": kfactor.value + kfactor.error, - "down": kfactor.value - kfactor.error, + # NOTE: 1-kfactor for "ew" correction + "nominal": 1 - kfactor.value, + "up": 1 - kfactor.value + kfactor.error, + "down": 1 - kfactor.value - kfactor.error, } # save the weights @@ -232,6 +233,7 @@ def vjets_weight_setup(self: Producer, reqs: dict, inputs: dict, reader_targets: self.get_vjets_reweighting_file(bundle.files).load(formatter="gzip").decode("utf-8"), ) corrections = self.get_vjets_reweighting_config() + self.vjets_reweighting_evaluators = { obj_name: { key: correction_set[correction_name] From 14f95cb668f82b0fff31983b3969f823f14cbdd4 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Tue, 10 Dec 2024 11:24:05 +0100 Subject: [PATCH 04/28] add simple producer for simple normalization weights --- hbw/production/dataset_normalization.py | 79 +++++++++++++++++++++++++ hbw/production/process_ids.py | 3 +- hbw/production/weights.py | 16 ++--- 3 files changed, 90 insertions(+), 8 deletions(-) create mode 100644 hbw/production/dataset_normalization.py diff --git a/hbw/production/dataset_normalization.py b/hbw/production/dataset_normalization.py new file mode 100644 index 00000000..eb92353d --- /dev/null +++ b/hbw/production/dataset_normalization.py @@ -0,0 +1,79 @@ +# coding: utf-8 + +""" +Column production methods related to sample normalization event weights. +""" + +from columnflow.production import Producer, producer +from columnflow.util import maybe_import, InsertableDict +from columnflow.columnar_util import set_ak_column + +np = maybe_import("numpy") +sp = maybe_import("scipy") +maybe_import("scipy.sparse") +ak = maybe_import("awkward") + + +@producer( + uses={"mc_weight"}, + produces={"dataset_normalization_weight"}, + # only run on mc + mc_only=True, +) +def dataset_normalization_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array: + """ + Uses luminosity information of internal py:attr:`config_inst`, the cross section of a process + obtained from the dataset inst and the sum of event weights from the + py:attr:`selection_stats` attribute to assign each event a normalization weight + independent of the sub-processes of the dataset. + Can only be used when there is a one-to-one mapping between datasets and processes. + """ + # get the lumi + lumi = self.config_inst.x.luminosity.nominal + + # compute the weight and store it + norm_weight = events.mc_weight * lumi * self.xs / self.sum_weights + events = set_ak_column(events, "dataset_normalization_weight", norm_weight, value_type=np.float32) + + return events + + +@dataset_normalization_weight.requires +def dataset_normalization_weight_requires(self: Producer, reqs: dict) -> None: + """ + Adds the requirements needed by the underlying py:attr:`task` to access selection stats into + *reqs*. + """ + # TODO: for actual sample stitching, we don't need the selection stats for that dataset, but + # rather the one merged for either all datasets, or the "stitching group" + # (i.e. all datasets that might contain any of the sub processes found in a dataset) + from columnflow.tasks.selection import MergeSelectionStats + reqs["selection_stats"] = MergeSelectionStats.req( + self.task, + tree_index=0, + branch=-1, + _exclude=MergeSelectionStats.exclude_params_forest_merge, + ) + + +@dataset_normalization_weight.setup +def dataset_normalization_weight_setup( + self: Producer, + reqs: dict, + inputs: dict, + reader_targets: InsertableDict, +) -> None: + """ + Load inclusive selection stats and cross sections for the normalization weight calculation. + """ + # load the selection stats + selection_stats = inputs["selection_stats"]["collection"][0]["stats"].load(formatter="json") + + process_inst = self.dataset_inst.processes.get_first() + + xs = process_inst.xsecs.get(self.config_inst.campaign.ecm, None) + if not xs: + raise Exception(f"no cross section found for process {process_inst.name}") + + self.xs = xs.nominal + self.sum_weights = selection_stats["sum_mc_weight"] diff --git a/hbw/production/process_ids.py b/hbw/production/process_ids.py index 2d5315d8..25f78b8b 100644 --- a/hbw/production/process_ids.py +++ b/hbw/production/process_ids.py @@ -62,7 +62,8 @@ def hbw_process_ids_init(self: Producer) -> None: if self.dataset_inst.has_tag("is_hbv"): self.process_producer = hh_bbvv_process_producer - elif "dy" in self.dataset_inst.name: + elif "dy" in self.dataset_inst.name and "amcatnlo" in self.dataset_inst.name: + # stitching of DY NLO samples self.process_producer = dy_nlo_process_producer elif len(self.dataset_inst.processes) == 1: self.process_producer = process_ids diff --git a/hbw/production/weights.py b/hbw/production/weights.py index 5e038c3c..07301324 100644 --- a/hbw/production/weights.py +++ b/hbw/production/weights.py @@ -5,6 +5,7 @@ """ import functools +import law from columnflow.util import maybe_import from columnflow.columnar_util import set_ak_column @@ -25,6 +26,7 @@ from hbw.production.gen_v import gen_v_boson, vjets_weight from hbw.production.normalized_weights import normalized_weight_factory from hbw.production.normalized_btag import normalized_btag_weights +from hbw.production.dataset_normalization import dataset_normalization_weight from hbw.util import has_tag @@ -224,11 +226,11 @@ def combined_normalization_weights(self: Producer, events: ak.Array, **kwargs) - when stitching our signal samples, but we want to calculate the BRs ourselved for other types of sample stitching (e.g. DY). """ - # NOTE: I would like to produce the unstitched normalization weights for cross checks, - # but for DY, this is not possible at the moment, since we assign processes (hf/lf) for which no - # xsecs are available - # events = self[normalization_weights](events, **kwargs) events = self[self.norm_weights_producer](events, **kwargs) + + # very simple Producer that creates normalization weight without any stitching + # (can only be used when there is a one-to-one mapping between datasets and processes) + events = self[dataset_normalization_weight](events, **kwargs) return events @@ -244,8 +246,8 @@ def combined_normalization_weights_init(self: Producer) -> None: self.norm_weights_producer.weight_name = "stitched_normalization_weight" - self.uses |= {self.norm_weights_producer} - self.produces |= {self.norm_weights_producer} + self.uses |= {self.norm_weights_producer, dataset_normalization_weight} + self.produces |= {self.norm_weights_producer, dataset_normalization_weight} @producer( @@ -262,7 +264,7 @@ def combined_normalization_weights_init(self: Producer) -> None: normalized_pu_weights, }, mc_only=True, - version=1, + version=law.config.get_expanded("analysis", "event_weights_version", 1), ) def event_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Array: """ From 8d245141937605c5044ab20dbfe06fc63ab73f76 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Tue, 10 Dec 2024 11:29:40 +0100 Subject: [PATCH 05/28] add task for campaign creation and consistency checks --- hbw/analysis/create_analysis.py | 59 +++++------ hbw/config/datasets.py | 9 +- hbw/tasks/campaigns.py | 177 ++++++++++++++++++++++++++++++++ hbw/tasks/inspection.py | 34 +++++- hbw/util.py | 104 +++++++++++++++++-- law.cfg | 2 +- 6 files changed, 346 insertions(+), 39 deletions(-) create mode 100644 hbw/tasks/campaigns.py diff --git a/hbw/analysis/create_analysis.py b/hbw/analysis/create_analysis.py index 68b8ca9c..d2a49c1d 100644 --- a/hbw/analysis/create_analysis.py +++ b/hbw/analysis/create_analysis.py @@ -7,7 +7,6 @@ from __future__ import annotations import os -import importlib import law import order as od @@ -24,6 +23,8 @@ ml_inputs_producer, ) +from hbw.tasks.campaigns import BuildCampaignSummary + @timeit_multiple def create_hbw_analysis( @@ -82,7 +83,6 @@ def create_hbw_analysis( from hbw.config.config_run2 import add_config def add_lazy_config( - campaigns: dict[str, str], config_name: str, config_id: int, **kwargs, @@ -101,22 +101,22 @@ def create_factory( ): @timeit_multiple def analysis_factory(configs: od.UniqueObjectIndex): - hbw_campaign_inst = None - - for mod, campaign in campaigns.items(): - # import the campaign - mod = importlib.import_module(mod) - if not hbw_campaign_inst: - # copy the main campaign - hbw_campaign_inst = getattr(mod, campaign).copy() - else: - # add datasets to the main campaign - campaign_inst = getattr(mod, campaign).copy() - for dataset in list(campaign_inst.datasets): - dataset.x.campaign = campaign - if not hbw_campaign_inst.has_dataset(dataset.name): - hbw_campaign_inst.add_dataset(dataset) - + cpn_task = BuildCampaignSummary( + config=config_name, + ) + if cpn_task.complete(): + logger.warning( + f"Using pickled campaign for config {config_name}; to re-initialize, run:\n" + f"law run {cpn_task.task_family} --config {config_name} --remove-output 0,a,y", + ) + else: + logger.warning( + f"Campaign used for {config_name} has been changed since last initialization." + "Difference: \n", + ) + cpn_task.run() + + hbw_campaign_inst = cpn_task.output()["hbw_campaign_inst"].load(formatter="pickle") return add_config( analysis_inst, hbw_campaign_inst, @@ -139,29 +139,30 @@ def analysis_factory(configs: od.UniqueObjectIndex): # 2017 add_lazy_config( - { - "cmsdb.campaigns.run2_2017_nano_v9": "campaign_run2_2017_nano_v9", - }, + # { + # "cmsdb.campaigns.run2_2017_nano_v9": "campaign_run2_2017_nano_v9", + # }, "c17", 1700, ) # 2022 preEE add_lazy_config( - { - "cmsdb.campaigns.run3_2022_preEE_nano_v12": "campaign_run3_2022_preEE_nano_v12", - "cmsdb.campaigns.run3_2022_preEE_nano_v13": "campaign_run3_2022_preEE_nano_v13", - }, + # { + # "cmsdb.campaigns.run3_2022_preEE_nano_v12": "campaign_run3_2022_preEE_nano_v12", + # "cmsdb.campaigns.run3_2022_preEE_nano_v13": "campaign_run3_2022_preEE_nano_v13", + # }, "c22pre", 2200, ) # 2022 postEE add_lazy_config( - { - "cmsdb.campaigns.run3_2022_postEE_nano_v12": "campaign_run3_2022_postEE_nano_v12", - "cmsdb.campaigns.run3_2022_postEE_nano_v13": "campaign_run3_2022_postEE_nano_v13", - }, + # { + # "cmsdb.campaigns.run3_2022_postEE_nano_v12": "campaign_run3_2022_postEE_nano_v12", + # "cmsdb.campaigns.run3_2022_postEE_nano_v13": "campaign_run3_2022_postEE_nano_v13", + # "cmsdb.campaigns.run3_2022_postEE_nano_uhh_v12": "campaign_run3_2022_postEE_nano_uhh_v12", + # }, "c22post", 2210, ) diff --git a/hbw/config/datasets.py b/hbw/config/datasets.py index 278a8e80..431a1a32 100644 --- a/hbw/config/datasets.py +++ b/hbw/config/datasets.py @@ -83,6 +83,7 @@ def hbw_dataset_names(config: od.Config, as_list: bool = False) -> DotDict[str: "dy_m50toinf_ht2500toinf_madgraph", ]), *config.x.if_era(run=3, values=[ + # NLO samples "dy_m50toinf_amcatnlo", "dy_m10to50_amcatnlo", "dy_m4to10_amcatnlo", @@ -113,7 +114,13 @@ def hbw_dataset_names(config: od.Config, as_list: bool = False) -> DotDict[str: "zz_pythia", ]), ], - "ttv": [], # empty for now + "ttv": [ + "ttw_wlnu_amcatnlo", + "ttz_zll_m4to50_amcatnlo", + "ttz_zll_m50toinf_amcatnlo", + "ttz_znunu_amcatnlo", + "ttz_zqq_amcatnlo", + ], "h": [ *config.x.if_era(run=3, values=[ # TODO: remove whatever is not really necessary diff --git a/hbw/tasks/campaigns.py b/hbw/tasks/campaigns.py new file mode 100644 index 00000000..c7132cb9 --- /dev/null +++ b/hbw/tasks/campaigns.py @@ -0,0 +1,177 @@ +# coding: utf-8 + +""" +Custom tasks for creating and managing campaigns. +""" + +from collections import defaultdict +from functools import cached_property +import importlib + +import law +import luigi + +from columnflow.tasks.framework.base import AnalysisTask +from hbw.tasks.base import HBWTask + + +logger = law.logger.get_logger(__name__) + + +campaign_map = { + "c17": { + "cmsdb.campaigns.run2_2017_nano_v9": "campaign_run2_2017_nano_v9", + }, + "c22pre": { + "cmsdb.campaigns.run3_2022_preEE_nano_v12": "campaign_run3_2022_preEE_nano_v12", + "cmsdb.campaigns.run3_2022_preEE_nano_v13": "campaign_run3_2022_preEE_nano_v13", + }, + "c22post": { + "cmsdb.campaigns.run3_2022_postEE_nano_v12": "campaign_run3_2022_postEE_nano_v12", + "cmsdb.campaigns.run3_2022_postEE_nano_v13": "campaign_run3_2022_postEE_nano_v13", + "cmsdb.campaigns.run3_2022_postEE_nano_uhh_v12": "campaign_run3_2022_postEE_nano_uhh_v12", + }, +} + + +class BuildCampaignSummary( + HBWTask, + AnalysisTask, +): + + config = luigi.Parameter() + # TODO: set campaigns as part of this function instead of configuring in the config? + + recreate_backup_summary = luigi.BoolParameter(default=False) + + def requires(self): + return {} + + def store_parts(self): + parts = super().store_parts() + + # add the config name + parts.insert_after("task_family", "config", self.config) + + return parts + + @cached_property + def campaigns(self): + if self.config not in campaign_map: + raise ValueError(f"Unknown config {self.config}") + return campaign_map[self.config] + + @cached_property + def campaign_insts(self): + return [ + getattr(importlib.import_module(mod), campaign).copy() + for mod, campaign in self.campaigns.items() + ] + + dataset_from_uhh_identifier = { + # TODO: use DY from uhh campaign + # "dy_m10to50_amcatnlo", + # "dy_m4to10_amcatnlo", + "ttw_", + "ttz_", + } + + def get_dataset_prio(self, dataset_name, campaign): + """ + If dataset should be overwritten from this campaign, return True. + Otherwise, return False. + """ + if "uhh" in campaign.name and any( + dataset_identifier in dataset_name + for dataset_identifier in self.dataset_from_uhh_identifier + ): + return True + + return False + + def output(self): + output = { + "dataset_summary": self.target("dataset_summary.yaml"), + "campaign_summary": self.target("campaign_summary.yaml"), + "hbw_campaign_inst": self.target("hbw_campaign_inst.pickle"), + } + return output + + @cached_property + def dataset_summary(self): + dataset_summary = defaultdict(dict) + used_datasets = set() + # create campaign summary with one key per dataset (to fulfill dataset uniqueness) + for campaign in self.campaign_insts: + for dataset in campaign.datasets: + if dataset.name not in used_datasets or self.get_dataset_prio(dataset.name, campaign): + dataset_summary[dataset.name] = { + "campaign": campaign.name, + "n_events": dataset.n_events, + "n_files": dataset.n_files, + } + used_datasets.add(dataset.name) + + return dict(dataset_summary) + + @cached_property + def campaign_summary(self,): + campaign_summary = { + campaign.name: {} for campaign in self.campaign_insts + } + + for dataset, dataset_info in self.dataset_summary.items(): + campaign_summary[dataset_info["campaign"]][dataset] = { + "n_events": dataset_info["n_events"], + "n_files": dataset_info["n_files"], + } + return campaign_summary + + def get_custom_campaign(self): + hbw_campaign_inst = self.campaign_insts[0].copy() + hbw_campaign_inst.clear_datasets() + for campaign_inst in self.campaign_insts: + campaign_info = self.campaign_summary[campaign_inst.name] + for dataset in campaign_info.keys(): + dataset_inst = campaign_inst.get_dataset(dataset) + dataset_inst.x.campaign = campaign_inst.name + hbw_campaign_inst.add_dataset(dataset_inst) + + return hbw_campaign_inst + + from hbw.util import timeit_multiple + + @timeit_multiple + def run(self): + output = self.output() + + # cross check if the dataset summary did change + backup_dataset_summary = self.target("backup_dataset_summary.yaml") + if backup_dataset_summary.exists(): + backup_dataset_summary = backup_dataset_summary.load(formatter="yaml") + if backup_dataset_summary != self.dataset_summary: + from hbw.util import gather_dict_diff + logger.warning( + "Backup dataset summary does not match the current one \n" + f"{gather_dict_diff(backup_dataset_summary, self.dataset_summary)}", + ) + if self.recreate_backup_summary: + logger.warning("Recreating backup dataset summary") + backup_dataset_summary.dump(self.dataset_summary, formatter="yaml") + else: + logger.warning( + "Run the following command to recreate the backup dataset summary:\n" + f"law run {self.task_family} --recreate_backup_summary --config {self.config} --remove-output 0,a,y", # noqa + ) + else: + logger.warning("No backup dataset summary found, creating one now") + backup_dataset_summary.dump(self.dataset_summary, formatter="yaml") + + output["dataset_summary"].dump(self.dataset_summary, formatter="yaml") + output["campaign_summary"].dump(self.campaign_summary, formatter="yaml") + + import sys + orig_rec_limit = sys.getrecursionlimit() + sys.setrecursionlimit(max(orig_rec_limit, 100000)) + output["hbw_campaign_inst"].dump(self.get_custom_campaign(), formatter="pickle") + sys.setrecursionlimit(orig_rec_limit) diff --git a/hbw/tasks/inspection.py b/hbw/tasks/inspection.py index 950c8ee1..be49ac7e 100644 --- a/hbw/tasks/inspection.py +++ b/hbw/tasks/inspection.py @@ -4,15 +4,16 @@ Custom tasks for inspecting the configuration or certain task outputs. """ -# from functools import cached_property +from collections import defaultdict import law import luigi + from columnflow.tasks.framework.mixins import ( ProducersMixin, MLModelsMixin, ) -from columnflow.tasks.framework.base import ConfigTask, Requirements +from columnflow.tasks.framework.base import MultiConfigTask, ConfigTask, Requirements from columnflow.tasks.framework.mixins import DatasetsProcessesMixin, SelectorMixin, CalibratorsMixin from columnflow.tasks.framework.parameters import SettingsParameter from columnflow.tasks.reduction import ReducedEventsUser @@ -287,6 +288,35 @@ def run(self): debugger() +class DatasetSummary( + HBWTask, + MultiConfigTask, +): + def requires(self): + return {} + + def output(self): + output = { + "dataset_summary": self.target("dataset_summary.yaml"), + } + return output + + def run(self): + multi_config_dataset_summary = {} + for config in self.config_insts: + dataset_summary = defaultdict(dict) + cpn_name = config.campaign.name + for dataset in config.datasets: + dataset_campaign = dataset.x("campaign", cpn_name) + dataset_summary[dataset_campaign][dataset.name] = { + "n_events": dataset.n_events, + "n_files": dataset.n_files, + } + multi_config_dataset_summary[config.name] = dict(dataset_summary) + + self.output()["dataset_summary"].dump(multi_config_dataset_summary, formatter="yaml") + + class CheckColumns( ColumnsBaseTask, law.LocalWorkflow, diff --git a/hbw/util.py b/hbw/util.py index 14154431..36e6c4dd 100644 --- a/hbw/util.py +++ b/hbw/util.py @@ -230,31 +230,108 @@ def traceback_function(depth: int = 1): def make_dict_hashable(d: dict, deep: bool = True): - """ small helper that converts dict into hashable dict""" + """Small helper that converts dict into a hashable representation.""" d_out = d.copy() for key, value in d.items(): if isinstance(value, Hashable): - # skip values that are already hashable + # Skip values that are already hashable continue elif isinstance(value, dict): - # convert dictionary items to hashable and use items of resulting dict + # Convert nested dictionaries to a hashable form if deep: value = make_dict_hashable(value) d_out[key] = tuple(value) else: - # hopefully, everything else can be cast to a tuple + # Convert other types to tuples d_out[key] = law.util.make_tuple(value) return d_out.items() def dict_diff(dict1: dict, dict2: dict): + """Return the differences between two dictionaries.""" set1 = set(make_dict_hashable(dict1)) set2 = set(make_dict_hashable(dict2)) return set1 ^ set2 +def filter_unchanged_keys(d1: dict, d2: dict): + """Recursively remove unchanged keys from nested dictionaries and return modified values.""" + if not isinstance(d1, dict) or not isinstance(d2, dict): + return {"old": d1, "new": d2} if d1 != d2 else None + + filtered = {} + all_keys = set(d1.keys()).union(set(d2.keys())) + + for key in all_keys: + val1 = d1.get(key) + val2 = d2.get(key) + + if isinstance(val1, dict) and isinstance(val2, dict): + # Recur for nested dictionaries + nested_diff = filter_unchanged_keys(val1, val2) + if nested_diff: + filtered[key] = nested_diff + elif val1 != val2: + # Value changed or key added/removed + filtered[key] = {"old": val1, "new": val2} + + return filtered if filtered else None + + +def dict_diff_filtered(dict1: dict, dict2: dict): + """Return the differences between two dictionaries with nested filtering of unchanged keys.""" + diff = {} + + # Check keys present in either dict + all_keys = set(dict1.keys()).union(set(dict2.keys())) + + for key in all_keys: + if key in dict1 and key in dict2: + if isinstance(dict1[key], dict) and isinstance(dict2[key], dict): + # Recur for nested dictionaries and get filtered diff + nested_diff = filter_unchanged_keys(dict1[key], dict2[key]) + if nested_diff: + diff[key] = nested_diff + elif dict1[key] != dict2[key]: + diff[key] = {"old": dict1[key], "new": dict2[key]} + elif key in dict1: + diff[key] = {"old": dict1[key], "new": None} + else: + diff[key] = {"old": None, "new": dict2[key]} + + return diff + + +def gather_dict_diff(dict1: dict, dict2: dict) -> str: + """Gather the differences between two dictionaries and return them as a formatted string.""" + diff = filter_unchanged_keys(dict1, dict2) + lines = [] + + if not diff: + return "✅ No differences found." + + def process_diff(diff, indent=0): + indentation = " " * indent + for key, value in diff.items(): + if isinstance(value, dict) and "old" in value and "new" in value: + if value["old"] is None: + lines.append(f"{indentation}🔹 Added: {key}: {value['new']}") + elif value["new"] is None: + lines.append(f"{indentation}🔻 Removed: {key}: {value['old']}") + else: + lines.append(f"{indentation}🔄 Modified: {key}:") + lines.append(f"{indentation} - Old: {value['old']}") + lines.append(f"{indentation} - New: {value['new']}") + elif isinstance(value, dict): + lines.append(f"{indentation}🔄 Modified: {key}:") + process_diff(value, indent + 1) + + process_diff(diff) + return "\n".join(lines) + + def four_vec( collections: str | Iterable[str], columns: str | Iterable[str] | None = None, @@ -333,7 +410,9 @@ def inner(config, *args, **kwargs): def timeit(func): - """ Simple wrapper to measure execution time of a function """ + """ + Simple wrapper to measure execution time of a function. + """ @wraps(func) def timeit_wrapper(*args, **kwargs): start_time = time.perf_counter() @@ -347,16 +426,29 @@ def timeit_wrapper(*args, **kwargs): def timeit_multiple(func): """ Wrapper to measure the number of execution calls and the added execution time of a function """ + log_method = "info" + log_func = getattr(_logger, log_method) + @wraps(func) def timeit_wrapper(*args, **kwargs): func.total_calls = getattr(func, "total_calls", 0) + 1 + _repr = func.__name__ + if len(args) >= 1 and hasattr(args[0], "__name__"): + _repr = f"{args[0].__name__}.{_repr}" + + if len(args) >= 3 and isinstance(args[2], dict): + for param in ("branch", "dataset"): + if param in args[2]: + _repr = f"{_repr} ({param} {args[2][param]})" + start_time = time.perf_counter() result = func(*args, **kwargs) end_time = time.perf_counter() total_time = end_time - start_time func.total_time = getattr(func, "total_time", 0) + total_time - _logger.info(f"{func.__name__} has been run {func.total_calls} times ({round_sig(func.total_time)} seconds)") + log_func(f"{_repr} has been run {func.total_calls} times ({round_sig(func.total_time)} seconds)") return result + return timeit_wrapper diff --git a/law.cfg b/law.cfg index 75d32b48..b57cbda9 100644 --- a/law.cfg +++ b/law.cfg @@ -8,7 +8,7 @@ inherit: $CF_BASE/law.cfg columnflow.tasks.cms.external columnflow.tasks.cms.inference -hbw.tasks.{inspection,ml,inference,postfit_plots,plotting,wrapper,union,optimization,corrections} +hbw.tasks.{inspection,campaigns,ml,inference,postfit_plots,plotting,wrapper,union,optimization,corrections} From 87dd19c8a371cb6c22dd53fa312853ce07e18a72 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Tue, 10 Dec 2024 11:52:46 +0100 Subject: [PATCH 06/28] cleanup and tests --- hbw/analysis/create_analysis.py | 3 +- hbw/scripts/test_config.py | 1 - hbw/tasks/campaigns.py | 10 +-- hbw/util.py | 24 +++---- tests/test_util.py | 122 +++++++++++++++++++++++++++++++- 5 files changed, 138 insertions(+), 22 deletions(-) diff --git a/hbw/analysis/create_analysis.py b/hbw/analysis/create_analysis.py index d2a49c1d..382bea48 100644 --- a/hbw/analysis/create_analysis.py +++ b/hbw/analysis/create_analysis.py @@ -111,8 +111,7 @@ def analysis_factory(configs: od.UniqueObjectIndex): ) else: logger.warning( - f"Campaign used for {config_name} has been changed since last initialization." - "Difference: \n", + f"Campaign used for {config_name} is being reinitialized: \n", ) cpn_task.run() diff --git a/hbw/scripts/test_config.py b/hbw/scripts/test_config.py index 0a40ff1e..a5763e97 100644 --- a/hbw/scripts/test_config.py +++ b/hbw/scripts/test_config.py @@ -9,7 +9,6 @@ default_analysis = law.config.get_expanded("analysis", "default_analysis") default_config = law.config.get_expanded("analysis", "default_config") -default_config = "c22uhhpost" analysis_inst = ana = AnalysisTask.get_analysis_inst(default_analysis) config_inst = cfg = ana.get_config(default_config) diff --git a/hbw/tasks/campaigns.py b/hbw/tasks/campaigns.py index c7132cb9..cccb0f14 100644 --- a/hbw/tasks/campaigns.py +++ b/hbw/tasks/campaigns.py @@ -146,9 +146,9 @@ def run(self): output = self.output() # cross check if the dataset summary did change - backup_dataset_summary = self.target("backup_dataset_summary.yaml") - if backup_dataset_summary.exists(): - backup_dataset_summary = backup_dataset_summary.load(formatter="yaml") + backup_target = self.target("backup_dataset_summary.yaml") + if backup_target.exists(): + backup_dataset_summary = backup_target.load(formatter="yaml") if backup_dataset_summary != self.dataset_summary: from hbw.util import gather_dict_diff logger.warning( @@ -157,7 +157,7 @@ def run(self): ) if self.recreate_backup_summary: logger.warning("Recreating backup dataset summary") - backup_dataset_summary.dump(self.dataset_summary, formatter="yaml") + backup_target.dump(self.dataset_summary, formatter="yaml") else: logger.warning( "Run the following command to recreate the backup dataset summary:\n" @@ -165,7 +165,7 @@ def run(self): ) else: logger.warning("No backup dataset summary found, creating one now") - backup_dataset_summary.dump(self.dataset_summary, formatter="yaml") + backup_target.dump(self.dataset_summary, formatter="yaml") output["dataset_summary"].dump(self.dataset_summary, formatter="yaml") output["campaign_summary"].dump(self.campaign_summary, formatter="yaml") diff --git a/hbw/util.py b/hbw/util.py index 36e6c4dd..bdb998dc 100644 --- a/hbw/util.py +++ b/hbw/util.py @@ -280,33 +280,33 @@ def filter_unchanged_keys(d1: dict, d2: dict): return filtered if filtered else None -def dict_diff_filtered(dict1: dict, dict2: dict): +def dict_diff_filtered(old_dict: dict, new_dict: dict): """Return the differences between two dictionaries with nested filtering of unchanged keys.""" diff = {} # Check keys present in either dict - all_keys = set(dict1.keys()).union(set(dict2.keys())) + all_keys = set(old_dict.keys()).union(set(new_dict.keys())) for key in all_keys: - if key in dict1 and key in dict2: - if isinstance(dict1[key], dict) and isinstance(dict2[key], dict): + if key in old_dict and key in new_dict: + if isinstance(old_dict[key], dict) and isinstance(new_dict[key], dict): # Recur for nested dictionaries and get filtered diff - nested_diff = filter_unchanged_keys(dict1[key], dict2[key]) + nested_diff = filter_unchanged_keys(old_dict[key], new_dict[key]) if nested_diff: diff[key] = nested_diff - elif dict1[key] != dict2[key]: - diff[key] = {"old": dict1[key], "new": dict2[key]} - elif key in dict1: - diff[key] = {"old": dict1[key], "new": None} + elif old_dict[key] != new_dict[key]: + diff[key] = {"old": old_dict[key], "new": new_dict[key]} + elif key in old_dict: + diff[key] = {"old": old_dict[key], "new": None} else: - diff[key] = {"old": None, "new": dict2[key]} + diff[key] = {"old": None, "new": new_dict[key]} return diff -def gather_dict_diff(dict1: dict, dict2: dict) -> str: +def gather_dict_diff(old_dict: dict, new_dict: dict) -> str: """Gather the differences between two dictionaries and return them as a formatted string.""" - diff = filter_unchanged_keys(dict1, dict2) + diff = filter_unchanged_keys(old_dict, new_dict) lines = [] if not diff: diff --git a/tests/test_util.py b/tests/test_util.py index 537a42bb..85defa9c 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -8,7 +8,7 @@ from columnflow.util import maybe_import -from hbw.util import build_param_product, round_sig, dict_diff, four_vec, call_once_on_config +from hbw.util import build_param_product, round_sig, dict_diff, four_vec, call_once_on_config, gather_dict_diff import order as od @@ -16,7 +16,121 @@ ak = maybe_import("awkward") -class HbwUtilTest(unittest.TestCase): +class TestDictDiff(unittest.TestCase): + def test_no_difference(self): + dict1 = {"name": "Alice", "age": 25} + dict2 = {"name": "Alice", "age": 25} + result = gather_dict_diff(dict1, dict2) + self.assertEqual(result, "✅ No differences found.") + + def test_simple_modification(self): + dict1 = {"name": "Alice", "age": 25} + dict2 = {"name": "Alice", "age": 26} + result = gather_dict_diff(dict1, dict2) + expected_output = ( + "🔄 Modified: age:\n" + " - Old: 25\n" + " - New: 26" + ) + self.assertEqual(result, expected_output) + + def test_addition(self): + dict1 = {"name": "Alice"} + dict2 = {"name": "Alice", "hobby": "cycling"} + result = gather_dict_diff(dict1, dict2) + expected_output = "🔹 Added: hobby: cycling" + self.assertEqual(result, expected_output) + + def test_removal(self): + dict1 = {"name": "Alice", "hobby": "cycling"} + dict2 = {"name": "Alice"} + result = gather_dict_diff(dict1, dict2) + expected_output = "🔻 Removed: hobby: cycling" + self.assertEqual(result, expected_output) + + def test_nested_modification(self): + dict1 = { + "name": "Alice", + "skills": { + "python": "intermediate", + "sql": "beginner" + } + } + dict2 = { + "name": "Alice", + "skills": { + "python": "advanced", + "sql": "beginner" + } + } + result = gather_dict_diff(dict1, dict2) + expected_output = ( + "🔄 Modified: skills:\n" + " 🔄 Modified: python:\n" + " - Old: intermediate\n" + " - New: advanced" + ) + self.assertEqual(result, expected_output) + + def test_nested_addition(self): + dict1 = { + "name": "Alice", + "skills": { + "python": "intermediate" + } + } + dict2 = { + "name": "Alice", + "skills": { + "python": "intermediate", + "docker": "beginner" + } + } + result = gather_dict_diff(dict1, dict2) + expected_output = ( + "🔄 Modified: skills:\n" + " 🔹 Added: docker: beginner" + ) + self.assertEqual(result, expected_output) + + def test_complex_diff(self): + dict1 = { + "name": "Alice", + "age": 25, + "skills": { + "python": "intermediate", + "sql": "beginner", + }, + } + dict2 = { + "name": "Alice", + "age": 26, + "skills": { + "python": "advanced", + "sql": "beginner", + "docker": "beginner", + }, + "hobby": "cycling", + } + result = gather_dict_diff(dict1, dict2) + expected_output = ( + "🔄 Modified: age:\n" + " - Old: 25\n" + " - New: 26\n" + "🔄 Modified: skills:\n" + " 🔄 Modified: python:\n" + " - Old: intermediate\n" + " - New: advanced\n" + " 🔹 Added: docker: beginner\n" + "🔹 Added: hobby: cycling" + ) + self.assertEqual(result, expected_output) + + +class HbwUtilTest( + TestDictDiff, + unittest.TestCase, +): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -97,3 +211,7 @@ def some_config_function(config: od.Config) -> str: # on second call, function should not be called -> returns None self.assertEqual(some_config_function(self.config_inst), None) + + +if __name__ == "__main__": + unittest.main() From 260bb915f6dc0ba07185e754448318b99911f2d7 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Tue, 10 Dec 2024 14:11:27 +0100 Subject: [PATCH 07/28] require existence of campaign before running analysis --- hbw/analysis/create_analysis.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hbw/analysis/create_analysis.py b/hbw/analysis/create_analysis.py index 382bea48..4c7d19fc 100644 --- a/hbw/analysis/create_analysis.py +++ b/hbw/analysis/create_analysis.py @@ -110,10 +110,11 @@ def analysis_factory(configs: od.UniqueObjectIndex): f"law run {cpn_task.task_family} --config {config_name} --remove-output 0,a,y", ) else: - logger.warning( - f"Campaign used for {config_name} is being reinitialized: \n", + raise ValueError( + f"Campaign used for {config_name} is not yet initialized; to initialize, run: \n", + f"law run {cpn_task.task_family} --config {config_name} --remove-output 0,a,y", ) - cpn_task.run() + # cpn_task.run() hbw_campaign_inst = cpn_task.output()["hbw_campaign_inst"].load(formatter="pickle") return add_config( From 07aa1b627f7131d772845a891d6680f49b4555e6 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Tue, 10 Dec 2024 14:12:23 +0100 Subject: [PATCH 08/28] enable usage of uhh campaigns --- hbw/config/datasets.py | 43 ++++++++++++++++++++++++++++++++++++++++++ hbw/config/styling.py | 2 +- hbw/tasks/campaigns.py | 2 ++ law.cfg | 36 ++++++++++++++++++++++++++++++++++- 4 files changed, 81 insertions(+), 2 deletions(-) diff --git a/hbw/config/datasets.py b/hbw/config/datasets.py index 431a1a32..a9193479 100644 --- a/hbw/config/datasets.py +++ b/hbw/config/datasets.py @@ -397,6 +397,9 @@ def configure_hbw_datasets( limit_dataset_files: int | None = None, add_dataset_extensions: bool = False, ): + # allow usage of UHH campaign + enable_uhh_campaign_usage(config) + for dataset in config.datasets: if add_dataset_extensions: add_dataset_extension_to_nominal(dataset) @@ -541,3 +544,43 @@ def get_dataset_lfns_2017( lfn_base.child(basename, type="f").path for basename in lfn_base.listdir(pattern="*.root") ] + + +def enable_uhh_campaign_usage(cfg: od.Config) -> None: + # custom lfn retrieval method in case the underlying campaign is custom uhh + def get_dataset_lfns_uhh( + dataset_inst: od.Dataset, + shift_inst: od.Shift, + dataset_key: str, + ) -> list[str]: + if "uhh" not in dataset_inst.x("campaign", ""): + # for non-uhh datasets, use default GetDatasetLFNs method + return GetDatasetLFNs.get_dataset_lfns_dasgoclient( + GetDatasetLFNs, dataset_inst=dataset_inst, shift_inst=shift_inst, dataset_key=dataset_key, + ) + cpn_name = dataset_inst.x.campaign + # destructure dataset_key into parts and create the lfn base directory + dataset_id, full_campaign, tier = dataset_key.split("/")[1:] + main_campaign, sub_campaign = full_campaign.split("-", 1) + lfn_base = law.wlcg.WLCGDirectoryTarget( + f"/store/{dataset_inst.data_source}/{main_campaign}/{dataset_id}/{tier}/{sub_campaign}/0", + # fs=f"wlcg_fs_{cfg.campaign.x.custom['name']}", + fs=f"wlcg_fs_{cpn_name}", + ) + + # loop though files and interpret paths as lfns + return [ + lfn_base.child(basename, type="f").path + for basename in lfn_base.listdir(pattern="*.root") + ] + + if any("uhh" in cpn_name for cpn_name in cfg.campaign.x("campaigns", [])): + # define the lfn retrieval function + cfg.x.get_dataset_lfns = get_dataset_lfns_uhh + + # define custom remote fs's to look at + cfg.x.get_dataset_lfns_remote_fs = lambda dataset_inst: ( + None if "uhh" not in dataset_inst.x("campaign", "") else [ + f"local_fs_{dataset_inst.x.campaign}", + f"wlcg_fs_{dataset_inst.x.campaign}", + ]) diff --git a/hbw/config/styling.py b/hbw/config/styling.py index 318f7db4..9075979f 100644 --- a/hbw/config/styling.py +++ b/hbw/config/styling.py @@ -82,7 +82,7 @@ "dy_m50toinf": color_palette["yellow"], "dy_m10to50": color_palette["brown"], "dy_m4to10": color_palette["darkgrey"], - "ttV": color_palette["brown"], + "ttv": color_palette["brown"], "vv": color_palette["blue"], "other": color_palette["grey"], "hh_ggf_hbb_htt": color_palette["grey"], diff --git a/hbw/tasks/campaigns.py b/hbw/tasks/campaigns.py index cccb0f14..8c9b9ca7 100644 --- a/hbw/tasks/campaigns.py +++ b/hbw/tasks/campaigns.py @@ -137,6 +137,8 @@ def get_custom_campaign(self): dataset_inst.x.campaign = campaign_inst.name hbw_campaign_inst.add_dataset(dataset_inst) + hbw_campaign_inst.x.campaigns = list(self.campaigns) + return hbw_campaign_inst from hbw.util import timeit_multiple diff --git a/law.cfg b/law.cfg index b57cbda9..ef9ce8c0 100644 --- a/law.cfg +++ b/law.cfg @@ -73,7 +73,7 @@ check_overlapping_inputs: None [outputs] # list of all used file systems -wlcg_file_systems: wlcg_fs, wlcg_fs_desy, wlcg_fs_cernbox, wlcg_fs_desy_store, wlcg_fs_infn_redirector, wlcg_fs_global_redirector +wlcg_file_systems: wlcg_fs, wlcg_fs_desy, wlcg_fs_cernbox, wlcg_fs_desy_store, wlcg_fs_infn_redirector, wlcg_fs_global_redirector, wlcg_fs_run3_2022_preEE_nano_uhh_v12, wlcg_fs_run3_2022_postEE_nano_uhh_v12 # list of file systems used by columnflow.tasks.external.GetDatasetLFNs.iter_nano_files to # look for the correct fs per nano input file (in that order) @@ -259,6 +259,40 @@ gsiftp_base: gsiftp://dcache-door-cms04.desy.de:2811/pnfs/desy.de/cms/tier2/stor base: &::gsiftp_base +[wlcg_fs_run3_2022_preEE_nano_uhh_v12] + +webdav_base: davs://dcache-cms-webdav-wan.desy.de:2880/pnfs/desy.de/cms/tier2/store/user/nprouvos/nanogen_store/MergeNano/config_22pre_v12/prod3 +gsiftp_base: gsiftp://dcache-door-cms04.desy.de:2811/pnfs/desy.de/cms/tier2/store/user/nprouvos/nanogen_store/MergeNano/config_22pre_v12/prod3 +xrootd_base: root://dcache-cms-xrootd.desy.de:1094/pnfs/desy.de/cms/tier2/store/user/nprouvos/nanogen_store/MergeNano/config_22pre_v12/prod3 +base: &::xrootd_base +use_cache: $CF_WLCG_USE_CACHE +cache_root: $CF_WLCG_CACHE_ROOT +cache_cleanup: $CF_WLCG_CACHE_CLEANUP +cache_max_size: 15GB +cache_global_lock: True + +[local_fs_run3_2022_preEE_nano_uhh_v12] + +base: file:///pnfs/desy.de/cms/tier2/store/user/nprouvos/nanogen_store/MergeNano/config_22pre_v12/prod3 + + +[wlcg_fs_run3_2022_postEE_nano_uhh_v12] + +webdav_base: davs://dcache-cms-webdav-wan.desy.de:2880/pnfs/desy.de/cms/tier2/store/user/aalvesan/nanogen_store/MergeNano/config_22post_v12/prod1 +gsiftp_base: gsiftp://dcache-door-cms04.desy.de:2811/pnfs/desy.de/cms/tier2/store/user/aalvesan/nanogen_store/MergeNano/config_22post_v12/prod1 +xrootd_base: root://dcache-cms-xrootd.desy.de:1094/pnfs/desy.de/cms/tier2/store/user/aalvesan/nanogen_store/MergeNano/config_22post_v12/prod1 +base: &::xrootd_base +use_cache: $CF_WLCG_USE_CACHE +cache_root: $CF_WLCG_CACHE_ROOT +cache_cleanup: $CF_WLCG_CACHE_CLEANUP +cache_max_size: 15GB +cache_global_lock: True + +[local_fs_run3_2022_postEE_nano_uhh_v12] + +base: file:///pnfs/desy.de/cms/tier2/store/user/aalvesan/nanogen_store/MergeNano/config_22post_v12/prod1 + + [luigi_resources] From 580e9b4408ee79ffb3289cbccfc5845d2560aaa5 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Thu, 12 Dec 2024 09:27:18 +0100 Subject: [PATCH 09/28] allow string version for CSPs --- hbw/columnflow_patches.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hbw/columnflow_patches.py b/hbw/columnflow_patches.py index 1724d0ee..06ba7c5d 100644 --- a/hbw/columnflow_patches.py +++ b/hbw/columnflow_patches.py @@ -86,8 +86,8 @@ def patch_csp_versioning(): def TaskArrayFunction_str(self): version = self.version() if callable(getattr(self, "version", None)) else getattr(self, "version", None) - if version and not isinstance(version, int): - raise Exception(f"version must be an integer, but is {version}") + if version and not isinstance(version, (int, str)): + raise Exception(f"version must be an integer or string, but is {version} ({type(version)})") version_str = f"V{version}" if version is not None else "" return f"{self.cls_name}{version_str}" From 65b26bd9f59857a4d7cdf52401d4c0d745a0db93 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Thu, 12 Dec 2024 10:44:21 +0100 Subject: [PATCH 10/28] cleanup in variables (start count with 0) --- hbw/config/styling.py | 6 +- hbw/config/variables.py | 196 ++++++++++++++++++++++++---------------- 2 files changed, 120 insertions(+), 82 deletions(-) diff --git a/hbw/config/styling.py b/hbw/config/styling.py index 9075979f..4f248c91 100644 --- a/hbw/config/styling.py +++ b/hbw/config/styling.py @@ -82,7 +82,7 @@ "dy_m50toinf": color_palette["yellow"], "dy_m10to50": color_palette["brown"], "dy_m4to10": color_palette["darkgrey"], - "ttv": color_palette["brown"], + "ttv": color_palette["turqoise"], "vv": color_palette["blue"], "other": color_palette["grey"], "hh_ggf_hbb_htt": color_palette["grey"], @@ -292,10 +292,10 @@ def quick_addvar(config: od.Config, obj: str, i: int, var: str): object (starting at 1) and `var` is the variable of interest; example: cf_loosejet1_pt """ config.add_variable( - name=name.format(obj=obj, i=i + 1, var=var).lower(), + name=name.format(obj=obj, i=i, var=var).lower(), expression=expr.format(obj=obj, i=i, var=var), null_value=EMPTY_FLOAT, binning=default_var_binning[var], unit=default_var_unit.get(var, "1"), - x_title=x_title_base.format(obj=obj, i=i + 1) + default_var_title_format.get(var, var), + x_title=x_title_base.format(obj=obj, i=i) + default_var_title_format.get(var, var), ) diff --git a/hbw/config/variables.py b/hbw/config/variables.py index 454e7888..34ea1dca 100644 --- a/hbw/config/variables.py +++ b/hbw/config/variables.py @@ -119,11 +119,11 @@ def add_feature_variables(config: od.Config) -> None: # FatJet features for i in range(2): config.add_variable( - name=f"fatjet{i+1}_tau21", + name=f"fatjet{i}_tau21", expression=f"FatJet.tau21[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 1), - x_title=r"FatJet %i $\tau_{21}$" % (i + 1), + x_title=r"FatJet %i $\tau_{21}$" % i, ) @@ -375,31 +375,33 @@ def add_variables(config: od.Config) -> None: x_title="Number of jets", discrete_x=True, ) - deepjet_wps = config.x.btag_working_points.deepjet - config.add_variable( - name="n_deepjet_loose", - expression=lambda events: ak.sum(events.Jet.btagDeepFlavB > deepjet_wps.loose, axis=1), - aux={"inputs": {"Jet.pt", "Jet.btagDeepFlavB"}}, - binning=(7, -0.5, 6.5), - x_title="Number of deepjets (loose WP)", - discrete_x=True, - ) - config.add_variable( - name="n_deepjet_medium", - expression=lambda events: ak.sum(events.Jet.btagDeepFlavB > deepjet_wps.medium, axis=1), - aux={"inputs": {"Jet.pt", "Jet.btagDeepFlavB"}}, - binning=(7, -0.5, 6.5), - x_title="Number of deepjets (medium WP)", - discrete_x=True, - ) - config.add_variable( - name="n_deepjet_tight", - expression=lambda events: ak.sum(events.Jet.btagDeepFlavB > deepjet_wps.tight, axis=1), - aux={"inputs": {"Jet.pt", "Jet.btagDeepFlavB"}}, - binning=(7, -0.5, 6.5), - x_title="Number of deepjets (tight WP)", - discrete_x=True, - ) + + if config.x.run == 2: + deepjet_wps = config.x.btag_working_points.deepjet + config.add_variable( + name="n_deepjet_loose", + expression=lambda events: ak.sum(events.Jet.btagDeepFlavB > deepjet_wps.loose, axis=1), + aux={"inputs": {"Jet.pt", "Jet.btagDeepFlavB"}}, + binning=(7, -0.5, 6.5), + x_title="Number of deepjets (loose WP)", + discrete_x=True, + ) + config.add_variable( + name="n_deepjet_medium", + expression=lambda events: ak.sum(events.Jet.btagDeepFlavB > deepjet_wps.medium, axis=1), + aux={"inputs": {"Jet.pt", "Jet.btagDeepFlavB"}}, + binning=(7, -0.5, 6.5), + x_title="Number of deepjets (medium WP)", + discrete_x=True, + ) + config.add_variable( + name="n_deepjet_tight", + expression=lambda events: ak.sum(events.Jet.btagDeepFlavB > deepjet_wps.tight, axis=1), + aux={"inputs": {"Jet.pt", "Jet.btagDeepFlavB"}}, + binning=(7, -0.5, 6.5), + x_title="Number of deepjets (tight WP)", + discrete_x=True, + ) if config.x.run == 3: particlenet_wps = config.x.btag_working_points.particlenet config.add_variable( @@ -508,169 +510,205 @@ def add_variables(config: od.Config) -> None: # Jets (4 pt-leading jets) for i in range(4): config.add_variable( - name=f"jet{i+1}_pt", + name=f"jet{i}_pt", expression=f"Jet.pt[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0., 400.), unit="GeV", - x_title=r"Jet %i $p_{T}$" % (i + 1), + x_title=r"Jet %i $p_{T}$" % i, ) config.add_variable( - name=f"jet{i+1}_eta", + name=f"jet{i}_eta", expression=f"Jet.eta[:,{i}]", null_value=EMPTY_FLOAT, binning=(50, -2.5, 2.5), - x_title=r"Jet %i $\eta$" % (i + 1), + x_title=r"Jet %i $\eta$" % i, ) config.add_variable( - name=f"jet{i+1}_phi", + name=f"jet{i}_phi", expression=f"Jet.phi[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, -3.2, 3.2), - x_title=r"Jet %i $\phi$" % (i + 1), + x_title=r"Jet %i $\phi$" % i, ) config.add_variable( - name=f"jet{i+1}_mass", + name=f"jet{i}_mass", expression=f"Jet.mass[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 200), unit="GeV", - x_title=r"Jet %i mass" % (i + 1), - ) - # config.add_variable( - # name=f"jet{i+1}_btagDeepB", - # expression=f"Jet.btagDeepB[:,{i}]", - # null_value=EMPTY_FLOAT, - # binning=(40, 0, 1), - # x_title=r"Jet %i DeepCSV b+bb tag" % (i + 1), - # ) - config.add_variable( - name=f"jet{i+1}_btagDeepFlavB", - expression=f"Jet.btagDeepFlavB[:,{i}]", - null_value=EMPTY_FLOAT, - binning=(40, 0, 1), - x_title=r"Jet %i DeepFlavour b+bb+lepb tag" % (i + 1), + x_title=r"Jet %i mass" % i, ) + if config.x.run == 2: + config.add_variable( + name=f"jet{i}_btagDeepFlavB", + expression=f"Jet.btagDeepFlavB[:,{i}]", + null_value=EMPTY_FLOAT, + binning=(40, 0, 1), + x_title=r"Jet %i DeepFlavour b+bb+lepb tag" % i, + ) if config.x.run == 3: config.add_variable( - name=f"jet{i+1}_btagPNetB", + name=f"jet{i}_btagPNetB", expression=f"Jet.btagPNetB[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 1), - x_title=r"Jet %i ParticleNet score" % (i + 1), + x_title=r"Jet %i ParticleNet score" % i, ) # Bjets (2 b-score leading jets) and Lightjets (2 non-b pt-leading jets) for i in range(2): for obj in ["Bjet", "Lightjet"]: config.add_variable( - name=f"{obj}{i+1}_pt".lower(), + name=f"{obj}{i}_pt".lower(), expression=f"{obj}.pt[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0., 300.), unit="GeV", - x_title=obj + r" %i $p_{T}$" % (i + 1), + x_title=obj + r" %i $p_{T}$" % i, ) config.add_variable( - name=f"{obj}{i+1}_eta".lower(), + name=f"{obj}{i}_eta".lower(), expression=f"{obj}.eta[:,{i}]", null_value=EMPTY_FLOAT, binning=(50, -2.5, 2.5), - x_title=obj + r" %i $\eta$" % (i + 1), + x_title=obj + r" %i $\eta$" % i, ) config.add_variable( - name=f"{obj}{i+1}_phi".lower(), + name=f"{obj}{i}_phi".lower(), expression=f"{obj}.phi[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, -3.2, 3.2), - x_title=obj + r" %i $\phi$" % (i + 1), + x_title=obj + r" %i $\phi$" % i, ) config.add_variable( - name=f"{obj}{i+1}_mass".lower(), + name=f"{obj}{i}_mass".lower(), expression=f"{obj}.mass[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 200), - x_title=obj + r" %i mass" % (i + 1), + x_title=obj + r" %i mass" % i, ) if config.x.run == 3: config.add_variable( - name=f"{obj}{i+1}_btagPNetB", + name=f"{obj}{i}_btagPNetB", expression=f"{obj}.btagPNetB[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 1), - x_title=obj + r" %i ParticleNet score" % (i + 1), + x_title=obj + r" %i ParticleNet score" % i, ) # FatJets (2 pt-leading fatjets) for i in range(2): config.add_variable( - name=f"fatjet{i+1}_pt", + name=f"fatjet{i}_pt", expression=f"FatJet.pt[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 170., 500.), unit="GeV", - x_title=r"FatJet %i $p_{T}$" % (i + 1), + x_title=r"FatJet %i $p_{T}$" % i, ) config.add_variable( - name=f"fatjet{i+1}_eta", + name=f"fatjet{i}_eta", expression=f"FatJet.eta[:,{i}]", null_value=EMPTY_FLOAT, binning=(50, -2.5, 2.5), - x_title=r"FatJet %i $\eta$" % (i + 1), + x_title=r"FatJet %i $\eta$" % i, ) config.add_variable( - name=f"fatjet{i+1}_phi", + name=f"fatjet{i}_phi", expression=f"FatJet.phi[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, -3.2, 3.2), - x_title=r"FatJet %i $\phi$" % (i + 1), + x_title=r"FatJet %i $\phi$" % i, ) config.add_variable( - name=f"fatjet{i+1}_mass", + name=f"fatjet{i}_mass", expression=f"FatJet.mass[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 250), unit="GeV", - x_title=r"FatJet %i mass" % (i + 1), + x_title=r"FatJet %i mass" % i, ) config.add_variable( - name=f"fatjet{i+1}_msoftdrop", + name=f"fatjet{i}_msoftdrop", expression=f"FatJet.msoftdrop[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 250), unit="GeV", - x_title=r"FatJet %i softdrop mass" % (i + 1), + x_title=r"FatJet %i softdrop mass" % i, ) config.add_variable( - name=f"fatjet{i+1}_tau1", + name=f"fatjet{i}_tau1", expression=f"FatJet.tau1[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 1), - x_title=r"FatJet %i $\tau_1$" % (i + 1), + x_title=r"FatJet %i $\tau_1$" % i, ) config.add_variable( - name=f"fatjet{i+1}_tau2", + name=f"fatjet{i}_tau2", expression=f"FatJet.tau2[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 1), - x_title=r"FatJet %i $\tau_2$" % (i + 1), + x_title=r"FatJet %i $\tau_2$" % i, ) config.add_variable( - name=f"fatjet{i+1}_btagHbb", + name=f"fatjet{i}_btagHbb", expression=f"FatJet.btagHbb[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 1), - x_title=r"FatJet %i btagHbb" % (i + 1), + x_title=r"FatJet %i btagHbb" % i, ) config.add_variable( - name=f"fatjet{i+1}_deepTagMD_HbbvsQCD", + name=f"fatjet{i}_deepTagMD_HbbvsQCD", expression=f"FatJet.deepTagMD_HbbvsQCD[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 1), - x_title=r"FatJet %i deepTagMD_HbbvsQCD " % (i + 1), + x_title=r"FatJet %i deepTagMD_HbbvsQCD " % i, ) # Leptons + for i in range(2): + config.add_variable( + name=f"lepton{i}_pt", + expression="Lepton[:, i].pt", + aux=dict( + inputs={"{Electron,Muon}.{pt,eta,phi,mass}"}, + ), + binning=(40, 0., 400.), + unit="GeV", + null_value=EMPTY_FLOAT, + ) + config.add_variable( + name=f"lepton{i}_eta", + expression="Lepton[:, i].eta", + aux=dict( + inputs={"{Electron,Muon}.{pt,eta,phi,mass}"}, + ), + binning=(40, -3.2, 3.2), + unit="GeV", + null_value=EMPTY_FLOAT, + ) + config.add_variable( + name=f"lepton{i}_phi", + expression="Lepton[:, i].phi", + aux=dict( + inputs={"{Electron,Muon}.{pt,eta,phi,mass}"}, + ), + binning=(50, -2.5, 2.5), + unit="GeV", + null_value=EMPTY_FLOAT, + ) + config.add_variable( + name=f"lepton{i}_mass", + expression="Lepton[:, i].mass", + aux=dict( + inputs={"{Electron,Muon}.{pt,eta,phi,mass}"}, + ), + binning=(40, 0., 400.), + unit="GeV", + null_value=EMPTY_FLOAT, + ) + for obj in ["Electron", "Muon"]: config.add_variable( name=f"{obj.lower()}_pt", From f15ccd38fcc093edc19ae4a68962834bab4ea7a7 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Thu, 12 Dec 2024 12:39:12 +0100 Subject: [PATCH 11/28] update variable and process groups --- hbw/config/defaults_and_groups.py | 19 ++++++++++++++----- hbw/util.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/hbw/config/defaults_and_groups.py b/hbw/config/defaults_and_groups.py index d3351dc9..7852e34e 100644 --- a/hbw/config/defaults_and_groups.py +++ b/hbw/config/defaults_and_groups.py @@ -4,6 +4,7 @@ from columnflow.inference import InferenceModel from columnflow.tasks.framework.base import RESOLVE_DEFAULT +from hbw.util import bracket_expansion def default_calibrator(container): @@ -134,9 +135,10 @@ def set_config_defaults_and_groups(config_inst): "much": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "qcd", "st", "dy", "vv", "w_lnu", "h"], # noqa: E501 "ech": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "qcd", "st", "dy", "vv", "w_lnu", "h"], # noqa: E501 "dl": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "dy", "st", "vv", "w_lnu", "h"], # noqa: E501 - "dl1": [default_signal_process, "tt", "dy", "st", "vv", "w_lnu", "h"], - "dl2": [default_signal_process, "tt", "dy_m4to10", "dy_m10to50", "dy_m50toinf", "st", "vv", "w_lnu", "h"], # noqa: E501 - "dlbkg": ["tt", "dy_m4to10", "dy_m10to50", "dy_m50toinf", "st", "vv", "w_lnu", "h"], + "dl1": [default_signal_process, "tt", "dy", "st", "ttv", "vv", "w_lnu", "h"], + "dl2": [default_signal_process, "tt", "dy_m4to10", "dy_m10to50", "dy_m50toinf", "st", "ttv", "vv", "w_lnu", "h"], # noqa: E501 + "dl3": [default_signal_process, "tt", "dy_m10to50", "dy_m50toinf", "st", "ttv", "vv", "w_lnu", "h"], # noqa: E501 + "dlbkg": ["tt", "dy_m4to10", "dy_m10to50", "dy_m50toinf", "st", "ttv", "vv", "w_lnu", "h"], "dlmajor": [default_signal_process, "tt", "dy", "st"], "2much": [default_signal_process, "tt", "dy_m4to10", "dy_m10to50", "dy_m50toinf", "st", "vv", "w_lnu", "h"], "2ech": [default_signal_process, "tt", "dy_m4to10", "dy_m10to50", "dy_m50toinf", "st", "vv", "w_lnu", "h"], @@ -186,7 +188,7 @@ def set_config_defaults_and_groups(config_inst): remove_generator = lambda x: x.replace("_powheg", "").replace("_madgraph", "").replace("_amcatnlo", "").replace("_pythia8", "").replace("4f_", "") # noqa: E501 config_inst.x.process_groups[f"datasets_{proc}"] = [remove_generator(dataset) for dataset in datasets] - for group in ("dl2", "dl1", "dl", "much", "2much", "ech", "2ech", "emuch"): + for group in ("dl3", "dl2", "dl1", "dl", "much", "2much", "ech", "2ech", "emuch"): # thanks to double counting removal, we can (and should) now use all datasets in each channel config_inst.x.process_groups[f"d{group}"] = ["data"] + config_inst.x.process_groups[group] @@ -299,7 +301,14 @@ def set_config_defaults_and_groups(config_inst): "sl": ["n_*", "electron_*", "muon_*", "met_*", "jet*", "bjet*", "ht"], "sl_resolved": ["n_*", "electron_*", "muon_*", "met_*", "jet*", "bjet*", "ht"], "sl_boosted": ["n_*", "electron_*", "muon_*", "met_*", "fatjet_*"], - "dl": ["n_*", "electron_*", "muon_*", "met_*", "jet*", "bjet*", "ht", "lt", "mll", "ptll"], + "dl": bracket_expansion([ + "n_{jet,bjet,electron,muon,fatjet,hbbjet}", + "lepton{0,1}_{pt,eta,phi}", + "met_{pt,phi}", + "jet{0,1,2,3}_{pt,eta,phi,mass,btagPNetB}", + "bjet{0,1}_{pt,eta,phi,mass,btagPNetB}", + "ht", "lt", "mll", "ptll", + ]), "dl_resolved": ["n_*", "electron_*", "muon_*", "met_*", "jet*", "bjet*", "ht", "lt", "mll", "ptll"], "dl_boosted": ["n_*", "electron_*", "muon_*", "met_*", "fatjet_*", "lt", "mll", "ptll"], "default": ["n_jet", "n_muon", "n_electron", "ht", "m_bb", "deltaR_bb", "jet1_pt"], # n_deepjet, .... diff --git a/hbw/util.py b/hbw/util.py index bdb998dc..990f4da9 100644 --- a/hbw/util.py +++ b/hbw/util.py @@ -6,6 +6,8 @@ from __future__ import annotations +import re +import itertools import time from typing import Hashable, Iterable, Callable from functools import wraps, reduce @@ -378,6 +380,34 @@ def four_vec( return outp +def bracket_expansion(inputs: list): + """ + Expands a list of strings with bracket notation into all possible combinations. + + Example: + bracket_expansion(["{Jet,Muon}.{pt,eta}", "{Electron,Photon}.{phi}"]) --> + {"Jet.pt", "Jet.eta", "Muon.pt", "Muon.eta", "Electron.phi", "Photon.phi"} + + NOTE: similar implementation might be somewhere in columnflow. + """ + pattern = re.compile(r'\{([^{}]+)\}') + outp = set() + + for inp in inputs: + # Find all bracketed groups and extract options by splitting on ',' + matches = pattern.findall(inp) + options = [match.split(',') for match in matches] + + # Replace each bracketed group with a placeholder '{}' + template = pattern.sub('{}', inp) + + # Generate all possible combinations and add to the output set + combinations = itertools.product(*options) + outp.update(template.format(*combo) for combo in combinations) + + return sorted(outp) + + def has_four_vec( events: ak.Array, collection_name: str, From 04ef6ad007eb885bd9c311ef33357714b959ad32 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Thu, 12 Dec 2024 12:40:34 +0100 Subject: [PATCH 12/28] cleanup in ml setup function --- hbw/ml/base.py | 30 ++++++++++++++++++++++-------- hbw/ml/derived/dl.py | 33 +++------------------------------ 2 files changed, 25 insertions(+), 38 deletions(-) diff --git a/hbw/ml/base.py b/hbw/ml/base.py index c1774524..90dfe5a6 100644 --- a/hbw/ml/base.py +++ b/hbw/ml/base.py @@ -21,6 +21,7 @@ from hbw.util import log_memory from hbw.ml.data_loader import MLDatasetLoader, MLProcessData, input_features_sanity_checks +from hbw.config.processes import create_combined_proc_forML from hbw.tasks.ml import MLPreTraining @@ -186,24 +187,37 @@ def parameters_repr(self): self._parameters_repr = parameters_repr return self._parameters_repr - def setup(self): + def setup(self) -> None: """ function that is run as part of the setup phase. Most likely overwritten by subclasses """ - logger.info( - f"Setting up MLModel {self.cls_name} (parameter hash: {self.parameters_repr})" + logger.debug( + f"Setting up MLModel {self.cls_name} (parameter hash: {self.parameters_repr}), " f"parameters: \n{self.parameters}", ) - # dynamically add variables for the quantities produced by this model - # NOTE: since these variables are only used in ConfigTasks, - # we do not need to add these variables to all configs + # dynamically add processes and variables for the quantities produced by this model + # NOTE: this function might not be called for all configs when the requested configs + # between MLTraining and the requested task are different + for proc in self.combine_processes: + if proc not in self.config_inst.processes: + proc_name = str(proc) + proc_dict = DotDict(self.combine_processes[proc]) + create_combined_proc_forML(self.config_inst, proc_name, proc_dict) + for proc in self.processes: for config_inst in self.config_insts: if f"mlscore.{proc}" not in config_inst.variables: config_inst.add_variable( name=f"mlscore.{proc}", + expression=f"mlscore.{proc}", null_value=-1, binning=(1000, 0., 1.), x_title=f"DNN output score {config_inst.get_process(proc).x.ml_label}", - aux={"rebin": 25}, # automatically rebin to 40 bins for plotting tasks + aux={ + "rebin": 25, + "rebin_config": { + "processes": [proc], + "n_bins": 4, + } + }, # automatically rebin to 40 bins for plotting tasks ) def preparation_producer(self: MLModel, analysis_inst: od.Analysis): @@ -295,6 +309,7 @@ def output(self, task: law.Task) -> dict[str, law.FileSystemTarget]: outp = { "mlmodel": target, "plots": target.child("plots", type="d", optional=True), + # "dummy": target.child("dummy", type="d", optional=True), "checkpoint": target.child("checkpoint", type="d", optional=True), } @@ -303,7 +318,6 @@ def output(self, task: law.Task) -> dict[str, law.FileSystemTarget]: target.child(fname, type="f") for fname in ("saved_model.pb", "keras_metadata.pb", "fingerprint.pb", "parameters.yaml", "input_features.pkl") ] - return outp def open_model(self, target: law.LocalDirectoryTarget) -> dict[str, Any]: diff --git a/hbw/ml/derived/dl.py b/hbw/ml/derived/dl.py index 59b4a631..05791fe7 100644 --- a/hbw/ml/derived/dl.py +++ b/hbw/ml/derived/dl.py @@ -10,12 +10,11 @@ import law -from columnflow.util import maybe_import, DotDict +from columnflow.util import maybe_import from hbw.ml.base import MLClassifierBase from hbw.ml.mixins import DenseModelMixin, ModelFitMixin -from hbw.config.processes import create_combined_proc_forML np = maybe_import("numpy") ak = maybe_import("awkward") @@ -135,34 +134,8 @@ def __init__( def cast_ml_param_values(self): super().cast_ml_param_values() - def setup(self): - # dynamically add variables for the quantities produced by this model - # NOTE: since these variables are only used in ConfigTasks, - # we do not need to add these variables to all configs - for proc in self.combine_processes: - if proc not in self.config_inst.processes: - proc_name = str(proc) - proc_dict = DotDict(self.combine_processes[proc]) - create_combined_proc_forML(self.config_inst, proc_name, proc_dict) - - for proc in self.processes: - for config_inst in self.config_insts: - if f"mlscore.{proc}" not in config_inst.variables: - config_inst.add_variable( - name=f"mlscore.{proc}", - expression=f"mlscore.{proc}", - null_value=-1, - binning=(1000, 0., 1.), - x_title=f"DNN output score {config_inst.get_process(proc).x('ml_label', '')}", - aux={"rebin": 40}, - ) - config_inst.add_variable( - name=f"mlscore40.{proc}", - expression=f"mlscore.{proc}", - null_value=-1, - binning=(40, 0., 1.), - x_title=f"DNN output score {config_inst.get_process(proc).x('ml_label', '')}", - ) + def setup(self) -> None: + super().setup() # From 0bceba32c32866b4ef9de04b8085dc272eaff623 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 13 Dec 2024 09:09:59 +0100 Subject: [PATCH 13/28] fix variables --- hbw/config/variables.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hbw/config/variables.py b/hbw/config/variables.py index 34ea1dca..60d5071e 100644 --- a/hbw/config/variables.py +++ b/hbw/config/variables.py @@ -670,7 +670,7 @@ def add_variables(config: od.Config) -> None: for i in range(2): config.add_variable( name=f"lepton{i}_pt", - expression="Lepton[:, i].pt", + expression=f"Lepton[:, {i}].pt", aux=dict( inputs={"{Electron,Muon}.{pt,eta,phi,mass}"}, ), @@ -680,7 +680,7 @@ def add_variables(config: od.Config) -> None: ) config.add_variable( name=f"lepton{i}_eta", - expression="Lepton[:, i].eta", + expression=f"Lepton[:, {i}].eta", aux=dict( inputs={"{Electron,Muon}.{pt,eta,phi,mass}"}, ), @@ -690,7 +690,7 @@ def add_variables(config: od.Config) -> None: ) config.add_variable( name=f"lepton{i}_phi", - expression="Lepton[:, i].phi", + expression=f"Lepton[:, {i}].phi", aux=dict( inputs={"{Electron,Muon}.{pt,eta,phi,mass}"}, ), @@ -700,7 +700,7 @@ def add_variables(config: od.Config) -> None: ) config.add_variable( name=f"lepton{i}_mass", - expression="Lepton[:, i].mass", + expression=f"Lepton[:, {i}].mass", aux=dict( inputs={"{Electron,Muon}.{pt,eta,phi,mass}"}, ), From 6ddf5515f6c28bc102e43d996628650e1db464b4 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 13 Dec 2024 09:10:36 +0100 Subject: [PATCH 14/28] extend timeit wrapper --- hbw/util.py | 49 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/hbw/util.py b/hbw/util.py index 990f4da9..93b00cb7 100644 --- a/hbw/util.py +++ b/hbw/util.py @@ -463,13 +463,56 @@ def timeit_multiple(func): def timeit_wrapper(*args, **kwargs): func.total_calls = getattr(func, "total_calls", 0) + 1 _repr = func.__name__ + if len(args) >= 1 and hasattr(args[0], "__name__"): + # some classmethod _repr = f"{args[0].__name__}.{_repr}" - if len(args) >= 3 and isinstance(args[2], dict): + if len(args) >= 2 and isinstance(args[1], dict): + params = args[1] + elif len(args) >= 3 and isinstance(args[2], dict): + params = args[2] + else: + params = {} + for param in ("branch", "dataset"): - if param in args[2]: - _repr = f"{_repr} ({param} {args[2][param]})" + if param in params: + _repr = f"{_repr} ({param} {params[param]})" + + elif len(args) >= 1 and hasattr(args[0], "cls_name"): + # probably a CSP function + inst = args[0] + params = {} + _repr = f"{inst.cls_name}.{_repr}" + if hasattr(inst, "config_inst"): + _repr = f"{_repr} ({inst.config_inst.name})" + if hasattr(inst, "dataset_inst"): + _repr = f"{_repr} ({inst.dataset_inst.name})" + if hasattr(inst, "shift_inst"): + _repr = f"{_repr} ({inst.shift_inst.name})" + + start_time = time.perf_counter() + result = func(*args, **kwargs) + end_time = time.perf_counter() + total_time = end_time - start_time + func.total_time = getattr(func, "total_time", 0) + total_time + log_func(f"{_repr} has been run {func.total_calls} times ({round_sig(func.total_time)} seconds)") + return result + + return timeit_wrapper + + +def timeit_multiple_plain(func): + """ Wrapper to measure the number of execution calls and the added execution time of a function """ + log_method = "info" + log_func = getattr(_logger, log_method) + + @wraps(func) + def timeit_wrapper(*args, **kwargs): + func.total_calls = getattr(func, "total_calls", 0) + 1 + _repr = func.__name__ + if len(args) >= 1 and hasattr(args[0], "__name__"): + _repr = f"{args[0].__name__}.{_repr}" start_time = time.perf_counter() result = func(*args, **kwargs) From 0feb3c6bbf9eb64d5faaf80adc0fd47f78648b6d Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 13 Dec 2024 09:10:55 +0100 Subject: [PATCH 15/28] add category group --- hbw/config/defaults_and_groups.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hbw/config/defaults_and_groups.py b/hbw/config/defaults_and_groups.py index 7852e34e..d2d8897c 100644 --- a/hbw/config/defaults_and_groups.py +++ b/hbw/config/defaults_and_groups.py @@ -225,6 +225,7 @@ def set_config_defaults_and_groups(config_inst): "sl_much_boosted": ["sr__1mu__boosted"], "sl_ech_boosted": ["sr__1e__boosted"], "dl": ["sr", "dycr", "ttcr", "sr__1b", "sr__2b", "dycr__1b", "dycr__2b", "ttcr__1b", "ttcr__2b"], + "dl_preml": bracket_expansion(["incl", "{sr,ttcr,dycr}{,__2e,__2mu,__emu}{,__1b,__2b}"]), "dl_ttcr": ["ttcr", "ttcr__1b", "ttcr__2b", "ttcr__2e", "ttcr__2mu", "ttcr__emu"], "dl_dycr": ["dycr", "dycr__1b", "dycr__2b", "dycr__2e", "dycr__2mu", "dycr__emu"], "dl_sr": ["sr", "sr__1b", "sr__2b", "sr__2e", "sr__2mu", "sr__emu"], From 8c85933ce077e31b44ba52ebeaa99e8043ad076a Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 13 Dec 2024 09:14:33 +0100 Subject: [PATCH 16/28] lint --- hbw/ml/base.py | 2 +- hbw/ml/data_loader.py | 2 +- hbw/tasks/campaigns.py | 2 +- hbw/util.py | 10 +++++----- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/hbw/ml/base.py b/hbw/ml/base.py index 90dfe5a6..e03c3bd7 100644 --- a/hbw/ml/base.py +++ b/hbw/ml/base.py @@ -216,7 +216,7 @@ def setup(self) -> None: "rebin_config": { "processes": [proc], "n_bins": 4, - } + }, }, # automatically rebin to 40 bins for plotting tasks ) diff --git a/hbw/ml/data_loader.py b/hbw/ml/data_loader.py index 445ccaba..9b6a1457 100644 --- a/hbw/ml/data_loader.py +++ b/hbw/ml/data_loader.py @@ -27,7 +27,7 @@ def get_proc_mask( """ Creates the mask selecting events belonging to the process *proc* and a list of all ids belonging to this process. - :param events: Event array + :param events: Event array :param proc: Either string or process instance. :param config_inst: An instance of the Config, can be None if Porcess instance is given. :return process mask and the corresponding process ids diff --git a/hbw/tasks/campaigns.py b/hbw/tasks/campaigns.py index 8c9b9ca7..62c36e74 100644 --- a/hbw/tasks/campaigns.py +++ b/hbw/tasks/campaigns.py @@ -115,7 +115,7 @@ def dataset_summary(self): return dict(dataset_summary) @cached_property - def campaign_summary(self,): + def campaign_summary(self): campaign_summary = { campaign.name: {} for campaign in self.campaign_insts } diff --git a/hbw/util.py b/hbw/util.py index 93b00cb7..67082e94 100644 --- a/hbw/util.py +++ b/hbw/util.py @@ -390,16 +390,16 @@ def bracket_expansion(inputs: list): NOTE: similar implementation might be somewhere in columnflow. """ - pattern = re.compile(r'\{([^{}]+)\}') + pattern = re.compile(r"\{([^{}]+)\}") outp = set() for inp in inputs: - # Find all bracketed groups and extract options by splitting on ',' + # Find all bracketed groups and extract options by splitting on "," matches = pattern.findall(inp) - options = [match.split(',') for match in matches] + options = [match.split(",") for match in matches] - # Replace each bracketed group with a placeholder '{}' - template = pattern.sub('{}', inp) + # Replace each bracketed group with a placeholder "{}" + template = pattern.sub("{}", inp) # Generate all possible combinations and add to the output set combinations = itertools.product(*options) From d496971b00f26ef6f031c4126212048703cd5436 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 13 Dec 2024 09:27:37 +0100 Subject: [PATCH 17/28] lint again --- tests/test_util.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_util.py b/tests/test_util.py index 85defa9c..d7a860c8 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -53,15 +53,15 @@ def test_nested_modification(self): "name": "Alice", "skills": { "python": "intermediate", - "sql": "beginner" - } + "sql": "beginner", + }, } dict2 = { "name": "Alice", "skills": { "python": "advanced", - "sql": "beginner" - } + "sql": "beginner", + }, } result = gather_dict_diff(dict1, dict2) expected_output = ( @@ -76,15 +76,15 @@ def test_nested_addition(self): dict1 = { "name": "Alice", "skills": { - "python": "intermediate" - } + "python": "intermediate", + }, } dict2 = { "name": "Alice", "skills": { "python": "intermediate", - "docker": "beginner" - } + "docker": "beginner", + }, } result = gather_dict_diff(dict1, dict2) expected_output = ( From 1b57fed9e682eb3f6164ef5f001c4e0d2322aaf7 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 13 Dec 2024 09:37:44 +0100 Subject: [PATCH 18/28] loop over produced_columns instead of produces --- hbw/config/defaults_and_groups.py | 4 ++-- hbw/config/variables.py | 10 +++++----- hbw/production/features.py | 6 ++++-- hbw/production/normalized_btag.py | 6 ++++-- hbw/production/resonant_features.py | 3 ++- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/hbw/config/defaults_and_groups.py b/hbw/config/defaults_and_groups.py index d2d8897c..0443d24b 100644 --- a/hbw/config/defaults_and_groups.py +++ b/hbw/config/defaults_and_groups.py @@ -306,8 +306,8 @@ def set_config_defaults_and_groups(config_inst): "n_{jet,bjet,electron,muon,fatjet,hbbjet}", "lepton{0,1}_{pt,eta,phi}", "met_{pt,phi}", - "jet{0,1,2,3}_{pt,eta,phi,mass,btagPNetB}", - "bjet{0,1}_{pt,eta,phi,mass,btagPNetB}", + "jet{0,1,2,3}_{pt,eta,phi,mass,btagpnetb}", + "bjet{0,1}_{pt,eta,phi,mass,btagpnetb}", "ht", "lt", "mll", "ptll", ]), "dl_resolved": ["n_*", "electron_*", "muon_*", "met_*", "jet*", "bjet*", "ht", "lt", "mll", "ptll"], diff --git a/hbw/config/variables.py b/hbw/config/variables.py index 60d5071e..7fa7b1f8 100644 --- a/hbw/config/variables.py +++ b/hbw/config/variables.py @@ -541,7 +541,7 @@ def add_variables(config: od.Config) -> None: ) if config.x.run == 2: config.add_variable( - name=f"jet{i}_btagDeepFlavB", + name=f"jet{i}_btagDeepFlavB".lower(), expression=f"Jet.btagDeepFlavB[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 1), @@ -549,7 +549,7 @@ def add_variables(config: od.Config) -> None: ) if config.x.run == 3: config.add_variable( - name=f"jet{i}_btagPNetB", + name=f"jet{i}_btagPNetB".lower(), expression=f"Jet.btagPNetB[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 1), @@ -590,7 +590,7 @@ def add_variables(config: od.Config) -> None: ) if config.x.run == 3: config.add_variable( - name=f"{obj}{i}_btagPNetB", + name=f"{obj}{i}_btagPNetB".lower(), expression=f"{obj}.btagPNetB[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 1), @@ -652,14 +652,14 @@ def add_variables(config: od.Config) -> None: x_title=r"FatJet %i $\tau_2$" % i, ) config.add_variable( - name=f"fatjet{i}_btagHbb", + name=f"fatjet{i}_btagHbb".lower(), expression=f"FatJet.btagHbb[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 1), x_title=r"FatJet %i btagHbb" % i, ) config.add_variable( - name=f"fatjet{i}_deepTagMD_HbbvsQCD", + name=f"fatjet{i}_deepTagMD_HbbvsQCD".lower(), expression=f"FatJet.deepTagMD_HbbvsQCD[:,{i}]", null_value=EMPTY_FLOAT, binning=(40, 0, 1), diff --git a/hbw/production/features.py b/hbw/production/features.py index 7a23858b..f41dd275 100644 --- a/hbw/production/features.py +++ b/hbw/production/features.py @@ -37,7 +37,8 @@ def jj_features(self: Producer, events: ak.Array, **kwargs) -> ak.Array: events = set_ak_column_f32(events, "deltaR_jj", deltaR_jj) # fill none values - for col in self.produces: + for route in self.produced_columns: + col = route.string_column events = set_ak_column_f32(events, col, ak.fill_none(events[col], EMPTY_FLOAT)) return events @@ -62,7 +63,8 @@ def bb_features(self: Producer, events: ak.Array, **kwargs) -> ak.Array: events = set_ak_column_f32(events, "m_bb_combined", m_bb_combined) # fill none values - for col in self.produces: + for route in self.produced_columns: + col = route.string_column events = set_ak_column_f32(events, col, ak.fill_none(events[col], EMPTY_FLOAT)) return events diff --git a/hbw/production/normalized_btag.py b/hbw/production/normalized_btag.py index 24743367..7c6bbad7 100644 --- a/hbw/production/normalized_btag.py +++ b/hbw/production/normalized_btag.py @@ -46,7 +46,8 @@ def normalized_btag_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Ar raise NotImplementedError( f"Normalization mode {mode} not implemented (see hbw.tasks.corrections.GetBtagNormalizationSF)", ) - for weight_name in self[btag_weights].produces: + for weight_route in self[btag_weights].produced_columns: + weight_name = weight_route.string_column if not weight_name.startswith("btag_weight"): continue @@ -66,7 +67,8 @@ def normalized_btag_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Ar @normalized_btag_weights.init def normalized_btag_weights_init(self: Producer) -> None: - for weight_name in self[btag_weights].produces: + for weight_route in self[btag_weights].produced_columns: + weight_name = weight_route.string_column if not weight_name.startswith("btag_weight"): continue for mode in self.modes: diff --git a/hbw/production/resonant_features.py b/hbw/production/resonant_features.py index 9fc5a169..d8aafe51 100644 --- a/hbw/production/resonant_features.py +++ b/hbw/production/resonant_features.py @@ -98,7 +98,8 @@ def resonant_features(self: Producer, events: ak.Array, **kwargs) -> ak.Array: events = set_ak_column_f32(events, "m_Heavy_Higgs", events.Heavy_Higgs.mass) events = set_ak_column_f32(events, "eta_Heavy_Higgs", events.Heavy_Higgs.eta) events = set_ak_column_f32(events, "phi_Heavy_Higgs", events.Heavy_Higgs.phi) - for col in self.produces: + for route in self.produced_columns: + col = route.string_column events = set_ak_column(events, col, ak.fill_none(ak.nan_to_none(events[col]), EMPTY_FLOAT)) # undo object padding From 6e7e68852c4cf9ef523344bac39b3f6b77df035e Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 13 Dec 2024 09:53:00 +0100 Subject: [PATCH 19/28] remove features producer --- hbw/config/variables.py | 111 ------------------- hbw/production/features.py | 214 ------------------------------------- law.cfg | 2 +- 3 files changed, 1 insertion(+), 326 deletions(-) delete mode 100644 hbw/production/features.py diff --git a/hbw/config/variables.py b/hbw/config/variables.py index 7fa7b1f8..20fbe348 100644 --- a/hbw/config/variables.py +++ b/hbw/config/variables.py @@ -16,117 +16,6 @@ from hbw.config.styling import default_var_binning, default_var_unit -@call_once_on_config() -def add_feature_variables(config: od.Config) -> None: - """ - Adds variables to a *config* that are produced as part of the `features` producer. - """ - - # Event properties - config.add_variable( - name="features_n_jet", - expression=lambda events: ak.num(events.Jet.pt, axis=1), - binning=(12, -0.5, 11.5), - x_title="Number of jets", - aux={"inputs": {"Jet.pt"}}, - discrete_x=True, - ) - config.add_variable( - name="features_n_deepjet", - binning=(11, -0.5, 10.5), - x_title="Number of deepjets", - discrete_x=True, - ) - config.add_variable( - name="features_n_fatjet", - binning=(7, -0.5, 6.5), - x_title="Number of fatjets", - discrete_x=True, - ) - config.add_variable( - name="features_n_hbbjet", - binning=(4, -0.5, 3.5), - x_title="Number of hbbjets", - discrete_x=True, - ) - config.add_variable( - name="features_n_electron", - binning=(4, -0.5, 3.5), - x_title="Number of electrons", - discrete_x=True, - ) - config.add_variable( - name="features_n_muon", - binning=(4, -0.5, 3.5), - x_title="Number of muons", - discrete_x=True, - ) - config.add_variable( - name="features_n_bjet", - binning=(4, -0.5, 3.5), - x_title="Number of bjets", - discrete_x=True, - ) - config.add_variable( - name="features_ht", - binning=(40, 0, 1500), - x_title="HT", - ) - - # bb features - config.add_variable( - name="m_bb", - binning=(40, 0., 400.), - unit="GeV", - x_title=r"$m_{bb}$", - ) - config.add_variable( - name="m_bb_combined", - binning=(40, 0., 400.), - unit="GeV", - x_title=r"$m_{bb}$ combined", - ) - config.add_variable( - name="bb_pt", - binning=(40, 0., 350), - x_title=r"$p_T^{bb}$", - unit="GeV", - ) - config.add_variable( - name="deltaR_bb", - binning=(40, 0, 5), - x_title=r"$\Delta R(b,b)$", - ) - # jj features - config.add_variable( - name="m_jj", - binning=(40, 0., 400.), - unit="GeV", - x_title=r"$m_{jj}$", - ) - config.add_variable( - name="jj_pt", - binning=(40, 0., 350), - x_title=r"$p_T^{jj}$", - unit="GeV", - ) - config.add_variable( - name="deltaR_jj", - binning=(40, 0, 5), - x_title=r"$\Delta R(j_{1},j_{2})$", - ) - - # FatJet features - for i in range(2): - config.add_variable( - name=f"fatjet{i}_tau21", - expression=f"FatJet.tau21[:,{i}]", - null_value=EMPTY_FLOAT, - binning=(40, 0, 1), - x_title=r"FatJet %i $\tau_{21}$" % i, - ) - - @call_once_on_config() def add_neutrino_variables(config: od.Config) -> None: """ diff --git a/hbw/production/features.py b/hbw/production/features.py deleted file mode 100644 index f41dd275..00000000 --- a/hbw/production/features.py +++ /dev/null @@ -1,214 +0,0 @@ -# coding: utf-8 - -""" -Column production methods related to higher-level features. -""" - -import functools - -from columnflow.production import Producer, producer -from columnflow.util import maybe_import -from columnflow.columnar_util import set_ak_column, EMPTY_FLOAT - -from hbw.production.prepare_objects import prepare_objects -from hbw.config.variables import add_feature_variables -from hbw.config.dl.variables import add_dl_variables - -np = maybe_import("numpy") -ak = maybe_import("awkward") -coffea = maybe_import("coffea") -maybe_import("coffea.nanoevents.methods.nanoaod") -# from coffea.nanoevents.methods.nanoaod import behavior - -# helper -set_ak_column_f32 = functools.partial(set_ak_column, value_type=np.float32) - - -@producer( - uses={"Jet.{pt,eta,phi,mass}"}, - produces={"m_jj", "jj_pt", "deltaR_jj"}, -) -def jj_features(self: Producer, events: ak.Array, **kwargs) -> ak.Array: - # create jj features - jj = (events.Jet[:, 0] + events.Jet[:, 1]) - deltaR_jj = events.Jet[:, 0].delta_r(events.Jet[:, 1]) - events = set_ak_column_f32(events, "m_jj", jj.mass) - events = set_ak_column_f32(events, "jj_pt", jj.pt) - events = set_ak_column_f32(events, "deltaR_jj", deltaR_jj) - - # fill none values - for route in self.produced_columns: - col = route.string_column - events = set_ak_column_f32(events, col, ak.fill_none(events[col], EMPTY_FLOAT)) - return events - - -@producer( - uses={ - "HbbJet.msoftdrop", "Jet.{pt,eta,phi,mass}", - }, - produces={"m_bb", "bb_pt", "deltaR_bb", "m_bb_combined"}, -) -def bb_features(self: Producer, events: ak.Array, **kwargs) -> ak.Array: - # create bb features - bb = (events.Bjet[:, 0] + events.Bjet[:, 1]) - events = set_ak_column_f32(events, "m_bb", bb.mass) - events = set_ak_column_f32(events, "bb_pt", bb.pt) - - deltaR_bb = events.Bjet[:, 0].delta_r(events.Bjet[:, 1]) - events = set_ak_column_f32(events, "deltaR_bb", deltaR_bb) - - # combination of resolved and boosted bb mass - m_bb_combined = ak.where(ak.num(events.HbbJet) > 0, events.HbbJet[:, 0].msoftdrop, bb.mass) - events = set_ak_column_f32(events, "m_bb_combined", m_bb_combined) - - # fill none values - for route in self.produced_columns: - col = route.string_column - events = set_ak_column_f32(events, col, ak.fill_none(events[col], EMPTY_FLOAT)) - - return events - - -@producer( - uses={ - prepare_objects, - bb_features, jj_features, - "Electron.pt", "Electron.eta", "Muon.pt", "Muon.eta", - "Muon.charge", "Electron.charge", - "Jet.pt", "Jet.eta", "Jet.btagDeepFlavB", "Jet.btagPNetB", - "Bjet.pt", - "HbbJet.pt", - "FatJet.pt", "FatJet.tau1", "FatJet.tau2", - }, - produces={ - bb_features, jj_features, - "ht", "n_jet", "n_electron", "n_muon", "n_deepjet", "n_fatjet", "n_hbbjet", - "FatJet.tau21", "n_bjet", - }, -) -def features(self: Producer, events: ak.Array, **kwargs) -> ak.Array: - - # add behavior and define new collections (e.g. Lepton) - events = self[prepare_objects](events, **kwargs) - - # object padding - events = set_ak_column(events, "Jet", ak.pad_none(events.Jet, 2)) - events = set_ak_column(events, "Bjet", ak.pad_none(events.Bjet, 2)) - events = set_ak_column(events, "FatJet", ak.pad_none(events.FatJet, 1)) - events = set_ak_column(events, "HbbJet", ak.pad_none(events.HbbJet, 1)) - - # ht and number of objects (safe for None entries) - events = set_ak_column_f32(events, "ht", ak.sum(events.Jet.pt, axis=1)) - events = set_ak_column(events, "n_jet", ak.sum(events.Jet.pt > 0, axis=1)) - events = set_ak_column(events, "n_bjet", ak.sum(events.Bjet.pt > 0, axis=1)) - events = set_ak_column(events, "n_electron", ak.sum(events.Electron.pt > 0, axis=1)) - events = set_ak_column(events, "n_muon", ak.sum(events.Muon.pt > 0, axis=1)) - wp_med_deepjet = self.config_inst.x.btag_working_points.deepjet.medium - events = set_ak_column(events, "n_deepjet", ak.sum(events.Jet.btagDeepFlavB > wp_med_deepjet, axis=1)) - wp_med_particlenet = self.config_inst.x.btag_working_points.particlenet.medium - events = set_ak_column(events, "n_particlenet", ak.sum(events.Jet.btagPNetB > wp_med_particlenet, axis=1)) - events = set_ak_column(events, "n_fatjet", ak.sum(events.FatJet.pt > 0, axis=1)) - events = set_ak_column(events, "n_hbbjet", ak.sum(events.HbbJet.pt > 0, axis=1)) - - # Subjettiness - events = set_ak_column_f32(events, "FatJet.tau21", events.FatJet.tau2 / events.FatJet.tau1) - - # bb and jj features - events = self[bb_features](events, **kwargs) - events = self[jj_features](events, **kwargs) - - # undo object padding (remove None entries) - for obj in ["Jet", "Bjet", "FatJet"]: - events = set_ak_column(events, obj, events[obj][~ak.is_none(events[obj], axis=1)]) - - return events - - -@features.init -def features_init(self: Producer) -> None: - # add variable instances to config - add_feature_variables(self.config_inst) - - -@producer( - uses={ - "{Electron,Muon,Bjet}.{pt,eta,phi,mass}", "MET.{pt,phi}", - features, - "Electron.charge", "Muon.charge", - }, - produces={ - features, - "deltaR_ll", "ll_pt", "m_bb", "deltaR_bb", "bb_pt", - "MT", "min_dr_lljj", "delta_Phi", "m_lljjMET", - "m_ll_check", "E_miss", "charge", "wp_score", - }, -) -def dl_features(self: Producer, events: ak.Array, **kwargs) -> ak.Array: - - # Inherit common features and prepares Object Lepton. Bjet, etc. - events = self[features](events, **kwargs) - - # create ll object and ll variables - ll = (events.Lepton[:, 0] + events.Lepton[:, 1]) - deltaR_ll = events.Lepton[:, 0].delta_r(events.Lepton[:, 1]) - events = set_ak_column_f32(events, "ll_pt", ll.pt) - events = set_ak_column_f32(events, "m_ll_check", ll.mass) - events = set_ak_column_f32(events, "deltaR_ll", deltaR_ll) - - # minimum deltaR between lep and jet - lljj_pairs = ak.cartesian([events.Lepton, events.Bjet], axis=1) - lep, jet = ak.unzip(lljj_pairs) - min_dr_lljj = (ak.min(lep.delta_r(jet), axis=-1)) - events = set_ak_column_f32(events, "min_dr_lljj", min_dr_lljj) - - # Transverse mass - MT = (2 * events.MET.pt * ll.pt * (1 - np.cos(ll.delta_phi(events.MET)))) ** 0.5 - events = set_ak_column_f32(events, "MT", MT) - - # delta Phi between ll and bb object - bb = (events.Bjet[:, 0] + events.Bjet[:, 1]) - events = set_ak_column_f32(events, "delta_Phi", abs(ll.delta_phi(bb))) - - # invariant mass of all decay products - m_lljjMET = (events.Bjet[:, 0] + events.Bjet[:, 1] + events.Lepton[:, 0] + events.Lepton[:, 1] + events.MET[:]).mass - events = set_ak_column(events, "m_lljjMET", m_lljjMET) - - # Lepton charge - events = set_ak_column(events, "charge", (events.Lepton.charge)) - - # fill none values for dl variables - dl_variable_list = [ - "m_bb", "bb_pt", "deltaR_bb", "ll_pt", "m_ll_check", "deltaR_ll", "min_dr_lljj", - "charge", "MT", "delta_Phi", "E_miss", "m_lljjMET", - ] - for var in dl_variable_list: - events = set_ak_column_f32(events, var, ak.fill_none(events[var], EMPTY_FLOAT)) - - return events - - -@dl_features.init -def dl_features_init(self: Producer) -> None: - # add variable instances to config - add_dl_variables(self.config_inst) - - -from hbw.production.resonant_features import resonant_features - - -@producer( - uses={ - features, resonant_features, - }, - produces={ - features, resonant_features, - }, -) -def sl_res_features(self: Producer, events: ak.Array, **kwargs) -> ak.Array: - - # Inherit common features and prepares Object Lepton. Bjet, etc. - events = self[features](events, **kwargs) - events = self[resonant_features](events, **kwargs) - - return events diff --git a/law.cfg b/law.cfg index ef9ce8c0..bd75a910 100644 --- a/law.cfg +++ b/law.cfg @@ -32,7 +32,7 @@ default_version: prod3 default_common_version: common3 -production_modules: hbw.production.{weights,features,ml_inputs,categories,gen_hbw_decay,neutrino,synchronization}, hbw.ml.stats +production_modules: hbw.production.{weights,ml_inputs,categories,gen_hbw_decay,neutrino,synchronization}, hbw.ml.stats calibration_modules: columnflow.calibration.jets, hbw.calibration.default selection_modules: hbw.selection.{jet,common,sl_remastered,dl_remastered} categorization_modules: hbw.categorization.categories From 2d45359206b8016fc0a167b90cca2f63656328e6 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Tue, 17 Dec 2024 12:52:03 +0100 Subject: [PATCH 20/28] fix dataset-dependent shift resolving --- hbw/weight/default.py | 49 ++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/hbw/weight/default.py b/hbw/weight/default.py index 2b472e03..2179df8f 100644 --- a/hbw/weight/default.py +++ b/hbw/weight/default.py @@ -6,7 +6,7 @@ import law -from columnflow.util import maybe_import +from columnflow.util import maybe_import, InsertableDict from columnflow.weight import WeightProducer, weight_producer from columnflow.config_util import get_shifts_from_sources from columnflow.columnar_util import Route @@ -59,46 +59,65 @@ def base(self: WeightProducer, events: ak.Array, **kwargs) -> ak.Array: # build the full event weight weight = ak.Array(np.ones(len(events), dtype=np.float32)) - for column in self.weight_columns.keys(): + for column in self.local_weight_columns.keys(): weight = weight * Route(column).apply(events) return events, weight +@base.setup +def base_setup( + self: WeightProducer, + reqs: dict, + inputs: dict, + reader_targets: InsertableDict, +) -> None: + logger.info( + f"WeightProducer '{self.cls_name}' (dataset {self.dataset_inst}) uses weight columns: \n" + f"{', '.join(self.weight_columns.keys())}", + ) + + @base.init def base_init(self: WeightProducer) -> None: # NOTE: this might be called multiple times, might be quite inefficient - if not getattr(self, "config_inst", None) or not getattr(self, "dataset_inst", None): + # if not getattr(self, "config_inst", None) or not getattr(self, "dataset_inst", None): + # return + + if not getattr(self, "config_inst"): return - if self.dataset_inst.is_data: + dataset_inst = getattr(self, "dataset_inst", None) + + if dataset_inst and dataset_inst.is_data: return year = self.config_inst.campaign.x.year if not self.weight_columns: raise Exception("weight_columns not set") + self.local_weight_columns = self.weight_columns.copy() - if self.dataset_inst.has_tag("skip_scale"): + if dataset_inst and dataset_inst.has_tag("skip_scale"): # remove dependency towards mur/muf weights for column in [ "normalized_mur_weight", "normalized_muf_weight", "normalized_murmuf_envelope_weight", "mur_weight", "muf_weight", "murmuf_envelope_weight", ]: - self.weight_columns.pop(column, None) + self.local_weight_columns.pop(column, None) - if self.dataset_inst.has_tag("skip_pdf"): + if dataset_inst and dataset_inst.has_tag("skip_pdf"): # remove dependency towards pdf weights for column in ["pdf_weight", "normalized_pdf_weight"]: - self.weight_columns.pop(column, None) + self.local_weight_columns.pop(column, None) - if not self.dataset_inst.has_tag("is_ttbar"): + if dataset_inst and not dataset_inst.has_tag("is_ttbar"): # remove dependency towards top pt weights - self.weight_columns.pop("top_pt_weight", None) + self.local_weight_columns.pop("top_pt_weight", None) - if not self.dataset_inst.has_tag("is_v_jets"): + if dataset_inst and not dataset_inst.has_tag("is_v_jets"): # remove dependency towards vjets weights - self.weight_columns.pop("vjets_weight", None) + self.local_weight_columns.pop("vjets_weight", None) self.shifts = set() @@ -106,14 +125,14 @@ def base_init(self: WeightProducer) -> None: # TODO: we should do this somewhere centrally btag_sf_jec_sources = ( (set(self.config_inst.x.btag_sf_jec_sources) | {"Total"}) & - set(self.config_inst.x.jec["uncertainty_sources"]) + set(self.config_inst.x.jec.Jet["uncertainty_sources"]) ) self.shifts |= set(get_shifts_from_sources( self.config_inst, *[f"jec_{jec_source}" for jec_source in btag_sf_jec_sources], )) - for weight_column, shift_sources in self.weight_columns.items(): + for weight_column, shift_sources in self.local_weight_columns.items(): shift_sources = law.util.make_list(shift_sources) shift_sources = [s.format(year=year) for s in shift_sources] shifts = get_shifts_from_sources(self.config_inst, *shift_sources) @@ -129,7 +148,7 @@ def base_init(self: WeightProducer) -> None: self.shifts |= set(shifts) # store column names referring to weights to multiply - self.uses |= self.weight_columns.keys() + self.uses |= self.local_weight_columns.keys() btag_uncs = [ From 87bb2127f46b40c6e8652cc25295a9f3cb53771f Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Tue, 17 Dec 2024 12:53:19 +0100 Subject: [PATCH 21/28] fix tests --- tests/__init__.py | 3 ++- tests/run_test | 8 ++++---- tests/test_util.py | 8 ++++---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index e008438a..71049fac 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -16,4 +16,5 @@ import hbw # noqa # import all tests -# ... +from .test_util import * +from .test_workflow import * diff --git a/tests/run_test b/tests/run_test index ec92cc0f..b354f8a1 100755 --- a/tests/run_test +++ b/tests/run_test @@ -21,14 +21,14 @@ action() { if [ -z "${sandbox}" ]; then echo "testing ${mod} ..." ( - cd "${cf_dir}" && \ - python -m unittest "tests.${mod}" + cd "${this_dir}" && \ + python -m unittest "${mod}" ) else echo "testing ${mod} ..." ( - cd "${cf_dir}" && \ - cf_sandbox "${sandbox}" "python -m unittest tests.${mod}" + cd "${this_dir}" && \ + cf_sandbox "${sandbox}" "python -m unittest ${mod}" ) fi } diff --git a/tests/test_util.py b/tests/test_util.py index d7a860c8..3bd06276 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -114,15 +114,15 @@ def test_complex_diff(self): } result = gather_dict_diff(dict1, dict2) expected_output = ( - "🔄 Modified: age:\n" - " - Old: 25\n" - " - New: 26\n" "🔄 Modified: skills:\n" + " 🔹 Added: docker: beginner\n" " 🔄 Modified: python:\n" " - Old: intermediate\n" " - New: advanced\n" - " 🔹 Added: docker: beginner\n" "🔹 Added: hobby: cycling" + "🔄 Modified: age:\n" + " - Old: 25\n" + " - New: 26\n" ) self.assertEqual(result, expected_output) From c113c5af666b9a6c2cf51956d3753e0c93fa3dae Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Tue, 17 Dec 2024 15:27:39 +0100 Subject: [PATCH 22/28] 'fix' tests --- tests/test_util.py | 114 +-------------------------------------------- 1 file changed, 1 insertion(+), 113 deletions(-) diff --git a/tests/test_util.py b/tests/test_util.py index 3bd06276..1f3289de 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -8,7 +8,7 @@ from columnflow.util import maybe_import -from hbw.util import build_param_product, round_sig, dict_diff, four_vec, call_once_on_config, gather_dict_diff +from hbw.util import build_param_product, round_sig, dict_diff, four_vec, call_once_on_config import order as od @@ -16,119 +16,7 @@ ak = maybe_import("awkward") -class TestDictDiff(unittest.TestCase): - def test_no_difference(self): - dict1 = {"name": "Alice", "age": 25} - dict2 = {"name": "Alice", "age": 25} - result = gather_dict_diff(dict1, dict2) - self.assertEqual(result, "✅ No differences found.") - - def test_simple_modification(self): - dict1 = {"name": "Alice", "age": 25} - dict2 = {"name": "Alice", "age": 26} - result = gather_dict_diff(dict1, dict2) - expected_output = ( - "🔄 Modified: age:\n" - " - Old: 25\n" - " - New: 26" - ) - self.assertEqual(result, expected_output) - - def test_addition(self): - dict1 = {"name": "Alice"} - dict2 = {"name": "Alice", "hobby": "cycling"} - result = gather_dict_diff(dict1, dict2) - expected_output = "🔹 Added: hobby: cycling" - self.assertEqual(result, expected_output) - - def test_removal(self): - dict1 = {"name": "Alice", "hobby": "cycling"} - dict2 = {"name": "Alice"} - result = gather_dict_diff(dict1, dict2) - expected_output = "🔻 Removed: hobby: cycling" - self.assertEqual(result, expected_output) - - def test_nested_modification(self): - dict1 = { - "name": "Alice", - "skills": { - "python": "intermediate", - "sql": "beginner", - }, - } - dict2 = { - "name": "Alice", - "skills": { - "python": "advanced", - "sql": "beginner", - }, - } - result = gather_dict_diff(dict1, dict2) - expected_output = ( - "🔄 Modified: skills:\n" - " 🔄 Modified: python:\n" - " - Old: intermediate\n" - " - New: advanced" - ) - self.assertEqual(result, expected_output) - - def test_nested_addition(self): - dict1 = { - "name": "Alice", - "skills": { - "python": "intermediate", - }, - } - dict2 = { - "name": "Alice", - "skills": { - "python": "intermediate", - "docker": "beginner", - }, - } - result = gather_dict_diff(dict1, dict2) - expected_output = ( - "🔄 Modified: skills:\n" - " 🔹 Added: docker: beginner" - ) - self.assertEqual(result, expected_output) - - def test_complex_diff(self): - dict1 = { - "name": "Alice", - "age": 25, - "skills": { - "python": "intermediate", - "sql": "beginner", - }, - } - dict2 = { - "name": "Alice", - "age": 26, - "skills": { - "python": "advanced", - "sql": "beginner", - "docker": "beginner", - }, - "hobby": "cycling", - } - result = gather_dict_diff(dict1, dict2) - expected_output = ( - "🔄 Modified: skills:\n" - " 🔹 Added: docker: beginner\n" - " 🔄 Modified: python:\n" - " - Old: intermediate\n" - " - New: advanced\n" - "🔹 Added: hobby: cycling" - "🔄 Modified: age:\n" - " - Old: 25\n" - " - New: 26\n" - ) - self.assertEqual(result, expected_output) - - class HbwUtilTest( - TestDictDiff, unittest.TestCase, ): From b45aa6597b6ba9cd43816e3044e0b512054def51 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Wed, 18 Dec 2024 13:37:40 +0100 Subject: [PATCH 23/28] update cf + required changes --- hbw/analysis/create_analysis.py | 2 ++ hbw/config/config_run2.py | 10 +++++----- hbw/config/defaults_and_groups.py | 4 ++-- modules/columnflow | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/hbw/analysis/create_analysis.py b/hbw/analysis/create_analysis.py index 4c7d19fc..5879523f 100644 --- a/hbw/analysis/create_analysis.py +++ b/hbw/analysis/create_analysis.py @@ -75,6 +75,8 @@ def create_hbw_analysis( analysis_inst.x.default_weight_producer = "default" analysis_inst.x.ml_inputs_producer = ml_inputs_producer(analysis_inst) analysis_inst.x.default_ml_model = default_ml_model + analysis_inst.x.default_variables = ["jet0_pt", "mll", "n_jet", "ptll", "lepton0_pt", "lepton1_pt"] + analysis_inst.x.default_categories = ["incl", "sr", "ttcr", "dycr"] # # define configs diff --git a/hbw/config/config_run2.py b/hbw/config/config_run2.py index 854705b5..4c07c2a5 100644 --- a/hbw/config/config_run2.py +++ b/hbw/config/config_run2.py @@ -196,7 +196,7 @@ def if_era( jerc_campaign = f"Summer{year2}{jerc_postfix}_22Sep2023" jet_type = "AK4PFPuppi" - cfg.x.jec = DotDict.wrap({ + cfg.x.jec = DotDict.wrap({"Jet": { "campaign": jerc_campaign, "version": {2016: "V7", 2017: "V5", 2018: "V5", 2022: "V2"}[year], "jet_type": jet_type, @@ -260,15 +260,15 @@ def if_era( "CorrelationGroupFlavor", "CorrelationGroupUncorrelated", ], - }) + }}) # JER # https://twiki.cern.ch/twiki/bin/view/CMS/JetResolution?rev=107 - cfg.x.jer = DotDict.wrap({ + cfg.x.jer = DotDict.wrap({"Jet": { "campaign": jerc_campaign, "version": {2016: "JRV3", 2017: "JRV2", 2018: "JRV2", 2022: "JRV1"}[year], "jet_type": jet_type, - }) + }}) # JEC uncertainty sources propagated to btag scale factors # (names derived from contents in BTV correctionlib file) @@ -524,7 +524,7 @@ def if_era( with open(os.path.join(thisdir, "jec_sources.yaml"), "r") as f: all_jec_sources = yaml.load(f, yaml.Loader)["names"] - for jec_source in cfg.x.jec["uncertainty_sources"]: + for jec_source in cfg.x.jec.Jet["uncertainty_sources"]: idx = all_jec_sources.index(jec_source) cfg.add_shift( name=f"jec_{jec_source}_up", diff --git a/hbw/config/defaults_and_groups.py b/hbw/config/defaults_and_groups.py index 0443d24b..4bb544b3 100644 --- a/hbw/config/defaults_and_groups.py +++ b/hbw/config/defaults_and_groups.py @@ -119,8 +119,8 @@ def set_config_defaults_and_groups(config_inst): # # config_inst.x.default_weight_producer = "btag_not_normalized" # config_inst.x.default_ml_model = default_ml_model config_inst.x.default_inference_model = "default" if year == 2017 else "sl_22" - config_inst.x.default_categories = ["incl"] - config_inst.x.default_variables = ["jet1_pt"] + # config_inst.x.default_categories = ["incl"] + # config_inst.x.default_variables = ["jet1_pt"] # # Groups diff --git a/modules/columnflow b/modules/columnflow index 26426673..ed1be2f7 160000 --- a/modules/columnflow +++ b/modules/columnflow @@ -1 +1 @@ -Subproject commit 264266731d11c6b652a7bcf1bb56ad1ebffea595 +Subproject commit ed1be2f7473d16875e744496445085d0bfda1a3a From ea39ece4eeab927cbf82556d5c47c8e487f2a661 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Thu, 19 Dec 2024 09:16:37 +0100 Subject: [PATCH 24/28] add uhh campaign in 22preEE --- hbw/tasks/campaigns.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hbw/tasks/campaigns.py b/hbw/tasks/campaigns.py index 62c36e74..71cc3d17 100644 --- a/hbw/tasks/campaigns.py +++ b/hbw/tasks/campaigns.py @@ -25,6 +25,7 @@ "c22pre": { "cmsdb.campaigns.run3_2022_preEE_nano_v12": "campaign_run3_2022_preEE_nano_v12", "cmsdb.campaigns.run3_2022_preEE_nano_v13": "campaign_run3_2022_preEE_nano_v13", + "cmsdb.campaigns.run3_2022_preEE_nano_uhh_v12": "campaign_run3_2022_preEE_nano_uhh_v12", }, "c22post": { "cmsdb.campaigns.run3_2022_postEE_nano_v12": "campaign_run3_2022_postEE_nano_v12", @@ -163,7 +164,7 @@ def run(self): else: logger.warning( "Run the following command to recreate the backup dataset summary:\n" - f"law run {self.task_family} --recreate_backup_summary --config {self.config} --remove-output 0,a,y", # noqa + f"law run {self.task_family} --recreate-backup-summary --config {self.config} --remove-output 0,a,y", # noqa ) else: logger.warning("No backup dataset summary found, creating one now") From e5c0635ebc5c9c9d246b42c4a47c072ec4854791 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Thu, 19 Dec 2024 10:36:02 +0100 Subject: [PATCH 25/28] update cf (needs reprocessing) --- hbw/tasks/plotting.py | 8 ++++---- law.cfg | 40 ++++++++++++++++++++++++++++++---------- modules/columnflow | 2 +- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/hbw/tasks/plotting.py b/hbw/tasks/plotting.py index afa58bbb..a4d88e06 100644 --- a/hbw/tasks/plotting.py +++ b/hbw/tasks/plotting.py @@ -242,14 +242,14 @@ def run(self): if p.id in h.axes["process"] ], "category": [ - hist.loc(c.id) + hist.loc(c.name) for c in leaf_category_insts - if c.id in h.axes["category"] + if c.name in h.axes["category"] ], "shift": [ - hist.loc(s.id) + hist.loc(s.name) for s in plot_shifts - if s.id in h.axes["shift"] + if s.name in h.axes["shift"] ], }] diff --git a/law.cfg b/law.cfg index bd75a910..edebd04e 100644 --- a/law.cfg +++ b/law.cfg @@ -99,19 +99,39 @@ lfn_sources: local_desy_dcache, wlcg_fs_desy_store, wlcg_fs_infn_redirector, wlc # cf.MLTraining: wlcg # cf.MLEvaluation: wlcg + +[versions] + +# NOTE: in hbw, we also define default versions via the analysis section +# default versions of specific tasks to pin +# the key can consist of multple underscore-separated parts, that can each be patterns or regexes +# these parts are used for the lookup from within tasks and can contain (e.g.) the analysis name, +# the config name, the task family, the dataset name, or the shift name +# (see AnalysisTask.get_config_lookup_keys() - and subclasses - for the exact order) +# note: +# this lookup is skipped if the lookup based on the config instance's auxiliary data succeeded +# example: +; c22post__cf.CalibrateEvents__nomin*: common3 +; cf.SelectEvents: prod3 + +[resources] + +# default sources of remote workflows +# keys can have the same format as described above in [versions] to pinpoint specific tasks +# values should be comma-separated strings in the form "RESOURCE=VALUE", where RESOURCE should refer +# to a valid task parameter (e.g. max_runtime, htcondor_memory, etc.) so that VALUE can be parsed +# by the respective parameter instance at runtime +# same as for [versions], the order of options is important as it defines the resolution order +# example: +; c22post__cf.CalibrateEvents__nomin*: htcondor_memory=5GB +; cf.MLTraining: htcondor_memory=10GB, htcondor_gpus=1 + + +[luigi_cf.DummyTask] # To set defaults on a per-task basis # NOTE: this does override defaults defined in the config, but it does not overwrite parameters # when the parameter has already been set e.g. by another task requiring this task - -# TODO: to share some outputs over multiple analyses -# [luigi_cf.GetDatasetLFNs] - -# analysis: hbw.analysis.hbw_merged.hbw_merged - - -# [luigi_cf.CalibrateEvents] - -# analysis: hbw.analysis.hbw_merged.hbw_merged +dummy_param: dummy_value [luigi_cf.MergeReductionStats] diff --git a/modules/columnflow b/modules/columnflow index ed1be2f7..312bd050 160000 --- a/modules/columnflow +++ b/modules/columnflow @@ -1 +1 @@ -Subproject commit ed1be2f7473d16875e744496445085d0bfda1a3a +Subproject commit 312bd05015de0f6edfea656353cd60ff02d8c608 From 356d18dd79e5b7b2f5b792c3912244fee88050dc Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Thu, 19 Dec 2024 11:18:23 +0100 Subject: [PATCH 26/28] remove hist_util duplicate --- hbw/hist_util.py | 99 ------------------------------------------ hbw/selection/hists.py | 8 ++-- 2 files changed, 4 insertions(+), 103 deletions(-) delete mode 100644 hbw/hist_util.py diff --git a/hbw/hist_util.py b/hbw/hist_util.py deleted file mode 100644 index 819c45c2..00000000 --- a/hbw/hist_util.py +++ /dev/null @@ -1,99 +0,0 @@ -# coding: utf-8 - -""" -Task to produce and merge histograms. -""" - -from __future__ import annotations - -import law -import order as od - -from columnflow.util import maybe_import - -hist = maybe_import("hist") -np = maybe_import("numpy") -ak = maybe_import("awkward") - -logger = law.logger.get_logger(__name__) - - -def add_axis(histogram: hist.Hist, variable_inst: od.Variable) -> hist.Hist: - """ - Add an axis to a histogram based on a variable instance. The axis_type is chosen - based on the variable instance's "axis_type" auxiliary. - - :param histogram: The histogram to add the axis to. - :param variable_inst: The variable instance to use for the axis. - :return: The histogram with the added axis. - """ - default_kwargs = { - "name": variable_inst.name, - "label": variable_inst.get_full_x_title(), - } - - axis_kwargs = law.util.merge_dicts( - default_kwargs, - variable_inst.x("axis_kwargs", {}), - deep=True, - ) - - default_axis_type = "integer" if variable_inst.discrete_x else "variable" - axis_type = variable_inst.x("axis_type", default_axis_type).lower() - - if axis_type == "variable" or axis_type == "var": - return histogram.Var( - variable_inst.bin_edges, - **axis_kwargs, - ) - elif axis_type == "integer" or axis_type == "int": - return histogram.Integer( - int(variable_inst.bin_edges[0]), - int(variable_inst.bin_edges[-1]), - **axis_kwargs, - ) - elif axis_type == "boolean" or axis_type == "bool": - return histogram.Boolean( - **axis_kwargs, - ) - elif axis_type == "intcategory" or axis_type == "intcat": - binning = [int(b) for b in variable_inst.binning] if isinstance(variable_inst.binning, list) else [] - return histogram.IntCat( - binning, - growth=True, - **axis_kwargs, - ) - elif axis_type == "strcategory" or axis_type == "strcat": - return histogram.StrCat( - [], - growth=True, - **axis_kwargs, - ) - elif axis_type == "regular" or axis_type == "reg": - return histogram.Regular( - variable_inst.nbins, - variable_inst.bin_edges[0], - variable_inst.bin_edges[-1], - **axis_kwargs, - ) - - -def create_columnflow_hist( - *variable_insts, - add_default_axes: bool = False, -) -> hist.Hist: - histogram = hist.Hist.new - - # default axes - if add_default_axes: - histogram = histogram.IntCat([], name="category", growth=True) - histogram = histogram.IntCat([], name="process", growth=True) - histogram = histogram.IntCat([], name="shift", growth=True) - - # requested axes - for variable_inst in variable_insts: - histogram = add_axis(histogram, variable_inst) - - histogram = histogram.Weight() - - return histogram diff --git a/hbw/selection/hists.py b/hbw/selection/hists.py index bfd0698c..4bed747d 100644 --- a/hbw/selection/hists.py +++ b/hbw/selection/hists.py @@ -14,7 +14,7 @@ from columnflow.util import maybe_import from hbw.util import has_tag, IF_MC -from hbw.hist_util import create_columnflow_hist +from columnflow.hist_util import create_hist_from_variables np = maybe_import("numpy") ak = maybe_import("awkward") @@ -85,10 +85,10 @@ def hbw_selection_hists( if getattr(self, "first_chunk", True): for key, weight in weight_map.items(): if "btag_weight" not in key: - hists[key] = create_columnflow_hist(self.steps_variable) - hists[f"{key}_per_process"] = create_columnflow_hist(self.steps_variable, self.process_variable) + hists[key] = create_hist_from_variables(self.steps_variable) + hists[f"{key}_per_process"] = create_hist_from_variables(self.steps_variable, self.process_variable) if key == "sum_mc_weight" or "btag_weight" in key: - hists[f"{key}_per_process_ht_njet_nhf"] = create_columnflow_hist( + hists[f"{key}_per_process_ht_njet_nhf"] = create_hist_from_variables( self.steps_variable, self.process_variable, self.ht_variable, From 2f21fdb2556b3e9906411bae996e0b753ca35032 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Thu, 19 Dec 2024 15:52:15 +0100 Subject: [PATCH 27/28] fix derived muon weight producers --- hbw/production/weights.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hbw/production/weights.py b/hbw/production/weights.py index 07301324..4fa4b49b 100644 --- a/hbw/production/weights.py +++ b/hbw/production/weights.py @@ -18,7 +18,7 @@ stitched_normalization_weights_brs_from_processes, ) from columnflow.production.cms.electron import electron_weights -from columnflow.production.cms.muon import muon_weights +from columnflow.production.cms.muon import muon_weights, MuonSFConfig from columnflow.production.cms.btag import btag_weights from columnflow.production.cms.scale import murmuf_weights, murmuf_envelope_weights from columnflow.production.cms.pdf import pdf_weights @@ -123,15 +123,15 @@ def event_weights_to_normalize_init(self) -> None: muon_id_weights = muon_weights.derive("muon_id_weights", cls_dict={ "weight_name": "muon_id_weight", - "get_muon_config": (lambda self: self.config_inst.x.muon_iso_sf_names), + "get_muon_config": (lambda self: MuonSFConfig.new(self.config_inst.x.muon_iso_sf_names)), }) muon_iso_weights = muon_weights.derive("muon_iso_weights", cls_dict={ "weight_name": "muon_iso_weight", - "get_muon_config": (lambda self: self.config_inst.x.muon_id_sf_names), + "get_muon_config": (lambda self: MuonSFConfig.new(self.config_inst.x.muon_id_sf_names)), }) muon_trigger_weights = muon_weights.derive("muon_trigger_weights", cls_dict={ "weight_name": "muon_trigger_weight", - "get_muon_config": (lambda self: self.config_inst.x.muon_trigger_sf_names), + "get_muon_config": (lambda self: MuonSFConfig.new(self.config_inst.x.muon_trigger_sf_names)), }) From cc3b6d66e47ffc28af5003b163b8b4b340230912 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 20 Dec 2024 11:30:09 +0100 Subject: [PATCH 28/28] add dummy shift for MultiConfig tests --- hbw/config/config_run2.py | 12 +++++++++++- hbw/weight/default.py | 13 ++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/hbw/config/config_run2.py b/hbw/config/config_run2.py index 4c07c2a5..7fac5476 100644 --- a/hbw/config/config_run2.py +++ b/hbw/config/config_run2.py @@ -521,6 +521,16 @@ def if_era( }, ) + cfg.add_shift(name=f"dummy_{cfg.x.cpn_tag}_up", id=209, type="shape") + cfg.add_shift(name=f"dummy_{cfg.x.cpn_tag}_down", id=210, type="shape") + add_shift_aliases( + cfg, + f"dummy_{cfg.x.cpn_tag}", + { + "dummy_weight": f"dummy_{cfg.x.cpn_tag}_weight_" + "{direction}", + }, + ) + with open(os.path.join(thisdir, "jec_sources.yaml"), "r") as f: all_jec_sources = yaml.load(f, yaml.Loader)["names"] @@ -700,7 +710,7 @@ def add_external(name, value): "{Electron,Muon}.{pt,eta,phi,mass,charge,pdgId,jetRelIso,is_tight,dxy,dz}", "Electron.deltaEtaSC", "mll", # MET - "MET.{pt,phi}", + "{MET,PuppiMET}.{pt,phi}", # all columns added during selection using a ColumnCollection flag, but skip cutflow ones ColumnCollection.ALL_FROM_SELECTOR, skip_column("cutflow.*"), diff --git a/hbw/weight/default.py b/hbw/weight/default.py index 2179df8f..ef5db3ca 100644 --- a/hbw/weight/default.py +++ b/hbw/weight/default.py @@ -62,6 +62,12 @@ def base(self: WeightProducer, events: ak.Array, **kwargs) -> ak.Array: for column in self.local_weight_columns.keys(): weight = weight * Route(column).apply(events) + # implement dummy shift by varying weight by factor of 2 + if "dummy" in self.local_shift_inst.name: + logger.warning("Applying dummy weight shift (should never be use for real analysis)") + variation = self.local_shift_inst.name.split("_")[-1] + weight = weight * {"up": 2.0, "down": 0.5}[variation] + return events, weight @@ -93,6 +99,7 @@ def base_init(self: WeightProducer) -> None: return year = self.config_inst.campaign.x.year + cpn_tag = self.config_inst.x.cpn_tag if not self.weight_columns: raise Exception("weight_columns not set") @@ -134,7 +141,7 @@ def base_init(self: WeightProducer) -> None: for weight_column, shift_sources in self.local_weight_columns.items(): shift_sources = law.util.make_list(shift_sources) - shift_sources = [s.format(year=year) for s in shift_sources] + shift_sources = [s.format(year=year, cpn_tag=cpn_tag) for s in shift_sources] shifts = get_shifts_from_sources(self.config_inst, *shift_sources) for shift in shifts: if weight_column not in shift.x("column_aliases").keys(): @@ -147,6 +154,9 @@ def base_init(self: WeightProducer) -> None: # declare shifts that the produced event weight depends on self.shifts |= set(shifts) + # remove dummy column from weight columns and uses + self.local_weight_columns.pop("dummy_weight") + # store column names referring to weights to multiply self.uses |= self.local_weight_columns.keys() @@ -158,6 +168,7 @@ def base_init(self: WeightProducer) -> None: default_correction_weights = { + "dummy_weight": ["dummy_{cpn_tag}"], "normalized_pu_weight": ["minbias_xs"], "muon_id_weight": ["mu_id_sf"], "muon_iso_weight": ["mu_iso_sf"],