Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/v boson pt #98

Merged
merged 28 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
860cdd4
cleanup in scripts
mafrahm Dec 4, 2024
1d057d5
use correct btag reweighting and add vjets weight_producer
mafrahm Dec 4, 2024
a274649
switch vjets pt reweighting json and apply NLO EW weights
mafrahm Dec 5, 2024
14f95cb
add simple producer for simple normalization weights
mafrahm Dec 10, 2024
8d24514
add task for campaign creation and consistency checks
mafrahm Dec 10, 2024
87dd19c
cleanup and tests
mafrahm Dec 10, 2024
260bb91
require existence of campaign before running analysis
mafrahm Dec 10, 2024
07aa1b6
enable usage of uhh campaigns
mafrahm Dec 10, 2024
580e9b4
allow string version for CSPs
mafrahm Dec 12, 2024
65b26bd
cleanup in variables (start count with 0)
mafrahm Dec 12, 2024
f15ccd3
update variable and process groups
mafrahm Dec 12, 2024
04ef6ad
cleanup in ml setup function
mafrahm Dec 12, 2024
0bceba3
fix variables
mafrahm Dec 13, 2024
6ddf551
extend timeit wrapper
mafrahm Dec 13, 2024
0feb3c6
add category group
mafrahm Dec 13, 2024
8c85933
lint
mafrahm Dec 13, 2024
d496971
lint again
mafrahm Dec 13, 2024
1b57fed
loop over produced_columns instead of produces
mafrahm Dec 13, 2024
6e7e688
remove features producer
mafrahm Dec 13, 2024
2d45359
fix dataset-dependent shift resolving
mafrahm Dec 17, 2024
87bb212
fix tests
mafrahm Dec 17, 2024
c113c5a
'fix' tests
mafrahm Dec 17, 2024
b45aa65
update cf + required changes
mafrahm Dec 18, 2024
ea39ece
add uhh campaign in 22preEE
mafrahm Dec 19, 2024
e5c0635
update cf (needs reprocessing)
mafrahm Dec 19, 2024
356d18d
remove hist_util duplicate
mafrahm Dec 19, 2024
2f21fdb
fix derived muon weight producers
mafrahm Dec 19, 2024
cc3b6d6
add dummy shift for MultiConfig tests
mafrahm Dec 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 32 additions & 29 deletions hbw/analysis/create_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from __future__ import annotations

import os
import importlib

import law
import order as od
Expand All @@ -24,6 +23,8 @@
ml_inputs_producer,
)

from hbw.tasks.campaigns import BuildCampaignSummary


@timeit_multiple
def create_hbw_analysis(
Expand Down Expand Up @@ -74,6 +75,8 @@ def create_hbw_analysis(
analysis_inst.x.default_weight_producer = "default"
analysis_inst.x.ml_inputs_producer = ml_inputs_producer(analysis_inst)
analysis_inst.x.default_ml_model = default_ml_model
analysis_inst.x.default_variables = ["jet0_pt", "mll", "n_jet", "ptll", "lepton0_pt", "lepton1_pt"]
analysis_inst.x.default_categories = ["incl", "sr", "ttcr", "dycr"]

#
# define configs
Expand All @@ -82,7 +85,6 @@ def create_hbw_analysis(
from hbw.config.config_run2 import add_config

def add_lazy_config(
campaigns: dict[str, str],
config_name: str,
config_id: int,
**kwargs,
Expand All @@ -101,22 +103,22 @@ def create_factory(
):
@timeit_multiple
def analysis_factory(configs: od.UniqueObjectIndex):
hbw_campaign_inst = None

for mod, campaign in campaigns.items():
# import the campaign
mod = importlib.import_module(mod)
if not hbw_campaign_inst:
# copy the main campaign
hbw_campaign_inst = getattr(mod, campaign).copy()
else:
# add datasets to the main campaign
campaign_inst = getattr(mod, campaign).copy()
for dataset in list(campaign_inst.datasets):
dataset.x.campaign = campaign
if not hbw_campaign_inst.has_dataset(dataset.name):
hbw_campaign_inst.add_dataset(dataset)

cpn_task = BuildCampaignSummary(
config=config_name,
)
if cpn_task.complete():
logger.warning(
f"Using pickled campaign for config {config_name}; to re-initialize, run:\n"
f"law run {cpn_task.task_family} --config {config_name} --remove-output 0,a,y",
)
else:
raise ValueError(
f"Campaign used for {config_name} is not yet initialized; to initialize, run: \n",
f"law run {cpn_task.task_family} --config {config_name} --remove-output 0,a,y",
)
# cpn_task.run()

hbw_campaign_inst = cpn_task.output()["hbw_campaign_inst"].load(formatter="pickle")
return add_config(
analysis_inst,
hbw_campaign_inst,
Expand All @@ -139,29 +141,30 @@ def analysis_factory(configs: od.UniqueObjectIndex):

# 2017
add_lazy_config(
{
"cmsdb.campaigns.run2_2017_nano_v9": "campaign_run2_2017_nano_v9",
},
# {
# "cmsdb.campaigns.run2_2017_nano_v9": "campaign_run2_2017_nano_v9",
# },
"c17",
1700,
)

# 2022 preEE
add_lazy_config(
{
"cmsdb.campaigns.run3_2022_preEE_nano_v12": "campaign_run3_2022_preEE_nano_v12",
"cmsdb.campaigns.run3_2022_preEE_nano_v13": "campaign_run3_2022_preEE_nano_v13",
},
# {
# "cmsdb.campaigns.run3_2022_preEE_nano_v12": "campaign_run3_2022_preEE_nano_v12",
# "cmsdb.campaigns.run3_2022_preEE_nano_v13": "campaign_run3_2022_preEE_nano_v13",
# },
"c22pre",
2200,
)

# 2022 postEE
add_lazy_config(
{
"cmsdb.campaigns.run3_2022_postEE_nano_v12": "campaign_run3_2022_postEE_nano_v12",
"cmsdb.campaigns.run3_2022_postEE_nano_v13": "campaign_run3_2022_postEE_nano_v13",
},
# {
# "cmsdb.campaigns.run3_2022_postEE_nano_v12": "campaign_run3_2022_postEE_nano_v12",
# "cmsdb.campaigns.run3_2022_postEE_nano_v13": "campaign_run3_2022_postEE_nano_v13",
# "cmsdb.campaigns.run3_2022_postEE_nano_uhh_v12": "campaign_run3_2022_postEE_nano_uhh_v12",
# },
"c22post",
2210,
)
Expand Down
4 changes: 2 additions & 2 deletions hbw/columnflow_patches.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ def patch_csp_versioning():

def TaskArrayFunction_str(self):
version = self.version() if callable(getattr(self, "version", None)) else getattr(self, "version", None)
if version and not isinstance(version, int):
raise Exception(f"version must be an integer, but is {version}")
if version and not isinstance(version, (int, str)):
raise Exception(f"version must be an integer or string, but is {version} ({type(version)})")
version_str = f"V{version}" if version is not None else ""
return f"{self.cls_name}{version_str}"

Expand Down
54 changes: 40 additions & 14 deletions hbw/config/config_run2.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def if_era(
jerc_campaign = f"Summer{year2}{jerc_postfix}_22Sep2023"
jet_type = "AK4PFPuppi"

cfg.x.jec = DotDict.wrap({
cfg.x.jec = DotDict.wrap({"Jet": {
"campaign": jerc_campaign,
"version": {2016: "V7", 2017: "V5", 2018: "V5", 2022: "V2"}[year],
"jet_type": jet_type,
Expand Down Expand Up @@ -260,15 +260,15 @@ def if_era(
"CorrelationGroupFlavor",
"CorrelationGroupUncorrelated",
],
})
}})

# JER
# https://twiki.cern.ch/twiki/bin/view/CMS/JetResolution?rev=107
cfg.x.jer = DotDict.wrap({
cfg.x.jer = DotDict.wrap({"Jet": {
"campaign": jerc_campaign,
"version": {2016: "JRV3", 2017: "JRV2", 2018: "JRV2", 2022: "JRV1"}[year],
"jet_type": jet_type,
})
}})

# JEC uncertainty sources propagated to btag scale factors
# (names derived from contents in BTV correctionlib file)
Expand Down Expand Up @@ -357,14 +357,28 @@ def if_era(

# V+jets reweighting
cfg.x.vjets_reweighting = DotDict.wrap({
"w": {
"value": "wjets_kfactor_value",
"error": "wjets_kfactor_error",
},
"z": {
"value": "zjets_kfactor_value",
"error": "zjets_kfactor_error",
"value": "eej_pTV_kappa_NLO_EW",
"ew": "eej_pTV_kappa_NLO_EW",
"error": "eej_pTV_d1kappa_EW", # NOTE: not sure if this is correct to use as error (d2,d3?)
"d2": "eej_pTV_d2kappa_EW",
"d3": "eej_pTV_d3kappa_EW",
},
"w": {
"value": "aj_pTV_kappa_NLO_EW",
"ew": "aj_pTV_kappa_NLO_EW",
"error": "aj_pTV_d1kappa_EW", # NOTE: not sure if this is correct to use as error (d2,d3?)
"d2": "aj_pTV_d2kappa_EW",
"d3": "aj_pTV_d3kappa_EW",
},
# "w": {
# "value": "wjets_kfactor_value",
# "error": "wjets_kfactor_error",
# },
# "z": {
# "value": "zjets_kfactor_value",
# "error": "zjets_kfactor_error",
# },
})

################################################################################################
Expand Down Expand Up @@ -507,10 +521,20 @@ def if_era(
},
)

cfg.add_shift(name=f"dummy_{cfg.x.cpn_tag}_up", id=209, type="shape")
cfg.add_shift(name=f"dummy_{cfg.x.cpn_tag}_down", id=210, type="shape")
add_shift_aliases(
cfg,
f"dummy_{cfg.x.cpn_tag}",
{
"dummy_weight": f"dummy_{cfg.x.cpn_tag}_weight_" + "{direction}",
},
)

with open(os.path.join(thisdir, "jec_sources.yaml"), "r") as f:
all_jec_sources = yaml.load(f, yaml.Loader)["names"]

for jec_source in cfg.x.jec["uncertainty_sources"]:
for jec_source in cfg.x.jec.Jet["uncertainty_sources"]:
idx = all_jec_sources.index(jec_source)
cfg.add_shift(
name=f"jec_{jec_source}_up",
Expand Down Expand Up @@ -587,8 +611,10 @@ def add_external(name, value):
add_external("muon_sf", (f"{json_mirror}/POG/MUO/{corr_tag}/muon_Z.json.gz", "v1"))
# btag scale factor
add_external("btag_sf_corr", (f"{json_mirror}/POG/BTV/{corr_tag}/btagging.json.gz", "v1"))
# V+jets reweighting (still unused and not centrally produced)
add_external("vjets_reweighting", f"{json_mirror}/data/json/vjets_reweighting.json.gz")
# V+jets reweighting (derived for 13 TeV, custom json converted from ROOT, not centrally produced)
# ROOT files (eej.root and aj.root) taken from here:
# https://github.com/UHH2/2HDM/tree/ultra_legacy/data/ScaleFactors/VJetsCorrections
add_external("vjets_reweighting", (f"{json_mirror}/data/json/vjets_pt.json.gz", "v1"))
if cfg.x.run == 2:
# met phi corrector (still unused and missing in Run3)
add_external("met_phi_corr", (f"{json_mirror}/POG/JME/{corr_tag}/met.json.gz", "v1"))
Expand Down Expand Up @@ -684,7 +710,7 @@ def add_external(name, value):
"{Electron,Muon}.{pt,eta,phi,mass,charge,pdgId,jetRelIso,is_tight,dxy,dz}",
"Electron.deltaEtaSC", "mll",
# MET
"MET.{pt,phi}",
"{MET,PuppiMET}.{pt,phi}",
# all columns added during selection using a ColumnCollection flag, but skip cutflow ones
ColumnCollection.ALL_FROM_SELECTOR,
skip_column("cutflow.*"),
Expand Down
52 changes: 51 additions & 1 deletion hbw/config/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def hbw_dataset_names(config: od.Config, as_list: bool = False) -> DotDict[str:
"dy_m50toinf_ht2500toinf_madgraph",
]),
*config.x.if_era(run=3, values=[
# NLO samples
"dy_m50toinf_amcatnlo",
"dy_m10to50_amcatnlo",
"dy_m4to10_amcatnlo",
Expand Down Expand Up @@ -113,7 +114,13 @@ def hbw_dataset_names(config: od.Config, as_list: bool = False) -> DotDict[str:
"zz_pythia",
]),
],
"ttv": [], # empty for now
"ttv": [
"ttw_wlnu_amcatnlo",
"ttz_zll_m4to50_amcatnlo",
"ttz_zll_m50toinf_amcatnlo",
"ttz_znunu_amcatnlo",
"ttz_zqq_amcatnlo",
],
"h": [
*config.x.if_era(run=3, values=[
# TODO: remove whatever is not really necessary
Expand Down Expand Up @@ -390,6 +397,9 @@ def configure_hbw_datasets(
limit_dataset_files: int | None = None,
add_dataset_extensions: bool = False,
):
# allow usage of UHH campaign
enable_uhh_campaign_usage(config)

for dataset in config.datasets:
if add_dataset_extensions:
add_dataset_extension_to_nominal(dataset)
Expand Down Expand Up @@ -534,3 +544,43 @@ def get_dataset_lfns_2017(
lfn_base.child(basename, type="f").path
for basename in lfn_base.listdir(pattern="*.root")
]


def enable_uhh_campaign_usage(cfg: od.Config) -> None:
# custom lfn retrieval method in case the underlying campaign is custom uhh
def get_dataset_lfns_uhh(
dataset_inst: od.Dataset,
shift_inst: od.Shift,
dataset_key: str,
) -> list[str]:
if "uhh" not in dataset_inst.x("campaign", ""):
# for non-uhh datasets, use default GetDatasetLFNs method
return GetDatasetLFNs.get_dataset_lfns_dasgoclient(
GetDatasetLFNs, dataset_inst=dataset_inst, shift_inst=shift_inst, dataset_key=dataset_key,
)
cpn_name = dataset_inst.x.campaign
# destructure dataset_key into parts and create the lfn base directory
dataset_id, full_campaign, tier = dataset_key.split("/")[1:]
main_campaign, sub_campaign = full_campaign.split("-", 1)
lfn_base = law.wlcg.WLCGDirectoryTarget(
f"/store/{dataset_inst.data_source}/{main_campaign}/{dataset_id}/{tier}/{sub_campaign}/0",
# fs=f"wlcg_fs_{cfg.campaign.x.custom['name']}",
fs=f"wlcg_fs_{cpn_name}",
)

# loop though files and interpret paths as lfns
return [
lfn_base.child(basename, type="f").path
for basename in lfn_base.listdir(pattern="*.root")
]

if any("uhh" in cpn_name for cpn_name in cfg.campaign.x("campaigns", [])):
# define the lfn retrieval function
cfg.x.get_dataset_lfns = get_dataset_lfns_uhh

# define custom remote fs's to look at
cfg.x.get_dataset_lfns_remote_fs = lambda dataset_inst: (
None if "uhh" not in dataset_inst.x("campaign", "") else [
f"local_fs_{dataset_inst.x.campaign}",
f"wlcg_fs_{dataset_inst.x.campaign}",
])
24 changes: 17 additions & 7 deletions hbw/config/defaults_and_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from columnflow.inference import InferenceModel
from columnflow.tasks.framework.base import RESOLVE_DEFAULT
from hbw.util import bracket_expansion


def default_calibrator(container):
Expand Down Expand Up @@ -118,8 +119,8 @@ def set_config_defaults_and_groups(config_inst):
# # config_inst.x.default_weight_producer = "btag_not_normalized"
# config_inst.x.default_ml_model = default_ml_model
config_inst.x.default_inference_model = "default" if year == 2017 else "sl_22"
config_inst.x.default_categories = ["incl"]
config_inst.x.default_variables = ["jet1_pt"]
# config_inst.x.default_categories = ["incl"]
# config_inst.x.default_variables = ["jet1_pt"]

#
# Groups
Expand All @@ -134,9 +135,10 @@ def set_config_defaults_and_groups(config_inst):
"much": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "qcd", "st", "dy", "vv", "w_lnu", "h"], # noqa: E501
"ech": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "qcd", "st", "dy", "vv", "w_lnu", "h"], # noqa: E501
"dl": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "dy", "st", "vv", "w_lnu", "h"], # noqa: E501
"dl1": [default_signal_process, "tt", "dy", "st", "vv", "w_lnu", "h"],
"dl2": [default_signal_process, "tt", "dy_m4to10", "dy_m10to50", "dy_m50toinf", "st", "vv", "w_lnu", "h"], # noqa: E501
"dlbkg": ["tt", "dy_m4to10", "dy_m10to50", "dy_m50toinf", "st", "vv", "w_lnu", "h"],
"dl1": [default_signal_process, "tt", "dy", "st", "ttv", "vv", "w_lnu", "h"],
"dl2": [default_signal_process, "tt", "dy_m4to10", "dy_m10to50", "dy_m50toinf", "st", "ttv", "vv", "w_lnu", "h"], # noqa: E501
"dl3": [default_signal_process, "tt", "dy_m10to50", "dy_m50toinf", "st", "ttv", "vv", "w_lnu", "h"], # noqa: E501
"dlbkg": ["tt", "dy_m4to10", "dy_m10to50", "dy_m50toinf", "st", "ttv", "vv", "w_lnu", "h"],
"dlmajor": [default_signal_process, "tt", "dy", "st"],
"2much": [default_signal_process, "tt", "dy_m4to10", "dy_m10to50", "dy_m50toinf", "st", "vv", "w_lnu", "h"],
"2ech": [default_signal_process, "tt", "dy_m4to10", "dy_m10to50", "dy_m50toinf", "st", "vv", "w_lnu", "h"],
Expand Down Expand Up @@ -186,7 +188,7 @@ def set_config_defaults_and_groups(config_inst):
remove_generator = lambda x: x.replace("_powheg", "").replace("_madgraph", "").replace("_amcatnlo", "").replace("_pythia8", "").replace("4f_", "") # noqa: E501
config_inst.x.process_groups[f"datasets_{proc}"] = [remove_generator(dataset) for dataset in datasets]

for group in ("dl2", "dl1", "dl", "much", "2much", "ech", "2ech", "emuch"):
for group in ("dl3", "dl2", "dl1", "dl", "much", "2much", "ech", "2ech", "emuch"):
# thanks to double counting removal, we can (and should) now use all datasets in each channel
config_inst.x.process_groups[f"d{group}"] = ["data"] + config_inst.x.process_groups[group]

Expand Down Expand Up @@ -223,6 +225,7 @@ def set_config_defaults_and_groups(config_inst):
"sl_much_boosted": ["sr__1mu__boosted"],
"sl_ech_boosted": ["sr__1e__boosted"],
"dl": ["sr", "dycr", "ttcr", "sr__1b", "sr__2b", "dycr__1b", "dycr__2b", "ttcr__1b", "ttcr__2b"],
"dl_preml": bracket_expansion(["incl", "{sr,ttcr,dycr}{,__2e,__2mu,__emu}{,__1b,__2b}"]),
"dl_ttcr": ["ttcr", "ttcr__1b", "ttcr__2b", "ttcr__2e", "ttcr__2mu", "ttcr__emu"],
"dl_dycr": ["dycr", "dycr__1b", "dycr__2b", "dycr__2e", "dycr__2mu", "dycr__emu"],
"dl_sr": ["sr", "sr__1b", "sr__2b", "sr__2e", "sr__2mu", "sr__emu"],
Expand Down Expand Up @@ -299,7 +302,14 @@ def set_config_defaults_and_groups(config_inst):
"sl": ["n_*", "electron_*", "muon_*", "met_*", "jet*", "bjet*", "ht"],
"sl_resolved": ["n_*", "electron_*", "muon_*", "met_*", "jet*", "bjet*", "ht"],
"sl_boosted": ["n_*", "electron_*", "muon_*", "met_*", "fatjet_*"],
"dl": ["n_*", "electron_*", "muon_*", "met_*", "jet*", "bjet*", "ht", "lt", "mll", "ptll"],
"dl": bracket_expansion([
"n_{jet,bjet,electron,muon,fatjet,hbbjet}",
"lepton{0,1}_{pt,eta,phi}",
"met_{pt,phi}",
"jet{0,1,2,3}_{pt,eta,phi,mass,btagpnetb}",
"bjet{0,1}_{pt,eta,phi,mass,btagpnetb}",
"ht", "lt", "mll", "ptll",
]),
"dl_resolved": ["n_*", "electron_*", "muon_*", "met_*", "jet*", "bjet*", "ht", "lt", "mll", "ptll"],
"dl_boosted": ["n_*", "electron_*", "muon_*", "met_*", "fatjet_*", "lt", "mll", "ptll"],
"default": ["n_jet", "n_muon", "n_electron", "ht", "m_bb", "deltaR_bb", "jet1_pt"], # n_deepjet, ....
Expand Down
6 changes: 3 additions & 3 deletions hbw/config/styling.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
"dy_m50toinf": color_palette["yellow"],
"dy_m10to50": color_palette["brown"],
"dy_m4to10": color_palette["darkgrey"],
"ttV": color_palette["brown"],
"ttv": color_palette["turqoise"],
"vv": color_palette["blue"],
"other": color_palette["grey"],
"hh_ggf_hbb_htt": color_palette["grey"],
Expand Down Expand Up @@ -292,10 +292,10 @@ def quick_addvar(config: od.Config, obj: str, i: int, var: str):
object (starting at 1) and `var` is the variable of interest; example: cf_loosejet1_pt
"""
config.add_variable(
name=name.format(obj=obj, i=i + 1, var=var).lower(),
name=name.format(obj=obj, i=i, var=var).lower(),
expression=expr.format(obj=obj, i=i, var=var),
null_value=EMPTY_FLOAT,
binning=default_var_binning[var],
unit=default_var_unit.get(var, "1"),
x_title=x_title_base.format(obj=obj, i=i + 1) + default_var_title_format.get(var, var),
x_title=x_title_base.format(obj=obj, i=i) + default_var_title_format.get(var, var),
)
Loading
Loading