diff --git a/hbw/categorization/categories.py b/hbw/categorization/categories.py index bfde28f3..24857d7c 100644 --- a/hbw/categorization/categories.py +++ b/hbw/categorization/categories.py @@ -274,6 +274,7 @@ def catid_2b(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, a # TODO: not hard-coded -> use config? ml_processes = [ "signal_ggf", "signal_ggf2", "signal_vbf", "signal_vbf2", + "signal_ggf4", "signal_ggf5", "signal_vbf4", "signal_vbf5", "hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "hh_ggf_hbb_hvvqqlnu_kl1_kt1", "hh_vbf_hbb_hvvqqlnu_kv1_k2v1_kl1", "hh_ggf_hbb_hvv2l2nu_kl1_kt1", "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", diff --git a/hbw/config/defaults_and_groups.py b/hbw/config/defaults_and_groups.py index 5c9c9ce2..392b4515 100644 --- a/hbw/config/defaults_and_groups.py +++ b/hbw/config/defaults_and_groups.py @@ -133,6 +133,38 @@ def set_config_defaults_and_groups(config_inst): # process groups for conveniently looping over certain processs # (used in wrapper_factory and during plotting) config_inst.x.process_groups = { + "ml_study": [ + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy_m4to10", "dy_m10to50", "dy_m50toinf", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], + "test_postfit": [ + # "hh_vbf_hbb_hww2l2nu", + "hh_ggf_hbb_hww2l2nu", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h", + ], "all": ["*"], "default": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "dy", "st", "vv", "w_lnu", "h"], # noqa: E501 "sl": ["hh_ggf_hbb_hvv_kl1_kt1", "hh_vbf_hbb_hvv_kv1_k2v1_kl1", "tt", "qcd", "st", "dy", "vv", "w_lnu", "h"], # noqa: E501 @@ -282,10 +314,15 @@ def set_config_defaults_and_groups(config_inst): ), # Dilepton "SR_dl": ( + "sr__1b__ml_signal_ggf", "sr__1b__ml_signal_ggf2", "sr__2b__ml_signal_ggf", "sr__2b__ml_signal_ggf2", + "sr__1b__ml_signal_vbf", "sr__1b__ml_signal_vbf2", "sr__2b__ml_signal_vbf", "sr__2b__ml_signal_vbf2", + "sr__1b__ml_signal_ggf4", "sr__1b__ml_signal_ggf5", "sr__2b__ml_signal_ggf4", "sr__2b__ml_signal_ggf5", + "sr__1b__ml_signal_vbf4", "sr__1b__ml_signal_vbf5", "sr__2b__ml_signal_vbf4", "sr__2b__ml_signal_vbf5", "sr__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2mu__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2mu__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2e__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__2e__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__emu__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", "sr__emu__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "sr__1b", "sr__2b", ), "vbfSR_dl": ( "sr__1b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", "sr__2b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", @@ -372,10 +409,8 @@ def set_config_defaults_and_groups(config_inst): for proc, _, _ in config_inst.walk_processes() if proc.has_tag("is_signal") }, "dilep": { - "hh_ggf_hbb_hvv2l2nu_kl0_kt1": {"scale": 10000, "unstack": True}, - "hh_ggf_hbb_hvv2l2nu_kl1_kt1": {"scale": 10000, "unstack": True}, - "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1": {"scale": 10000, "unstack": True}, - "hh_ggf_hbb_hvv2l2nu_kl5_kt1": {"scale": 10000, "unstack": True}, + "hh_vbf_hbb_hww2l2nu": {"scale": 90000, "unstack": True}, + "hh_ggf_hbb_hww2l2nu": {"scale": 10000, "unstack": True}, }, "dileptest": { "hh_ggf_hbb_hvv2l2nu_kl1_kt1": {"scale": 10000, "unstack": True}, @@ -478,4 +513,5 @@ def set_config_defaults_and_groups(config_inst): "vbfSR_dl_resolved": is_signal_sm, "vbfSR_dl_boosted": is_signal_sm, "BR_dl": is_background, + } diff --git a/hbw/inference/base.py b/hbw/inference/base.py index 5ee3d013..22f1c17b 100644 --- a/hbw/inference/base.py +++ b/hbw/inference/base.py @@ -89,7 +89,7 @@ def config_variable(self: InferenceModel, config_cat_inst: od.Config): dnn_proc = dnn_cat.replace("ml_", "") return f"mlscore.{dnn_proc}" else: - return "mli_mbb" + return "mli_lep_pt" def customize_category(self: InferenceModel, cat_inst: DotDict, config_cat_inst: od.Config): """ Function to allow customizing the inference category """ diff --git a/hbw/inference/dl.py b/hbw/inference/dl.py index 9bde981f..5a4f42fc 100644 --- a/hbw/inference/dl.py +++ b/hbw/inference/dl.py @@ -153,8 +153,94 @@ # "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1", # "hh_ggf_hbb_hvv2l2nu_kl5_kt1", +dl.derive("dl_ml_study_5", cls_dict={ + "ml_model_name": "dl_22post_ml_study_5", + "config_categories": [ + "sr__1b__ml_signal_ggf5", + "sr__1b__ml_signal_vbf5", + "sr__1b__ml_tt", + "sr__1b__ml_st", + "sr__1b__ml_dy", + "sr__1b__ml_h", + "sr__2b__ml_signal_ggf5", + "sr__2b__ml_signal_vbf5", + "sr__2b__ml_tt", + "sr__2b__ml_st", + "sr__2b__ml_dy", + "sr__2b__ml_h", + ], + "processes": [ + # qqHH_CV_m0p012_C2V_0p03_kl_10p2 + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], + "systematics": rate_systematics, +}) -dl.derive("dl_ml_study_1", cls_dict={ +dl.derive("dl_ml_study_4", cls_dict={ + "ml_model_name": "dl_22post_ml_study_4", + "config_categories": [ + "sr__1b__ml_signal_ggf4", + "sr__1b__ml_signal_vbf4", + "sr__1b__ml_tt", + "sr__1b__ml_st", + "sr__1b__ml_dy", + "sr__1b__ml_h", + "sr__2b__ml_signal_ggf4", + "sr__2b__ml_signal_vbf4", + "sr__2b__ml_tt", + "sr__2b__ml_st", + "sr__2b__ml_dy", + "sr__2b__ml_h", + ], + "processes": [ + # qqHH_CV_m0p012_C2V_0p03_kl_10p2 + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], + "systematics": rate_systematics, +}) + + +dl_ml_study_1 = dl.derive("dl_ml_study_1", cls_dict={ "ml_model_name": "dl_22post_ml_study_1", "config_categories": [ "sr__1b__ml_signal_ggf", @@ -171,16 +257,69 @@ "sr__2b__ml_h", ], "processes": [ - "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", - "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1", - "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43", - "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94", - "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", - "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", - "hh_ggf_hbb_hvv2l2nu_kl0_kt1", - "hh_ggf_hbb_hvv2l2nu_kl1_kt1", - "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1", - "hh_ggf_hbb_hvv2l2nu_kl5_kt1", + # qqHH_CV_m0p012_C2V_0p03_kl_10p2 + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], + "systematics": rate_systematics, +}) + +dl_ml_study_1.derive("dl_ml_study_1_handle", cls_dict={ + "ml_model_name": "dl_22post_ml_study_1_handle", +}) + +dl_ml_study_3 = dl.derive("dl_ml_study_3", cls_dict={ + "ml_model_name": "dl_22_procs1_w0", + "config_categories": [ + "sr__1b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "sr__1b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", + "sr__1b__ml_tt", + "sr__1b__ml_st", + "sr__1b__ml_dy", + "sr__1b__ml_h", + "sr__2b__ml_hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "sr__2b__ml_hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", + "sr__2b__ml_tt", + "sr__2b__ml_st", + "sr__2b__ml_dy", + "sr__2b__ml_h", + ], + "processes": [ + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", "tt", "dy", "w_lnu", @@ -190,7 +329,11 @@ "systematics": rate_systematics, }) -dl.derive("dl_ml_study_2", cls_dict={ +dl_ml_study_3.derive("dl_ml_study_3_handle", cls_dict={ + "ml_model_name": "dl_22_procs1_w0_handle", +}) + +dl_ml_study_2 = dl.derive("dl_ml_study_2", cls_dict={ "ml_model_name": "dl_22post_ml_study_2", "config_categories": [ "sr__1b__ml_signal_ggf2", @@ -207,16 +350,22 @@ "sr__2b__ml_h", ], "processes": [ - "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", - "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1", - "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43", - "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94", - "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", - "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", - "hh_ggf_hbb_hvv2l2nu_kl0_kt1", - "hh_ggf_hbb_hvv2l2nu_kl1_kt1", - "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1", - "hh_ggf_hbb_hvv2l2nu_kl5_kt1", + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", "tt", "dy", "w_lnu", @@ -226,6 +375,14 @@ "systematics": rate_systematics, }) +dl_ml_study_2.derive("dl_ml_study_2_handle", cls_dict={ + "ml_model_name": "dl_22post_ml_study_2_handle", +}) + +dl_ml_study_2.derive("dl_ml_study_2_ignore", cls_dict={ + "ml_model_name": "dl_22post_ml_study_2", +}) + dl.derive("dl_hww_and_hzz", cls_dict={ "processes": [ "hh_ggf_hbb_hww_kl0_kt1", @@ -390,3 +547,35 @@ "systematics": rate_systematics}, ) dl.derive("dl_rates_only", cls_dict={"systematics": rate_systematics}) + +dl.derive("dl_postfit_test", cls_dict={ + "ml_model_name": None, + "config_categories": [ + "sr__1b", + "sr__2b", + ], + "processes": [ + # "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kv1p74_k2v1p37_kl14p4", + "hh_vbf_hbb_hww2l2nu_kvm0p758_k2v1p44_klm19p3", + "hh_vbf_hbb_hww2l2nu_kvm0p012_k2v0p03_kl10p2", + "hh_vbf_hbb_hww2l2nu_kvm2p12_k2v3p87_klm5p96", + "hh_vbf_hbb_hww2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hww2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hww2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hww2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hww2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hww2l2nu_kvm1p83_k2v3p57_klm3p39", + "hh_ggf_hbb_hww2l2nu_kl0_kt1", + "hh_ggf_hbb_hww2l2nu_kl1_kt1", + "hh_ggf_hbb_hww2l2nu_kl2p45_kt1", + "hh_ggf_hbb_hww2l2nu_kl5_kt1", + "st", + "tt", + "dy", + "w_lnu", + "vv", + "h_ggf", "h_vbf", "zh", "wh", "zh_gg", "tth", + ], + "systematics": rate_systematics, +}) diff --git a/hbw/ml/base.py b/hbw/ml/base.py index dfb134a3..7644f7a2 100644 --- a/hbw/ml/base.py +++ b/hbw/ml/base.py @@ -336,6 +336,7 @@ def open_model(self, target: law.LocalDirectoryTarget) -> dict[str, Any]: # custom loss needed due to output layer changes for negative weights from hbw.ml.tf_util import cumulated_crossentropy + models["model"] = tf.keras.models.load_model( target["mlmodel"].path, custom_objects={cumulated_crossentropy.__name__: cumulated_crossentropy}, ) @@ -373,7 +374,6 @@ def load_data( # load into memory validation.load_all log_memory("loading validation data") - # store input features as an output output["mlmodel"].child("input_features.pkl", type="f").dump(self.input_features_ordered, formatter="pickle") @@ -414,7 +414,6 @@ def train( # # training # - self.fit_ml_model(task, model, train, validation, output) log_memory("training") # save the model and history; TODO: use formatter @@ -454,7 +453,7 @@ def evaluate( """ Evaluation function that is run as part of the MLEvaluation task """ - use_best_model = False + use_best_model = False # TODO ML, hier auf True setzen? if len(events) == 0: logger.warning(f"Dataset {task.dataset} is empty. No columns are produced.") @@ -467,7 +466,7 @@ def evaluate( process = task.dataset_inst.x("ml_process", task.dataset_inst.processes.get_first().name) process_inst = task.config_inst.get_process(process) - ml_dataset = self.data_loader(self, process_inst, events) + ml_dataset = self.data_loader(self, process_inst, events, skip_mask=True) # # store the ml truth label in the events # events = set_ak_column( @@ -499,7 +498,6 @@ def evaluate( if len(pred[0]) != len(self.processes): raise Exception("Number of output nodes should be equal to number of processes") predictions.append(pred) - # store predictions for each model for j, proc in enumerate(self.processes): events = set_ak_column( @@ -567,11 +565,18 @@ def prepare_ml_model( # compile the network # NOTE: the custom loss needed due to output layer changes for negative weights optimizer = keras.optimizers.Adam(learning_rate=0.00050) - model.compile( - loss=cumulated_crossentropy, - optimizer=optimizer, - weighted_metrics=["categorical_accuracy"], - ) + if self.negative_weights == "ignore": + model.compile( + loss="categorical_crossentropy", + optimizer=optimizer, + weighted_metrics=["categorical_accuracy"], + ) + else: + model.compile( + loss=cumulated_crossentropy, + optimizer=optimizer, + weighted_metrics=["categorical_accuracy"], + ) return model diff --git a/hbw/ml/data_loader.py b/hbw/ml/data_loader.py index 9b6a1457..87826ee9 100644 --- a/hbw/ml/data_loader.py +++ b/hbw/ml/data_loader.py @@ -97,7 +97,14 @@ class MLDatasetLoader: input_arrays: tuple = ("features", "weights", "train_weights", "equal_weights") evaluation_arrays: tuple = ("prediction",) - def __init__(self, ml_model_inst: MLModel, process: "str", events: ak.Array, stats: dict | None = None): + def __init__( + self, + ml_model_inst: MLModel, + process: "str", + events: ak.Array, + stats: dict | None = None, + skip_mask=False, + ): """ Initializes the MLDatasetLoader with the given parameters. @@ -112,10 +119,15 @@ def __init__(self, ml_model_inst: MLModel, process: "str", events: ak.Array, sta """ self._ml_model_inst = ml_model_inst self._process = process + self._skip_mask = skip_mask proc_mask, _ = get_proc_mask(events, process, ml_model_inst.config_inst) self._stats = stats - self._events = events[proc_mask] + if not skip_mask: + self._events = events[proc_mask] + self._events = events[events.event_weight >= 0.0] + else: + self._events = events def __repr__(self): return f"{self.__class__.__name__}({self.ml_model_inst.cls_name}, {self.process})" @@ -143,6 +155,10 @@ def parameters(self): } return self._parameters + @property + def skip_mask(self): + return self._skip_mask + @property def ml_model_inst(self): return self._ml_model_inst @@ -221,6 +237,13 @@ def shuffle_indices(self) -> np.ndarray: self._shuffle_indices = np.random.permutation(self.n_events) return self._shuffle_indices + @property + def num_event_per_process(self) -> str: + if not self.skip_mask: + return "num_events_pos_weights_per_process" + else: + return "num_events_per_process" + def get_xsec_train_weights(self) -> np.ndarray: """ Weighting such that each event has roughly the same weight, @@ -233,10 +256,20 @@ def get_xsec_train_weights(self) -> np.ndarray: raise Exception("cannot determine train weights without stats") _, sub_id = get_proc_mask(self._events, self.process, self.ml_model_inst.config_inst) - sum_abs_weights = np.sum([self.stats[self.process]["sum_abs_weights_per_process"][str(id)] for id in sub_id]) - num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id]) + sum_weights = np.sum([self.stats[self.process]["sum_pos_weights_per_process"][str(id)] for id in sub_id]) + num_events = np.sum( + [self.stats[self.process][self.num_event_per_process][str(id)] for id in sub_id], + ) + # if not self.skip_mask: + # num_events = np.sum( + # [self.stats[self.process]["num_events_pos_weights_per_process"][str(id)] for id in sub_id], + # ) + # else: + # num_events = np.sum( + # [self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id], + # ) - xsec_train_weights = self.weights / sum_abs_weights * num_events + xsec_train_weights = self.weights / sum_weights * num_events return xsec_train_weights @@ -252,7 +285,15 @@ def get_equal_train_weights(self) -> np.ndarray: combined_proc_inst = self.ml_model_inst.config_inst.get_process(self.process) _, sub_id_proc = get_proc_mask(self._events, self.process, self.ml_model_inst.config_inst) - num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id_proc]) + num_events = np.sum( + [self.stats[self.process][self.num_event_per_process][str(id)] for id in sub_id_proc], + ) + # if not self.skip_mask: + # num_events = np.sum( + # [self.stats[self.process]["num_events_pos_weights_per_process"][str(id)] for id in sub_id_proc], + # ) + # else: + # num_events = np.sum([self.stats[self.process]["num_events_per_process"][str(id)] for id in sub_id_proc]) targeted_sum_of_weights_per_process = ( num_events / len(combined_proc_inst.x.ml_config.sub_processes) ) @@ -685,8 +726,10 @@ def prediction(self) -> np.ndarray: self._prediction = self.load_data("prediction") else: # calcluate prediction if needed - if not hasattr(self._ml_model_inst, "trained_model"): + if not hasattr(self._ml_model_inst, "best_model"): + # if not hasattr(self._ml_model_inst, "trained_model"): raise Exception("No trained model found in the MLModel instance. Cannot calculate prediction.") - self._prediction = predict_numpy_on_batch(self._ml_model_inst.trained_model, self.features) + # self._prediction = predict_numpy_on_batch(self._ml_model_inst.trained_model, self.features) + self._prediction = predict_numpy_on_batch(self._ml_model_inst.best_model, self.features) - return self._prediction + return self._prediction # TODO ML best model diff --git a/hbw/ml/derived/dl.py b/hbw/ml/derived/dl.py index 05791fe7..1a4563a6 100644 --- a/hbw/ml/derived/dl.py +++ b/hbw/ml/derived/dl.py @@ -80,7 +80,7 @@ class DenseClassifierDL(DenseModelMixin, ModelFitMixin, MLClassifierBase): store_name: str = "inputs_inclusive" folds: int = 5 - negative_weights: str = "handle" + negative_weights: str = "ignore" # overwriting DenseModelMixin parameters activation: str = "relu" @@ -242,8 +242,83 @@ def setup(self) -> None: "processes": ["signal", "top"], }) +dl_22post_ml_study_5 = dl_22post.derive("dl_22post_ml_study_5", cls_dict={ + "training_configs": lambda self, requested_configs: ["c22post"], + "combine_processes": { + "signal_ggf5": { + # "name": "tt_and_st", + "label": "Signal GGF", + "sub_processes": [ + "hh_ggf_hbb_hvv2l2nu_kl0_kt1", + "hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "hh_ggf_hbb_hvv2l2nu_kl5_kt1", + ], + "weighting": "equal", + }, + "signal_vbf5": { + # "name": "tt_and_st", + "label": "Signal VBF", + "sub_processes": [ + "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", + ], + "weighting": "equal", + }, + }, + "processes": [ + "signal_ggf5", + "signal_vbf5", + "tt", + "st", + "dy", + "h", + ], +}) + +dl_22post_ml_study_4 = dl_22post.derive("dl_22post_ml_study_4", cls_dict={ + "training_configs": lambda self, requested_configs: ["c22post"], + "combine_processes": { + "signal_ggf4": { + # "name": "tt_and_st", + "label": "Signal GGF", + "sub_processes": [ + "hh_ggf_hbb_hvv2l2nu_kl0_kt1", + "hh_ggf_hbb_hvv2l2nu_kl1_kt1", + "hh_ggf_hbb_hvv2l2nu_kl2p45_kt1", + ], + "weighting": "xsec", + }, + "signal_vbf4": { + # "name": "tt_and_st", + "label": "Signal VBF", + "sub_processes": [ + "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", + "hh_vbf_hbb_hvv2l2nu_kv1_k2v0_kl1", + "hh_vbf_hbb_hvv2l2nu_kvm0p962_k2v0p959_klm1p43", + "hh_vbf_hbb_hvv2l2nu_kvm1p21_k2v1p94_klm0p94", + "hh_vbf_hbb_hvv2l2nu_kvm1p6_k2v2p72_klm1p36", + "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", + ], + "weighting": "xsec", + }, + }, + "processes": [ + "signal_ggf4", + "signal_vbf4", + "tt", + "st", + "dy", + "h", + ], +}) + dl_22post_ml_study_1 = dl_22post.derive("dl_22post_ml_study_1", cls_dict={ "training_configs": lambda self, requested_configs: ["c22post"], + "negative_weights": "ignore", "combine_processes": { "signal_ggf": { # "name": "tt_and_st", @@ -280,8 +355,13 @@ def setup(self) -> None: ], }) +dl_22post_ml_study_1_handle = dl_22post_ml_study_1.derive("dl_22post_ml_study_1_handle", cls_dict={ + "negative_weights": "handle", +}) + dl_22post_ml_study_2 = dl_22post.derive("dl_22post_ml_study_2", cls_dict={ "training_configs": lambda self, requested_configs: ["c22post"], + "negative_weights": "ignore", "combine_processes": { "signal_ggf2": { # "name": "tt_and_st", @@ -306,7 +386,7 @@ def setup(self) -> None: "hh_vbf_hbb_hvv2l2nu_kvm1p83_k2v3p57_klm3p39", ], - "weighting": "xsec", + "weighting": "equal", }, }, "processes": [ @@ -318,6 +398,11 @@ def setup(self) -> None: "h", ], }) + +dl_22post_ml_study_2_handle = dl_22post_ml_study_2.derive("dl_22post_ml_study_2_handle", cls_dict={ + "negative_weights": "handle", +}) + # # setups with different processes (0: baseline, 1: add SM vbf + single H, 2: add SL+all HH variations) # NOTE: we should decide which signal processes exactly to use: @@ -332,16 +417,23 @@ def setup(self) -> None: "training_configs": lambda self, requested_configs: ["c22post", "c22pre"], "processes": ["hh_ggf_hbb_hvv2l2nu_kl1_kt1", "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1", "tt", "st", "dy", "h"], }) -dl_22_procs1_w0 = dl_22_procs1.derive("dl_22_procs1_w1", cls_dict={ +dl_22_procs1_w0 = dl_22_procs1.derive("dl_22_procs1_w0", cls_dict={ + "training_configs": lambda self, requested_configs: ["c22post"], + "negative_weights": "ignore", "ml_process_weights": { "hh_ggf_hbb_hvv2l2nu_kl1_kt1": 1, "hh_vbf_hbb_hvv2l2nu_kv1_k2v1_kl1": 1, "tt": 2, "st": 2, "dy": 2, - "h": 2, + "h": 1, }, }) + +dl_22_procs1_w0_handle = dl_22_procs1_w0.derive("dl_22_procs1_w0_handle", cls_dict={ + "negative_weights": "handle", +}) + dl_22_procs1_w1 = dl_22_procs1.derive("dl_22_procs1_w1", cls_dict={ "ml_process_weights": { "hh_ggf_hbb_hvv2l2nu_kl1_kt1": 1, diff --git a/hbw/ml/mixins.py b/hbw/ml/mixins.py index 38c632e2..3a423119 100644 --- a/hbw/ml/mixins.py +++ b/hbw/ml/mixins.py @@ -97,16 +97,13 @@ def prepare_ml_model( epsilon=1e-6, amsgrad=False, ) - model.compile( - loss=cumulated_crossentropy, - # NOTE: we'd preferrably use the Keras CCE, but it does not work when assigning one event - # to multiple classes (target with multiple entries != 0) - # loss ="categorical_crossentropy", - # loss=tf.keras.losses.CategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE), - optimizer=optimizer, - metrics=["categorical_accuracy"], - weighted_metrics=["categorical_accuracy"], - ) + model_compile_kwargs = { + "loss": "categorical_crossentropy" if self.negative_weights == "ignore" else cumulated_crossentropy, + "optimizer": optimizer, + "metrics": ["categorical_accuracy"], + "weighted_metrics": ["categorical_accuracy"], + } + model.compile(**model_compile_kwargs) return model diff --git a/hbw/ml/stats.py b/hbw/ml/stats.py index 1a4b0a88..9a68c8e6 100644 --- a/hbw/ml/stats.py +++ b/hbw/ml/stats.py @@ -76,15 +76,16 @@ def ml_preparation( events = set_ak_column_f32(events, "event_weight", weight) stats["sum_weights"] += float(ak.sum(weight, axis=0)) weight_map["sum_weights"] = weight - weight_map["sum_abs_weights"] = (weight, weight > 0) - weight_map["sum_pos_weights"] = np.abs(weight) + weight_map["sum_pos_weights"] = (weight, weight > 0) + weight_map["sum_abs_weights"] = np.abs(weight) + weight_map["num_events_pos_weights"] = weight > 0 # normalization weight only norm_weight = events["stitched_normalization_weight"] stats["sum_norm_weights"] += float(ak.sum(norm_weight, axis=0)) weight_map["sum_norm_weights"] = norm_weight - weight_map["sum_abs_norm_weights"] = (norm_weight, norm_weight > 0) - weight_map["sum_pos_norm_weights"] = np.abs(norm_weight) + weight_map["sum_pos_norm_weights"] = (norm_weight, norm_weight > 0) + weight_map["sum_abs_norm_weights"] = np.abs(norm_weight) group_map = { "process": { diff --git a/hbw/tasks/inference.py b/hbw/tasks/inference.py index 98b76320..f291765d 100644 --- a/hbw/tasks/inference.py +++ b/hbw/tasks/inference.py @@ -8,11 +8,10 @@ from typing import Callable -# import luigi import law import order as od -from columnflow.tasks.framework.base import Requirements, RESOLVE_DEFAULT, AnalysisTask +from columnflow.tasks.framework.base import Requirements, RESOLVE_DEFAULT, AnalysisTask, ConfigTask from columnflow.tasks.framework.parameters import SettingsParameter from columnflow.tasks.framework.mixins import ( CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, InferenceModelMixin, @@ -22,14 +21,72 @@ from columnflow.util import dev_sandbox, maybe_import from hbw.tasks.base import HBWTask -# import hbw.inference.constants as const - array = maybe_import("array") +uproot = maybe_import("uproot") +np = maybe_import("numpy") +hist = maybe_import("hist") logger = law.logger.get_logger(__name__) +# Function copied from Mathis Hist hook commit +# TODO: define once at central place (hist_util.py) +def apply_rebinning_edges(h: hist.Histogram, axis_name: str, edges: list): + """ + Generalized rebinning of a single axis from a hist.Histogram, using predefined edges. + + :param h: histogram to rebin + :param axis_name: string representing the axis to rebin + :param edges: list of floats representing the new bin edges. Must be a subset of the original edges. + :return: rebinned histogram + """ + if isinstance(edges, int): + return h[{axis_name: hist.rebin(edges)}] + + ax = h.axes[axis_name] + ax_idx = [a.name for a in h.axes].index(axis_name) + if not all([np.isclose(x, ax.edges).any() for x in edges]): + raise ValueError( + f"Cannot rebin histogram due to incompatible edges for axis '{ax.name}'\n" + f"Edges of histogram are {ax.edges}, requested rebinning to {edges}", + ) + + # If you rebin to a subset of initial range, keep the overflow and underflow + overflow = ax.traits.overflow or (edges[-1] < ax.edges[-1] and not np.isclose(edges[-1], ax.edges[-1])) + underflow = ax.traits.underflow or (edges[0] > ax.edges[0] and not np.isclose(edges[0], ax.edges[0])) + flow = overflow or underflow + new_ax = hist.axis.Variable(edges, name=ax.name, overflow=overflow, underflow=underflow) + axes = list(h.axes) + axes[ax_idx] = new_ax + + hnew = hist.Hist(*axes, name=h.name, storage=h._storage_type()) + + # Offset from bin edge to avoid numeric issues + offset = 0.5 * np.min(ax.edges[1:] - ax.edges[:-1]) + edges_eval = edges + offset + edge_idx = ax.index(edges_eval) + # Avoid going outside the range, reduceat will add the last index anyway + if edge_idx[-1] == ax.size + ax.traits.overflow: + edge_idx = edge_idx[:-1] + + if underflow: + # Only if the original axis had an underflow should you offset + if ax.traits.underflow: + edge_idx += 1 + edge_idx = np.insert(edge_idx, 0, 0) + + # Take is used because reduceat sums i:len(array) for the last entry, in the case + # where the final bin isn't the same between the initial and rebinned histogram, you + # want to drop this value. Add tolerance of 1/2 min bin width to avoid numeric issues + hnew.values(flow=flow)[...] = np.add.reduceat(h.values(flow=flow), edge_idx, + axis=ax_idx).take(indices=range(new_ax.size + underflow + overflow), axis=ax_idx) + if hnew._storage_type() == hist.storage.Weight(): + hnew.variances(flow=flow)[...] = np.add.reduceat(h.variances(flow=flow), edge_idx, + axis=ax_idx).take(indices=range(new_ax.size + underflow + overflow), axis=ax_idx) + return hnew + + def get_hist_name(cat_name: str, proc_name: str, syst_name: str | None = None) -> str: hist_name = f"{cat_name}/{proc_name}" if syst_name: @@ -68,41 +125,34 @@ def get_rebin_values(hist, N_bins_final: int = 10): Function that determines how to rebin a hist to *N_bins_final* bins such that the resulting histogram is flat """ - N_bins_input = hist.GetNbinsX() - - # # replace empty bin values (TODO: remove as soon as we can disable the empty bin filling) - # EMPTY_BIN_VALUE = 1e-5 - # for i in range(1, N_bins_input + 1): - # if hist.GetBinContent(i) == EMPTY_BIN_VALUE: - # hist.SetBinContent(i, 0) + N_bins_input = hist.axes[0].size # determine events per bin the final histogram should have - events_per_bin = hist.Integral() / N_bins_final + events_per_bin = hist.sum().value / N_bins_final logger.info(f"============ {round(events_per_bin, 3)} events per bin") # bookkeeping number of bins and number of events bin_count = 1 N_events = 0 - rebin_values = [hist.GetBinLowEdge(1)] + rebin_values = [int(hist.axes[0].edges[0])] # starting loop at 1 to exclude underflow # ending at N_bins_input + 1 excludes overflow - for i in range(1, N_bins_input + 1): + for i in range(1, N_bins_input): if bin_count == N_bins_final: # break as soon as N-1 bin edges have been determined --> final bin is x_max break - - N_events += hist.GetBinContent(i) + N_events += hist.view()["value"][i] if i % 100 == 0: logger.info(f"========== Bin {i} of {N_bins_input}, {N_events} events") if N_events >= events_per_bin * bin_count: # when *N_events* surpasses threshold, append the corresponding bin edge and count - logger.info(f"++++++++++ Append bin edge {bin_count} of {N_bins_final} at edge {hist.GetBinLowEdge(i)}") - rebin_values.append(hist.GetBinLowEdge(i + 1)) + logger.info(f"++++++++++ Append bin edge {bin_count} of {N_bins_final} at edge {hist.axes[0].edges[i]}") + rebin_values.append(hist.axes[0].edges[i]) bin_count += 1 # final bin is x_max - x_max = hist.GetBinLowEdge(N_bins_input + 1) + x_max = hist.axes[0].edges[N_bins_input] rebin_values.append(x_max) logger.info(f"final bin edges: {rebin_values}") return rebin_values @@ -111,7 +161,7 @@ def get_rebin_values(hist, N_bins_final: int = 10): def apply_binning(hist, rebin_values: list): N_bins = len(rebin_values) - 1 rebin_values_ptr = array.array("d", rebin_values) - + # rebin_values_ptr = np.array(rebin_values, dtype="float64") h_out = hist.Rebin(N_bins, hist.GetName(), rebin_values_ptr) return h_out @@ -202,10 +252,12 @@ class ModifyDatacardsFlatRebin( ProducersMixin, SelectorStepsMixin, CalibratorsMixin, + ConfigTask, law.LocalWorkflow, RemoteWorkflow, ): - sandbox = dev_sandbox("bash::$CF_BASE/sandboxes/cmssw_default.sh") + + sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) bins_per_category = SettingsParameter( default=(RESOLVE_DEFAULT,), @@ -268,7 +320,6 @@ def get_rebin_processes(self): """ config_category = self.branch_data.config_category processes = self.branch_data.processes.copy() - # inf_proc_names = [self.inference_model_inst.inf_proc(proc.name) for proc in self.branch_data.processes] rebin_process_condition = self.inference_category_rebin_processes.get(config_category, None) if not rebin_process_condition: @@ -321,11 +372,6 @@ def output(self): } def run(self): - import uproot - from ROOT import TH1 - - # config_inst = self.config_inst - # inference_model = self.inference_model_inst inputs = self.input() outputs = self.output() @@ -339,7 +385,6 @@ def run(self): outputs["card"].dump(datacard, formatter="text") with uproot.open(inp_shapes.fn) as file: - # logger.info(f"File keys: {file.keys()}") # determine which histograms are present cat_names, proc_names, syst_names = get_cat_proc_syst_names(file) @@ -355,7 +400,8 @@ def run(self): # get all nominal histograms nominal_hists = { - proc_name: file[get_hist_name(cat_name, proc_name)].to_pyroot() + proc_name: file[get_hist_name(cat_name, proc_name)].to_hist() + # proc_name: file[get_hist_name(cat_name, proc_name)].to_pyroot() for proc_name in proc_names } @@ -379,22 +425,23 @@ def run(self): # apply rebinning on all histograms and store resulting hists in a ROOT file out_file = uproot.recreate(outputs["shapes"].fn) + rebinned_histograms = {} for key, h in file.items(): - # remove ";1" appendix + key = key.split(";")[0] - try: - # convert histograms to pyroot - h = h.to_pyroot() - except AttributeError: - # skip non-histograms - continue - if not isinstance(h, TH1): - raise Exception(f"{h} is not a TH1 histogram") + if "TH1" in str(h): + try: + h = file[key].to_hist() + except AttributeError: + continue + h_rebin = apply_rebinning_edges(h, h.axes[0].name, rebin_values) + # problematic_bin_count = check_empty_bins(h_rebin) + # TODO: we should reimplement check empty bins, check for all bkg not per process + + rebinned_histograms[key] = h_rebin - h_rebin = apply_binning(h, rebin_values) - problematic_bin_count = check_empty_bins(h_rebin) # noqa - logger.info(f"Inserting histogram with name {key}") - out_file[key] = uproot.from_pyroot(h_rebin) + logger.info(f"Inserting histogram with name {key}") + out_file[key] = h_rebin class PrepareInferenceTaskCalls( @@ -404,6 +451,7 @@ class PrepareInferenceTaskCalls( ProducersMixin, SelectorStepsMixin, CalibratorsMixin, + ConfigTask, ): """ Simple task that produces string to run certain tasks in Inference @@ -457,6 +505,9 @@ def run(self): # combine category names with card fn to a single string datacards = ",".join([f"{cat_name}=$CARDS_PATH/{card_fn}" for cat_name, card_fn in zip(cat_names, card_fns)]) + # # name of the output root file that contains the Pre+Postfit shapes + # output_file = "" + base_cmd = f"export CARDS_PATH={cards_path}" + "\n" print("\n\n") @@ -490,4 +541,11 @@ def run(self): f"--order-by-impact" ) print(cmd, "\n\n") + + # running PreAndPostfitShapes for Pre+Postfit plots + cmd = base_cmd + ( + f"law run PreAndPostFitShapes --version {identifier} --datacards {datacards} " + # f"--output-name {output_file}" + ) + print(cmd, "\n\n") output["FitDiagnostics"].dump(cmd, formatter="text") diff --git a/hbw/tasks/ml.py b/hbw/tasks/ml.py index ac2eb876..44b8ffa2 100644 --- a/hbw/tasks/ml.py +++ b/hbw/tasks/ml.py @@ -330,6 +330,8 @@ def output(self): # the stats dict is created per process+fold, but should always be identical, therefore we store it only once outputs["stats"] = self.target("stats.json") + outputs["cross_check_weights"] = self.target(f"cross_check_weights_{process}_{self.fold}.yaml") + outputs["parameters"] = self.target("parameters.yaml") return outputs @@ -398,23 +400,36 @@ def run(self): outputs["stats"].dump(stats, formatter="json") # initialize the DatasetLoader - ml_dataset = self.ml_model_inst.data_loader(self.ml_model_inst, process, events, stats) - # NOTE: Almost closure + if self.ml_model_inst.negative_weights == "ignore": + ml_dataset = self.ml_model_inst.data_loader(self.ml_model_inst, process, events, stats) + logger.info( + f"{self.ml_model_inst.negative_weights} method chosen to hanlde negative weights:", + " All negative weights will be removed from training.", + ) + else: + ml_dataset = self.ml_model_inst.data_loader(self.ml_model_inst, process, events, stats, skip_mask=True) + sum_train_weights = np.sum(ml_dataset.train_weights) n_events_per_fold = len(ml_dataset.train_weights) - logger.info(f"Sum of traing weights is: {sum_train_weights} for {n_events_per_fold} {process} events") - - # check that equal weighting works as intended + logger.info(f"Sum of training weights is: {sum_train_weights} for {n_events_per_fold} {process} events") + xcheck = {} if self.ml_model_inst.config_inst.get_process(process).x("ml_config", None): + xcheck[process] = {} if self.ml_model_inst.config_inst.get_process(process).x.ml_config.weighting == "equal": for sub_proc in self.ml_model_inst.config_inst.get_process(process).x.ml_config.sub_processes: - proc_mask, sub_id = get_proc_mask(events, sub_proc, self.ml_model_inst.config_inst) + proc_mask, _ = get_proc_mask(ml_dataset._events, sub_proc, self.ml_model_inst.config_inst) xcheck_weight_sum = np.sum(ml_dataset.train_weights[proc_mask]) xcheck_n_events = len(ml_dataset.train_weights[proc_mask]) logger.info( f"For the equal weighting method the sum of weights for {sub_proc} is {xcheck_weight_sum} " f"(No. of events: {xcheck_n_events})", ) + if sub_proc not in xcheck[process]: + xcheck[process][sub_proc] = {} + xcheck[process][sub_proc]["weight_sum"] = int(xcheck_weight_sum) + xcheck[process][sub_proc]["num_events"] = xcheck_n_events + # __import__("IPython").embed() + outputs["cross_check_weights"].dump(xcheck, formatter="yaml") for input_array in self.ml_model_inst.data_loader.input_arrays: logger.info(f"loading data for input array {input_array}") diff --git a/hbw/tasks/postfit_plots.py b/hbw/tasks/postfit_plots.py index 9df3745a..4069b211 100644 --- a/hbw/tasks/postfit_plots.py +++ b/hbw/tasks/postfit_plots.py @@ -6,54 +6,101 @@ from __future__ import annotations -from collections import OrderedDict +from collections import OrderedDict, defaultdict import law import luigi import order as od - from columnflow.tasks.framework.base import ConfigTask from columnflow.tasks.framework.mixins import ( InferenceModelMixin, MLModelsMixin, ProducersMixin, SelectorStepsMixin, - CalibratorsMixin, + CalibratorsMixin, DatasetsProcessesMixin, ) from columnflow.tasks.framework.plotting import ( - PlotBase, PlotBase1D, + PlotBase, PlotBase1D, ProcessPlotSettingMixin, # VariablesPlotSettingMixin, ) from columnflow.tasks.framework.decorators import view_output_plots from hbw.tasks.base import HBWTask from columnflow.util import dev_sandbox, DotDict, maybe_import +# from hbw.inference.base import inf_proc + uproot = maybe_import("uproot") +hist = maybe_import("hist") logger = law.logger.get_logger(__name__) -def load_hists_uproot(fit_diagnostics_path): +def load_hists_uproot(fit_diagnostics_path, fit_type): """ Helper to load histograms from a fit_diagnostics file """ + with uproot.open(fit_diagnostics_path) as tfile: + if any("shapes_fit_s" in _k for _k in tfile.keys()): + if fit_type == "postfit": + fit_type = "fit_s" + hists = get_hists_from_fit_diagnostics(tfile)[f"shapes_{fit_type}"] + else: + hists = get_hists_from_multidimfit(tfile)[f"{fit_type}"] + + return hists + + +def get_hists_from_fit_diagnostics(tfile): + """ Helper function to load histograms from root file created by FitDiagnostics """ + # prepare output dict hists = DotDict() - with uproot.open(fit_diagnostics_path) as tfile: - keys = [key.split("/") for key in tfile.keys()] - for key in keys: - if len(key) != 3: - continue - # get the histogram from the tfile - h_in = tfile["/".join(key)] + keys = [key.split("/") for key in tfile.keys()] + for key in keys: + if len(key) != 3: + continue - # unpack key - fit, channel, process = key - process = process.split(";")[0] + # get the histogram from the tfile + h_in = tfile["/".join(key)] - if "data" not in process: - # transform TH1F to hist - h_in = h_in.to_hist() + # unpack key + fit, channel, process = key + process = process.split(";")[0] - # set the histogram in a deep dictionary - hists = law.util.merge_dicts(hists, DotDict.wrap({fit: {channel: {process: h_in}}}), deep=True) + if "data" in process or "total" in process: + continue + else: + # transform TH1F to hist + h_in = h_in.to_hist() + + # set the histogram in a deep dictionary + hists = law.util.merge_dicts(hists, DotDict.wrap({fit: {channel: {process: h_in}}}), deep=True) + return hists + + +def get_hists_from_multidimfit(tfile): + """ Helper function to load histograms from root file created by MultiDimFit """ + # prepare output dict + hists = DotDict() + keys = [key.split("/") for key in tfile.keys()] + for key in keys: + if len(key) != 2: + continue + # get the histogram from the tfile + h_in = tfile["/".join(key)] + + # unpack key + fit_and_channel, process = key + fit = fit_and_channel.split("_")[-1] + fit = fit.replace("_", "") + channel = fit_and_channel.replace(f"_{fit}", "") + process = process.split(";")[0] + + if "data" in process or "Total" in process: + continue + # transform TH1F to hist + else: + h_in = h_in.to_hist() + + # set the histogram in a deep dictionary + hists = law.util.merge_dicts(hists, DotDict.wrap({fit: {channel: {process: h_in}}}), deep=True) return hists @@ -67,11 +114,12 @@ def load_hists_uproot(fit_diagnostics_path): from columnflow.plotting.plot_util import ( prepare_plot_config, prepare_style_config, + apply_process_settings, ) def plot_postfit_shapes( - hists: OrderedDict, + hists: OrderedDict[od.Process, hist.Hist], config_inst: od.Config, category_inst: od.Category, variable_insts: list[od.Variable], @@ -85,7 +133,7 @@ def plot_postfit_shapes( **kwargs, ) -> tuple(plt.Figure, tuple(plt.Axes)): variable_inst = law.util.make_tuple(variable_insts)[0] - + hists = apply_process_settings(hists, process_settings) plot_config = prepare_plot_config( hists, shape_norm=shape_norm, @@ -111,6 +159,8 @@ class PlotPostfitShapes( # NOTE: mixins might be wrong and could (should?) be extended to MultiConfigTask HBWTask, PlotBase1D, + ProcessPlotSettingMixin, + DatasetsProcessesMixin, # to correctly setup our InferenceModel, we need all these mixins, but hopefully, all these # parameters are automatically resolved correctly InferenceModelMixin, @@ -131,9 +181,10 @@ class PlotPostfitShapes( """ sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + datasets = None plot_function = PlotBase.plot_function.copy( - default="hbw.tasks.plotting.plot_postfit_shapes", + default="hbw.tasks.postfit_plots.plot_postfit_shapes", add_default_to_description=True, ) @@ -147,11 +198,60 @@ class PlotPostfitShapes( description="Whether to do prefit or postfit plots; defaults to False", ) + @property + def fit_type(self) -> str: + if self.prefit: + return "prefit" + else: + return "postfit" + def requires(self): return {} def output(self): - return {"plots": self.target("plots", dir=True)} + return {"plots": self.target(f"plots_{self.fit_type}", dir=True)} + + # map processes in root shape to corresponding process instance used for plotting + def prepare_hist_map(self, hist_processes: set, process_insts: list) -> defaultdict(list): + + hist_map = defaultdict(list) + + for proc_key in hist_processes: + proc_inst = None + # try getting the config process via InferenceModel + # NOTE: Only the first category is taken to get the config process instance, + # assuming they are the same for all categories + channel = self.inference_model_inst.get_categories_with_process(proc_key)[0] + has_category = self.inference_model_inst.has_category(channel) + if has_category: + inference_process = self.inference_model_inst.get_process(proc_key, channel) + proc_inst = self.config_inst.get_process(inference_process.config_process) + else: + # try getting proc inst directly via config + proc_inst = self.config_inst.get_process(proc_key, default=None) + + # replace string keys with process instances + # map HHinference processes to process instances for plotting + if proc_inst: + plot_proc = [ + proc for proc in process_insts if proc.has_process(proc_inst) or proc.name == proc_inst.name + ] + if len(plot_proc) > 1: + logger.warning( + f"{proc_key} was assigned to ({','.join([p.name for p in plot_proc])})", + f" but {plot_proc[0].name} was chosen", + ) + elif len(plot_proc) == 0: + logger.warning(f"{proc_key} in root file, but won't be plotted.") + continue + plot_proc = plot_proc[0] + + if plot_proc not in hist_map: + hist_map[plot_proc] = [proc_key] + else: + hist_map[plot_proc].append(proc_key) + + return hist_map @view_output_plots def run(self): @@ -159,45 +259,40 @@ def run(self): f"Note! It is important that the requested inference_model {self.inference_model} " "is identical to the one that has been used to create the datacards", ) - all_hists = load_hists_uproot(self.fit_diagnostics_file) outp = self.output() - if self.prefit: - fit_type = "prefit" - else: - fit_type = "fit_s" - all_hists = all_hists[f"shapes_{fit_type}"] + # Load all required histograms corresponding to the fit_type from the root input file + all_hists = load_hists_uproot(self.fit_diagnostics_file, self.fit_type) + + # Get list of all process instances required for plotting + process_insts = list(map(self.config_inst.get_process, self.processes)) + + # map processes in root shape to corresponding process instance used for plotting + hist_processes = {key for _, h_in in all_hists.items() for key in h_in.keys()} + hist_map = self.prepare_hist_map(hist_processes, process_insts) - for channel, hists in all_hists.items(): + # Plot Pre/Postfit plot for each channel + for channel, h_in in all_hists.items(): + + # Check for coherence between inference and pre/postfit categories has_category = self.inference_model_inst.has_category(channel) if not has_category: logger.warning(f"Category {channel} is not part of the inference model {self.inference_model}") + continue - for proc_key in list(hists.keys()): - # remove unnecessary histograms - if "data" in proc_key or "total" in proc_key: - hists.pop(proc_key) + # Create Histograms + hists = defaultdict(OrderedDict) + for proc in hist_map: + plot_proc = proc.copy() # NOTE: copy produced, so actual process is not modified by process settings + if hist_map[proc][0] not in h_in: + logger.warning(f"No histograms for {proc.name} found in {channel}.") continue + hists[plot_proc] = h_in[hist_map[proc][0]] + for p in hist_map[proc][1:]: + hists[plot_proc] += h_in[p] - proc_inst = None - # try getting the config process via InferenceModel - if has_category: - # TODO: process customization based on inference process? e.g. scale - inference_process = self.inference_model_inst.get_process(proc_key, channel) - proc_inst = self.config_inst.get_process(inference_process.config_process) - else: - # try getting proc inst directly via config - proc_inst = self.config_inst.get_process(proc_key, default=None) - - # replace string keys with process instances - if proc_inst: - hists[proc_inst] = hists[proc_key] - hists.pop(proc_key) - - # try getting the config category and variable via InferenceModel if has_category: - # TODO: category/variable customization based on inference model? inference_category = self.inference_model_inst.get_category(channel) config_category = self.config_inst.get_category(inference_category.config_category) variable_inst = self.config_inst.get_variable(inference_category.config_variable) @@ -207,13 +302,14 @@ def run(self): variable_inst = od.Variable("dummy") # call the plot function + h = hists.copy() # NOTE: copy produced, so actual process is not modified by process settings fig, _ = self.call_plot_func( self.plot_function, - hists=hists, + hists=h, config_inst=self.config_inst, category_inst=config_category, variable_insts=variable_inst, **self.get_plot_parameters(), ) - outp["plots"].child(f"{channel}_{fit_type}.pdf", type="f").dump(fig, formatter="mpl") + outp["plots"].child(f"{channel}_{self.fit_type}.pdf", type="f").dump(fig, formatter="mpl")