From 8d481f8e2bbfa5403787b861d764282b9c486404 Mon Sep 17 00:00:00 2001 From: Aashwin Basnet Date: Thu, 30 Jan 2025 21:32:56 -0500 Subject: [PATCH] added modifications to perform nonprompt estimation of photons --- analysis/topeft_run2/run_analysis.py | 17 +-- topeft/modules/dataDrivenEstimation.py | 81 +++++++++++++- .../nonprompt_unc_propagation_helper.py | 100 ++++++++++++++++++ 3 files changed, 188 insertions(+), 10 deletions(-) create mode 100644 topeft/modules/nonprompt_unc_propagation_helper.py diff --git a/analysis/topeft_run2/run_analysis.py b/analysis/topeft_run2/run_analysis.py index 5a46e10b..b0b1c29d 100644 --- a/analysis/topeft_run2/run_analysis.py +++ b/analysis/topeft_run2/run_analysis.py @@ -52,6 +52,7 @@ parser.add_argument('--skip-sr', action='store_true', help = 'Skip all signal region categories') parser.add_argument('--skip-cr', action='store_true', help = 'Skip all control region categories') parser.add_argument('--do-np' , action='store_true', help = 'Perform nonprompt estimation on the output hist, and save a new hist with the np contribution included. Note that signal, background and data samples should all be processed together in order for this option to make sense.') + parser.add_argument('--do_np_ph', action='store_true', help='Perform photon non-prompt estimation') parser.add_argument('--do-renormfact-envelope', action='store_true', help = 'Perform renorm/fact envelope calculation on the output hist (saves the modified with the the same name as the original.') parser.add_argument('--wc-list', action='extend', nargs='+', help = 'Specify a list of Wilson coefficients to use in filling histograms.') parser.add_argument('--hist-list', action='extend', nargs='+', help = 'Specify a list of histograms to fill.') @@ -77,6 +78,7 @@ skip_sr = args.skip_sr skip_cr = args.skip_cr do_np = args.do_np + do_np_ph = args.do_np_ph do_renormfact_envelope = args.do_renormfact_envelope wc_lst = args.wc_list if args.wc_list is not None else [] @@ -118,7 +120,7 @@ # Here we hardcode a list of hists used for the analysis hist_lst = ["njets","lj0pt","ptz"] elif args.hist_list == ["photon"]: - hist_lst = ['njet_bjet','photon_pt','invmass','l0pt','l1pt','njets','nbjetsm'] + hist_lst = ['photon_pt','photon_eta','photon_eta2','photon_pt2','photon_pt_eta'] elif args.hist_list == ["cr"]: # Here we hardcode a list of hists used for the CRs hist_lst = ["lj0pt", "ptz", "met", "ljptsum", "l0pt", "l0eta", "l1pt", "l1eta", "j0pt", "j0eta", "njets", "nbtagsl", "invmass"] @@ -336,9 +338,9 @@ def LoadJsonToSampleName(jsonFile, prefix): if executor == "work_queue": print('Processed {} events in {} seconds ({:.2f} evts/sec).'.format(nevts_total,dt,nevts_total/dt)) - #nbins = sum(sum(arr.size for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist)) - #nfilled = sum(sum(np.sum(arr > 0) for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist)) - #print("Filled %.0f bins, nonzero bins: %1.1f %%" % (nbins, 100*nfilled/nbins,)) + nbins = sum(sum(arr.size for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist)) + nfilled = sum(sum(np.sum(arr > 0) for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist)) + print("Filled %.0f bins, nonzero bins: %1.1f %%" % (nbins, 100*nfilled/nbins,)) if executor == "futures": print("Processing time: %1.2f s with %i workers (%.2f s cpu overall)" % (dt, nworkers, dt*nworkers, )) @@ -353,9 +355,12 @@ def LoadJsonToSampleName(jsonFile, prefix): # Run the data driven estimation, save the output if do_np: - print("\nDoing the nonprompt estimation...") + if not do_np_ph: + print("\nDoing the nonprompt lepton estimation...") + else: + print("\nDoing the nonprompt estimation of lepton and photon......") out_pkl_file_name_np = os.path.join(outpath,outname+"_np.pkl.gz") - ddp = DataDrivenProducer(out_pkl_file,out_pkl_file_name_np) + ddp = DataDrivenProducer(out_pkl_file,out_pkl_file_name_np, do_np_ph=args.do_np_ph) print(f"Saving output in {out_pkl_file_name_np}...") ddp.dumpToPickle() print("Done!") diff --git a/topeft/modules/dataDrivenEstimation.py b/topeft/modules/dataDrivenEstimation.py index 577e15d1..cdd5f601 100644 --- a/topeft/modules/dataDrivenEstimation.py +++ b/topeft/modules/dataDrivenEstimation.py @@ -7,10 +7,11 @@ from topeft.modules.paths import topeft_path from topcoffea.modules.get_param_from_jsons import GetParam +from topeft.modules.nonprompt_unc_propagation_helper import modify_NP_photon_pt_eta_variance,modify_photon_pt_variance get_te_param = GetParam(topeft_path("params/params.json")) class DataDrivenProducer: - def __init__(self, inputHist, outputName): + def __init__(self, inputHist, outputName, do_np_ph=False): if isinstance(inputHist, str) and inputHist.endswith('.pkl.gz'): # we are plugging a pickle file self.inhist=utils.get_hist_from_pkl(inputHist) else: # we already have the histogram @@ -19,7 +20,10 @@ def __init__(self, inputHist, outputName): self.verbose=False self.dataName='data' self.outHist=None + self.closure=False #a boolean to indicate whether we are doing closure test for nonprompt photon estimation + self.do_np_ph=do_np_ph #this controls whether we will do non-prompt photon estimation or not self.promptSubtractionSamples=get_te_param('prompt_subtraction_samples') + self.promptPhSubtractionSamples=get_te_param('prompt_photon_subtraction_samples') self.DDFakes() def DDFakes(self): @@ -28,8 +32,11 @@ def DDFakes(self): self.outHist={} - for key,histo in self.inhist.items(): + #This dictionary collects photon_pt_eta and photon_pt_eta_sumw2 hists needed for FR/kMC stat uncertainty propagation + required_hists_for_nonprompt_ph={} + np_uncertainty_propagation_done = False + for key,histo in self.inhist.items(): if histo.empty(): # histo is empty, so we just integrate over appl and keep an empty histo print(f'[W]: Histogram {key} is empty, returning an empty histo') self.outHist[key]=histo.integrate('appl') @@ -92,10 +99,11 @@ def DDFakes(self): newhist += hFlips - else: + elif "isAR_2lOS"==ident: # if we are in the nonprompt application region, we also integrate the application region axis # and construct the new process 'nonprompt' # we look at data only, and rename it to fakes + print(f"\n\nWe are inside {ident} appl axis") newNameDictData=defaultdict(list); newNameDictNoData=defaultdict(list) for process in hAR.axes['process']: match = pattern.search(process) @@ -132,7 +140,72 @@ def DDFakes(self): else: newhist += hFakes - self.outHist[key]=newhist + #isAR_B_ABCD is the regular AR using which we estimate non-prompt photon in our signal region A + #isAR_R_LRCD is the "AR" corresponding to the "SR" L in the ABCD closure test + elif ident in ["isAR_R_LRCD", "isAR_B_ABCD"] and self.do_np_ph: + print(f"\n\nWe are inside {ident} appl axis") + newDataDict=defaultdict(list); newNonDataDict=defaultdict(list) + for process in hAR.axes['process']: + match = pattern.search(process) + sampleName=match.group('process') + year=match.group('year') + nonPromptPhName='nonpromptPhUL%s'%year + if self.dataName==sampleName: + newDataDict[nonPromptPhName].append(process) + elif sampleName in self.promptPhSubtractionSamples: + newNonDataDict[nonPromptPhName].append(process) + elif sampleName.startswith(("ZGToLLGISR", "ZGToLLGFSR")): + newNonDataDict[nonPromptPhName].append(process) + else: + print(f"We won't consider {sampleName} for the prompt photon subtraction in the appl. region") + hPhFakes=hAR.group('process', newDataDict) + # now we take all the stuff that is not data in the AR to make the prompt photon subtraction and assign them to nonprompt. + hPromptPhSub=hAR.group('process', newNonDataDict) + + # remove the up/down variations (if any) from the prompt photon subtraction histo + # but keep nonPromptPhUp and nonPromptPhDown, as these are the nonprompt photon up and down variations + syst_var_idet_rm_lst = [] + syst_var_idet_lst = list(hPromptPhSub.axes["systematic"]) + for syst_var_idet in syst_var_idet_lst: + if (syst_var_idet != "nominal") and (not syst_var_idet.startswith("nonpromptPh")): + syst_var_idet_rm_lst.append(syst_var_idet) + hPromptPhSub = hPromptPhSub.remove("systematic", syst_var_idet_rm_lst) + + # now we actually make the subtraction + if not key.endswith("_sumw2"): + hPromptPhSub.scale(-1) + hPhFakes += hPromptPhSub + if key == "photon_pt_eta": + required_hists_for_nonprompt_ph["photon_pt_eta"] = hPhFakes + elif key == "photon_pt_eta_sumw2": + required_hists_for_nonprompt_ph["photon_pt_eta_sumw2"] = hPhFakes + if not np_uncertainty_propagation_done and "photon_pt_eta" in required_hists_for_nonprompt_ph and "photon_pt_eta_sumw2" in required_hists_for_nonprompt_ph: + modify_NP_photon_pt_eta_variance(required_hists_for_nonprompt_ph,closure=self.closure) + np_uncertainty_propagation_done = True + # Update newhist with the modified histogram. We only need the sumw2 histogram! + if newhist is None: + newhist = required_hists_for_nonprompt_ph["photon_pt_eta_sumw2"] + else: + newhist += required_hists_for_nonprompt_ph["photon_pt_eta_sumw2"] + + # now adding them to the list of processes. We cannot add the sumw2 histogram yet: + if key not in ["photon_pt_eta_sumw2"]: + if newhist==None: + newhist=hPhFakes + else: + newhist += hPhFakes + + #For the sumw2 2D histogram, only dump it to the outHist dict if we have done the non-prompt uncertainty propagation + if self.do_np_ph and key == "photon_pt_eta_sumw2": + if np_uncertainty_propagation_done: + self.outHist[key]=newhist + else: + self.outHist[key]=newhist + + #This is where we modify the photon_pt2 histogram's variance for the nonpromptPhUL{year} contribution + if self.do_np_ph: + for year in ['16','16APV','17','18']: + modify_photon_pt_variance(self.outHist,year) def dumpToPickle(self): if not self.outputName.endswith(".pkl.gz"): diff --git a/topeft/modules/nonprompt_unc_propagation_helper.py b/topeft/modules/nonprompt_unc_propagation_helper.py new file mode 100644 index 00000000..fda47433 --- /dev/null +++ b/topeft/modules/nonprompt_unc_propagation_helper.py @@ -0,0 +1,100 @@ +#Where is it used?: This script is used in dataDrivenEstimation script. +#What is the purpose?: The main purpose of this is to correctly handle the non-prompt photon uncertainty in the photon_pt histogram. This script makes sure that the FR and kMC uncertainties are included in the photon_pt histograms +#What does it take?: It takes the photon_pt_eta and photon_pt_eta_sumw2 histograms and uses it to modify the values in the photon_pt2_sumw2 histogram +#Future modifications: Potential places with pitfalls in the future are commented with "CAUTION" tag + +import re +import numpy as np +from coffea import hist + +from topeft.modules.paths import topeft_path + +#With overflow bins, the photon_pt_eta (and sumw2) 2D array has an original shape of (N_photon_pt_bins + 1) x (N_photon_eta_bins + 1) +#Similarly, with overflow bins, the photon_pt and (photon_pt_sumw2) has an original shape of (N_photon_pt_bins + 1) X 3, which is the shape of "target_arr" +#We take the 2D sumw2 histogram and sum over the eta bins, which gives us the "original_arr". This array has a shape of 1 x (N_photon_pt_bins + 1) +def get_desired_array_shape(original_arr, target_arr): + #first make it a column matrix + original_arr = original_arr.T + + #create an array of all zeros with shape of target_arr + desired_arr = np.zeros_like(target_arr) + + desired_arr[:,1] = original_arr.flatten() + + return desired_arr + +def load_numpy_files(file_path): + f = np.load(file_path) + + val = f[f.files[0]] + err = f[f.files[1]] #be careful here cause this could be either error or variance + + return val, err + +#This function takes a dictionary that has two histograms: photon_pt_eta and photon_pt_eta_sumw2. At this point, both of these histograms should only have a single process axis "nonpromptPhUL" and the yield here will be with non-prompt photon estimation done. i.e. Data - Prompt MC in region B or R +#CAUTION: The fr_file_path and kmc_file_path are hardcoded right now. +def modify_NP_photon_pt_eta_variance(dict_of_hists_for_NP_uncertainty, closure=False): + print("Inside NP variance calculation block") + photon_pt_eta = dict_of_hists_for_NP_uncertainty["photon_pt_eta"] + photon_pt_eta_sumw2 = dict_of_hists_for_NP_uncertainty["photon_pt_eta_sumw2"] + + for keys in list(photon_pt_eta.view(flow=True).keys()): + for part in keys: + if "nonpromptPhUL" in str(part): + match = re.search(r'nonpromptPhUL(\d{2}APV|\d{2})', str(part)) # Check the exact structure + if match: year = match.group(1) + + #We need to load the fake-rate and kMC files inside cause they depend on year! + fr_file_path = topeft_path("data/photon_fakerates_gyR6uGhvfy/")+f"fr_ph_UL{year}.npz" + + #Depending on whether we are doing closure test or not, the kMC file changes + if closure: + kmc_file_path = topeft_path("data/photon_fakerates_jeJHI2cDh5/")+f"kmc_ph_UL{year}.npz" + + else: + kmc_file_path = topeft_path("data/photon_fakerates_gB29WFMqFb/")+f"kmc_ph_UL{year}.npz" + + + fr_val, fr_err = load_numpy_files(fr_file_path) + kmc_val, kmc_err = load_numpy_files(kmc_file_path) + + ff_val = fr_val*kmc_val + ff_err = ff_val * np.sqrt(pow((fr_err/fr_val),2) + pow((kmc_err/kmc_val),2)) + + #Let's pad zeros as a first row and first column to make the fake factor arrays have same shape as the sumw and sumw2 arrays (which have overflows included) + #CAUTION: In the future, these two lines can be entirely avoided if I use scikit-hep hists to make the FR and kMC files. The files loaded above were made with old coffea hist. + ff_val_padded = np.pad(ff_val, pad_width=((1, 0), (1, 0)), mode='constant', constant_values=0) + ff_err_padded = np.pad(ff_err, pad_width=((1, 0), (1, 0)), mode='constant', constant_values=0) + + if any(str(part) == f"nonpromptPhUL{year}" for part in keys): + sumw2_pt_eta_hist = photon_pt_eta_sumw2.view(flow=True)[keys] + sumw_pt_eta_hist = photon_pt_eta.view(flow=True)[keys] + + #This is the master equation to calculate the final NP variance + np_var = ((sumw2_pt_eta_hist*pow(ff_val_padded,2)))+(pow(sumw_pt_eta_hist*ff_err_padded,2)) + np_var = np.nan_to_num(np_var,0) + + #this is how we modify the values of sumw2 and sumw. After this point the photon_pt_eta and and photon_pt_eta_sumw2 hists values will be modified by new values + sumw2_pt_eta_hist[:] = np_var + sumw_pt_eta_hist[:] = ff_val_padded*sumw_pt_eta_hist + + +#This histogram takes the output histogram dictionary called "outHist" in the dataDrivenEstimation script, where we will have a key called "photon_pt2_sumw2". This histogram will also have the non-prompt estimation done, so it will have a key called "nonPromptPhUL", and we need to modify the values associated with that key. The photon_pt2_sumw2 histogram has same binning as FR/kMC +#Also note that at this point, the photon_pt_eta_sumw2 histogram will already have the values modified as described above +#CAUTION: Revisit this in this future cause we will just have one photon_pt histogram and the name will need to be changed accordingly +def modify_photon_pt_variance(outputHist,year): + for k in list(outputHist['photon_pt2_sumw2'].view(flow=True).keys()): + if any(str(part) == f"nonpromptPhUL{year}" for part in k): + #First, load the values of photon_pt2_sumw2 hist + photon_pt2_sumw2 = outputHist['photon_pt2_sumw2'].view(flow=True)[k] + #Second, load the modified values from photon_pt_eta_sumw2 hist + np_var = outputHist['photon_pt_eta_sumw2'].view(flow=True)[k] + + #WE first have to add over eta bins to get uncertainties associated with the pt bins + new_sumw2_summed_over_eta = np.sum(np_var,1) #At this point this array has a shape of 1 X (N_photon_pt_bins+1) + + #We need to get new_sumw2_summed_over_eta array to be the same shape as photon_pt2_sumw2 array + desired_sumw2_summed_over_eta = get_desired_array_shape(new_sumw2_summed_over_eta, np.zeros((new_sumw2_summed_over_eta.shape[0], 3))) + + #We are now ready to modify the values of the photon_pt2_sumw2 histogram + photon_pt2_sumw2[:] = desired_sumw2_summed_over_eta