Skip to content

Commit

Permalink
added modifications to perform nonprompt estimation of photons
Browse files Browse the repository at this point in the history
  • Loading branch information
abasnet97 committed Jan 31, 2025
1 parent ddad937 commit 8d481f8
Show file tree
Hide file tree
Showing 3 changed files with 188 additions and 10 deletions.
17 changes: 11 additions & 6 deletions analysis/topeft_run2/run_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
parser.add_argument('--skip-sr', action='store_true', help = 'Skip all signal region categories')
parser.add_argument('--skip-cr', action='store_true', help = 'Skip all control region categories')
parser.add_argument('--do-np' , action='store_true', help = 'Perform nonprompt estimation on the output hist, and save a new hist with the np contribution included. Note that signal, background and data samples should all be processed together in order for this option to make sense.')
parser.add_argument('--do_np_ph', action='store_true', help='Perform photon non-prompt estimation')
parser.add_argument('--do-renormfact-envelope', action='store_true', help = 'Perform renorm/fact envelope calculation on the output hist (saves the modified with the the same name as the original.')
parser.add_argument('--wc-list', action='extend', nargs='+', help = 'Specify a list of Wilson coefficients to use in filling histograms.')
parser.add_argument('--hist-list', action='extend', nargs='+', help = 'Specify a list of histograms to fill.')
Expand All @@ -77,6 +78,7 @@
skip_sr = args.skip_sr
skip_cr = args.skip_cr
do_np = args.do_np
do_np_ph = args.do_np_ph
do_renormfact_envelope = args.do_renormfact_envelope
wc_lst = args.wc_list if args.wc_list is not None else []

Expand Down Expand Up @@ -118,7 +120,7 @@
# Here we hardcode a list of hists used for the analysis
hist_lst = ["njets","lj0pt","ptz"]
elif args.hist_list == ["photon"]:
hist_lst = ['njet_bjet','photon_pt','invmass','l0pt','l1pt','njets','nbjetsm']
hist_lst = ['photon_pt','photon_eta','photon_eta2','photon_pt2','photon_pt_eta']
elif args.hist_list == ["cr"]:
# Here we hardcode a list of hists used for the CRs
hist_lst = ["lj0pt", "ptz", "met", "ljptsum", "l0pt", "l0eta", "l1pt", "l1eta", "j0pt", "j0eta", "njets", "nbtagsl", "invmass"]
Expand Down Expand Up @@ -336,9 +338,9 @@ def LoadJsonToSampleName(jsonFile, prefix):
if executor == "work_queue":
print('Processed {} events in {} seconds ({:.2f} evts/sec).'.format(nevts_total,dt,nevts_total/dt))

#nbins = sum(sum(arr.size for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist))
#nfilled = sum(sum(np.sum(arr > 0) for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist))
#print("Filled %.0f bins, nonzero bins: %1.1f %%" % (nbins, 100*nfilled/nbins,))
nbins = sum(sum(arr.size for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist))
nfilled = sum(sum(np.sum(arr > 0) for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist))
print("Filled %.0f bins, nonzero bins: %1.1f %%" % (nbins, 100*nfilled/nbins,))

if executor == "futures":
print("Processing time: %1.2f s with %i workers (%.2f s cpu overall)" % (dt, nworkers, dt*nworkers, ))
Expand All @@ -353,9 +355,12 @@ def LoadJsonToSampleName(jsonFile, prefix):

# Run the data driven estimation, save the output
if do_np:
print("\nDoing the nonprompt estimation...")
if not do_np_ph:
print("\nDoing the nonprompt lepton estimation...")
else:
print("\nDoing the nonprompt estimation of lepton and photon......")
out_pkl_file_name_np = os.path.join(outpath,outname+"_np.pkl.gz")
ddp = DataDrivenProducer(out_pkl_file,out_pkl_file_name_np)
ddp = DataDrivenProducer(out_pkl_file,out_pkl_file_name_np, do_np_ph=args.do_np_ph)
print(f"Saving output in {out_pkl_file_name_np}...")
ddp.dumpToPickle()
print("Done!")
Expand Down
81 changes: 77 additions & 4 deletions topeft/modules/dataDrivenEstimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@

from topeft.modules.paths import topeft_path
from topcoffea.modules.get_param_from_jsons import GetParam
from topeft.modules.nonprompt_unc_propagation_helper import modify_NP_photon_pt_eta_variance,modify_photon_pt_variance
get_te_param = GetParam(topeft_path("params/params.json"))

class DataDrivenProducer:
def __init__(self, inputHist, outputName):
def __init__(self, inputHist, outputName, do_np_ph=False):
if isinstance(inputHist, str) and inputHist.endswith('.pkl.gz'): # we are plugging a pickle file
self.inhist=utils.get_hist_from_pkl(inputHist)
else: # we already have the histogram
Expand All @@ -19,7 +20,10 @@ def __init__(self, inputHist, outputName):
self.verbose=False
self.dataName='data'
self.outHist=None
self.closure=False #a boolean to indicate whether we are doing closure test for nonprompt photon estimation
self.do_np_ph=do_np_ph #this controls whether we will do non-prompt photon estimation or not
self.promptSubtractionSamples=get_te_param('prompt_subtraction_samples')
self.promptPhSubtractionSamples=get_te_param('prompt_photon_subtraction_samples')
self.DDFakes()

def DDFakes(self):
Expand All @@ -28,8 +32,11 @@ def DDFakes(self):

self.outHist={}

for key,histo in self.inhist.items():
#This dictionary collects photon_pt_eta and photon_pt_eta_sumw2 hists needed for FR/kMC stat uncertainty propagation
required_hists_for_nonprompt_ph={}
np_uncertainty_propagation_done = False

for key,histo in self.inhist.items():
if histo.empty(): # histo is empty, so we just integrate over appl and keep an empty histo
print(f'[W]: Histogram {key} is empty, returning an empty histo')
self.outHist[key]=histo.integrate('appl')
Expand Down Expand Up @@ -92,10 +99,11 @@ def DDFakes(self):
newhist += hFlips


else:
elif "isAR_2lOS"==ident:
# if we are in the nonprompt application region, we also integrate the application region axis
# and construct the new process 'nonprompt'
# we look at data only, and rename it to fakes
print(f"\n\nWe are inside {ident} appl axis")
newNameDictData=defaultdict(list); newNameDictNoData=defaultdict(list)
for process in hAR.axes['process']:
match = pattern.search(process)
Expand Down Expand Up @@ -132,7 +140,72 @@ def DDFakes(self):
else:
newhist += hFakes

self.outHist[key]=newhist
#isAR_B_ABCD is the regular AR using which we estimate non-prompt photon in our signal region A
#isAR_R_LRCD is the "AR" corresponding to the "SR" L in the ABCD closure test
elif ident in ["isAR_R_LRCD", "isAR_B_ABCD"] and self.do_np_ph:
print(f"\n\nWe are inside {ident} appl axis")
newDataDict=defaultdict(list); newNonDataDict=defaultdict(list)
for process in hAR.axes['process']:
match = pattern.search(process)
sampleName=match.group('process')
year=match.group('year')
nonPromptPhName='nonpromptPhUL%s'%year
if self.dataName==sampleName:
newDataDict[nonPromptPhName].append(process)
elif sampleName in self.promptPhSubtractionSamples:
newNonDataDict[nonPromptPhName].append(process)
elif sampleName.startswith(("ZGToLLGISR", "ZGToLLGFSR")):
newNonDataDict[nonPromptPhName].append(process)
else:
print(f"We won't consider {sampleName} for the prompt photon subtraction in the appl. region")
hPhFakes=hAR.group('process', newDataDict)
# now we take all the stuff that is not data in the AR to make the prompt photon subtraction and assign them to nonprompt.
hPromptPhSub=hAR.group('process', newNonDataDict)

# remove the up/down variations (if any) from the prompt photon subtraction histo
# but keep nonPromptPhUp and nonPromptPhDown, as these are the nonprompt photon up and down variations
syst_var_idet_rm_lst = []
syst_var_idet_lst = list(hPromptPhSub.axes["systematic"])
for syst_var_idet in syst_var_idet_lst:
if (syst_var_idet != "nominal") and (not syst_var_idet.startswith("nonpromptPh")):
syst_var_idet_rm_lst.append(syst_var_idet)
hPromptPhSub = hPromptPhSub.remove("systematic", syst_var_idet_rm_lst)

# now we actually make the subtraction
if not key.endswith("_sumw2"):
hPromptPhSub.scale(-1)
hPhFakes += hPromptPhSub
if key == "photon_pt_eta":
required_hists_for_nonprompt_ph["photon_pt_eta"] = hPhFakes
elif key == "photon_pt_eta_sumw2":
required_hists_for_nonprompt_ph["photon_pt_eta_sumw2"] = hPhFakes
if not np_uncertainty_propagation_done and "photon_pt_eta" in required_hists_for_nonprompt_ph and "photon_pt_eta_sumw2" in required_hists_for_nonprompt_ph:
modify_NP_photon_pt_eta_variance(required_hists_for_nonprompt_ph,closure=self.closure)
np_uncertainty_propagation_done = True
# Update newhist with the modified histogram. We only need the sumw2 histogram!
if newhist is None:
newhist = required_hists_for_nonprompt_ph["photon_pt_eta_sumw2"]
else:
newhist += required_hists_for_nonprompt_ph["photon_pt_eta_sumw2"]

# now adding them to the list of processes. We cannot add the sumw2 histogram yet:
if key not in ["photon_pt_eta_sumw2"]:
if newhist==None:
newhist=hPhFakes
else:
newhist += hPhFakes

#For the sumw2 2D histogram, only dump it to the outHist dict if we have done the non-prompt uncertainty propagation
if self.do_np_ph and key == "photon_pt_eta_sumw2":
if np_uncertainty_propagation_done:
self.outHist[key]=newhist
else:
self.outHist[key]=newhist

#This is where we modify the photon_pt2 histogram's variance for the nonpromptPhUL{year} contribution
if self.do_np_ph:
for year in ['16','16APV','17','18']:
modify_photon_pt_variance(self.outHist,year)

def dumpToPickle(self):
if not self.outputName.endswith(".pkl.gz"):
Expand Down
100 changes: 100 additions & 0 deletions topeft/modules/nonprompt_unc_propagation_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#Where is it used?: This script is used in dataDrivenEstimation script.
#What is the purpose?: The main purpose of this is to correctly handle the non-prompt photon uncertainty in the photon_pt histogram. This script makes sure that the FR and kMC uncertainties are included in the photon_pt histograms
#What does it take?: It takes the photon_pt_eta and photon_pt_eta_sumw2 histograms and uses it to modify the values in the photon_pt2_sumw2 histogram
#Future modifications: Potential places with pitfalls in the future are commented with "CAUTION" tag

import re
import numpy as np
from coffea import hist

from topeft.modules.paths import topeft_path

#With overflow bins, the photon_pt_eta (and sumw2) 2D array has an original shape of (N_photon_pt_bins + 1) x (N_photon_eta_bins + 1)
#Similarly, with overflow bins, the photon_pt and (photon_pt_sumw2) has an original shape of (N_photon_pt_bins + 1) X 3, which is the shape of "target_arr"
#We take the 2D sumw2 histogram and sum over the eta bins, which gives us the "original_arr". This array has a shape of 1 x (N_photon_pt_bins + 1)
def get_desired_array_shape(original_arr, target_arr):
#first make it a column matrix
original_arr = original_arr.T

#create an array of all zeros with shape of target_arr
desired_arr = np.zeros_like(target_arr)

desired_arr[:,1] = original_arr.flatten()

return desired_arr

def load_numpy_files(file_path):
f = np.load(file_path)

val = f[f.files[0]]
err = f[f.files[1]] #be careful here cause this could be either error or variance

return val, err

#This function takes a dictionary that has two histograms: photon_pt_eta and photon_pt_eta_sumw2. At this point, both of these histograms should only have a single process axis "nonpromptPhUL<year>" and the yield here will be with non-prompt photon estimation done. i.e. Data - Prompt MC in region B or R
#CAUTION: The fr_file_path and kmc_file_path are hardcoded right now.
def modify_NP_photon_pt_eta_variance(dict_of_hists_for_NP_uncertainty, closure=False):
print("Inside NP variance calculation block")
photon_pt_eta = dict_of_hists_for_NP_uncertainty["photon_pt_eta"]
photon_pt_eta_sumw2 = dict_of_hists_for_NP_uncertainty["photon_pt_eta_sumw2"]

for keys in list(photon_pt_eta.view(flow=True).keys()):
for part in keys:
if "nonpromptPhUL" in str(part):
match = re.search(r'nonpromptPhUL(\d{2}APV|\d{2})', str(part)) # Check the exact structure
if match: year = match.group(1)

#We need to load the fake-rate and kMC files inside cause they depend on year!
fr_file_path = topeft_path("data/photon_fakerates_gyR6uGhvfy/")+f"fr_ph_UL{year}.npz"

#Depending on whether we are doing closure test or not, the kMC file changes
if closure:
kmc_file_path = topeft_path("data/photon_fakerates_jeJHI2cDh5/")+f"kmc_ph_UL{year}.npz"

else:
kmc_file_path = topeft_path("data/photon_fakerates_gB29WFMqFb/")+f"kmc_ph_UL{year}.npz"


fr_val, fr_err = load_numpy_files(fr_file_path)
kmc_val, kmc_err = load_numpy_files(kmc_file_path)

ff_val = fr_val*kmc_val
ff_err = ff_val * np.sqrt(pow((fr_err/fr_val),2) + pow((kmc_err/kmc_val),2))

#Let's pad zeros as a first row and first column to make the fake factor arrays have same shape as the sumw and sumw2 arrays (which have overflows included)
#CAUTION: In the future, these two lines can be entirely avoided if I use scikit-hep hists to make the FR and kMC files. The files loaded above were made with old coffea hist.
ff_val_padded = np.pad(ff_val, pad_width=((1, 0), (1, 0)), mode='constant', constant_values=0)
ff_err_padded = np.pad(ff_err, pad_width=((1, 0), (1, 0)), mode='constant', constant_values=0)

if any(str(part) == f"nonpromptPhUL{year}" for part in keys):
sumw2_pt_eta_hist = photon_pt_eta_sumw2.view(flow=True)[keys]
sumw_pt_eta_hist = photon_pt_eta.view(flow=True)[keys]

#This is the master equation to calculate the final NP variance
np_var = ((sumw2_pt_eta_hist*pow(ff_val_padded,2)))+(pow(sumw_pt_eta_hist*ff_err_padded,2))
np_var = np.nan_to_num(np_var,0)

#this is how we modify the values of sumw2 and sumw. After this point the photon_pt_eta and and photon_pt_eta_sumw2 hists values will be modified by new values
sumw2_pt_eta_hist[:] = np_var
sumw_pt_eta_hist[:] = ff_val_padded*sumw_pt_eta_hist


#This histogram takes the output histogram dictionary called "outHist" in the dataDrivenEstimation script, where we will have a key called "photon_pt2_sumw2". This histogram will also have the non-prompt estimation done, so it will have a key called "nonPromptPhUL<year>", and we need to modify the values associated with that key. The photon_pt2_sumw2 histogram has same binning as FR/kMC
#Also note that at this point, the photon_pt_eta_sumw2 histogram will already have the values modified as described above
#CAUTION: Revisit this in this future cause we will just have one photon_pt histogram and the name will need to be changed accordingly
def modify_photon_pt_variance(outputHist,year):
for k in list(outputHist['photon_pt2_sumw2'].view(flow=True).keys()):
if any(str(part) == f"nonpromptPhUL{year}" for part in k):
#First, load the values of photon_pt2_sumw2 hist
photon_pt2_sumw2 = outputHist['photon_pt2_sumw2'].view(flow=True)[k]
#Second, load the modified values from photon_pt_eta_sumw2 hist
np_var = outputHist['photon_pt_eta_sumw2'].view(flow=True)[k]

#WE first have to add over eta bins to get uncertainties associated with the pt bins
new_sumw2_summed_over_eta = np.sum(np_var,1) #At this point this array has a shape of 1 X (N_photon_pt_bins+1)

#We need to get new_sumw2_summed_over_eta array to be the same shape as photon_pt2_sumw2 array
desired_sumw2_summed_over_eta = get_desired_array_shape(new_sumw2_summed_over_eta, np.zeros((new_sumw2_summed_over_eta.shape[0], 3)))

#We are now ready to modify the values of the photon_pt2_sumw2 histogram
photon_pt2_sumw2[:] = desired_sumw2_summed_over_eta

0 comments on commit 8d481f8

Please sign in to comment.