From 8d481f8e2bbfa5403787b861d764282b9c486404 Mon Sep 17 00:00:00 2001
From: Aashwin Basnet <basnetaashwin1166@gmail.com>
Date: Thu, 30 Jan 2025 21:32:56 -0500
Subject: [PATCH] added modifications to perform nonprompt estimation of
 photons

---
 analysis/topeft_run2/run_analysis.py          |  17 +--
 topeft/modules/dataDrivenEstimation.py        |  81 +++++++++++++-
 .../nonprompt_unc_propagation_helper.py       | 100 ++++++++++++++++++
 3 files changed, 188 insertions(+), 10 deletions(-)
 create mode 100644 topeft/modules/nonprompt_unc_propagation_helper.py

diff --git a/analysis/topeft_run2/run_analysis.py b/analysis/topeft_run2/run_analysis.py
index 5a46e10b..b0b1c29d 100644
--- a/analysis/topeft_run2/run_analysis.py
+++ b/analysis/topeft_run2/run_analysis.py
@@ -52,6 +52,7 @@
     parser.add_argument('--skip-sr', action='store_true', help = 'Skip all signal region categories')
     parser.add_argument('--skip-cr', action='store_true', help = 'Skip all control region categories')
     parser.add_argument('--do-np'  , action='store_true', help = 'Perform nonprompt estimation on the output hist, and save a new hist with the np contribution included. Note that signal, background and data samples should all be processed together in order for this option to make sense.')
+    parser.add_argument('--do_np_ph', action='store_true', help='Perform photon non-prompt estimation')
     parser.add_argument('--do-renormfact-envelope', action='store_true', help = 'Perform renorm/fact envelope calculation on the output hist (saves the modified with the the same name as the original.')
     parser.add_argument('--wc-list', action='extend', nargs='+', help = 'Specify a list of Wilson coefficients to use in filling histograms.')
     parser.add_argument('--hist-list', action='extend', nargs='+', help = 'Specify a list of histograms to fill.')
@@ -77,6 +78,7 @@
     skip_sr    = args.skip_sr
     skip_cr    = args.skip_cr
     do_np      = args.do_np
+    do_np_ph   = args.do_np_ph
     do_renormfact_envelope = args.do_renormfact_envelope
     wc_lst = args.wc_list if args.wc_list is not None else []
 
@@ -118,7 +120,7 @@
         # Here we hardcode a list of hists used for the analysis
         hist_lst = ["njets","lj0pt","ptz"]
     elif args.hist_list == ["photon"]:
-        hist_lst = ['njet_bjet','photon_pt','invmass','l0pt','l1pt','njets','nbjetsm']
+        hist_lst = ['photon_pt','photon_eta','photon_eta2','photon_pt2','photon_pt_eta']
     elif args.hist_list == ["cr"]:
         # Here we hardcode a list of hists used for the CRs
         hist_lst = ["lj0pt", "ptz", "met", "ljptsum", "l0pt", "l0eta", "l1pt", "l1eta", "j0pt", "j0eta", "njets", "nbtagsl", "invmass"]
@@ -336,9 +338,9 @@ def LoadJsonToSampleName(jsonFile, prefix):
     if executor == "work_queue":
         print('Processed {} events in {} seconds ({:.2f} evts/sec).'.format(nevts_total,dt,nevts_total/dt))
 
-    #nbins = sum(sum(arr.size for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist))
-    #nfilled = sum(sum(np.sum(arr > 0) for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist))
-    #print("Filled %.0f bins, nonzero bins: %1.1f %%" % (nbins, 100*nfilled/nbins,))
+    nbins = sum(sum(arr.size for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist))
+    nfilled = sum(sum(np.sum(arr > 0) for arr in h.eval({}).values()) for h in output.values() if isinstance(h, hist.Hist))
+    print("Filled %.0f bins, nonzero bins: %1.1f %%" % (nbins, 100*nfilled/nbins,))
 
     if executor == "futures":
         print("Processing time: %1.2f s with %i workers (%.2f s cpu overall)" % (dt, nworkers, dt*nworkers, ))
@@ -353,9 +355,12 @@ def LoadJsonToSampleName(jsonFile, prefix):
 
     # Run the data driven estimation, save the output
     if do_np:
-        print("\nDoing the nonprompt estimation...")
+        if not do_np_ph:
+            print("\nDoing the nonprompt lepton estimation...")
+        else:
+            print("\nDoing the nonprompt estimation of lepton and photon......")
         out_pkl_file_name_np = os.path.join(outpath,outname+"_np.pkl.gz")
-        ddp = DataDrivenProducer(out_pkl_file,out_pkl_file_name_np)
+        ddp = DataDrivenProducer(out_pkl_file,out_pkl_file_name_np, do_np_ph=args.do_np_ph)
         print(f"Saving output in {out_pkl_file_name_np}...")
         ddp.dumpToPickle()
         print("Done!")
diff --git a/topeft/modules/dataDrivenEstimation.py b/topeft/modules/dataDrivenEstimation.py
index 577e15d1..cdd5f601 100644
--- a/topeft/modules/dataDrivenEstimation.py
+++ b/topeft/modules/dataDrivenEstimation.py
@@ -7,10 +7,11 @@
 
 from topeft.modules.paths import topeft_path
 from topcoffea.modules.get_param_from_jsons import GetParam
+from topeft.modules.nonprompt_unc_propagation_helper import modify_NP_photon_pt_eta_variance,modify_photon_pt_variance
 get_te_param = GetParam(topeft_path("params/params.json"))
 
 class DataDrivenProducer:
-    def __init__(self, inputHist, outputName):
+    def __init__(self, inputHist, outputName, do_np_ph=False):
         if isinstance(inputHist, str) and inputHist.endswith('.pkl.gz'): # we are plugging a pickle file
             self.inhist=utils.get_hist_from_pkl(inputHist)
         else: # we already have the histogram
@@ -19,7 +20,10 @@ def __init__(self, inputHist, outputName):
         self.verbose=False
         self.dataName='data'
         self.outHist=None
+        self.closure=False #a boolean to indicate whether we are doing closure test for nonprompt photon estimation
+        self.do_np_ph=do_np_ph #this controls whether we will do non-prompt photon estimation or not
         self.promptSubtractionSamples=get_te_param('prompt_subtraction_samples')
+        self.promptPhSubtractionSamples=get_te_param('prompt_photon_subtraction_samples')
         self.DDFakes()
 
     def DDFakes(self):
@@ -28,8 +32,11 @@ def DDFakes(self):
 
         self.outHist={}
 
-        for key,histo in self.inhist.items():
+        #This dictionary collects photon_pt_eta and photon_pt_eta_sumw2 hists needed for FR/kMC stat uncertainty propagation
+        required_hists_for_nonprompt_ph={}
+        np_uncertainty_propagation_done = False
 
+        for key,histo in self.inhist.items():
             if histo.empty(): # histo is empty, so we just integrate over appl and keep an empty histo
                 print(f'[W]: Histogram {key} is empty, returning an empty histo')
                 self.outHist[key]=histo.integrate('appl')
@@ -92,10 +99,11 @@ def DDFakes(self):
                             newhist += hFlips
 
 
-                    else:
+                    elif "isAR_2lOS"==ident:
                         # if we are in the nonprompt application region, we also integrate the application region axis
                         # and construct the new process 'nonprompt'
                         # we look at data only, and rename it to fakes
+                        print(f"\n\nWe are inside {ident} appl axis")
                         newNameDictData=defaultdict(list); newNameDictNoData=defaultdict(list)
                         for process in hAR.axes['process']:
                             match = pattern.search(process)
@@ -132,7 +140,72 @@ def DDFakes(self):
                         else:
                             newhist += hFakes
 
-            self.outHist[key]=newhist
+                    #isAR_B_ABCD is the regular AR using which we estimate non-prompt photon in our signal region A
+                    #isAR_R_LRCD is the "AR" corresponding to the "SR" L in the ABCD closure test
+                    elif ident in ["isAR_R_LRCD", "isAR_B_ABCD"] and self.do_np_ph:
+                        print(f"\n\nWe are inside {ident} appl axis")
+                        newDataDict=defaultdict(list); newNonDataDict=defaultdict(list)
+                        for process in hAR.axes['process']:
+                            match = pattern.search(process)
+                            sampleName=match.group('process')
+                            year=match.group('year')
+                            nonPromptPhName='nonpromptPhUL%s'%year
+                            if self.dataName==sampleName:
+                                newDataDict[nonPromptPhName].append(process)
+                            elif sampleName in self.promptPhSubtractionSamples:
+                                newNonDataDict[nonPromptPhName].append(process)
+                            elif sampleName.startswith(("ZGToLLGISR", "ZGToLLGFSR")):
+                                newNonDataDict[nonPromptPhName].append(process)
+                            else:
+                                print(f"We won't consider {sampleName} for the prompt photon subtraction in the appl. region")
+                        hPhFakes=hAR.group('process', newDataDict)
+                        # now we take all the stuff that is not data in the AR to make the prompt photon subtraction and assign them to nonprompt.
+                        hPromptPhSub=hAR.group('process', newNonDataDict)
+
+                        # remove the up/down variations (if any) from the prompt photon subtraction histo
+                        # but keep nonPromptPhUp and nonPromptPhDown, as these are the nonprompt photon up and down variations
+                        syst_var_idet_rm_lst = []
+                        syst_var_idet_lst = list(hPromptPhSub.axes["systematic"])
+                        for syst_var_idet in syst_var_idet_lst:
+                            if (syst_var_idet != "nominal") and (not syst_var_idet.startswith("nonpromptPh")):
+                                syst_var_idet_rm_lst.append(syst_var_idet)
+                        hPromptPhSub = hPromptPhSub.remove("systematic", syst_var_idet_rm_lst)
+
+                        # now we actually make the subtraction
+                        if not key.endswith("_sumw2"):
+                            hPromptPhSub.scale(-1)
+                        hPhFakes += hPromptPhSub
+                        if key == "photon_pt_eta":
+                            required_hists_for_nonprompt_ph["photon_pt_eta"] = hPhFakes
+                        elif key == "photon_pt_eta_sumw2":
+                            required_hists_for_nonprompt_ph["photon_pt_eta_sumw2"] = hPhFakes
+                        if not np_uncertainty_propagation_done and "photon_pt_eta" in required_hists_for_nonprompt_ph and "photon_pt_eta_sumw2" in required_hists_for_nonprompt_ph:
+                            modify_NP_photon_pt_eta_variance(required_hists_for_nonprompt_ph,closure=self.closure)
+                            np_uncertainty_propagation_done = True
+                            # Update newhist with the modified histogram. We only need the sumw2 histogram!
+                            if newhist is None:
+                                newhist = required_hists_for_nonprompt_ph["photon_pt_eta_sumw2"]
+                            else:
+                                newhist += required_hists_for_nonprompt_ph["photon_pt_eta_sumw2"]
+
+                        # now adding them to the list of processes. We cannot add the sumw2 histogram yet:
+                        if key not in ["photon_pt_eta_sumw2"]:
+                            if newhist==None:
+                                newhist=hPhFakes
+                            else:
+                                newhist += hPhFakes
+
+            #For the sumw2 2D histogram, only dump it to the outHist dict if we have done the non-prompt uncertainty propagation
+            if self.do_np_ph and key == "photon_pt_eta_sumw2":
+                if np_uncertainty_propagation_done:
+                    self.outHist[key]=newhist
+            else:
+                self.outHist[key]=newhist
+
+        #This is where we modify the photon_pt2 histogram's variance for the nonpromptPhUL{year} contribution
+        if self.do_np_ph:
+            for year in ['16','16APV','17','18']:
+                modify_photon_pt_variance(self.outHist,year)
 
     def dumpToPickle(self):
         if not self.outputName.endswith(".pkl.gz"):
diff --git a/topeft/modules/nonprompt_unc_propagation_helper.py b/topeft/modules/nonprompt_unc_propagation_helper.py
new file mode 100644
index 00000000..fda47433
--- /dev/null
+++ b/topeft/modules/nonprompt_unc_propagation_helper.py
@@ -0,0 +1,100 @@
+#Where is it used?: This script is used in dataDrivenEstimation script. 
+#What is the purpose?: The main purpose of this is to correctly handle the non-prompt photon uncertainty in the photon_pt histogram. This script makes sure that the FR and kMC uncertainties are included in the photon_pt histograms
+#What does it take?: It takes the photon_pt_eta and photon_pt_eta_sumw2 histograms and uses it to modify the values in the photon_pt2_sumw2 histogram
+#Future modifications: Potential places with pitfalls in the future are commented with "CAUTION" tag
+
+import re
+import numpy as np
+from coffea import hist
+
+from topeft.modules.paths import topeft_path
+
+#With overflow bins, the photon_pt_eta (and sumw2) 2D array has an original shape of (N_photon_pt_bins + 1) x (N_photon_eta_bins + 1)
+#Similarly, with overflow bins, the photon_pt and (photon_pt_sumw2) has an original shape of (N_photon_pt_bins + 1) X 3, which is the shape of "target_arr"
+#We take the 2D sumw2 histogram and sum over the eta bins, which gives us the "original_arr". This array has a shape of 1 x (N_photon_pt_bins + 1)
+def get_desired_array_shape(original_arr, target_arr):
+    #first make it a column matrix
+    original_arr = original_arr.T
+
+    #create an array of all zeros with shape of target_arr    
+    desired_arr = np.zeros_like(target_arr)
+    
+    desired_arr[:,1] = original_arr.flatten()
+    
+    return desired_arr
+
+def load_numpy_files(file_path):
+    f = np.load(file_path)
+
+    val = f[f.files[0]]
+    err = f[f.files[1]]  #be careful here cause this could be either error or variance
+
+    return val, err
+
+#This function takes a dictionary that has two histograms: photon_pt_eta and photon_pt_eta_sumw2. At this point, both of these histograms should only have a single process axis "nonpromptPhUL<year>" and the yield here will be with non-prompt photon estimation done. i.e. Data - Prompt MC in region B or R
+#CAUTION: The fr_file_path and kmc_file_path are hardcoded right now. 
+def modify_NP_photon_pt_eta_variance(dict_of_hists_for_NP_uncertainty, closure=False):
+    print("Inside NP variance calculation block")
+    photon_pt_eta = dict_of_hists_for_NP_uncertainty["photon_pt_eta"]
+    photon_pt_eta_sumw2 = dict_of_hists_for_NP_uncertainty["photon_pt_eta_sumw2"]
+
+    for keys in list(photon_pt_eta.view(flow=True).keys()):
+        for part in keys:
+            if "nonpromptPhUL" in str(part):
+                match = re.search(r'nonpromptPhUL(\d{2}APV|\d{2})', str(part))  # Check the exact structure
+            if match: year = match.group(1)
+
+        #We need to load the fake-rate and kMC files inside cause they depend on year!
+        fr_file_path = topeft_path("data/photon_fakerates_gyR6uGhvfy/")+f"fr_ph_UL{year}.npz"
+
+        #Depending on whether we are doing closure test or not, the kMC file changes
+        if closure:
+            kmc_file_path = topeft_path("data/photon_fakerates_jeJHI2cDh5/")+f"kmc_ph_UL{year}.npz"
+
+        else:
+            kmc_file_path = topeft_path("data/photon_fakerates_gB29WFMqFb/")+f"kmc_ph_UL{year}.npz"
+
+
+        fr_val, fr_err = load_numpy_files(fr_file_path)
+        kmc_val, kmc_err = load_numpy_files(kmc_file_path)
+
+        ff_val = fr_val*kmc_val
+        ff_err = ff_val * np.sqrt(pow((fr_err/fr_val),2) + pow((kmc_err/kmc_val),2))
+
+        #Let's pad zeros as a first row and first column to make the fake factor arrays have same shape as the sumw and sumw2 arrays (which have overflows included)
+        #CAUTION: In the future, these two lines can be entirely avoided if I use scikit-hep hists to make the FR and kMC files. The files loaded above were made with old coffea hist.
+        ff_val_padded = np.pad(ff_val, pad_width=((1, 0), (1, 0)), mode='constant', constant_values=0)
+        ff_err_padded = np.pad(ff_err, pad_width=((1, 0), (1, 0)), mode='constant', constant_values=0)
+
+        if any(str(part) == f"nonpromptPhUL{year}" for part in keys):
+            sumw2_pt_eta_hist = photon_pt_eta_sumw2.view(flow=True)[keys]
+            sumw_pt_eta_hist = photon_pt_eta.view(flow=True)[keys]
+
+            #This is the master equation to calculate the final NP variance
+            np_var = ((sumw2_pt_eta_hist*pow(ff_val_padded,2)))+(pow(sumw_pt_eta_hist*ff_err_padded,2))
+            np_var = np.nan_to_num(np_var,0)
+
+            #this is how we modify the values of sumw2 and sumw. After this point the photon_pt_eta and and photon_pt_eta_sumw2 hists values will be modified by new values 
+            sumw2_pt_eta_hist[:] = np_var
+            sumw_pt_eta_hist[:] = ff_val_padded*sumw_pt_eta_hist
+
+
+#This histogram takes the output histogram dictionary called "outHist" in the dataDrivenEstimation script, where we will have a key called "photon_pt2_sumw2". This histogram will also have the non-prompt estimation done, so it will have a key called "nonPromptPhUL<year>", and we need to modify the values associated with that key. The photon_pt2_sumw2 histogram has same binning as FR/kMC
+#Also note that at this point, the photon_pt_eta_sumw2 histogram will already have the values modified as described above
+#CAUTION: Revisit this in this future cause we will just have one photon_pt histogram and the name will need to be changed accordingly
+def modify_photon_pt_variance(outputHist,year):
+    for k in list(outputHist['photon_pt2_sumw2'].view(flow=True).keys()):
+        if any(str(part) == f"nonpromptPhUL{year}" for part in k):
+            #First, load the values of photon_pt2_sumw2 hist
+            photon_pt2_sumw2 = outputHist['photon_pt2_sumw2'].view(flow=True)[k]
+            #Second, load the modified values from photon_pt_eta_sumw2 hist
+            np_var = outputHist['photon_pt_eta_sumw2'].view(flow=True)[k]
+
+            #WE first have to add over eta bins to get uncertainties associated with the pt bins
+            new_sumw2_summed_over_eta = np.sum(np_var,1)  #At this point this array has a shape of 1 X (N_photon_pt_bins+1)
+
+            #We need to get new_sumw2_summed_over_eta array to be the same shape as photon_pt2_sumw2 array
+            desired_sumw2_summed_over_eta = get_desired_array_shape(new_sumw2_summed_over_eta, np.zeros((new_sumw2_summed_over_eta.shape[0], 3)))
+
+            #We are now ready to modify the values of the photon_pt2_sumw2 histogram
+            photon_pt2_sumw2[:] = desired_sumw2_summed_over_eta