From d3fac0af0efe62b5beabadfc43f578de9de09f47 Mon Sep 17 00:00:00 2001
From: Mikael Mieskolainen <mikael.mieskolainen@cern.ch>
Date: Sat, 2 Nov 2024 08:09:08 +0000
Subject: [PATCH] extend Poisson loss 2

---
 icefit/icepeak.py     | 48 ++++++++++++++++++++++++++++---------------
 icefit/peakfit.py     | 10 ++++++---
 tests/test_peakfit.sh | 11 ++++++++++
 3 files changed, 49 insertions(+), 20 deletions(-)
 create mode 100644 tests/test_peakfit.sh

diff --git a/icefit/icepeak.py b/icefit/icepeak.py
index a2785aeb..c6de805b 100644
--- a/icefit/icepeak.py
+++ b/icefit/icepeak.py
@@ -515,7 +515,7 @@ def binned_1D_fit(hist: dict, param: dict, fitfunc: dict, techno: dict, par_fixe
     ### [Chi2 loss function]
     def chi2_loss(par):
 
-        total = 0
+        tot = 0
         
         # Over histograms
         for key in counts.keys():
@@ -529,14 +529,14 @@ def chi2_loss(par):
             
             residual = (y_pred[fmask] - counts[key][rmask][fmask]) / errors[key][rmask][fmask]
 
-            total += np.sum(residual**2)
+            tot += np.sum(residual**2)
 
-        return total
+        return tot
 
     ### [Huber loss function]
     def huber_loss(par):
 
-        total = 0
+        tot = 0
         delta = techno['huber_delta']
         
         # Over histograms
@@ -552,14 +552,15 @@ def huber_loss(par):
             T = huber_lossfunc(y_true=counts[key][rmask][fmask], y_pred=y_pred[fmask],
                                sigma=errors[key][rmask][fmask], delta=delta)
             
-            total += T
+            tot += T
 
-        return total
+        return tot
 
-    ### [Poissonian negative log-likelihood loss function]
+    ### [Poissonian negative delta log-likelihood loss function]
     def poiss_nll_loss(par):
         
-        total = 0
+        EPS = 1e-9
+        nll = 0
         
         # Over histograms
         for key in counts.keys():
@@ -571,16 +572,27 @@ def poiss_nll_loss(par):
             # ** Note use x=cbins[range_mask] here, due to trapz integral in fitfunc ! **
             y_pred = fitfunc[key](cbins[key][rmask], par, par_fixed)
             
-            valid  = (y_pred > 0) & fmask
+            valid  = (y_pred > 0)
             if np.sum(valid) == 0: return 1e9
             
-            T1 = counts[key][rmask][valid] * np.log(y_pred[valid])
-            T2 = y_pred[valid]
-
-            total += (-1)*(np.sum(T1) - np.sum(T2))
-
-        return total
-
+            y_pred[~valid] = EPS
+            
+            # Bohm-Zech scale transform for weighted events (https://arxiv.org/abs/1309.1287)
+            # https://scikit-hep.org/iminuit/notebooks/weighted_histograms.html
+            s  = counts[key][rmask][fmask] / (errors[key][rmask][fmask]**2) # Per bin
+            
+            n  = counts[key][rmask][fmask] # Observed
+            mu = y_pred[fmask]             # Predicted
+            
+            # Factor 2 x taken into account by setting `errordef = iminuit.Minuit.LIKELIHOOD`
+            
+            # Delta log-likelihood with Bohm-Zech scale
+            nll += np.sum( s * (n * (np.log(n) - np.log(mu)) - (n - mu)) )
+            
+            # Simple log-likelihood (NB. x -1)
+            #nll += (-1) * np.sum(n * np.log(mu) - mu)
+        
+        return nll
     
     # --------------------------------------------------------------------
     loss_type = techno['loss_type']
@@ -739,9 +751,11 @@ def optimizer_execute(trials, loss, param, techno):
         m1.fixed[k] = param['fixed'][k]
     # --------------------------------------------------------------------
     
-    if   techno['loss_type'] == 'nll':
+    if   'nll' in techno['loss_type']:
+        cprint('Setting errordef "LIKELIHOOD" ', 'yellow')
         m1.errordef = iminuit.Minuit.LIKELIHOOD
     else:
+        cprint('Setting errordef "LEAST_SQUARES" ', 'yellow')
         m1.errordef = iminuit.Minuit.LEAST_SQUARES
     
     # Set parameter bounds
diff --git a/icefit/peakfit.py b/icefit/peakfit.py
index a4f1cd24..a5eb5432 100644
--- a/icefit/peakfit.py
+++ b/icefit/peakfit.py
@@ -6,8 +6,12 @@
 # - Keep all pdf functions normalized in the steering yml (norm: True),
 #   otherwise fit stability problems and uncertainty estimation is not consistent.
 #
-# - Use 'chi2' loss if using weighted event histograms (either MC or data)
-#   Use 'nll' for unweighted Poisson count histograms (both MC and data)
+# - Use 'chi2' or 'huber' loss if using weighted event histograms (either MC or data)
+#   Use 'nll' for unweighted Poisson count histograms
+#             and a weighted count histograms via a scale transform (experimental)
+# 
+# - For different fit types see: /docs/pdf/peakfit.pdf 
+# 
 # 
 # Run with: python icefit/peakfit.py --analyze --group (--test_mode)
 # 
@@ -34,7 +38,7 @@
 
 import ray
 
-__VERSION__ = 0.02
+__VERSION__ = 0.03
 __AUTHOR__  = 'm.mieskolainen@imperial.ac.uk'
 
 # ========================================================================
diff --git a/tests/test_peakfit.sh b/tests/test_peakfit.sh
new file mode 100644
index 00000000..5f5c8f96
--- /dev/null
+++ b/tests/test_peakfit.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+#
+# Test peakfit
+
+git clone https://github.com/mieskolainen/actions-stash.git
+
+for L in chi2 huber nll; do
+    for T in single dual dual-unitary-I dual-unitary-II; do
+        python icefit/peakfit.py --num_cpus 1 --test_mode --analyze --group --loss_type "$L" --fit_type "$T" --output_name "test_${L}_${T}" > "peakfit_${L}_${T}.log"
+    done
+done