From 408a4acd5d4ca7f0e9df66b87a014d0caba3ffec Mon Sep 17 00:00:00 2001
From: Mikael Mieskolainen <mikael.mieskolainen@cern.ch>
Date: Sun, 5 May 2024 17:23:15 +0100
Subject: [PATCH] float64 precision update; template for icezee

---
 analysis/_icepaths_.py  |   1 +
 analysis/zee.py         |  19 +++++
 configs/zee/__init__.py |   3 +
 configs/zee/cuts.py     |  15 ++++
 configs/zee/filter.py   |  15 ++++
 configs/zee/models.yml  |  99 +++++++++++++++++++++++
 configs/zee/mvavars.py  |  25 ++++++
 configs/zee/plots.yml   |  74 +++++++++++++++++
 configs/zee/raytune.yml |  20 +++++
 configs/zee/tune0.yml   | 152 +++++++++++++++++++++++++++++++++++
 icedqcd/common.py       |  12 ++-
 icenet/__init__.py      |   2 +-
 icenet/deep/iceboost.py |  46 ++++++++---
 icenet/tools/aux.py     |  12 ++-
 icenet/tools/iceroot.py |   7 +-
 icezee/__init__.py      |   1 +
 icezee/common.py        | 173 ++++++++++++++++++++++++++++++++++++++++
 tests/runme_zee.sh      |  15 ++++
 18 files changed, 671 insertions(+), 20 deletions(-)
 create mode 100644 analysis/zee.py
 create mode 100644 configs/zee/__init__.py
 create mode 100644 configs/zee/cuts.py
 create mode 100644 configs/zee/filter.py
 create mode 100644 configs/zee/models.yml
 create mode 100644 configs/zee/mvavars.py
 create mode 100644 configs/zee/plots.yml
 create mode 100644 configs/zee/raytune.yml
 create mode 100644 configs/zee/tune0.yml
 create mode 100644 icezee/__init__.py
 create mode 100644 icezee/common.py
 create mode 100644 tests/runme_zee.sh

diff --git a/analysis/_icepaths_.py b/analysis/_icepaths_.py
index 771fc827..dddb45a9 100644
--- a/analysis/_icepaths_.py
+++ b/analysis/_icepaths_.py
@@ -10,6 +10,7 @@
 		 '/iceid/',
 		 '/icefit/',
 		 '/icebrem/',
+		 '/icezee/'
         ]
 
 for p in paths:
diff --git a/analysis/zee.py b/analysis/zee.py
new file mode 100644
index 00000000..4b85c7b5
--- /dev/null
+++ b/analysis/zee.py
@@ -0,0 +1,19 @@
+# Zee steering code
+# 
+# m.mieskolainen@imperial.ac.uk, 2024
+
+import sys
+sys.path.append(".")
+
+# Configure plotting backend
+import matplotlib
+matplotlib.use('Agg')
+
+from icenet.tools import process
+from icezee import common
+
+def main():
+    args = process.generic_flow(rootname='zee', func_loader=common.load_root_file, func_factor=common.splitfactor)
+
+if __name__ == '__main__' :
+    main()
diff --git a/configs/zee/__init__.py b/configs/zee/__init__.py
new file mode 100644
index 00000000..407eb4e3
--- /dev/null
+++ b/configs/zee/__init__.py
@@ -0,0 +1,3 @@
+#
+#
+#
diff --git a/configs/zee/cuts.py b/configs/zee/cuts.py
new file mode 100644
index 00000000..f7a1d44e
--- /dev/null
+++ b/configs/zee/cuts.py
@@ -0,0 +1,15 @@
+# Basic kinematic fiducial cuts, use only variables available in real data.
+#
+# m.mieskolainen@imperial.ac.uk, 2024
+
+import numpy as np
+import numba
+import matplotlib.pyplot as plt
+
+from icenet.tools import stx
+
+
+def cut_nocut(X, ids, isMC, xcorr_flow=False):
+    """ No cuts """
+    return np.ones(X.shape[0], dtype=np.bool_) # # Note datatype np.bool_
+
diff --git a/configs/zee/filter.py b/configs/zee/filter.py
new file mode 100644
index 00000000..c1a6a251
--- /dev/null
+++ b/configs/zee/filter.py
@@ -0,0 +1,15 @@
+# Data filtering rules
+#
+# Note! Physics observable (fiducial / kinematic) cuts are defined in cuts.py, not here.
+#
+# m.mieskolainen@imperial.ac.uk, 2024
+
+import numpy as np
+import numba
+
+from icenet.tools import stx
+
+
+def filter_nofilter(X, ids, isMC, xcorr_flow=False):
+    """ All pass """
+    return np.ones(X.shape[0], dtype=np.bool_) # Note datatype np.bool_
diff --git a/configs/zee/models.yml b/configs/zee/models.yml
new file mode 100644
index 00000000..9fea0e18
--- /dev/null
+++ b/configs/zee/models.yml
@@ -0,0 +1,99 @@
+## MVA models
+
+# XGBoost
+# https://xgboost.readthedocs.io/en/latest/parameter.html
+xgb0:
+  train:   'xgb'
+  predict: 'xgb_logistic'
+  label:   'XGB'
+  raytune:  xgb_trial_0
+  
+  # ** Custom set of variables **
+  #include_MVA_vars: ['.*']
+  #exclude_MVA_vars: ['.*']
+  
+  # booster parameters
+  model_param:
+    num_boost_round: 200       # number of epochs (equal to the number of trees!)
+    
+    booster: 'gbtree'         # 'gbtree' (default), 'dart' (dropout boosting)
+    tree_method: 'hist'
+    device:      'auto'       # 'auto', 'cpu', 'cuda'
+    
+    learning_rate: 0.1
+    gamma: 1.67
+    max_depth: 10
+    min_child_weight: 1.0
+    max_delta_step: 1
+    subsample: 1
+
+    colsample_bytree:  0.86
+    colsample_bylevel: 0.6
+    colsample_bynode:  0.8
+    
+    reg_lambda: 1.0            # L2 regularization
+    reg_alpha: 0.05            # L1 regularization
+    
+    # learning task parameters
+    objective: 'custom:binary_cross_entropy' # Note that 'multi:softprob' does not work with distillation
+    eval_metric: ['custom']                  # for custom losses, otherwise 'logloss', 'mlogloss' ...
+  
+  # BCE loss domains
+  BCE_param:
+    main:
+      classes: [0,1]
+      beta: 1.0
+      #set_filter: *MAIN_DOMAIN_FILTER # Comment out for 'inclusive'
+  
+  plot_trees: False
+  
+  # Read/Write of epochs
+  savemode: 'all'              # 'all', 'latest'
+  readmode: -1                 # -1 is the last saved epoch
+
+
+# Deep MLP
+dmlp0:
+  train:   'torch_generic'
+  predict: 'torch_vector'
+  label:   'DMLP'
+  raytune:  null
+  
+  # ** Custom set of variables **
+  #include_MVA_vars: ['.*']
+  #exclude_MVA_vars: ['.*']
+  
+  # Model
+  conv_type: 'dmlp'
+  model_param:
+    mlp_dim: [32, 32]     # hidden layer dimensions
+    activation: 'relu'
+    batch_norm: False
+    dropout: 0.01
+  
+  # Optimization
+  opt_param:  
+    lossfunc: 'cross_entropy'  # cross_entropy, focal_entropy, logit_norm_cross_entropy
+    gamma: 2                   # focal_entropy exponent
+    temperature: 1             # logit norm temperature
+    
+    optimizer: 'AdamW'
+    clip_norm: 1.0
+    
+    epochs: 150
+    batch_size: 256
+    lr: 3.0e-4
+    weight_decay: 0.00001      # L2-regularization
+
+  # Scheduler
+  scheduler_param:
+    step_size: 250             # Number of epochs for drop
+    gamma: 0.1
+  
+  device: 'auto'               # alternative 'cpu:0', 'cuda:0'
+  num_workers: 4
+
+  # Read/Write of epochs
+  savemode: 'all'              # 'all', 'latest'
+  readmode: -1                 # -1 is the last saved epoch
+
diff --git a/configs/zee/mvavars.py b/configs/zee/mvavars.py
new file mode 100644
index 00000000..cbc4b9f6
--- /dev/null
+++ b/configs/zee/mvavars.py
@@ -0,0 +1,25 @@
+
+KINEMATIC_VARS = [
+  'probe_eta',
+  'probe_pt',
+  'fixedGridRhoAll'  
+]
+
+MVA_SCALAR_VARS = [
+    'probe_sieie',
+    'probe_sieip',
+    'probe_s4',
+    'probe_r9',
+    'probe_pfChargedIsoWorstVtx',
+    'probe_esEnergyOverRawE',
+    'probe_esEffSigmaRR',
+    'probe_ecalPFClusterIso',
+    'probe_phiWidth',
+    'probe_etaWidth',
+    'probe_trkSumPtHollowConeDR03',
+    'probe_trkSumPtSolidConeDR04',
+    #'probe_pfChargedIso', # not found in data
+]
+
+LOAD_VARS = KINEMATIC_VARS + MVA_SCALAR_VARS
+
diff --git a/configs/zee/plots.yml b/configs/zee/plots.yml
new file mode 100644
index 00000000..4192da07
--- /dev/null
+++ b/configs/zee/plots.yml
@@ -0,0 +1,74 @@
+# Plot steering
+
+# -----------------------------------------------------------------------
+
+basic:
+  active: True
+  nbins:  70
+  percentile_range: [0.5, 99.5]
+  exclude_vals: [null, -999]
+  plot_unweighted: True
+
+corrmat:
+  active: false
+
+contours:
+  active: false
+
+ROC:
+  active: true
+  num_bootstrap: 200
+  xmin: 1.0E-4
+  #set_filter: *FINAL_STATE_FILTER
+
+
+## Binned ROC plots can be 1D or 2D (powerset filtering not supported here)
+ROC_binned:
+  active: false
+  num_bootstrap: 200
+  xmin: 1.0E-4
+  
+  #plot[0]:
+  #  var:   ['x_hlt_pt']
+  #  edges: [4.0, 6, 8.0, 10.0, 12.0, 15.0, 10000]
+  
+  #plot[1]:
+  #  var:   ['x_hlt_eta', 'x_hlt_pt']
+  #  edges: [[-1.5, -1.15, -0.75, 0.0, 0.75, 1.15, 1.5],
+  #        [4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 10000]]
+
+## MVA output density (1D)
+MVA_output:
+  active: true
+  edges: 80
+  #set_filter: *FINAL_STATE_FILTER
+
+## (MVA output x external variable) density (2D)
+# Set filter can be applied only per one plot[i] identifier!
+MVA_2D:
+  active: false
+
+  plot[0]:
+    var:   ['tagger_score']
+    edges: [{'nbin': 50, 'q': [0.0001, 0.9999],  'space': 'linear'},
+            {'nbin': 50, 'minmax': [0.0, 1.0],   'space': 'linear'}]
+    density: True
+
+    #set_filter: *FINAL_STATE_FILTER
+    
+    # -----------------------------
+    # Powerset correlation plot parameters
+    xlim:
+      # For each class [[lower, upper], ... [lower, upper]
+      pearson:     [[-0.15, 0.30], [-0.15, 0.30]]
+      abs_pearson: [[0.0, 0.30],   [0.0, 0.30]]
+      disco:       [[0.0, 0.30],   [0.0, 0.30]]
+      MI:          [[0.0, 0.12],   [0.0, 0.12]]
+    # -----------------------------
+  
+  #plot[1]:
+  #  var:   ['.?hlt_pms2.?'] # RegExp supported
+  #  edges: [{'nbin': 50, 'minmax': [0.0, 1.0], 'space': 'linear'},
+  #          {'nbin': 50, 'q': [0.0, 0.95],     'space': 'log10'}]
+  #  density: True
+  
diff --git a/configs/zee/raytune.yml b/configs/zee/raytune.yml
new file mode 100644
index 00000000..40f4d608
--- /dev/null
+++ b/configs/zee/raytune.yml
@@ -0,0 +1,20 @@
+param:
+
+  #active:         ['xgb1']
+  active:  [null]
+  num_samples:    10  # Trial count parameter
+
+
+setup:
+
+  xgb_trial_0:
+    search_algo: 'HyperOpt'
+
+    search_metric:
+      metric: 'AUC'
+      mode: 'max'
+
+    param:
+      
+      num_boost_round:
+        type: "tune.randint(20, 300)"
diff --git a/configs/zee/tune0.yml b/configs/zee/tune0.yml
new file mode 100644
index 00000000..74f42351
--- /dev/null
+++ b/configs/zee/tune0.yml
@@ -0,0 +1,152 @@
+# ZEE tune0.yml
+#
+# -------------------------------------------------------------------
+
+rootname: 'zee'
+rngseed: 123456                       # Fixed seed for training data mixing
+inputvars: 'mvavars'                  # Main file input description python file
+
+# ----------------------------------------------------
+mva_param: &MVA_INPUT_PARAM
+  use_conditional: false              # Conditional (theory parametric) input
+  primary_classes: [0,1]              # Primary class IDs in MVA (train, ROC etc.)
+  signal_class: 1                     # Signal class ID
+  #DA_class:    -2                    # Domain Adaptation class
+  
+  inputvar_scalar:  'MVA_SCALAR_VARS' # 'MVA_SCALAR_VARS' # Input variables, implemented under mvavars.py
+  #inputvar_jagged:  null             # 'MVA_JAGGED_VARS'
+  #jagged_maxdim:    6
+  
+  varnorm: null                       # Variable normalization: 'zscore', 'madscore', null
+  #varnorm_tensor: 'zscore'           # Tensor variable normalization
+  #varnorm_graph: null                # Not implemented yet
+  
+  frac: [0.6, 0.1, 0.3]               # Train vs validate/test split fraction
+  
+  # Imputation
+  imputation_param:
+    active: true                      # True / False
+    var: 'MVA_SCALAR_VARS'            # Array of variables to be imputated
+    algorithm: 'constant'             # Algorithm type: 'constant', iterative' (vector), knn' (vector), 'mean' (scalar), 'median' (scalar)
+    fill_value: 0                     # For constant imputation
+    knn_k: 8                          # Number of nearest neighbours considered
+    values: null                      # Special values which indicate the need for imputation, if null, then only Inf/Nan
+  
+  # # Graph object construction
+  # graph_param:
+  #   global_on: True
+  #   self_loops: True
+  #   directed: False
+  #   coord: 'pxpypze'                 # 'ptetaphim', 'pxpypze'
+
+  # # ** Image tensor object construction **
+  # image_param:
+
+  #   # See the corresponding construction under common.py
+  #   channels: 2                 # 1,2,...
+
+  #   # bin-edges
+  #   eta_bins: []
+  #   phi_bins: []
+
+
+# ----------------------------------------------------
+genesis_runmode:
+  
+  maxevents:  null
+  inputmap:   null #"mc_input.yml"
+  inputvars: 'mvavars'                # Input description python file
+  
+  mcfile:     'hepml_EEp_pteta_after_rho_train/MC.parquet'
+  datafile:   'files/Data_train_EEp.parquet'
+  tree_name:  null                    # 'ntuplizer/tree'
+
+  targetfunc: null                    # Training target,    implemented under mctargets.py
+  filterfunc: 'filter_nofilter'       # Training filtering, implemented under mcfilter.py
+  cutfunc:    'cut_nocut'             # Basic cuts,         implemented under cuts.py
+
+  xcorr_flow: True                    # Full N-point correlations computed between cuts
+  pickle_size: 100000                 # Number of entries (events) per pickle file
+
+
+# ----------------------------------------------------
+train_runmode:
+
+  <<: *MVA_INPUT_PARAM
+
+  maxevents: null
+  modeltag:  null
+
+  ## Reweighting setup
+  reweight: true
+  reweight_mode: 'write'          # 'write', 'load'
+  reweight_file: 'reweight_train.pkl' # differential reweighting model file
+  
+  reweight_param: &REWEIGHT_PARAM
+
+    maxevents: 100000             # Maximum number of events for the PDF construction
+    equal_frac: true              # Equalize integrated class fractions
+    differential: false           # Differential reweighting
+    reference_class: 0
+
+    # ---------------------
+    
+    # see /trg/tune0.yml
+
+
+  ## Outlier protection in the training
+  outlier_param:
+    algo: 'truncate'   # algorithm: 'truncate', null
+    qmin: 0.01         # in [0,100] 
+    qmax: 99.9         # in [0,100]
+
+  # ** Activate models here **
+  # Give all models some unique identifier and label
+  models:  !include configs/zee/models.yml
+  active_models: &ACTIVE_MODELS
+    
+    - xgb0
+    - dmlp0
+  
+
+  raytune: !include configs/zee/raytune.yml
+
+  # Distillation training
+  # -- the order must be compatible with the causal order in 'active_models'
+  distillation:
+
+    # Big, sophisticated model
+    source:
+      #xgb0
+    
+    # Simple, compressed models
+    drains:
+      #- xgb1
+      # - add more here
+
+  # Batched "deep" training
+  batch_train_param:
+    blocksize: 150000   # Maximum number of events simultaneously in RAM
+    epochs: 50          # Number of global epochs (1 epoch = one iteration over the full input dataset), same for all models
+    #num_cpu: null      # Set null for auto, or an integer for manual.
+
+# ----------------------------------------------------
+eval_runmode:
+
+  <<: *MVA_INPUT_PARAM
+
+  maxevents: null
+  modeltag:  null
+  
+  reweight: true
+  reweight_mode: 'load'          # 'write', 'load'
+  reweight_file: 'reweight_train.pkl'
+
+  reweight_param: *REWEIGHT_PARAM
+
+  models:  !include configs/zee/models.yml
+  active_models: *ACTIVE_MODELS
+
+# ----------------------------------------------------
+plot_param: !include configs/zee/plots.yml
+
diff --git a/icedqcd/common.py b/icedqcd/common.py
index 2a299eeb..094d4bdc 100644
--- a/icedqcd/common.py
+++ b/icedqcd/common.py
@@ -66,10 +66,10 @@ def load_root_file(root_path, ids=None, entry_start=0, entry_stop=None, maxevent
     # *** BACKGROUND MC, SIGNAL MC, DOMAIN ADAPTATION ... ***
     
     for key in args["input"].keys(): # input from yamlgen generated yml
-
+        
         class_id = int(key.split("_")[1])
         proc     = args["input"][key] 
-
+        
         X[key], Y[key], W[key], _, INFO[key] = iceroot.read_multiple(class_id=class_id,
             process_func=process_root, processes=proc, root_path=root_path, param=param)
 
@@ -77,7 +77,11 @@ def load_root_file(root_path, ids=None, entry_start=0, entry_stop=None, maxevent
     # Sample conditional theory parameters as they are distributed in signal sample
     
     sig_class = f"class_1" # ** HARDCODED DEFINITION **
-    p         = ak.to_numpy(W[sig_class] / ak.sum(W[sig_class])).squeeze() # probability per event entry
+    
+    # Probability per event entry, float64 needed for precision
+    # See: https://stackoverflow.com/questions/46539431/np-random-choice-probabilities-do-not-sum-to-1
+    p = ak.to_numpy(W[sig_class]).squeeze().astype('float64')
+    p = p / np.sum(p)
     
     # We pick all variables at once
     # (so N-dim random variable N-tuplets are sampled 1-to-1 as in the signal class)
@@ -104,7 +108,7 @@ def load_root_file(root_path, ids=None, entry_start=0, entry_stop=None, maxevent
             # Set conditional variables 'MODEL_'
             for i in range(len(var)):
                 X[key][var[i]] = new[:,i].squeeze().tolist()
-            
+
             # "Mirror" copy variables to 'GEN_', for ROC plots etc. diagnostics in the evaluation stage
             var_new = [s.replace('MODEL', 'GEN') for s in var]
             for i in range(len(var)):
diff --git a/icenet/__init__.py b/icenet/__init__.py
index 6c9e49c0..3fdbf27a 100644
--- a/icenet/__init__.py
+++ b/icenet/__init__.py
@@ -3,7 +3,7 @@
 import os
 import psutil
 
-__version__    = '0.1.0.4'
+__version__    = '0.1.0.5'
 __release__    = 'alpha'
 __date__       = '05/05/2024'
 __author__     = 'm.mieskolainen@imperial.ac.uk'
diff --git a/icenet/deep/iceboost.py b/icenet/deep/iceboost.py
index 7c9fe6ea..689f9bb7 100644
--- a/icenet/deep/iceboost.py
+++ b/icenet/deep/iceboost.py
@@ -49,16 +49,22 @@ def _binary_cross_entropy(preds: torch.Tensor, targets: torch.Tensor, weights: t
     """
     Custom binary cross entropy loss with
     domain adaptation (DA) and mutual information (MI) regularization
+    
+    Negative weights are supported via 'out_weights' variable (update it for each train / eval)
     """
     global loss_mode
     global MI_x
     global BCE_param
     global MI_param
     global loss_history
+    global out_weights
     
     device = preds.device
     
-    if weights is None:
+    if  out_weights is not None: # Feed in weights outside
+        w = torch.from_numpy(out_weights)
+        w = w / torch.sum(w)
+    elif weights is None:
         w = torch.ones(len(preds))
         w = w / torch.sum(w)
     else:
@@ -311,13 +317,15 @@ def train_xgb(config={'params': {}}, data_trn=None, data_val=None, y_soft=None,
     global MI_param    
     global loss_mode
     global loss_history
+    global out_weights
     
-    MI_x         = None
-    BCE_param    = None
-    MI_param     = None
-    loss_mode    = None
-    loss_history = {}
+    MI_x        = None
+    BCE_param   = None
+    MI_param    = None
+    loss_mode   = None
+    out_weights = None
     
+    loss_history = {}
     loss_history_train = {}
     loss_history_eval  = {}
 
@@ -354,11 +362,23 @@ def train_xgb(config={'params': {}}, data_trn=None, data_val=None, y_soft=None,
     w_trn = data_trn.w / np.sum(data_trn.w) * data_trn.w.shape[0]
     w_val = data_val.w / np.sum(data_val.w) * data_val.w.shape[0]
 
+    # ---------------------------------------------------------
+    # Choose weight mode
+    if np.min(w_trn) < 0.0 or np.min(w_val) < 0.0:
+        print(__name__ + f'.train_xgb: Negative weight mode on -- handled via custom loss')
+        out_weights_on = True
+        
+        if 'custom' not in param['model_param']['objective']:
+            raise Exception(__name__ + f'.train_xgb: Need to use custom with negative weights, e.g. "custom:binary_cross_entropy". Change your parameters.')
+    else:
+        out_weights_on = False
+    # ---------------------------------------------------------
+    
     X_trn, ids_trn = aux.red(X=data_trn.x, ids=data_trn.ids, param=param, verbose=True)  # variable reduction
-    dtrain     = xgboost.DMatrix(data=X_trn, label = data_trn.y if y_soft is None else y_soft, weight = w_trn, feature_names=ids_trn)
+    dtrain     = xgboost.DMatrix(data=X_trn, label = data_trn.y if y_soft is None else y_soft, weight = w_trn if not out_weights_on else None, feature_names=ids_trn)
     
     X_val, ids_val = aux.red(X=data_val.x, ids=data_val.ids, param=param, verbose=False) # variable reduction
-    deval      = xgboost.DMatrix(data=X_val, label = data_val.y,  weight = w_val, feature_names=ids_val)
+    deval      = xgboost.DMatrix(data=X_val, label = data_val.y,  weight = w_val if not out_weights_on else None, feature_names=ids_val)
     
     evallist   = [(dtrain, 'train'), (deval, 'eval')]
     print(param)
@@ -418,12 +438,18 @@ def train_xgb(config={'params': {}}, data_trn=None, data_val=None, y_soft=None,
         if epoch > 0: # Continue from the previous epoch model
             a['xgb_model'] = model
 
-        # Train
+        # ** Train **
+        if out_weights_on:
+            out_weights = copy.deepcopy(w_trn)
+
         loss_mode = 'train'
         model = xgboost.train(**a)
         loss_history_train = copy.deepcopy(loss_history)
         
-        # Validate
+        # ** Validate **
+        if out_weights_on:
+            out_weights = copy.deepcopy(w_val)    
+
         if 'custom' in model_param['objective']:
             loss_mode    = 'eval'
             MI_x         = copy.deepcopy(data_val_MI)
diff --git a/icenet/tools/aux.py b/icenet/tools/aux.py
index 9a7f7262..eff38c3c 100644
--- a/icenet/tools/aux.py
+++ b/icenet/tools/aux.py
@@ -18,6 +18,7 @@
 
 import sklearn
 from sklearn import metrics
+import scipy
 from scipy import stats
 import scipy.special as special
 from scipy import interpolate
@@ -1029,7 +1030,7 @@ def __init__(self, y_true, y_pred, weights=None, class_ids=[0,1], hist=True, val
             self.acc_bootstrap = (-1)*np.ones(num_bootstrap)
 
             for i in range(num_bootstrap):
-
+                
                 # ------------------
                 trials     = 0
                 max_trials = 10000
@@ -1077,7 +1078,14 @@ def compute_metrics(class_ids, y_true, y_pred, weights):
     try:
         if  len(class_ids) == 2:
             fpr, tpr, thresholds = metrics.roc_curve(y_true=y_true, y_score=y_pred, sample_weight=weights)
-            auc = metrics.roc_auc_score(y_true=y_true,  y_score=y_pred, sample_weight=weights)
+            
+            # AUC via numerical integration (stable with negative weight events)
+            sorted_index = np.argsort(fpr)
+            fpr_sorted   = np.array(fpr)[sorted_index]
+            tpr_sorted   = np.array(tpr)[sorted_index]
+            auc = scipy.integrate.trapz(y=tpr_sorted, x=fpr_sorted)
+            
+            #auc = metrics.roc_auc_score(y_true=y_true,  y_score=y_pred, sample_weight=weights)
             acc = metrics.accuracy_score(y_true=y_true, y_pred=np.round(y_pred), sample_weight=weights)
         else:
             fpr, tpr, thresholds = None, None, None
diff --git a/icenet/tools/iceroot.py b/icenet/tools/iceroot.py
index 6db3253e..8dffe524 100644
--- a/icenet/tools/iceroot.py
+++ b/icenet/tools/iceroot.py
@@ -80,17 +80,18 @@ def read_single(process_func, process, root_path, param, class_id, dtype=None):
 
     # -------------------------------------------------
     # Visible cross-section weights
+    # (keep float64 to avoid possible precision problems -- conversion to float32 done later)
     if not force_xs:
         # Sum over W yields --> (eff_acc * xs)
-        W = (ak.Array(np.ones(N_after, dtype=np.float32)) / N_after) * (xs * eff_acc)
+        W = (ak.Array(np.ones(N_after, dtype=np.float64)) / N_after) * (xs * eff_acc)
     
     # 'force_xs' mode is useful e.g. in training with a mix of signals with different eff x acc, but one wants
     # to have equal proportions for e.g. theory conditional MVA training.
     # (then one uses the same input 'xs' value for each process in the steering .yaml file)
     else:
         # Sum over W yields --> xs
-        W = (ak.Array(np.ones(N_after, dtype=np.float32)) / N_after) * xs
-
+        W = (ak.Array(np.ones(N_after, dtype=np.float64)) / N_after) * xs
+    
     # Save statistics information
     info = {'yaml':      process,
             'cut_stats': stats,
diff --git a/icezee/__init__.py b/icezee/__init__.py
new file mode 100644
index 00000000..792d6005
--- /dev/null
+++ b/icezee/__init__.py
@@ -0,0 +1 @@
+#
diff --git a/icezee/common.py b/icezee/common.py
new file mode 100644
index 00000000..e1ff4dcc
--- /dev/null
+++ b/icezee/common.py
@@ -0,0 +1,173 @@
+# Common input & data reading routines for ZEE
+#
+# m.mieskolainen@imperial.ac.uk, 2024
+
+import numpy as np
+import copy
+import pickle
+from importlib import import_module
+import pandas as pd
+
+from termcolor import colored, cprint
+
+from icenet.tools import io
+from icenet.tools import aux
+
+# GLOBALS
+#from configs.zee.cuts import *
+#from configs.zee.filter import *
+
+
+def load_root_file(root_path, ids=None, entry_start=0, entry_stop=None, maxevents=None, args=None):
+    """ Loads the root file.
+    
+    Args:
+        root_path : paths to root files (list)
+    
+    Returns:
+        X:     columnar data
+        Y:     class labels
+        W:     event weights
+        ids:   columnar variable string (list)
+        info:  trigger and pre-selection acceptance x efficiency information (dict)
+    """
+    inputvars = import_module("configs." + args["rootname"] + "." + args["inputvars"])
+    
+    if type(root_path) is list:
+        root_path = root_path[0] # Remove [] list, we expect only the path here
+    
+    # -----------------------------------------------
+    #param = {
+    #    'entry_start': entry_start,
+    #    "entry_stop":  entry_stop,
+    #    "maxevents":   maxevents,
+    #    "args": args
+    #}
+
+    LOAD_VARS = inputvars.LOAD_VARS
+    
+    
+    # =================================================================
+    # *** MC ***
+    
+    mc_file  = f'{root_path}/{args["mcfile"]}'
+    print(__name__ + f'.load_root_file: {mc_file}')
+    
+    frame_mc = pd.read_parquet(mc_file)
+    
+    print(f'Total number of events in file: {len(frame_mc)}')
+    print(f'ids: {frame_mc.keys()}')
+    
+    # Pre-computed weights (kinematic re-weight x gen event weight x ...)
+    W_MC = frame_mc[['rw_weights']].to_numpy().squeeze()
+    W_MC = W_MC / np.sum(W_MC) * len(W_MC)
+
+    X_MC = frame_mc[LOAD_VARS].to_numpy()
+    Y_MC = np.zeros(len(X_MC)).astype(int)
+    
+    print(f'X_MC.shape = {X_MC.shape}')
+    print(f'W_MC.shape = {W_MC.shape}')
+    
+    
+    # =================================================================
+    # *** Data ***
+    
+    data_file  = f'{root_path}/{args["datafile"]}'
+    print(__name__ + f'.load_root_file: {data_file}')
+    
+    frame_data = pd.read_parquet(data_file)
+    
+    print(f'Total number of events in file: {len(frame_data)}')
+    print(f'ids: {frame_data.keys()}')
+    
+    # Unit weighted events for data
+    X_data = frame_data[LOAD_VARS].to_numpy()
+    W_data = np.ones(len(X_data))
+    Y_data = np.ones(len(X_data)).astype(int)
+    
+    print(f'X_data.shape = {X_data.shape}')
+    print(f'W_data.shape = {W_data.shape}')
+    
+    
+    # =================================================================
+    # Combine
+    
+    X   = np.vstack((X_MC, X_data))
+    W   = np.concatenate((W_MC, W_data))
+    Y   = np.concatenate((Y_MC, Y_data))
+    ids = LOAD_VARS
+    
+    # =================================================================
+    
+    
+    ## Print some diagnostics
+    print(__name__ + f'.load_root_file: Number of events: {len(X)}')
+    
+    for c in np.unique(Y):      
+        print(__name__ + f'.load_root_file: class[{c}] mean(event_weight) = {np.mean(W[Y==c]):0.3E}, std(event_weight) = {np.std(W[Y==c]):0.3E}')
+    
+    # ** Crucial -- randomize order to avoid problems with other functions **
+    rand = np.random.permutation(len(X))
+    X    = X[rand].squeeze() # Squeeze removes additional [] dimension
+    Y    = Y[rand].squeeze()
+    W    = W[rand].squeeze()
+
+    # Apply maxevents cutoff
+    maxevents = np.min([args['maxevents'], len(X)])
+    print(__name__ + f'.load_root_file: Applying maxevents cutoff {maxevents}')
+    X, Y, W = X[0:maxevents], Y[0:maxevents], W[0:maxevents]
+    
+    # TBD add cut statistics etc. here
+    info = {}
+    
+    return {'X':X, 'Y':Y, 'W':W, 'ids':ids, 'info':info}
+
+
+def splitfactor(x, y, w, ids, args):
+    """
+    Transform data into different datatypes.
+    
+    Args:
+        data:  jagged arrays
+        args:  arguments dictionary
+    
+    Returns:
+        dictionary with different data representations
+    """
+    inputvars = import_module("configs." + args["rootname"] + "." + args["inputvars"])
+    
+    data = io.IceXYW(x=x, y=y, w=w, ids=ids)
+
+    # -------------------------------------------------------------------------
+    ### Pick kinematic variables out
+    data_kin = None
+
+    if inputvars.KINEMATIC_VARS is not None:
+        vars       = aux.process_regexp_ids(all_ids=data.ids, ids=inputvars.KINEMATIC_VARS)
+        data_kin   = data[vars]
+        data_kin.x = data_kin.x.astype(np.float32)
+    
+    # -------------------------------------------------------------------------
+    data_deps   = None
+
+    # -------------------------------------------------------------------------
+    data_tensor = None
+
+    # -------------------------------------------------------------------------
+    data_graph  = None
+    
+    # -------------------------------------------------------------------------
+    # Mutual information regularization targets
+    
+    #vars    = aux.process_regexp_ids(all_ids=data.ids, ids=inputvars.MI_VARS)
+    #data_MI = data[vars].x.astype(np.float32)
+    data_MI = None
+    
+    # --------------------------------------------------------------------
+    ### Finally pick active scalar variables out
+    
+    vars   = aux.process_regexp_ids(all_ids=data.ids, ids=eval('inputvars.' + args['inputvar_scalar']))
+    data   = data[vars]
+    data.x = data.x.astype(np.float32)
+    
+    return {'data': data, 'data_MI': data_MI, 'data_kin': data_kin, 'data_deps': data_deps, 'data_tensor': data_tensor, 'data_graph': data_graph}
diff --git a/tests/runme_zee.sh b/tests/runme_zee.sh
new file mode 100644
index 00000000..2e3441df
--- /dev/null
+++ b/tests/runme_zee.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+#
+# Execute training and evaluation for the ZEE
+#
+# Run with: source runme.sh
+
+CONFIG="tune0.yml"
+DATAPATH="/vols/cms/pfk18/phd/hgg/Jul23/NN21July/N/validations/outputs/Csplit_Jsamp"
+
+if [ ${maxevents+x} ]; then MAX="--maxevents $maxevents"; else MAX=""; fi
+
+# tee redirect output to both a file and to screen
+python analysis/zee.py --runmode "genesis" $MAX --config $CONFIG --datapath $DATAPATH
+#python analysis/zee.py --fastplot 1 --runmode "train"   $MAX --config $CONFIG --datapath $DATAPATH
+python analysis/zee.py --fastplot 1 --runmode "eval"    $MAX --config $CONFIG --datapath $DATAPATH