From 408a4acd5d4ca7f0e9df66b87a014d0caba3ffec Mon Sep 17 00:00:00 2001 From: Mikael Mieskolainen Date: Sun, 5 May 2024 17:23:15 +0100 Subject: [PATCH] float64 precision update; template for icezee --- analysis/_icepaths_.py | 1 + analysis/zee.py | 19 +++++ configs/zee/__init__.py | 3 + configs/zee/cuts.py | 15 ++++ configs/zee/filter.py | 15 ++++ configs/zee/models.yml | 99 +++++++++++++++++++++++ configs/zee/mvavars.py | 25 ++++++ configs/zee/plots.yml | 74 +++++++++++++++++ configs/zee/raytune.yml | 20 +++++ configs/zee/tune0.yml | 152 +++++++++++++++++++++++++++++++++++ icedqcd/common.py | 12 ++- icenet/__init__.py | 2 +- icenet/deep/iceboost.py | 46 ++++++++--- icenet/tools/aux.py | 12 ++- icenet/tools/iceroot.py | 7 +- icezee/__init__.py | 1 + icezee/common.py | 173 ++++++++++++++++++++++++++++++++++++++++ tests/runme_zee.sh | 15 ++++ 18 files changed, 671 insertions(+), 20 deletions(-) create mode 100644 analysis/zee.py create mode 100644 configs/zee/__init__.py create mode 100644 configs/zee/cuts.py create mode 100644 configs/zee/filter.py create mode 100644 configs/zee/models.yml create mode 100644 configs/zee/mvavars.py create mode 100644 configs/zee/plots.yml create mode 100644 configs/zee/raytune.yml create mode 100644 configs/zee/tune0.yml create mode 100644 icezee/__init__.py create mode 100644 icezee/common.py create mode 100644 tests/runme_zee.sh diff --git a/analysis/_icepaths_.py b/analysis/_icepaths_.py index 771fc827..dddb45a9 100644 --- a/analysis/_icepaths_.py +++ b/analysis/_icepaths_.py @@ -10,6 +10,7 @@ '/iceid/', '/icefit/', '/icebrem/', + '/icezee/' ] for p in paths: diff --git a/analysis/zee.py b/analysis/zee.py new file mode 100644 index 00000000..4b85c7b5 --- /dev/null +++ b/analysis/zee.py @@ -0,0 +1,19 @@ +# Zee steering code +# +# m.mieskolainen@imperial.ac.uk, 2024 + +import sys +sys.path.append(".") + +# Configure plotting backend +import matplotlib +matplotlib.use('Agg') + +from icenet.tools import process +from icezee import common + +def main(): + args = process.generic_flow(rootname='zee', func_loader=common.load_root_file, func_factor=common.splitfactor) + +if __name__ == '__main__' : + main() diff --git a/configs/zee/__init__.py b/configs/zee/__init__.py new file mode 100644 index 00000000..407eb4e3 --- /dev/null +++ b/configs/zee/__init__.py @@ -0,0 +1,3 @@ +# +# +# diff --git a/configs/zee/cuts.py b/configs/zee/cuts.py new file mode 100644 index 00000000..f7a1d44e --- /dev/null +++ b/configs/zee/cuts.py @@ -0,0 +1,15 @@ +# Basic kinematic fiducial cuts, use only variables available in real data. +# +# m.mieskolainen@imperial.ac.uk, 2024 + +import numpy as np +import numba +import matplotlib.pyplot as plt + +from icenet.tools import stx + + +def cut_nocut(X, ids, isMC, xcorr_flow=False): + """ No cuts """ + return np.ones(X.shape[0], dtype=np.bool_) # # Note datatype np.bool_ + diff --git a/configs/zee/filter.py b/configs/zee/filter.py new file mode 100644 index 00000000..c1a6a251 --- /dev/null +++ b/configs/zee/filter.py @@ -0,0 +1,15 @@ +# Data filtering rules +# +# Note! Physics observable (fiducial / kinematic) cuts are defined in cuts.py, not here. +# +# m.mieskolainen@imperial.ac.uk, 2024 + +import numpy as np +import numba + +from icenet.tools import stx + + +def filter_nofilter(X, ids, isMC, xcorr_flow=False): + """ All pass """ + return np.ones(X.shape[0], dtype=np.bool_) # Note datatype np.bool_ diff --git a/configs/zee/models.yml b/configs/zee/models.yml new file mode 100644 index 00000000..9fea0e18 --- /dev/null +++ b/configs/zee/models.yml @@ -0,0 +1,99 @@ +## MVA models + +# XGBoost +# https://xgboost.readthedocs.io/en/latest/parameter.html +xgb0: + train: 'xgb' + predict: 'xgb_logistic' + label: 'XGB' + raytune: xgb_trial_0 + + # ** Custom set of variables ** + #include_MVA_vars: ['.*'] + #exclude_MVA_vars: ['.*'] + + # booster parameters + model_param: + num_boost_round: 200 # number of epochs (equal to the number of trees!) + + booster: 'gbtree' # 'gbtree' (default), 'dart' (dropout boosting) + tree_method: 'hist' + device: 'auto' # 'auto', 'cpu', 'cuda' + + learning_rate: 0.1 + gamma: 1.67 + max_depth: 10 + min_child_weight: 1.0 + max_delta_step: 1 + subsample: 1 + + colsample_bytree: 0.86 + colsample_bylevel: 0.6 + colsample_bynode: 0.8 + + reg_lambda: 1.0 # L2 regularization + reg_alpha: 0.05 # L1 regularization + + # learning task parameters + objective: 'custom:binary_cross_entropy' # Note that 'multi:softprob' does not work with distillation + eval_metric: ['custom'] # for custom losses, otherwise 'logloss', 'mlogloss' ... + + # BCE loss domains + BCE_param: + main: + classes: [0,1] + beta: 1.0 + #set_filter: *MAIN_DOMAIN_FILTER # Comment out for 'inclusive' + + plot_trees: False + + # Read/Write of epochs + savemode: 'all' # 'all', 'latest' + readmode: -1 # -1 is the last saved epoch + + +# Deep MLP +dmlp0: + train: 'torch_generic' + predict: 'torch_vector' + label: 'DMLP' + raytune: null + + # ** Custom set of variables ** + #include_MVA_vars: ['.*'] + #exclude_MVA_vars: ['.*'] + + # Model + conv_type: 'dmlp' + model_param: + mlp_dim: [32, 32] # hidden layer dimensions + activation: 'relu' + batch_norm: False + dropout: 0.01 + + # Optimization + opt_param: + lossfunc: 'cross_entropy' # cross_entropy, focal_entropy, logit_norm_cross_entropy + gamma: 2 # focal_entropy exponent + temperature: 1 # logit norm temperature + + optimizer: 'AdamW' + clip_norm: 1.0 + + epochs: 150 + batch_size: 256 + lr: 3.0e-4 + weight_decay: 0.00001 # L2-regularization + + # Scheduler + scheduler_param: + step_size: 250 # Number of epochs for drop + gamma: 0.1 + + device: 'auto' # alternative 'cpu:0', 'cuda:0' + num_workers: 4 + + # Read/Write of epochs + savemode: 'all' # 'all', 'latest' + readmode: -1 # -1 is the last saved epoch + diff --git a/configs/zee/mvavars.py b/configs/zee/mvavars.py new file mode 100644 index 00000000..cbc4b9f6 --- /dev/null +++ b/configs/zee/mvavars.py @@ -0,0 +1,25 @@ + +KINEMATIC_VARS = [ + 'probe_eta', + 'probe_pt', + 'fixedGridRhoAll' +] + +MVA_SCALAR_VARS = [ + 'probe_sieie', + 'probe_sieip', + 'probe_s4', + 'probe_r9', + 'probe_pfChargedIsoWorstVtx', + 'probe_esEnergyOverRawE', + 'probe_esEffSigmaRR', + 'probe_ecalPFClusterIso', + 'probe_phiWidth', + 'probe_etaWidth', + 'probe_trkSumPtHollowConeDR03', + 'probe_trkSumPtSolidConeDR04', + #'probe_pfChargedIso', # not found in data +] + +LOAD_VARS = KINEMATIC_VARS + MVA_SCALAR_VARS + diff --git a/configs/zee/plots.yml b/configs/zee/plots.yml new file mode 100644 index 00000000..4192da07 --- /dev/null +++ b/configs/zee/plots.yml @@ -0,0 +1,74 @@ +# Plot steering + +# ----------------------------------------------------------------------- + +basic: + active: True + nbins: 70 + percentile_range: [0.5, 99.5] + exclude_vals: [null, -999] + plot_unweighted: True + +corrmat: + active: false + +contours: + active: false + +ROC: + active: true + num_bootstrap: 200 + xmin: 1.0E-4 + #set_filter: *FINAL_STATE_FILTER + + +## Binned ROC plots can be 1D or 2D (powerset filtering not supported here) +ROC_binned: + active: false + num_bootstrap: 200 + xmin: 1.0E-4 + + #plot[0]: + # var: ['x_hlt_pt'] + # edges: [4.0, 6, 8.0, 10.0, 12.0, 15.0, 10000] + + #plot[1]: + # var: ['x_hlt_eta', 'x_hlt_pt'] + # edges: [[-1.5, -1.15, -0.75, 0.0, 0.75, 1.15, 1.5], + # [4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 10000]] + +## MVA output density (1D) +MVA_output: + active: true + edges: 80 + #set_filter: *FINAL_STATE_FILTER + +## (MVA output x external variable) density (2D) +# Set filter can be applied only per one plot[i] identifier! +MVA_2D: + active: false + + plot[0]: + var: ['tagger_score'] + edges: [{'nbin': 50, 'q': [0.0001, 0.9999], 'space': 'linear'}, + {'nbin': 50, 'minmax': [0.0, 1.0], 'space': 'linear'}] + density: True + + #set_filter: *FINAL_STATE_FILTER + + # ----------------------------- + # Powerset correlation plot parameters + xlim: + # For each class [[lower, upper], ... [lower, upper] + pearson: [[-0.15, 0.30], [-0.15, 0.30]] + abs_pearson: [[0.0, 0.30], [0.0, 0.30]] + disco: [[0.0, 0.30], [0.0, 0.30]] + MI: [[0.0, 0.12], [0.0, 0.12]] + # ----------------------------- + + #plot[1]: + # var: ['.?hlt_pms2.?'] # RegExp supported + # edges: [{'nbin': 50, 'minmax': [0.0, 1.0], 'space': 'linear'}, + # {'nbin': 50, 'q': [0.0, 0.95], 'space': 'log10'}] + # density: True + diff --git a/configs/zee/raytune.yml b/configs/zee/raytune.yml new file mode 100644 index 00000000..40f4d608 --- /dev/null +++ b/configs/zee/raytune.yml @@ -0,0 +1,20 @@ +param: + + #active: ['xgb1'] + active: [null] + num_samples: 10 # Trial count parameter + + +setup: + + xgb_trial_0: + search_algo: 'HyperOpt' + + search_metric: + metric: 'AUC' + mode: 'max' + + param: + + num_boost_round: + type: "tune.randint(20, 300)" diff --git a/configs/zee/tune0.yml b/configs/zee/tune0.yml new file mode 100644 index 00000000..74f42351 --- /dev/null +++ b/configs/zee/tune0.yml @@ -0,0 +1,152 @@ +# ZEE tune0.yml +# +# ------------------------------------------------------------------- + +rootname: 'zee' +rngseed: 123456 # Fixed seed for training data mixing +inputvars: 'mvavars' # Main file input description python file + +# ---------------------------------------------------- +mva_param: &MVA_INPUT_PARAM + use_conditional: false # Conditional (theory parametric) input + primary_classes: [0,1] # Primary class IDs in MVA (train, ROC etc.) + signal_class: 1 # Signal class ID + #DA_class: -2 # Domain Adaptation class + + inputvar_scalar: 'MVA_SCALAR_VARS' # 'MVA_SCALAR_VARS' # Input variables, implemented under mvavars.py + #inputvar_jagged: null # 'MVA_JAGGED_VARS' + #jagged_maxdim: 6 + + varnorm: null # Variable normalization: 'zscore', 'madscore', null + #varnorm_tensor: 'zscore' # Tensor variable normalization + #varnorm_graph: null # Not implemented yet + + frac: [0.6, 0.1, 0.3] # Train vs validate/test split fraction + + # Imputation + imputation_param: + active: true # True / False + var: 'MVA_SCALAR_VARS' # Array of variables to be imputated + algorithm: 'constant' # Algorithm type: 'constant', iterative' (vector), knn' (vector), 'mean' (scalar), 'median' (scalar) + fill_value: 0 # For constant imputation + knn_k: 8 # Number of nearest neighbours considered + values: null # Special values which indicate the need for imputation, if null, then only Inf/Nan + + # # Graph object construction + # graph_param: + # global_on: True + # self_loops: True + # directed: False + # coord: 'pxpypze' # 'ptetaphim', 'pxpypze' + + # # ** Image tensor object construction ** + # image_param: + + # # See the corresponding construction under common.py + # channels: 2 # 1,2,... + + # # bin-edges + # eta_bins: [] + # phi_bins: [] + + +# ---------------------------------------------------- +genesis_runmode: + + maxevents: null + inputmap: null #"mc_input.yml" + inputvars: 'mvavars' # Input description python file + + mcfile: 'hepml_EEp_pteta_after_rho_train/MC.parquet' + datafile: 'files/Data_train_EEp.parquet' + tree_name: null # 'ntuplizer/tree' + + targetfunc: null # Training target, implemented under mctargets.py + filterfunc: 'filter_nofilter' # Training filtering, implemented under mcfilter.py + cutfunc: 'cut_nocut' # Basic cuts, implemented under cuts.py + + xcorr_flow: True # Full N-point correlations computed between cuts + pickle_size: 100000 # Number of entries (events) per pickle file + + +# ---------------------------------------------------- +train_runmode: + + <<: *MVA_INPUT_PARAM + + maxevents: null + modeltag: null + + ## Reweighting setup + reweight: true + reweight_mode: 'write' # 'write', 'load' + reweight_file: 'reweight_train.pkl' # differential reweighting model file + + reweight_param: &REWEIGHT_PARAM + + maxevents: 100000 # Maximum number of events for the PDF construction + equal_frac: true # Equalize integrated class fractions + differential: false # Differential reweighting + reference_class: 0 + + # --------------------- + + # see /trg/tune0.yml + + + ## Outlier protection in the training + outlier_param: + algo: 'truncate' # algorithm: 'truncate', null + qmin: 0.01 # in [0,100] + qmax: 99.9 # in [0,100] + + # ** Activate models here ** + # Give all models some unique identifier and label + models: !include configs/zee/models.yml + active_models: &ACTIVE_MODELS + + - xgb0 + - dmlp0 + + + raytune: !include configs/zee/raytune.yml + + # Distillation training + # -- the order must be compatible with the causal order in 'active_models' + distillation: + + # Big, sophisticated model + source: + #xgb0 + + # Simple, compressed models + drains: + #- xgb1 + # - add more here + + # Batched "deep" training + batch_train_param: + blocksize: 150000 # Maximum number of events simultaneously in RAM + epochs: 50 # Number of global epochs (1 epoch = one iteration over the full input dataset), same for all models + #num_cpu: null # Set null for auto, or an integer for manual. + +# ---------------------------------------------------- +eval_runmode: + + <<: *MVA_INPUT_PARAM + + maxevents: null + modeltag: null + + reweight: true + reweight_mode: 'load' # 'write', 'load' + reweight_file: 'reweight_train.pkl' + + reweight_param: *REWEIGHT_PARAM + + models: !include configs/zee/models.yml + active_models: *ACTIVE_MODELS + +# ---------------------------------------------------- +plot_param: !include configs/zee/plots.yml + diff --git a/icedqcd/common.py b/icedqcd/common.py index 2a299eeb..094d4bdc 100644 --- a/icedqcd/common.py +++ b/icedqcd/common.py @@ -66,10 +66,10 @@ def load_root_file(root_path, ids=None, entry_start=0, entry_stop=None, maxevent # *** BACKGROUND MC, SIGNAL MC, DOMAIN ADAPTATION ... *** for key in args["input"].keys(): # input from yamlgen generated yml - + class_id = int(key.split("_")[1]) proc = args["input"][key] - + X[key], Y[key], W[key], _, INFO[key] = iceroot.read_multiple(class_id=class_id, process_func=process_root, processes=proc, root_path=root_path, param=param) @@ -77,7 +77,11 @@ def load_root_file(root_path, ids=None, entry_start=0, entry_stop=None, maxevent # Sample conditional theory parameters as they are distributed in signal sample sig_class = f"class_1" # ** HARDCODED DEFINITION ** - p = ak.to_numpy(W[sig_class] / ak.sum(W[sig_class])).squeeze() # probability per event entry + + # Probability per event entry, float64 needed for precision + # See: https://stackoverflow.com/questions/46539431/np-random-choice-probabilities-do-not-sum-to-1 + p = ak.to_numpy(W[sig_class]).squeeze().astype('float64') + p = p / np.sum(p) # We pick all variables at once # (so N-dim random variable N-tuplets are sampled 1-to-1 as in the signal class) @@ -104,7 +108,7 @@ def load_root_file(root_path, ids=None, entry_start=0, entry_stop=None, maxevent # Set conditional variables 'MODEL_' for i in range(len(var)): X[key][var[i]] = new[:,i].squeeze().tolist() - + # "Mirror" copy variables to 'GEN_', for ROC plots etc. diagnostics in the evaluation stage var_new = [s.replace('MODEL', 'GEN') for s in var] for i in range(len(var)): diff --git a/icenet/__init__.py b/icenet/__init__.py index 6c9e49c0..3fdbf27a 100644 --- a/icenet/__init__.py +++ b/icenet/__init__.py @@ -3,7 +3,7 @@ import os import psutil -__version__ = '0.1.0.4' +__version__ = '0.1.0.5' __release__ = 'alpha' __date__ = '05/05/2024' __author__ = 'm.mieskolainen@imperial.ac.uk' diff --git a/icenet/deep/iceboost.py b/icenet/deep/iceboost.py index 7c9fe6ea..689f9bb7 100644 --- a/icenet/deep/iceboost.py +++ b/icenet/deep/iceboost.py @@ -49,16 +49,22 @@ def _binary_cross_entropy(preds: torch.Tensor, targets: torch.Tensor, weights: t """ Custom binary cross entropy loss with domain adaptation (DA) and mutual information (MI) regularization + + Negative weights are supported via 'out_weights' variable (update it for each train / eval) """ global loss_mode global MI_x global BCE_param global MI_param global loss_history + global out_weights device = preds.device - if weights is None: + if out_weights is not None: # Feed in weights outside + w = torch.from_numpy(out_weights) + w = w / torch.sum(w) + elif weights is None: w = torch.ones(len(preds)) w = w / torch.sum(w) else: @@ -311,13 +317,15 @@ def train_xgb(config={'params': {}}, data_trn=None, data_val=None, y_soft=None, global MI_param global loss_mode global loss_history + global out_weights - MI_x = None - BCE_param = None - MI_param = None - loss_mode = None - loss_history = {} + MI_x = None + BCE_param = None + MI_param = None + loss_mode = None + out_weights = None + loss_history = {} loss_history_train = {} loss_history_eval = {} @@ -354,11 +362,23 @@ def train_xgb(config={'params': {}}, data_trn=None, data_val=None, y_soft=None, w_trn = data_trn.w / np.sum(data_trn.w) * data_trn.w.shape[0] w_val = data_val.w / np.sum(data_val.w) * data_val.w.shape[0] + # --------------------------------------------------------- + # Choose weight mode + if np.min(w_trn) < 0.0 or np.min(w_val) < 0.0: + print(__name__ + f'.train_xgb: Negative weight mode on -- handled via custom loss') + out_weights_on = True + + if 'custom' not in param['model_param']['objective']: + raise Exception(__name__ + f'.train_xgb: Need to use custom with negative weights, e.g. "custom:binary_cross_entropy". Change your parameters.') + else: + out_weights_on = False + # --------------------------------------------------------- + X_trn, ids_trn = aux.red(X=data_trn.x, ids=data_trn.ids, param=param, verbose=True) # variable reduction - dtrain = xgboost.DMatrix(data=X_trn, label = data_trn.y if y_soft is None else y_soft, weight = w_trn, feature_names=ids_trn) + dtrain = xgboost.DMatrix(data=X_trn, label = data_trn.y if y_soft is None else y_soft, weight = w_trn if not out_weights_on else None, feature_names=ids_trn) X_val, ids_val = aux.red(X=data_val.x, ids=data_val.ids, param=param, verbose=False) # variable reduction - deval = xgboost.DMatrix(data=X_val, label = data_val.y, weight = w_val, feature_names=ids_val) + deval = xgboost.DMatrix(data=X_val, label = data_val.y, weight = w_val if not out_weights_on else None, feature_names=ids_val) evallist = [(dtrain, 'train'), (deval, 'eval')] print(param) @@ -418,12 +438,18 @@ def train_xgb(config={'params': {}}, data_trn=None, data_val=None, y_soft=None, if epoch > 0: # Continue from the previous epoch model a['xgb_model'] = model - # Train + # ** Train ** + if out_weights_on: + out_weights = copy.deepcopy(w_trn) + loss_mode = 'train' model = xgboost.train(**a) loss_history_train = copy.deepcopy(loss_history) - # Validate + # ** Validate ** + if out_weights_on: + out_weights = copy.deepcopy(w_val) + if 'custom' in model_param['objective']: loss_mode = 'eval' MI_x = copy.deepcopy(data_val_MI) diff --git a/icenet/tools/aux.py b/icenet/tools/aux.py index 9a7f7262..eff38c3c 100644 --- a/icenet/tools/aux.py +++ b/icenet/tools/aux.py @@ -18,6 +18,7 @@ import sklearn from sklearn import metrics +import scipy from scipy import stats import scipy.special as special from scipy import interpolate @@ -1029,7 +1030,7 @@ def __init__(self, y_true, y_pred, weights=None, class_ids=[0,1], hist=True, val self.acc_bootstrap = (-1)*np.ones(num_bootstrap) for i in range(num_bootstrap): - + # ------------------ trials = 0 max_trials = 10000 @@ -1077,7 +1078,14 @@ def compute_metrics(class_ids, y_true, y_pred, weights): try: if len(class_ids) == 2: fpr, tpr, thresholds = metrics.roc_curve(y_true=y_true, y_score=y_pred, sample_weight=weights) - auc = metrics.roc_auc_score(y_true=y_true, y_score=y_pred, sample_weight=weights) + + # AUC via numerical integration (stable with negative weight events) + sorted_index = np.argsort(fpr) + fpr_sorted = np.array(fpr)[sorted_index] + tpr_sorted = np.array(tpr)[sorted_index] + auc = scipy.integrate.trapz(y=tpr_sorted, x=fpr_sorted) + + #auc = metrics.roc_auc_score(y_true=y_true, y_score=y_pred, sample_weight=weights) acc = metrics.accuracy_score(y_true=y_true, y_pred=np.round(y_pred), sample_weight=weights) else: fpr, tpr, thresholds = None, None, None diff --git a/icenet/tools/iceroot.py b/icenet/tools/iceroot.py index 6db3253e..8dffe524 100644 --- a/icenet/tools/iceroot.py +++ b/icenet/tools/iceroot.py @@ -80,17 +80,18 @@ def read_single(process_func, process, root_path, param, class_id, dtype=None): # ------------------------------------------------- # Visible cross-section weights + # (keep float64 to avoid possible precision problems -- conversion to float32 done later) if not force_xs: # Sum over W yields --> (eff_acc * xs) - W = (ak.Array(np.ones(N_after, dtype=np.float32)) / N_after) * (xs * eff_acc) + W = (ak.Array(np.ones(N_after, dtype=np.float64)) / N_after) * (xs * eff_acc) # 'force_xs' mode is useful e.g. in training with a mix of signals with different eff x acc, but one wants # to have equal proportions for e.g. theory conditional MVA training. # (then one uses the same input 'xs' value for each process in the steering .yaml file) else: # Sum over W yields --> xs - W = (ak.Array(np.ones(N_after, dtype=np.float32)) / N_after) * xs - + W = (ak.Array(np.ones(N_after, dtype=np.float64)) / N_after) * xs + # Save statistics information info = {'yaml': process, 'cut_stats': stats, diff --git a/icezee/__init__.py b/icezee/__init__.py new file mode 100644 index 00000000..792d6005 --- /dev/null +++ b/icezee/__init__.py @@ -0,0 +1 @@ +# diff --git a/icezee/common.py b/icezee/common.py new file mode 100644 index 00000000..e1ff4dcc --- /dev/null +++ b/icezee/common.py @@ -0,0 +1,173 @@ +# Common input & data reading routines for ZEE +# +# m.mieskolainen@imperial.ac.uk, 2024 + +import numpy as np +import copy +import pickle +from importlib import import_module +import pandas as pd + +from termcolor import colored, cprint + +from icenet.tools import io +from icenet.tools import aux + +# GLOBALS +#from configs.zee.cuts import * +#from configs.zee.filter import * + + +def load_root_file(root_path, ids=None, entry_start=0, entry_stop=None, maxevents=None, args=None): + """ Loads the root file. + + Args: + root_path : paths to root files (list) + + Returns: + X: columnar data + Y: class labels + W: event weights + ids: columnar variable string (list) + info: trigger and pre-selection acceptance x efficiency information (dict) + """ + inputvars = import_module("configs." + args["rootname"] + "." + args["inputvars"]) + + if type(root_path) is list: + root_path = root_path[0] # Remove [] list, we expect only the path here + + # ----------------------------------------------- + #param = { + # 'entry_start': entry_start, + # "entry_stop": entry_stop, + # "maxevents": maxevents, + # "args": args + #} + + LOAD_VARS = inputvars.LOAD_VARS + + + # ================================================================= + # *** MC *** + + mc_file = f'{root_path}/{args["mcfile"]}' + print(__name__ + f'.load_root_file: {mc_file}') + + frame_mc = pd.read_parquet(mc_file) + + print(f'Total number of events in file: {len(frame_mc)}') + print(f'ids: {frame_mc.keys()}') + + # Pre-computed weights (kinematic re-weight x gen event weight x ...) + W_MC = frame_mc[['rw_weights']].to_numpy().squeeze() + W_MC = W_MC / np.sum(W_MC) * len(W_MC) + + X_MC = frame_mc[LOAD_VARS].to_numpy() + Y_MC = np.zeros(len(X_MC)).astype(int) + + print(f'X_MC.shape = {X_MC.shape}') + print(f'W_MC.shape = {W_MC.shape}') + + + # ================================================================= + # *** Data *** + + data_file = f'{root_path}/{args["datafile"]}' + print(__name__ + f'.load_root_file: {data_file}') + + frame_data = pd.read_parquet(data_file) + + print(f'Total number of events in file: {len(frame_data)}') + print(f'ids: {frame_data.keys()}') + + # Unit weighted events for data + X_data = frame_data[LOAD_VARS].to_numpy() + W_data = np.ones(len(X_data)) + Y_data = np.ones(len(X_data)).astype(int) + + print(f'X_data.shape = {X_data.shape}') + print(f'W_data.shape = {W_data.shape}') + + + # ================================================================= + # Combine + + X = np.vstack((X_MC, X_data)) + W = np.concatenate((W_MC, W_data)) + Y = np.concatenate((Y_MC, Y_data)) + ids = LOAD_VARS + + # ================================================================= + + + ## Print some diagnostics + print(__name__ + f'.load_root_file: Number of events: {len(X)}') + + for c in np.unique(Y): + print(__name__ + f'.load_root_file: class[{c}] mean(event_weight) = {np.mean(W[Y==c]):0.3E}, std(event_weight) = {np.std(W[Y==c]):0.3E}') + + # ** Crucial -- randomize order to avoid problems with other functions ** + rand = np.random.permutation(len(X)) + X = X[rand].squeeze() # Squeeze removes additional [] dimension + Y = Y[rand].squeeze() + W = W[rand].squeeze() + + # Apply maxevents cutoff + maxevents = np.min([args['maxevents'], len(X)]) + print(__name__ + f'.load_root_file: Applying maxevents cutoff {maxevents}') + X, Y, W = X[0:maxevents], Y[0:maxevents], W[0:maxevents] + + # TBD add cut statistics etc. here + info = {} + + return {'X':X, 'Y':Y, 'W':W, 'ids':ids, 'info':info} + + +def splitfactor(x, y, w, ids, args): + """ + Transform data into different datatypes. + + Args: + data: jagged arrays + args: arguments dictionary + + Returns: + dictionary with different data representations + """ + inputvars = import_module("configs." + args["rootname"] + "." + args["inputvars"]) + + data = io.IceXYW(x=x, y=y, w=w, ids=ids) + + # ------------------------------------------------------------------------- + ### Pick kinematic variables out + data_kin = None + + if inputvars.KINEMATIC_VARS is not None: + vars = aux.process_regexp_ids(all_ids=data.ids, ids=inputvars.KINEMATIC_VARS) + data_kin = data[vars] + data_kin.x = data_kin.x.astype(np.float32) + + # ------------------------------------------------------------------------- + data_deps = None + + # ------------------------------------------------------------------------- + data_tensor = None + + # ------------------------------------------------------------------------- + data_graph = None + + # ------------------------------------------------------------------------- + # Mutual information regularization targets + + #vars = aux.process_regexp_ids(all_ids=data.ids, ids=inputvars.MI_VARS) + #data_MI = data[vars].x.astype(np.float32) + data_MI = None + + # -------------------------------------------------------------------- + ### Finally pick active scalar variables out + + vars = aux.process_regexp_ids(all_ids=data.ids, ids=eval('inputvars.' + args['inputvar_scalar'])) + data = data[vars] + data.x = data.x.astype(np.float32) + + return {'data': data, 'data_MI': data_MI, 'data_kin': data_kin, 'data_deps': data_deps, 'data_tensor': data_tensor, 'data_graph': data_graph} diff --git a/tests/runme_zee.sh b/tests/runme_zee.sh new file mode 100644 index 00000000..2e3441df --- /dev/null +++ b/tests/runme_zee.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# +# Execute training and evaluation for the ZEE +# +# Run with: source runme.sh + +CONFIG="tune0.yml" +DATAPATH="/vols/cms/pfk18/phd/hgg/Jul23/NN21July/N/validations/outputs/Csplit_Jsamp" + +if [ ${maxevents+x} ]; then MAX="--maxevents $maxevents"; else MAX=""; fi + +# tee redirect output to both a file and to screen +python analysis/zee.py --runmode "genesis" $MAX --config $CONFIG --datapath $DATAPATH +#python analysis/zee.py --fastplot 1 --runmode "train" $MAX --config $CONFIG --datapath $DATAPATH +python analysis/zee.py --fastplot 1 --runmode "eval" $MAX --config $CONFIG --datapath $DATAPATH