From 8fd6f9287e4e48a4abfabf978949561adb16ed8f Mon Sep 17 00:00:00 2001 From: achamma Date: Fri, 29 Dec 2023 18:08:05 +0100 Subject: [PATCH] First draft: Add BBI --- README.md | 12 + hidimstat/BBI.py | 859 ++++++++++++++++++++++++++++++ hidimstat/Dnn_learner.py | 293 ++++++++++ hidimstat/Dnn_learner_single.py | 550 +++++++++++++++++++ hidimstat/RandomForestModified.py | 97 ++++ hidimstat/__init__.py | 4 + hidimstat/compute_importance.py | 688 ++++++++++++++++++++++++ hidimstat/utils.py | 605 +++++++++++++++++++++ 8 files changed, 3108 insertions(+) create mode 100644 hidimstat/BBI.py create mode 100644 hidimstat/Dnn_learner.py create mode 100644 hidimstat/Dnn_learner_single.py create mode 100644 hidimstat/RandomForestModified.py create mode 100644 hidimstat/compute_importance.py diff --git a/README.md b/README.md index 1d83e13..6bf7481 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,18 @@ Application to source localization (MEG/EEG data): * Chevalier, J. A., Gramfort, A., Salmon, J., & Thirion, B. (2020). __Statistical control for spatio-temporal MEG/EEG source imaging with desparsified multi-task Lasso__. In _Proceedings of the 34th Conference on Neural Information Processing Systems (NeurIPS 2020)_, Vancouver, Canada. +Single/Group statistically validated importance using conditional permutations: + +* Chamma, A., Thirion, B., & Engemann, D. (2024). __Variable importance in + high-dimensional settings requires grouping__. In _Proceedings of + the 38th Conference of the Association for the Advancement of Artificial + Intelligence(AAAI 2024)_, Vancouver, Canada. + +* Chamma, A., Engemann, D., & Thirion, B. (2023). __Statistically Valid Variable + Importance Assessment through Conditional Permutations__. In _Proceedings of + the 37th Conference on Neural Information Processing Systems (NeurIPS 2023)_, + New Orleans, USA. + If you use our packages, we would appreciate citations to the relevant aforementioned papers. #### Other useful references: diff --git a/hidimstat/BBI.py b/hidimstat/BBI.py new file mode 100644 index 0000000..cb70a3e --- /dev/null +++ b/hidimstat/BBI.py @@ -0,0 +1,859 @@ +import itertools +import warnings +from copy import copy + +import numpy as np +import pandas as pd +from joblib import Parallel, delayed +from scipy.stats import norm +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import RidgeCV +from sklearn.metrics import ( + log_loss, + mean_absolute_error, + mean_squared_error, + roc_auc_score, + r2_score, +) +from sklearn.model_selection import KFold +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.utils.validation import check_is_fitted + +from .compute_importance import ( + joblib_compute_conditional, + joblib_compute_permutation, +) +from .Dnn_learner import DNN_learner +from .utils import convert_predict_proba, create_X_y, compute_imp_std + + +class BlockBasedImportance(BaseEstimator, TransformerMixin): + """This class implements the Block Based Importance (BBI), + it consists of the learner block (first block) + and the importance block (second block). + Parameters + ---------- + estimator: scikit-learn compatible estimator, default=None + The provided estimator for the prediction task (First block). + The default estimator is the DNN learner. Other options are (1) RF + for Random Forest. + importance_estimator: {scikit-learn compatible estimator or string}, + default=Mod_RF + The provided estimator for the importance task (Second block). + Using "Mod_RF" will apply the modified version of the Random Forest as + the importance predictor. + do_hyper: bool, default=True + Tuning the hyperparameters of the provided estimator. + dict_hyper: dict, default=None + The dictionary of hyperparameters to tune. + prob_type: str, default='regression' + A classification or a regression problem. + bootstrap: bool, default=True + Application of bootstrap sampling for the training set. + split_perc: float, default=0.8 + The training/validation cut for the provided data. + conditional: bool, default=True + The permutation or the conditional sampling approach. + list_nominal: dict, default=None + The dictionary of binary, nominal and ordinal variables. + Perm: bool, default=False + The use of permutations or random sampling with CPI-DNN. + n_perm: int, default=50 + The number of permutations/random sampling for each column. + n_jobs: int, default=1 + The number of workers for parallel processing. + verbose: int, default=0 + If verbose > 0, the fitted iterations will be printed. + groups: dict, default=None + The knowledge-driven/data-driven grouping of the variables if provided. + group_stacking: bool, default=False + Apply the stacking-based method for the provided groups. + prop_out_subLayers: int, default=0. + If group_stacking is set to True, proportion of outputs for + the linear sub-layers per group. + index_i: int, default=None + The index of the current processed iteration. + random_state: int, default=2023 + Fixing the seeds of the random generator. + com_imp: boolean, default=True + Compute or not the importance scores. + Attributes + ---------- + ToDO + """ + + def __init__( + self, + estimator=None, + importance_estimator="Mod_RF", + coffeine_transformer=None, + do_hyper=True, + dict_hyper=None, + prob_type="regression", + bootstrap=True, + split_perc=0.8, + conditional=True, + list_nominal=None, + Perm=False, + n_perm=50, + n_jobs=1, + verbose=0, + groups=None, + group_stacking=False, + k_fold=2, + prop_out_subLayers=0, + index_i=None, + random_state=2023, + com_imp=True, + ): + self.estimator = estimator + self.importance_estimator = importance_estimator + self.coffeine_transformer = coffeine_transformer + self.do_hyper = do_hyper + self.dict_hyper = dict_hyper + self.prob_type = prob_type + self.bootstrap = bootstrap + self.split_perc = split_perc + self.conditional = conditional + self.list_nominal = list_nominal + self.Perm = Perm + self.n_perm = n_perm + self.n_jobs = n_jobs + self.verbose = verbose + self.groups = groups + self.group_stacking = group_stacking + self.k_fold = k_fold + self.prop_out_subLayers = prop_out_subLayers + self.index_i = index_i + self.random_state = random_state + self.X_test = [None] * max(self.k_fold, 1) + self.y_test = [None] * max(self.k_fold, 1) + self.org_pred = [None] * max(self.k_fold, 1) + self.pred_scores = [None] * max(self.k_fold, 1) + self.X_nominal = [None] * max(self.k_fold, 1) + self.type = None + self.list_estimators = [None] * max(self.k_fold, 1) + self.X_proc = [None] * max(self.k_fold, 1) + self.scaler_x = [None] * max(self.k_fold, 1) + self.scaler_y = [None] * max(self.k_fold, 1) + self.com_imp = com_imp + # Check for applying the stacking approach with the RidgeCV estimator + self.apply_ridge = False + # Check for the case of a coffeine transformer with provided groups + self.transformer_grp = True + self.coffeine_transformers = [] + + def fit(self, X, y=None): + """Build the provided estimator with the training set (X, y) + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The training input samples. + y : None + There is no need of a target in a transformer, yet the pipeline API + requires this parameter. + Returns + ------- + self : object + Returns self. + """ + # Disable the importance computation when only one group is provided + # (group case) + if self.groups is not None: + if len(self.groups) <= 1: + self.com_imp = False + + # Fixing the random generator's seed + self.rng = np.random.RandomState(self.random_state) + + # Switch to special binary case + if (self.prob_type == "classification") and (len(np.unique(y)) < 3): + self.prob_type = "binary" + # Convert list_nominal to a dictionary if initialized + # as an empty string + if not isinstance(self.list_nominal, dict): + self.list_nominal = { + "nominal": [], + "ordinal": [], + "binary": [], + } + + if "binary" not in self.list_nominal: + self.list_nominal["binary"] = [] + if "ordinal" not in self.list_nominal: + self.list_nominal["ordinal"] = [] + if "nominal" not in self.list_nominal: + self.list_nominal["nominal"] = [] + + # Move the ordinal columns with 2 values to the binary part + if self.list_nominal["ordinal"] != []: + for ord_col in set(self.list_nominal["ordinal"]).intersection(list(X.columns)): + if len(np.unique(X[ord_col])) < 3: + self.list_nominal["binary"].append(ord_col) + self.list_nominal["ordinal"].remove(ord_col) + + # Convert X to pandas dataframe + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + else: + self.X_cols = list(X.columns) + if (self.groups is None) or (not bool(self.groups)): + # Initialize the list_cols variable with each feature + # in a seperate list (default case) + self.groups = [[col] for col in X.columns] + self.transformer_grp = False + if self.group_stacking: + # Remove the group_stacking flag not + # to complex the DNN architecture + self.group_stacking = False + warnings.warn( + "The groups are not provided to apply the stacking" + " approach, back to single variables case." + ) + + # Convert dictionary of groups to a list of lists + if isinstance(self.groups, dict): + self.groups = list(self.groups.values()) + + # Checks if the number of variables in the groups is equal to the + # number of variables provided + list_count = [item for sublist in self.groups for item in sublist] + if self.coffeine_transformer is None: + if len(list_count) != X.shape[1]: + raise Exception("The provided groups are missing some variables!") + else: + if self.transformer_grp: + if len(list_count) != (X.shape[1] * self.coffeine_transformer[1]): + raise Exception("The provided groups are missing some variables!") + else: + if len(list_count) != X.shape[1]: + raise Exception("The provided groups are missing some variables!") + + # Check if categorical variables exist within the columns of the design + # matrix + self.list_nominal["binary"] = list(set(self.list_nominal["binary"]).intersection(list(X.columns))) + self.list_nominal["ordinal"] = list(set(self.list_nominal["ordinal"]).intersection(list(X.columns))) + self.list_nominal["nominal"] = list(set(self.list_nominal["nominal"]).intersection(list(X.columns))) + + self.list_cols = self.groups.copy() + self.list_cat_tot = list( + itertools.chain.from_iterable(self.list_nominal.values()) + ) + X_nominal_org = X.loc[:, self.list_cat_tot] + + # One-hot encoding of nominal variables + tmp_list = [] + self.dict_nom = {} + # A dictionary to save the encoders of the nominal variables + self.dict_enc = {} + if len(self.list_nominal["nominal"]) > 0: + for col_encode in self.list_nominal["nominal"]: + enc = OneHotEncoder(handle_unknown="ignore") + enc.fit(X[[col_encode]]) + labeled_cols = [ + enc.feature_names_in_[0] + "_" + str(enc.categories_[0][j]) + for j in range(len(enc.categories_[0])) + ] + hot_cols = pd.DataFrame( + enc.transform(X[[col_encode]]).toarray(), + dtype="int32", + columns=labeled_cols, + ) + X = X.drop(columns=[col_encode]) + X = pd.concat([X, hot_cols], axis=1) + self.dict_enc[col_encode] = enc + + # Create a dictionary for categorical variables with their indices + for col_cat in self.list_cat_tot: + current_list = [ + col + for col in range(len(X.columns)) + if X.columns[col].split("_")[0] == col_cat + ] + if len(current_list) > 0: + self.dict_nom[col_cat] = current_list + # A list to store the labels of the categorical variables + tmp_list.extend(current_list) + + # Create a dictionary for the continuous variables that will be scaled + self.dict_cont = {} + if self.coffeine_transformer is None: + for ind_col, col_cont in enumerate(X.columns): + if ind_col not in tmp_list: + self.dict_cont[col_cont] = [ind_col] + self.list_cont = [el[0] for el in self.dict_cont.values()] + else: + self.list_cols_tmp = [] + self.list_cont = list(np.arange(X.shape[1] * self.coffeine_transformer[1])) + for i in range(X.shape[1] * self.coffeine_transformer[1]): + self.dict_cont[i] = [i] + self.list_cols_tmp.append([i]) + if not self.transformer_grp: + self.list_cols = self.list_cols_tmp.copy() + self.coffeine_transformers = [ + copy(self.coffeine_transformer[0]) for _ in range(max(self.k_fold, 1)) + ] + + X = X.to_numpy() + + if len(y.shape) != 2: + y = np.array(y).reshape(-1, 1) + + if self.prob_type in ("classification", "binary"): + self.loss = log_loss + else: + self.loss = mean_squared_error + + # Replace groups' variables by the indices in the design matrix + self.list_grps = [] + self.inp_dim = None + if self.group_stacking: + for grp in self.groups: + current_grp = [] + for i in grp: + if i in self.dict_nom.keys(): + current_grp += self.dict_nom[i] + else: + current_grp += self.dict_cont[i] + self.list_grps.append(current_grp) + + if len(self.coffeine_transformers) == 1: + X = self.coffeine_transformers[0].fit_transform( + pd.DataFrame(X, columns=self.X_cols), np.ravel(y)) + + if self.estimator is not None: + # Force the output to 1 neurone per group + # in standard stacking case + self.prop_out_subLayers = 0 + # RidgeCV estimators + self.ridge_folds = 10 + list_alphas = np.logspace(-3, 5, num=100) + cv = KFold( + n_splits=self.ridge_folds, + random_state=self.random_state, + shuffle=True, + ) + self.ridge_mods = [ + [ + make_pipeline(StandardScaler(), RidgeCV(list_alphas)) + for __ in range(len(self.list_grps)) + ].copy() + for _ in range(self.ridge_folds) + ] + + X_prev = X.copy() + X = np.zeros((y.shape[0], len(self.list_grps))) + for ind_fold, (train, test) in enumerate(cv.split(X_prev)): + X_train, X_test = X_prev[train], X_prev[test] + y_train, _ = y[train], y[test] + if len(self.coffeine_transformers) > 1: + X_train = self.coffeine_transformers[ind_fold].fit_transform( + pd.DataFrame(X_train, columns=self.X_cols), np.ravel(y_train) + ) + X_test = self.coffeine_transformers[ind_fold].transform( + pd.DataFrame(X_test, columns=self.X_cols) + ) + for grp_ind, grp in enumerate(self.list_grps): + self.ridge_mods[ind_fold][grp_ind].fit(X_train[:, grp], y_train) + X[test, grp_ind] = ( + self.ridge_mods[ind_fold][grp_ind].predict(X_test[:, grp]).ravel() + ) + self.apply_ridge = True + + self.inp_dim = [ + max(1, int(self.prop_out_subLayers * len(grp))) + for grp in self.list_grps + ] + self.inp_dim.insert(0, 0) + self.inp_dim = np.cumsum(self.inp_dim) + self.list_cols = [ + list(np.arange(self.inp_dim[grp_ind], self.inp_dim[grp_ind + 1])) + for grp_ind in range(len(self.list_grps)) + ] + + # Initialize the first estimator (block learner) + if self.estimator is None: + self.estimator = DNN_learner( + prob_type=self.prob_type, + encode=True, + do_hyper=False, + list_cont=self.list_cont, + list_grps=self.list_grps, + group_stacking=self.group_stacking, + n_jobs=self.n_jobs, + inp_dim=self.inp_dim, + random_state=self.random_state, + verbose=self.verbose, + ) + self.type = "DNN" + # Initializing the dictionary for tuning the hyperparameters + if self.dict_hyper is None: + self.dict_hyper = { + "lr": [1e-4, 1e-3, 1e-2], + "l1_weight": [0, 1e-4, 1e-2], + "l2_weight": [0, 1e-4, 1e-2], + } + + elif self.estimator == "RF": + if self.prob_type == "regression": + self.estimator = RandomForestRegressor(random_state=2023) + else: + self.estimator = RandomForestClassifier(random_state=2023) + self.dict_hyper = {"max_depth": [2, 5, 10, 20]} + self.type = "RF" + + if self.k_fold != 0: + # Implementing k-fold cross validation as the default behavior + kf = KFold( + n_splits=self.k_fold, + random_state=self.random_state, + shuffle=True, + ) + for ind_fold, (train_index, test_index) in enumerate(kf.split(X)): + print(f"Processing: {ind_fold+1}") + X_fold = X.copy() + y_fold = y.copy() + + self.X_nominal[ind_fold] = X_nominal_org.iloc[test_index, :] + + X_train, X_test = ( + X_fold[train_index, :], + X_fold[test_index, :], + ) + + y_train, y_test = y_fold[train_index], y_fold[test_index] + + if not self.apply_ridge: + if self.coffeine_transformer is not None: + X_train = self.coffeine_transformers[ind_fold].fit_transform( + pd.DataFrame(X_train, columns=self.X_cols), np.ravel(y_train) + ) + + X_test = self.coffeine_transformers[ind_fold].transform( + pd.DataFrame(X_test, columns=self.X_cols) + ) + + self.X_test[ind_fold] = X_test.copy() + self.y_test[ind_fold] = y_test.copy() + + # Find the list of optimal sub-models to be used in the + # following steps (Default estimator) + if self.do_hyper: + self.__tuning_hyper(X_train, y_train, ind_fold) + if self.type == "DNN": + self.estimator.fit(X_train, y_train) + self.list_estimators[ind_fold] = copy(self.estimator) + + else: + if not self.apply_ridge: + if self.coffeine_transformer is not None: + X = self.coffeine_transformers[0].fit_transform( + pd.DataFrame(X, columns=self.X_cols), np.ravel(y)) + if not self.transformer_grp: + # Variables are provided as the third element of the + # coffeine transformer parameter + if len(self.coffeine_transformer) > 2: + X = X[:, self.coffeine_transformer[2]] + self.list_cont = np.arange(len(self.coffeine_transformer[2])) + + # Hyperparameter tuning + if self.do_hyper: + self.__tuning_hyper(X, y, 0) + + if self.type == "DNN": + self.estimator.fit(X, y) + self.list_estimators[0] = copy(self.estimator) + + self.is_fitted = True + return self + + def __tuning_hyper(self, X, y, ind_fold=None): + """ """ + if not ((self.apply_ridge) and (self.group_stacking)): + ( + X_train_scaled, + y_train_scaled, + X_valid_scaled, + y_valid_scaled, + X_scaled, + __, + scaler_x, + scaler_y, + ___, + ) = create_X_y( + X, + y, + bootstrap=self.bootstrap, + split_perc=self.split_perc, + prob_type=self.prob_type, + list_cont=self.list_cont, + random_state=self.random_state, + ) + if self.dict_hyper is not None: + list_hyper = list(itertools.product(*list(self.dict_hyper.values()))) + list_loss = [] + if self.type == "DNN": + list_loss = self.estimator.hyper_tuning( + X_train_scaled, + y_train_scaled, + X_valid_scaled, + y_valid_scaled, + list_hyper, + random_state=self.random_state, + ) + else: + if self.dict_hyper is None: + self.estimator.fit(X_scaled, y) + # If not a DNN learner case, need to save the scalers + self.scaler_x[ind_fold] = scaler_x + self.scaler_y[ind_fold] = scaler_y + return + else: + for ind_el, el in enumerate(list_hyper): + curr_params = dict( + (k, v) for v, k in zip(el, list(self.dict_hyper.keys())) + ) + list_hyper[ind_el] = curr_params + self.estimator.set_params(**curr_params) + if self.prob_type == "regression": + y_train_curr = ( + y_train_scaled * scaler_y.scale_ + scaler_y.mean_ + ) + y_valid_curr = ( + y_valid_scaled * scaler_y.scale_ + scaler_y.mean_ + ) + + def func(x): + return self.estimator.predict(x) + + else: + y_train_curr = y_train_scaled.copy() + y_valid_curr = y_valid_scaled.copy() + + def func(x): + return self.estimator.predict_proba(x) + + self.estimator.fit(X_train_scaled, y_train_curr) + + list_loss.append(self.loss(y_valid_curr, func(X_valid_scaled))) + + ind_min = np.argmin(list_loss) + best_hyper = list_hyper[ind_min] + if not isinstance(best_hyper, dict): + best_hyper = dict(zip(self.dict_hyper.keys(), best_hyper)) + + self.estimator.set_params(**best_hyper) + self.estimator.fit(X_scaled, y) + + # If not a DNN learner case, need to save the scalers + self.scaler_x[ind_fold] = scaler_x + self.scaler_y[ind_fold] = scaler_y + + else: + self.estimator.fit(X, y) + + def predict(self, X=None, encoding=True): + """Predict regression target for X. + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input samples. + Returns + ------- + y : ndarray, shape (n_samples,) + Returns an array of ones. + """ + if not isinstance(X, list): + list_X = [X.copy() for el in range(max(self.k_fold, 1))] + mean_pred = True + else: + list_X = X.copy() + mean_pred = False + + for ind_fold, curr_X in enumerate(list_X): + # Prepare the test set for the prediction + if encoding: + X_tmp = self.__encode_input(curr_X) + else: + X_tmp = curr_X.copy() + + if self.type != "DNN": + if not isinstance(curr_X, np.ndarray): + X_tmp = np.array(X_tmp) + if self.scaler_x[ind_fold] is not None: + X_tmp[:, self.list_cont] = self.scaler_x[ind_fold].transform( + X_tmp[:, self.list_cont] + ) + self.X_proc[ind_fold] = [X_tmp.copy()] + + self.org_pred[ind_fold] = self.list_estimators[ind_fold].predict(X_tmp) + + # Convert to the (n_samples x n_outputs) format + if len(self.org_pred[ind_fold].shape) != 2: + self.org_pred[ind_fold] = self.org_pred[ind_fold].reshape(-1, 1) + + if self.type == "DNN": + self.X_proc[ind_fold] = np.array( + self.list_estimators[ind_fold].X_test.copy() + ).swapaxes(0, 1) + + if mean_pred: + return np.mean(np.array(self.org_pred), axis=0) + + def predict_proba(self, X=None, encoding=True): + """Predict class probabilities for X. ToDo + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input samples. + Returns + ------- + y : ndarray, shape (n_samples,) + Returns an array of ones. + """ + if not isinstance(X, list): + list_X = [X.copy() for el in range(max(self.k_fold, 1))] + mean_pred = True + else: + list_X = X.copy() + mean_pred = False + + for ind_fold, curr_X in enumerate(list_X): + # Prepare the test set for the prediction + if encoding: + X_tmp = self.__encode_input(curr_X) + else: + X_tmp = curr_X.copy() + + if self.type != "DNN": + if not isinstance(curr_X, np.ndarray): + X_tmp = np.array(X_tmp) + if self.scaler_x[ind_fold] is not None: + X_tmp[:, self.list_cont] = self.scaler_x[ind_fold].transform( + X_tmp[:, self.list_cont] + ) + self.X_proc[ind_fold] = [X_tmp.copy()] + + self.org_pred[ind_fold] = self.list_estimators[ind_fold].predict_proba( + X_tmp + ) + + if self.type == "DNN": + self.X_proc[ind_fold] = np.array( + self.list_estimators[ind_fold].X_test.copy() + ).swapaxes(0, 1) + else: + self.org_pred[ind_fold] = convert_predict_proba(self.org_pred[ind_fold]) + + if mean_pred: + return np.mean(np.array(self.org_pred), axis=0) + + def __encode_input(self, X): + # Check is fit had been called + check_is_fitted(self, ["is_fitted"]) + + # One-hot encoding for the test set + if len(self.list_nominal["nominal"]) > 0: + for col_encode in self.list_nominal["nominal"]: + enc = self.dict_enc[col_encode] + labeled_cols = [ + enc.feature_names_in_[0] + "_" + str(enc.categories_[0][j]) + for j in range(len(enc.categories_[0])) + ] + hot_cols = pd.DataFrame( + enc.transform(X[[col_encode]]).toarray(), + dtype="int32", + columns=labeled_cols, + ) + X = X.drop(columns=[col_encode]) + X = pd.concat([X, hot_cols], axis=1) + + return X + + def compute_importance(self, X=None, y=None): + """This function is used to compute the importance scores and + the statistical guarantees for the different variables/groups + ---------- + X : {array-like, sparse-matrix}, shape (n_samples, n_features) + The input samples. + y : {array-like}, shape (n_samples,) + The output samples. + Returns + ------- + results : dict + The dictionary of importance scores, p-values and the corresponding + score. + """ + # Check is fit had been called + check_is_fitted(self, ["is_fitted"]) + encoding = True + + if self.k_fold != 0: + X = self.X_test.copy() + y = self.y_test.copy() + encoding = False + else: + if self.coffeine_transformer is not None: + X = self.coffeine_transformers[0].transform(pd.DataFrame(X, columns=self.X_cols)) + # Variables are provided as the third element of the + # coffeine transformer parameter + if len(self.coffeine_transformer) > 2: + X = X[:, self.coffeine_transformer[2]] + self.list_cont = np.arange(len(self.coffeine_transformer[2])) + # Perform stacking if enabled + if self.apply_ridge: + X_prev = X.copy() + X = np.zeros((y.shape[0], len(self.list_grps), self.ridge_folds)) + for grp_ind, grp in enumerate(self.list_grps): + for ind_ridge in range(self.ridge_folds): + X[:, grp_ind, ind_ridge] = ( + self.ridge_mods[ind_ridge][grp_ind] + .predict(X_prev.iloc[:, grp]) + .ravel() + ) + X = np.mean(X, axis=-1) + + # Convert X to pandas dataframe + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + self.X_nominal[0] = X.loc[:, self.list_cat_tot] + X = [X.copy() for _ in range(max(self.k_fold, 1))] + if self.prob_type in ("classification", "binary"): + pass + else: + if len(y.shape) != 2: + y = y.reshape(-1, 1) + y = [y.copy() for _ in range(max(self.k_fold, 1))] + + # Compute original predictions + if self.prob_type == "regression": + output_dim = y[0].shape[1] + self.predict(X, encoding=encoding) + else: + output_dim = 1 + self.predict_proba(X, encoding=encoding) + + list_seeds_imp = self.rng.randint(1e5, size=self.n_perm) + parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose) + score_imp_l = [] + results = {} + # n_features x n_permutations x n_samples + for ind_fold, estimator in enumerate(self.list_estimators): + if self.type == "DNN": + for y_col in range(y[ind_fold].shape[-1]): + y[ind_fold] = self.estimator.encode_outcome( + y[ind_fold], train=False + )[y_col] + else: + if self.prob_type in ("classification", "binary"): + y[ind_fold] = ( + OneHotEncoder(handle_unknown="ignore") + .fit_transform(y[ind_fold].reshape(-1, 1)) + .toarray() + ) + if self.com_imp: + if not self.conditional: + self.pred_scores[ind_fold], score_cur = list( + zip( + *parallel( + delayed(joblib_compute_permutation)( + self.list_cols[p_col], + perm, + estimator, + self.type, + self.X_proc[ind_fold], + y[ind_fold], + self.prob_type, + self.org_pred[ind_fold], + dict_cont=self.dict_cont, + dict_nom=self.dict_nom, + proc_col=p_col, + index_i=ind_fold + 1, + group_stacking=self.group_stacking, + random_state=list_seeds_imp[perm], + ) + for p_col in range(len(self.list_cols)) + for perm in range(self.n_perm) + ) + ) + ) + self.pred_scores[ind_fold] = np.array( + self.pred_scores[ind_fold] + ).reshape( + ( + len(self.list_cols), + self.n_perm, + y[ind_fold].shape[0], + output_dim, + ) + ) + else: + self.pred_scores[ind_fold], score_cur = list( + zip( + *parallel( + delayed(joblib_compute_conditional)( + self.list_cols[p_col], + self.n_perm, + estimator, + self.type, + self.importance_estimator, + self.X_proc[ind_fold], + y[ind_fold], + self.prob_type, + self.org_pred[ind_fold], + seed=self.random_state, + dict_cont=self.dict_cont, + dict_nom=self.dict_nom, + X_nominal=self.X_nominal[ind_fold], + list_nominal=self.list_nominal, + encoder=self.dict_enc, + proc_col=p_col, + index_i=ind_fold + 1, + group_stacking=self.group_stacking, + list_seeds=list_seeds_imp, + Perm=self.Perm, + output_dim=output_dim, + ) + for p_col in range(len(self.list_cols)) + ) + ) + ) + self.pred_scores[ind_fold] = np.array(self.pred_scores[ind_fold]) + score_imp_l.append(score_cur[0]) + else: + if self.prob_type in ("classification", "binary"): + score = roc_auc_score(y[ind_fold], self.org_pred[ind_fold]) + else: + score = ( + mean_absolute_error(y[ind_fold], self.org_pred[ind_fold]), + r2_score(y[ind_fold], self.org_pred[ind_fold]), + ) + score_imp_l.append(score) + + # Compute performance + if self.prob_type == "regression": + results["score_MAE"] = np.mean(np.array(score_imp_l), axis=0)[0] + results["score_R2"] = np.mean(np.array(score_imp_l), axis=0)[1] + else: + results["score_AUC"] = np.mean(np.array(score_imp_l), axis=0) + + if not self.com_imp: + return results + + # Compute Importance and P-values + pred_scores_full = [ + np.mean(self.pred_scores[ind_fold], axis=1) + for ind_fold in range(self.k_fold) + ] + results["importance"] = compute_imp_std(pred_scores_full)[0] + results["std"] = compute_imp_std(pred_scores_full)[1] + results["pval"] = norm.sf(results["importance"] / results["std"]) + results["pval"][np.isnan(results["pval"])] = 1 + + # Compute Z-scores across Permutations/Sampling + imp_z = compute_imp_std(self.pred_scores)[0] + std_z = compute_imp_std(self.pred_scores)[1] + results["zscores"] = imp_z / std_z + return results diff --git a/hidimstat/Dnn_learner.py b/hidimstat/Dnn_learner.py new file mode 100644 index 0000000..92eb768 --- /dev/null +++ b/hidimstat/Dnn_learner.py @@ -0,0 +1,293 @@ +import numpy as np +from sklearn.base import BaseEstimator + +from .Dnn_learner_single import DNN_learner_single +from .utils import sigmoid, softmax + + +class DNN_learner(BaseEstimator): + """ToDo + Parameters + ---------- + encode: bool, default=False + Encoding the categorical outcome. + do_hyper: bool, default=True + Tuning the hyperparameters of the provided estimator. + dict_hyper: dict, default=None + The dictionary of hyperparameters to tune. + n_ensemble: int, default=10 + The number of sub-DNN models to fit to the data. + min_keep: int, default=10 + The minimal number of DNNs to be kept. + batch_size: int, default=32 + The number of samples per batch for training. + batch_size_val: int, default=128 + The number of samples per batch for validation. + n_epoch: int, default=200 + The number of epochs for the DNN learner(s). + verbose: int, default=0 + If verbose > 0, the fitted iterations will be printed. + bootstrap: bool, default=True + Application of bootstrap sampling for the training set. + split_perc: float, default=0.8 + The training/validation cut for the provided data. + prob_type: str, default='regression' + A classification or a regression problem. + list_cont: list, default=None + The list of continuous variables. + list_grps: list of lists, default=None + A list collecting the indices of the groups' variables + while applying the stacking method. + beta1: float, default=0.9 + The exponential decay rate for the first moment estimates. + beta2: float, default=0.999 + The exponential decay rate for the second moment estimates. + lr: float, default=1e-3 + The learning rate. + epsilon: float, default=1e-8 + A small constant added to the denominator to prevent division by zero. + l1_weight: float, default=1e-2 + The L1-regularization paramter for weight decay. + l2_weight: float, default=0 + The L2-regularization paramter for weight decay. + n_jobs: int, default=1 + The number of workers for parallel processing. + group_stacking: bool, default=False + Apply the stacking-based method for the provided groups. + inp_dim: list, default=None + The cumsum of inputs after the linear sub-layers. + random_state: int, default=2023 + Fixing the seeds of the random generator. + Attributes + ---------- + ToDO + """ + + def __init__( + self, + encode=False, + do_hyper=False, + dict_hyper=None, + n_ensemble=10, + min_keep=10, + batch_size=32, + batch_size_val=128, + n_epoch=200, + verbose=0, + bootstrap=True, + split_perc=0.8, + prob_type="regression", + list_cont=None, + list_grps=None, + beta1=0.9, + beta2=0.999, + lr=1e-2, + epsilon=1e-8, + l1_weight=1e-2, + l2_weight=1e-2, + n_jobs=1, + group_stacking=False, + inp_dim=None, + random_state=2023, + ): + self.list_estimators = [] + self.encode = encode + self.do_hyper = do_hyper + self.dict_hyper = dict_hyper + self.n_ensemble = n_ensemble + self.min_keep = min_keep + self.batch_size = batch_size + self.batch_size_val = batch_size_val + self.n_epoch = n_epoch + self.verbose = verbose + self.bootstrap = bootstrap + self.split_perc = split_perc + self.prob_type = prob_type + self.list_grps = list_grps + self.beta1 = beta1 + self.beta2 = beta2 + self.lr = lr + self.epsilon = epsilon + self.l1_weight = l1_weight + self.l2_weight = l2_weight + self.list_cont = list_cont + self.n_jobs = n_jobs + self.group_stacking = group_stacking + self.inp_dim = inp_dim + self.random_state = random_state + self.pred = [None] * n_ensemble + self.enc_y = [] + self.link_func = { + "classification": softmax, + "ordinal": sigmoid, + "binary": sigmoid, + } + self.is_encoded = False + self.dim_repeat = 1 + + def fit(self, X, y=None): + """Build the DNN learner with the training set (X, y) + Parameters + ---------- + X : {pandas dataframe}, shape (n_samples, n_features) + The training input samples. + y : None + There is no need of a target in a transformer, yet the pipeline API + requires this parameter. + Returns + ------- + self : object + Returns self. + """ + if (len(X.shape) != 3) or (X.shape[0] != y.shape[-1]): + X = np.squeeze(X) + X = np.array([X for i in range(y.shape[-1])]) + self.dim_repeat = y.shape[-1] + + self.list_estimators = [None] * y.shape[-1] + self.X_test = [None] * y.shape[-1] + + for y_col in range(y.shape[-1]): + self.list_estimators[y_col] = DNN_learner_single( + encode=self.encode, + do_hyper=self.do_hyper, + dict_hyper=self.dict_hyper, + n_ensemble=self.n_ensemble, + min_keep=self.min_keep, + batch_size=self.batch_size, + batch_size_val=self.batch_size_val, + n_epoch=self.n_epoch, + verbose=self.verbose, + bootstrap=self.bootstrap, + split_perc=self.split_perc, + prob_type=self.prob_type, + list_cont=self.list_cont, + list_grps=self.list_grps, + beta1=self.beta1, + beta2=self.beta2, + lr=self.lr, + epsilon=self.epsilon, + l1_weight=self.l1_weight, + l2_weight=self.l2_weight, + n_jobs=self.n_jobs, + group_stacking=self.group_stacking, + inp_dim=self.inp_dim, + random_state=self.random_state, + ) + + self.list_estimators[y_col].fit(X[y_col, ...], y[:, [y_col]]) + + return self + + def hyper_tuning( + self, + X_train, + y_train, + X_valid, + y_valid, + list_hyper=None, + random_state=None, + ): + estimator = DNN_learner_single( + encode=self.encode, + do_hyper=self.do_hyper, + dict_hyper=self.dict_hyper, + n_ensemble=self.n_ensemble, + min_keep=self.min_keep, + batch_size=self.batch_size, + batch_size_val=self.batch_size_val, + n_epoch=self.n_epoch, + verbose=self.verbose, + bootstrap=self.bootstrap, + split_perc=self.split_perc, + prob_type=self.prob_type, + list_cont=self.list_cont, + list_grps=self.list_grps, + beta1=self.beta1, + beta2=self.beta2, + lr=self.lr, + epsilon=self.epsilon, + l1_weight=self.l1_weight, + l2_weight=self.l2_weight, + n_jobs=self.n_jobs, + group_stacking=self.group_stacking, + inp_dim=self.inp_dim, + random_state=self.random_state, + ) + return estimator.hyper_tuning( + X_train, y_train, X_valid, y_valid, list_hyper, random_state + ) + + def predict(self, X, scale=True): + """Predict regression target for X. + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The training input samples. + scale: bool, default=True + The continuous features will be standard scaled or not. + Returns + ------- + y : ndarray, shape (n_samples,) + Returns an array of ones. + """ + if isinstance(X, list): + X = [self.check_X_dim(el) for el in X] + else: + X = self.check_X_dim(X) + list_res = [] + for estimator_ind, estimator in enumerate(self.list_estimators): + if isinstance(X, list): + curr_X = [el[estimator_ind, ...] for el in X] + list_res.append(estimator.predict(curr_X, scale)) + else: + list_res.append(estimator.predict(X[estimator_ind, ...], scale)) + self.X_test[estimator_ind] = estimator.X_test.copy() + return np.array(list_res) + + def predict_proba(self, X, scale=True): + """Predict class probabilities for X. + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The training input samples. + scale: bool, default=True + The continuous features will be standard scaled or not. + Returns + ------- + y : ndarray, shape (n_samples,) + Returns an array of ones. + """ + if isinstance(X, list): + X = [self.check_X_dim(el) for el in X] + else: + X = self.check_X_dim(X) + + list_res = [] + for estimator_ind, estimator in enumerate(self.list_estimators): + if isinstance(X, list): + curr_X = [el[estimator_ind, ...] for el in X] + list_res.append(estimator.predict_proba(curr_X, scale)) + else: + list_res.append(estimator.predict_proba(X[estimator_ind, ...], scale)) + self.X_test[estimator_ind] = estimator.X_test.copy() + return np.squeeze(np.array(list_res)) + + def set_params(self, **kwargs): + """Set the parameters of this estimator.""" + for key, value in kwargs.items(): + setattr(self, key, value) + for estimator in self.list_estimators: + setattr(estimator, key, value) + + def check_X_dim(self, X): + if (len(X.shape) != 3) or (X.shape[0] != self.dim_repeat): + X = np.squeeze(X) + X = np.array([X for i in range(self.dim_repeat)]) + + return X + + def encode_outcome(self, y, train=True): + for y_col in range(y.shape[-1]): + y_enc = self.list_estimators[y_col].encode_outcome(y, train=train) + return y_enc \ No newline at end of file diff --git a/hidimstat/Dnn_learner_single.py b/hidimstat/Dnn_learner_single.py new file mode 100644 index 0000000..af7da51 --- /dev/null +++ b/hidimstat/Dnn_learner_single.py @@ -0,0 +1,550 @@ +import itertools + +import numpy as np +import pandas as pd +from joblib import Parallel, delayed +from sklearn.base import BaseEstimator +from sklearn.metrics import log_loss, mean_squared_error +from sklearn.preprocessing import OneHotEncoder +from sklearn.utils.validation import check_is_fitted + +from .utils import ( + create_X_y, + dnn_net, + joblib_ensemble_dnnet, + ordinal_encode, + relu, + sigmoid, + softmax, +) + + +class DNN_learner_single(BaseEstimator): + """ + Parameters + ---------- + encode: bool, default=False + Encoding the categorical outcome. + do_hyper: bool, default=True + Tuning the hyperparameters of the provided estimator. + dict_hyper: dict, default=None + The dictionary of hyperparameters to tune. + n_ensemble: int, default=10 + The number of sub-DNN models to fit to the data + min_keep: int, default=10 + The minimal number of DNNs to be kept + batch_size: int, default=32 + The number of samples per batch for training + batch_size_val: int, default=128 + The number of samples per batch for validation + n_epoch: int, default=200 + The number of epochs for the DNN learner(s) + verbose: int, default=0 + If verbose > 0, the fitted iterations will be printed + bootstrap: bool, default=True + Application of bootstrap sampling for the training set + split_perc: float, default=0.8 + The training/validation cut for the provided data + prob_type: str, default='regression' + A classification or a regression problem + list_grps: list of lists, default=None + A list collecting the indices of the groups' variables + while applying the stacking method. + list_cont: list, default=None + The list of continuous variables + beta1: float, default=0.9 + The exponential decay rate for the first moment estimates. + beta2: float, default=0.999 + The exponential decay rate for the second moment estimates. + lr: float, default=1e-3 + The learning rate + epsilon: float, default=1e-8 + A small constant added to the denominator to prevent division by zero. + l1_weight: float, default=1e-2 + The L1-regularization paramter for weight decay. + l2_weight: float, default=0 + The L2-regularization paramter for weight decay. + n_jobs: int, default=1 + The number of workers for parallel processing. + group_stacking: bool, default=False + Apply the stacking-based method for the provided groups. + inp_dim: list, default=None + The cumsum of inputs after the linear sub-layers. + random_state: int, default=2023 + Fixing the seeds of the random generator + """ + + def __init__( + self, + encode=False, + do_hyper=False, + dict_hyper=None, + n_ensemble=10, + min_keep=10, + batch_size=32, + batch_size_val=128, + n_epoch=200, + verbose=0, + bootstrap=True, + split_perc=0.8, + prob_type="regression", + list_cont=None, + list_grps=None, + beta1=0.9, + beta2=0.999, + lr=1e-3, + epsilon=1e-8, + l1_weight=1e-2, + l2_weight=0, + n_jobs=1, + group_stacking=False, + inp_dim=None, + random_state=2023, + ): + self.encode = encode + self.do_hyper = do_hyper + self.dict_hyper = dict_hyper + self.n_ensemble = n_ensemble + self.min_keep = min_keep + self.batch_size = batch_size + self.batch_size_val = batch_size_val + self.n_epoch = n_epoch + self.verbose = verbose + self.bootstrap = bootstrap + self.split_perc = split_perc + self.prob_type = prob_type + self.list_grps = list_grps + self.beta1 = beta1 + self.beta2 = beta2 + self.lr = lr + self.epsilon = epsilon + self.l1_weight = l1_weight + self.l2_weight = l2_weight + self.list_cont = list_cont + self.n_jobs = n_jobs + self.group_stacking = group_stacking + self.inp_dim = inp_dim + self.random_state = random_state + self.enc_y = [] + self.link_func = { + "classification": softmax, + "ordinal": sigmoid, + "binary": sigmoid, + } + self.is_encoded = False + + def fit(self, X, y=None): + """Build the DNN learner with the training set (X, y) + Parameters + ---------- + X : {pandas dataframe}, shape (n_samples, n_features) + The training input samples. + y : None + There is no need of a target in a transformer, yet the pipeline API + requires this parameter. + Returns + ------- + self : object + Returns self. + """ + # Disabling the encoding parameter with the regression case + if self.prob_type == "regression": + if len(y.shape) != 2: + y = y.reshape(-1, 1) + self.encode = False + + if self.encode: + y = self.encode_outcome(y) + self.is_encoded = True + y = np.squeeze(y, axis=0) + + # Initializing the dictionary for tuning the hyperparameters + if self.dict_hyper is None: + self.dict_hyper = { + "lr": [1e-2, 1e-3, 1e-4], + "l1_weight": [0, 1e-2, 1e-4], + "l2_weight": [0, 1e-2, 1e-4], + } + + # Switch to the special binary case + if (self.prob_type == "classification") and (y.shape[-1] < 3): + self.prob_type = "binary" + n, p = X.shape + self.min_keep = max(min(self.min_keep, self.n_ensemble), 1) + rng = np.random.RandomState(self.random_state) + list_seeds = rng.randint(1e5, size=(self.n_ensemble)) + + # Initialize the list of continuous variables + if self.list_cont is None: + self.list_cont = list(np.arange(p)) + + # Initialize the list of groups + if self.list_grps is None: + if not self.group_stacking: + self.list_grps = [] + + # Convert the matrix of predictors to numpy array + if not isinstance(X, np.ndarray): + X = np.array(X) + + # Hyperparameter tuning + if self.do_hyper: + self.__tuning_hyper(X, y) + + parallel = Parallel( + n_jobs=min(self.n_jobs, self.n_ensemble), verbose=self.verbose + ) + res_ens = list( + zip( + *parallel( + delayed(joblib_ensemble_dnnet)( + X, + y, + prob_type=self.prob_type, + link_func=self.link_func, + list_cont=self.list_cont, + list_grps=self.list_grps, + bootstrap=self.bootstrap, + split_perc=self.split_perc, + group_stacking=self.group_stacking, + inp_dim=self.inp_dim, + n_epoch=self.n_epoch, + batch_size=self.batch_size, + beta1=self.beta1, + beta2=self.beta2, + lr=self.lr, + l1_weight=self.l1_weight, + l2_weight=self.l2_weight, + epsilon=self.epsilon, + random_state=list_seeds[i], + ) + for i in range(self.n_ensemble) + ) + ) + ) + pred_m = np.array(res_ens[3]) + loss = np.array(res_ens[4]) + + if self.n_ensemble == 1: + return [(res_ens[0][0], (res_ens[1][0], res_ens[2][0]))] + + # Keeping the optimal subset of DNNs + sorted_loss = loss.copy() + sorted_loss.sort() + new_loss = np.empty(self.n_ensemble - 1) + for i in range(self.n_ensemble - 1): + current_pred = np.mean(pred_m[loss >= sorted_loss[i], :], axis=0) + if self.prob_type == "regression": + new_loss[i] = mean_squared_error(y, current_pred) + else: + new_loss[i] = log_loss(y, current_pred) + keep_dnn = ( + loss + >= sorted_loss[np.argmin(new_loss[: (self.n_ensemble - self.min_keep + 1)])] + ) + + self.optimal_list = [ + (res_ens[0][i], (res_ens[1][i], res_ens[2][i])) + for i in range(self.n_ensemble) + if keep_dnn[i] + ] + self.pred = [None] * len(self.optimal_list) + self.is_fitted = True + return self + + def encode_outcome(self, y, train=True): + list_y = [] + if len(y.shape) != 2: + y = y.reshape(-1, 1) + if self.prob_type == "regression": + list_y.append(y) + + for col in range(y.shape[1]): + if train: + # Encoding the target with the classification case + if self.prob_type in ("classification", "binary"): + self.enc_y.append(OneHotEncoder(handle_unknown="ignore")) + curr_y = self.enc_y[col].fit_transform(y[:, [col]]).toarray() + list_y.append(curr_y) + + # Encoding the target with the ordinal case + if self.prob_type == "ordinal": + y = ordinal_encode(y) + + else: + # Encoding the target with the classification case + if self.prob_type in ("classification", "binary"): + curr_y = self.enc_y[col].transform(y[:, [col]]).toarray() + list_y.append(curr_y) + + ## ToDo Add the ordinal case + return np.array(list_y) + + def hyper_tuning( + self, + X_train, + y_train, + X_valid, + y_valid, + list_hyper=None, + random_state=None, + ): + parallel = Parallel( + n_jobs=min(self.n_jobs, self.n_ensemble), verbose=self.verbose + ) + y_train = self.encode_outcome(y_train) + y_valid = self.encode_outcome(y_valid, train=False) + return [ + list( + zip( + *parallel( + delayed(dnn_net)( + X_train, + y_train[i, ...], + X_valid, + y_valid[i, ...], + prob_type=self.prob_type, + n_epoch=self.n_epoch, + batch_size=self.batch_size, + beta1=self.beta1, + beta2=self.beta2, + lr=el[0], + l1_weight=el[1], + l2_weight=el[2], + epsilon=self.epsilon, + list_grps=self.list_grps, + group_stacking=self.group_stacking, + inp_dim=self.inp_dim, + random_state=random_state, + ) + for el in list_hyper + ) + ) + )[2] + for i in range(y_train.shape[0]) + ] + + def __tuning_hyper(self, X, y): + ( + X_train_scaled, + y_train_scaled, + X_valid_scaled, + y_valid_scaled, + X_scaled, + __, + scaler_x, + scaler_y, + ___, + ) = create_X_y( + X, + y, + bootstrap=self.bootstrap, + split_perc=self.split_perc, + prob_type=self.prob_type, + list_cont=self.list_cont, + random_state=self.random_state, + ) + list_hyper = list(itertools.product(*list(self.dict_hyper.values()))) + list_loss = self.hyper_tuning( + X_train_scaled, + y_train_scaled, + X_valid_scaled, + y_valid_scaled, + list_hyper, + random_state=self.random_state, + ) + ind_min = np.argmin(list_loss) + best_hyper = list_hyper[ind_min] + if not isinstance(best_hyper, dict): + best_hyper = dict(zip(self.dict_hyper.keys(), best_hyper)) + self.set_params(**best_hyper) + + def predict(self, X, scale=True): + """Predict regression target for X. + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The training input samples. + scale: bool, default=True + The continuous features will be standard scaled or not. + Returns + ------- + y : ndarray, shape (n_samples,) + Returns an array of ones. + """ + if self.prob_type != "regression": + raise Exception("Use the predict_proba function for classification") + + # Prepare the test set for the prediction + if scale: + X = self.__scale_test(X) + + # Process the common prediction part + self.__pred_common(X) + + res_pred = np.zeros((self.pred[0].shape)) + total_n_elements = 0 + for ind_mod, pred in enumerate(self.pred): + res_pred += ( + pred * self.optimal_list[ind_mod][1][1].scale_ + + self.optimal_list[ind_mod][1][1].mean_ + ) + total_n_elements += 1 + res_pred = res_pred.copy() / total_n_elements + + return res_pred + + def predict_proba(self, X, scale=True): + """Predict class probabilities for X. + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The training input samples. + scale: bool, default=True + The continuous features will be standard scaled or not. + Returns + ------- + y : ndarray, shape (n_samples,) + Returns an array of ones. + """ + if self.prob_type == "regression": + raise Exception("Use the predict function for classification") + + # Prepare the test set for the prediction + if scale: + X = self.__scale_test(X) + + # Process the common prediction part + self.__pred_common(X) + + res_pred = np.zeros((self.pred[0].shape)) + total_n_elements = 0 + for pred in self.pred: + res_pred += self.link_func[self.prob_type](pred) + total_n_elements += 1 + res_pred = res_pred.copy() / total_n_elements + if self.prob_type == "binary": + res_pred = np.array([[1-res_pred[i][0], res_pred[i][0]] for i in range(res_pred.shape[0])]) + + return res_pred + + def __scale_test(self, X): + """This function prepares the input for the DNN estimator either in the default + case or after applying the stacking method + Parameters + ---------- + X : {array-like, sparse-matrix}, shape (n_samples, n_features) + The input samples. + """ + # Check is fit had been called + check_is_fitted(self, ["is_fitted"]) + + if isinstance(X, pd.DataFrame): + X = np.array(X) + + # The input will be either the original input or the result + # of the provided sub-linear layers in a stacking way for the different groups + # In the stacking method, each sub-linear layer will have a corresponding output + if self.group_stacking: + X_test_n = [None] * len(self.optimal_list) + for mod in range(len(self.optimal_list)): + X_test_scaled = X.copy() + if len(self.list_cont) > 0: + X_test_scaled[:, self.list_cont] = self.optimal_list[mod][1][ + 0 + ].transform(X_test_scaled[:, self.list_cont]) + X_test_n_curr = np.zeros( + ( + X_test_scaled.shape[0], + self.inp_dim[-1], + ) + ) + for grp_ind in range(len(self.list_grps)): + curr_pred = X_test_scaled[:, self.list_grps[grp_ind]].copy() + n_layer_stacking = len(self.optimal_list[mod][0][3][grp_ind]) - 1 + for ind_w_b in range(n_layer_stacking): + if ind_w_b == 0: + curr_pred = relu( + X_test_scaled[:, self.list_grps[grp_ind]].dot( + self.optimal_list[mod][0][3][grp_ind][ind_w_b] + ) + + self.optimal_list[mod][0][4][grp_ind][ind_w_b] + ) + else: + curr_pred = relu( + curr_pred.dot( + self.optimal_list[mod][0][3][grp_ind][ind_w_b] + ) + + self.optimal_list[mod][0][4][grp_ind][ind_w_b] + ) + X_test_n_curr[ + :, + list( + np.arange( + self.inp_dim[grp_ind], + self.inp_dim[grp_ind + 1], + ) + ), + ] = ( + curr_pred.dot( + self.optimal_list[mod][0][3][grp_ind][n_layer_stacking] + ) + + self.optimal_list[mod][0][4][grp_ind][n_layer_stacking] + ) + # X_test_n_curr[ + # :, + # list( + # np.arange( + # self.inp_dim[grp_ind], + # self.inp_dim[grp_ind + 1], + # ) + # # np.arange( + # # self.n_out_subLayers * grp_ind, + # # (grp_ind + 1) * self.n_out_subLayers, + # # ) + # ), + # ] = ( + # X_test_scaled[:, self.list_grps[grp_ind]].dot( + # self.optimal_list[mod][0][3][grp_ind] + # ) + # + self.optimal_list[mod][0][4][grp_ind] + # ) + X_test_n[mod] = X_test_n_curr + else: + X_test_n = [X.copy()] + self.X_test = X_test_n.copy() + return X_test_n + + def __pred_common(self, X): + """ + Parameters + ---------- + X : {array-like, sparse-matrix}, shape (n_samples, n_features) + The input samples. + """ + if not self.group_stacking: + X = [X[0].copy() for i in range(self.n_ensemble)] + + n_layer = len(self.optimal_list[0][0][0]) - 1 + for ind_mod, mod in enumerate(self.optimal_list): + X_test_scaled = X[ind_mod].copy() + for j in range(n_layer): + if not self.group_stacking: + if len(self.list_cont) > 0: + X_test_scaled[:, self.list_cont] = mod[1][0].transform( + X_test_scaled[:, self.list_cont] + ) + if j == 0: + pred = relu(X_test_scaled.dot(mod[0][0][j]) + mod[0][1][j]) + else: + pred = relu(pred.dot(mod[0][0][j]) + mod[0][1][j]) + + self.pred[ind_mod] = pred.dot(mod[0][0][n_layer]) + mod[0][1][n_layer] + + def set_params(self, **kwargs): + """Set the parameters of this estimator.""" + for key, value in kwargs.items(): + setattr(self, key, value) + + def clone(self): + return type(self)(**self.get_params()) diff --git a/hidimstat/RandomForestModified.py b/hidimstat/RandomForestModified.py new file mode 100644 index 0000000..7dd5b12 --- /dev/null +++ b/hidimstat/RandomForestModified.py @@ -0,0 +1,97 @@ +import numpy as np +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + + +class RandomForestClassifierModified(RandomForestClassifier): + def fit(self, X, y): + self.y_ = y + super().fit(X, y) + + def predict(self, X): + super().predict(X) + + def predict_proba(self, X): + super().predict_proba(X) + + def sample_same_leaf(self, X, y=None): + if not (y is None): + self.y_ = y + rng = np.random.RandomState(self.get_params()["random_state"]) + # The number of samples to sample from in the next step + random_samples = 10 + + # Get the leaf indices for each sample in the input data + leaf_indices = self.apply(X) + + # Initialize an array to store the predictions for each sample + predictions = [] + + # Loop over each sample in the input data + for i in range(X.shape[0]): + # Removing the row of the corresponding input sample + leaf_indices_minus_i = np.delete(leaf_indices, i, axis=0) + y_minus_i = np.delete(self.y_, i, axis=0) + + # The list of indices sampled from the same leaf of the input sample + leaf_samples = [] + # Loop over each tree in the forest + for j in range(self.n_estimators): + # Find the samples that fall into the same leaf node for this tree + samples_in_leaf = np.where( + leaf_indices_minus_i[:, j] == leaf_indices[i, j] + )[0] + + # Append the samples to the list + leaf_samples.append( + y_minus_i[rng.choice(samples_in_leaf, size=random_samples)] + ) + + predictions.append(leaf_samples) + + # Combine the predictions from all trees to make the final prediction + return np.array(predictions) + + +class RandomForestRegressorModified(RandomForestRegressor): + def fit(self, X, y): + self.y_ = y + super().fit(X, y) + + def predict(self, X): + super().predict(X) + + def sample_same_leaf(self, X): + rng = np.random.RandomState(self.get_params()["random_state"]) + # The number of samples to sample from in the next step + random_samples = 10 + + # Get the leaf indices for each sample in the input data + leaf_indices = self.apply(X) + + # Initialize an array to store the predictions for each sample + predictions = [] + + # Loop over each sample in the input data + for i in range(X.shape[0]): + # Removing the row of the corresponding input sample + leaf_indices_minus_i = np.delete(leaf_indices, i, axis=0) + y_minus_i = np.delete(self.y_, i, axis=0) + + # The list of indices sampled from the same leaf of the input sample + leaf_samples = [] + # Loop over each tree in the forest + for j in range(self.n_estimators): + # Find the samples that fall into the same leaf node for this tree + samples_in_leaf = np.where( + leaf_indices_minus_i[:, j] == leaf_indices[i, j] + )[0] + + # Append the samples to the list + leaf_samples.append( + y_minus_i[rng.choice(samples_in_leaf, size=random_samples)] + ) + + predictions.append(leaf_samples) + + # Combine the predictions from all trees to make the final prediction + return np.array(predictions) diff --git a/hidimstat/__init__.py b/hidimstat/__init__.py index 0e4b39e..d2d4180 100644 --- a/hidimstat/__init__.py +++ b/hidimstat/__init__.py @@ -1,6 +1,8 @@ from .adaptive_permutation_threshold import ada_svr +from .BBI import BlockBasedImportance from .clustered_inference import clustered_inference, hd_inference from .desparsified_lasso import desparsified_group_lasso, desparsified_lasso +from .Dnn_learner_single import DNN_learner_single from .ensemble_clustered_inference import ensemble_clustered_inference from .knockoffs import model_x_knockoff from .knockoff_aggregation import knockoff_aggregation @@ -15,9 +17,11 @@ __all__ = [ 'ada_svr', 'aggregate_quantiles', + 'BlockBasedImportance', 'clustered_inference', 'desparsified_lasso', 'desparsified_group_lasso', + 'DNN_learner_single', 'ensemble_clustered_inference', 'group_reid', 'hd_inference', diff --git a/hidimstat/compute_importance.py b/hidimstat/compute_importance.py new file mode 100644 index 0000000..517547c --- /dev/null +++ b/hidimstat/compute_importance.py @@ -0,0 +1,688 @@ +import warnings +from collections import Counter + +import numpy as np +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.metrics import mean_absolute_error, r2_score, roc_auc_score +from sklearn.model_selection import GridSearchCV +from sklearn.preprocessing import OneHotEncoder + +from .RandomForestModified import ( + RandomForestClassifierModified, + RandomForestRegressorModified, +) +from .utils import convert_predict_proba, ordinal_encode, sample_predictions + +warnings.filterwarnings("ignore") + + +def joblib_compute_conditional( + p_col, + n_sample, + estimator, + type_predictor, + importance_estimator, + X_test_list, + y_test, + prob_type, + org_pred, + seed=None, + dict_cont={}, + dict_nom={}, + X_nominal=None, + list_nominal={}, + encoder={}, + proc_col=None, + index_i=None, + group_stacking=False, + list_seeds=None, + Perm=False, + output_dim=1, +): + """This function applies the conditional approach for feature importance. + Parameters + ---------- + p_col: list + The list of single variables/groups to compute the feature importance. + n_sample: int + The number of permutations/samples to loop + estimator: scikit-learn compatible estimator, default=None + The provided estimator for the prediction task (First block) + type_predictor: string + The provided predictor either the DNN learner or not. + The DNN learner will use different inputs for the different + sub-models especially while applying the stacking case. + importance_estimator: scikit-learn compatible estimator, default=None + The provided estimator for the importance task (Second block) + X_test_list: list + The list of inputs containing either one input or a number of inputs + equal to the number of sub-models of the DNN learner. + y_test: {array-like, sparse matrix}, shape (n_samples, n_output) + The output test samples. + prob_type: str, default='regression' + A classification or a regression problem. + org_pred: {array-like, sparse matrix}, shape (n_output, n_samples) + The predictions using the original samples. + seed: int, default=None + Fixing the seeds of the random generator. + dict_cont: dict, default={} + The dictionary providing the indices of the continuous variables. + dict_nom: dict, default={} + The dictionary providing the indices of the categorical variables. + X_nominal: {array-like}, default=None + The dataframe of categorical variables without encoding. + list_nominal: dict, default={} + The dictionary of binary, nominal and ordinal variables. + encoder: dict, default={} + The dictionary of encoders for categorical variables. + proc_col: int, default=None + The processed column to print if verbose > 0. + index_i: int, default=None + The index of the current processed iteration. + group_stacking: bool, default=False + Apply the stacking-based method for the provided groups. + list_seeds: list, default=None + The list of seeds to fix the RandomState for reproducibility. + Perm: bool, default=False + The use of permutations or random sampling with CPI-DNN. + output_dim: + + """ + rng = np.random.RandomState(seed) + + res_ar = np.empty((n_sample, y_test.shape[0], output_dim)) + + # A list of copied items to avoid any overlapping in the process + current_X_test_list = [ + (lambda x: x.copy()[np.newaxis, ...] if len(x.shape) != 3 else x.copy())( + X_test_el + ) + for X_test_el in X_test_list + ] + + Res_col = [None] * len(current_X_test_list) + X_col_pred = { + "regression": [None] * len(current_X_test_list), + "classification": None, + "ordinal": None, + } + + # Group partitioning + grp_nom = [ + item + for item in p_col + if (item in dict_nom.keys()) + and (item in list_nominal["nominal"] + list_nominal["binary"]) + ] + grp_ord = [ + item for item in p_col if (item in dict_nom.keys()) and (item not in grp_nom) + ] + grp_cont = [item for item in p_col if item not in (grp_nom + grp_ord)] + + # If modified Random Forest is applied + pred_mod_RF = { + "regression": [None] * len(current_X_test_list), + "classification": [None], + "ordinal": [None] * len(grp_ord), + } + + p_col_n = {"regression": [], "classification": [], "ordinal": []} + if not group_stacking: + for val in p_col: + if val in grp_nom: + p_col_n["classification"] += dict_nom[val] + if val in grp_cont: + p_col_n["regression"] += dict_cont[val] + if val in grp_ord: + p_col_n["ordinal"] += dict_nom[val] + else: + p_col_n["regression"] = p_col + + # Dictionary of booleans checking for the encountered type of group + var_type = { + "regression": False, + "classification": False, + "ordinal": False, + } + + X_col_new = { + "regression": None, + "classification": None, + "ordinal": None, + } + output = {"regression": None, "classification": None, "ordinal": None} + importance_models = { + "regression": None, + "classification": None, + "ordinal": None, + } + + if importance_estimator is None: + importance_models["regression"] = RandomForestRegressor( + random_state=seed, max_depth=5 + ) + importance_models["classification"] = RandomForestClassifier( + random_state=seed, max_depth=5 + ) + importance_models["ordinal"] = RandomForestClassifier( + random_state=seed, max_depth=5 + ) + elif importance_estimator == "Mod_RF": + importance_models["regression"] = RandomForestRegressorModified( + random_state=seed, + min_samples_leaf=10, + ) + importance_models["classification"] = RandomForestClassifierModified( + random_state=seed, min_samples_leaf=10 + ) + importance_models["ordinal"] = RandomForestClassifierModified( + random_state=seed, min_samples_leaf=10 + ) + else: + importance_models = importance_estimator.copy() + + # Checking for pure vs hybrid groups + if len(grp_cont) > 0: + var_type["regression"] = True + if len(grp_nom) > 0: + var_type["classification"] = True + if len(grp_ord) > 0: + var_type["ordinal"] = True + + # All the variables of the same group will be removed simultaneously so it can be + # predicted conditional on the remaining variables + if importance_estimator != "Mod_RF": + if var_type["regression"]: + for counter_test, X_test_comp in enumerate(current_X_test_list): + X_test_minus_idx = np.delete( + np.copy(X_test_comp), + p_col_n["regression"] + + p_col_n["classification"] + + p_col_n["ordinal"], + -1, + ) + + # Nb of y outputs x Nb of samples x Nb of regression outputs + output["regression"] = X_test_comp[..., p_col_n["regression"]] + X_col_pred["regression"][counter_test] = [] + for cur_output_ind in range(X_test_minus_idx.shape[0]): + importance_models["regression"] = hypertune_predictor( + importance_models["regression"], + X_test_minus_idx[cur_output_ind, ...], + output["regression"][cur_output_ind, ...], + param_grid={"max_depth": [2, 5, 10]} + ) + importance_models["regression"].fit( + X_test_minus_idx[cur_output_ind, ...], + output["regression"][cur_output_ind, ...], + ) + X_col_pred["regression"][counter_test].append( + importance_models["regression"].predict( + X_test_minus_idx[cur_output_ind, ...] + ) + ) + + X_col_pred["regression"][counter_test] = np.array( + X_col_pred["regression"][counter_test] + ) + X_col_pred["regression"][counter_test] = X_col_pred["regression"][ + counter_test + ].reshape(output["regression"].shape) + Res_col[counter_test] = ( + output["regression"] - X_col_pred["regression"][counter_test] + ) + + # The loop doesn't include the classification part because the input across the different DNN sub-models + # is the same without the stacking part (where extra sub-linear layers are used), therefore identical inputs + # won't need looping classification process. This is not the case with the regression part. + if var_type["classification"]: + X_test_minus_idx = np.delete( + np.copy(current_X_test_list[0]), + p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"], + -1, + ) + output["classification"] = np.array(X_nominal[grp_nom]) + X_col_pred["classification"] = [] + for cur_output_ind in range(X_test_minus_idx.shape[0]): + importance_models["classification"] = hypertune_predictor( + importance_models["classification"], + X_test_minus_idx[cur_output_ind, ...], + output["classification"], + param_grid={"max_depth": [2, 5, 10]} + ) + importance_models["classification"].fit( + X_test_minus_idx[cur_output_ind, ...], + output["classification"], + ) + X_col_pred["classification"].append( + importance_models["classification"].predict_proba( + X_test_minus_idx[cur_output_ind, ...] + ) + ) + + if not isinstance(X_col_pred["classification"][0], list): + X_col_pred["classification"] = [X_col_pred["classification"]] + + if var_type["ordinal"]: + X_test_minus_idx = np.delete( + np.copy(current_X_test_list[0]), + p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"], + -1, + ) + output["ordinal"] = ordinal_encode(np.array(X_nominal[grp_ord])) + X_col_pred["ordinal"] = [] + for cur_output_ind in range(X_test_minus_idx.shape[0]): + current_prediction = [] + for cur_ordinal in output["ordinal"]: + # To solve the problem of having multiple input matrices + # with multiple ordinal outcomes + importance_models["ordinal"] = hypertune_predictor( + importance_models["ordinal"], + X_test_minus_idx[cur_output_ind, ...], + cur_ordinal, + param_grid={"max_depth": [2, 5, 10]} + ) + importance_models["ordinal"].fit( + X_test_minus_idx[cur_output_ind, ...], cur_ordinal + ) + probs = importance_models["ordinal"].predict_proba( + X_test_minus_idx[cur_output_ind, ...] + ) + current_prediction.append(convert_predict_proba(np.array(probs))) + X_col_pred["ordinal"].append(current_prediction) + else: + for counter_test, X_test_comp in enumerate(current_X_test_list): + if var_type["regression"]: + X_test_minus_idx = np.delete( + np.copy(X_test_comp), + p_col_n["regression"] + + p_col_n["classification"] + + p_col_n["ordinal"], + -1, + ) + output["regression"] = X_test_comp[..., p_col_n["regression"]] + for cur_output_ind in range(X_test_minus_idx.shape[0]): + importance_models["regression"] = hypertune_predictor( + importance_models["regression"], + X_test_minus_idx[cur_output_ind, ...], + output["regression"][cur_output_ind, ...], + param_grid={"min_samples_leaf": [10, 15, 20]} + ) + + importance_models["regression"].fit( + X_test_minus_idx[cur_output_ind, ...], + output["regression"][cur_output_ind, ...], + ) + pred_mod_RF["regression"][counter_test] = importance_models[ + "regression" + ].sample_same_leaf(X_test_minus_idx[cur_output_ind, ...]) + + if var_type["classification"]: + X_test_minus_idx = np.delete( + np.copy(current_X_test_list[0]), + p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"], + -1, + ) + output["classification"] = np.array(X_nominal[grp_nom]) + for cur_output_ind in range(X_test_minus_idx.shape[0]): + importance_models["classification"] = hypertune_predictor( + importance_models["classification"], + X_test_minus_idx[cur_output_ind, ...], + output["classification"], + param_grid={"min_samples_leaf": [10, 15, 20]} + ) + importance_models["classification"].fit( + X_test_minus_idx[cur_output_ind, ...], + output["classification"], + ) + pred_mod_RF["classification"] = importance_models[ + "classification" + ].sample_same_leaf(X_test_minus_idx[cur_output_ind, ...]) + + if var_type["ordinal"]: + X_test_minus_idx = np.delete( + np.copy(current_X_test_list[0]), + p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"], + -1, + ) + output["ordinal"] = ordinal_encode(np.array(X_nominal[grp_ord])) + + for cur_output_ind in range(X_test_minus_idx.shape[0]): + for cur_ordinal_ind, cur_ordinal in enumerate(output["ordinal"]): + importance_models["ordinal"] = hypertune_predictor( + importance_models["ordinal"], + X_test_minus_idx[cur_output_ind, ...], + cur_ordinal, + param_grid={"min_samples_leaf": [10, 15, 20]} + ) + + importance_models["ordinal"].fit( + X_test_minus_idx[cur_output_ind, ...], cur_ordinal + ) + pred_mod_RF["ordinal"][cur_ordinal_ind] = importance_models[ + "ordinal" + ].sample_same_leaf( + X_test_minus_idx[cur_output_ind, ...], + np.array(X_nominal[grp_ord])[:, [cur_ordinal_ind]], + ) + + if prob_type in ("classification", "binary"): + score = roc_auc_score(y_test, org_pred) + else: + score = ( + mean_absolute_error(y_test, org_pred), + r2_score(y_test, org_pred), + ) + + for sample in range(n_sample): + if index_i is not None: + print( + f"Iteration/Fold:{index_i}, Processing col:{proc_col+1}, Sample:{sample+1}" + ) + else: + print(f"Processing col:{proc_col+1}") + # Same shuffled indices across the sub-models items + indices = np.arange(current_X_test_list[0].shape[1]) + if importance_estimator != "Mod_RF": + rng = np.random.RandomState(list_seeds[sample]) + if Perm: + rng.shuffle(indices) + else: + indices = rng.choice(indices, size=len(indices)) + + for counter_test, X_test_comp in enumerate(current_X_test_list): + if var_type["regression"]: + X_test_comp[..., p_col_n["regression"]] = ( + X_col_pred["regression"][counter_test] + + Res_col[counter_test][:, indices, :] + ) + + if var_type["classification"]: + # X_col_pred['classification'] is a list of arrays representing the probability of getting + # each value at the corresponding variable (resp. to each observation) + for cur_output in X_col_pred["classification"]: + list_seed_cat = rng.randint(1e5, size=len(cur_output)) + for cat_prob_ind in range(len(cur_output)): + rng_cat = np.random.RandomState(list_seed_cat[cat_prob_ind]) + current_X_col_new = np.array( + [ + [ + rng_cat.choice( + np.unique( + X_nominal[[grp_nom[cat_prob_ind]]] + ), + size=1, + p=cur_output[cat_prob_ind][i], + )[0] + for i in range( + cur_output[cat_prob_ind].shape[0] + ) + ] + ] + ).T + if grp_nom[cat_prob_ind] in encoder: + current_X_col_new = ( + encoder[grp_nom[cat_prob_ind]] + .transform(current_X_col_new) + .toarray() + .astype("int32") + ) + if cat_prob_ind == 0: + X_col_new["classification"] = current_X_col_new + else: + X_col_new["classification"] = np.concatenate( + ( + X_col_new["classification"], + current_X_col_new, + ), + axis=1, + ) + X_test_comp[..., p_col_n["classification"]] = X_col_new[ + "classification" + ] + + if var_type["ordinal"]: + for cur_output in X_col_pred["ordinal"]: + list_seed_ord = rng.randint(1e5, size=len(cur_output)) + for ord_prob_ind, ord_prob in enumerate(cur_output): + rng_ord = np.random.RandomState(list_seed_ord[ord_prob_ind]) + current_X_col_new = [None] * ord_prob.shape[0] + for i in range(ord_prob.shape[0]): + current_sample = [0] * ord_prob.shape[1] + for ind_prob, prob in enumerate(ord_prob[i]): + current_sample[ind_prob] = rng_ord.choice( + [0, 1], p=[1 - prob, prob] + ) + if current_sample[ind_prob] == 0: + break + current_X_col_new[i] = np.array(current_sample) + # while True: + # current_X_col_new[i] = rng_ord.binomial(1, ord_prob[i], ord_prob.shape[1]) + # print(current_X_col_new[i]) + # if np.any(np.all(output['ordinal'][ord_prob_ind] == current_X_col_new[i], axis=1)): + # break + ind_cat = np.where(current_X_col_new[i] == 1)[0] + current_X_col_new[i] = [ + np.unique(X_nominal[grp_ord[ord_prob_ind]])[ + len(ind_cat) + ] + ] + + current_X_col_new = np.array(current_X_col_new) + if ord_prob_ind == 0: + X_col_new["ordinal"] = current_X_col_new + else: + X_col_new["ordinal"] = np.concatenate( + ( + X_col_new["ordinal"], + current_X_col_new, + ), + axis=1, + ) + X_test_comp[..., p_col_n["ordinal"]] = X_col_new["ordinal"] + else: + for counter_test, X_test_comp in enumerate(current_X_test_list): + if var_type["regression"]: + X_test_comp[..., p_col_n["regression"]] = np.mean( + sample_predictions( + pred_mod_RF["regression"][counter_test], + list_seeds[sample], + ), + axis=1, + ) + + if var_type["classification"]: + predictions = sample_predictions( + pred_mod_RF["classification"], list_seeds[sample] + ) + max_val = lambda row: list(Counter(row).keys())[0] + for cat_prob_ind in range(predictions.shape[-1]): + class_col = np.array( + [[max_val(row)] for row in predictions[..., cat_prob_ind]] + ) + if grp_nom[cat_prob_ind] in encoder: + current_X_col_new = ( + encoder[grp_nom[cat_prob_ind]] + .transform(class_col) + .toarray() + .astype("int32") + ) + else: + current_X_col_new = class_col + if cat_prob_ind == 0: + X_col_new["classification"] = current_X_col_new + else: + X_col_new["classification"] = np.concatenate( + ( + X_col_new["classification"], + current_X_col_new, + ), + axis=1, + ) + X_test_comp[..., p_col_n["classification"]] = X_col_new[ + "classification" + ] + + if var_type["ordinal"]: + max_val = lambda row: list(Counter(row).keys())[0] + for cur_output_ind in range(len(pred_mod_RF["ordinal"])): + predictions = sample_predictions( + pred_mod_RF["ordinal"][cur_output_ind], + list_seeds[sample], + ) + class_ord = np.array( + [[max_val(row)] for row in predictions[..., 0]] + ) + if cur_output_ind == 0: + X_col_new["ordinal"] = class_ord + else: + X_col_new["ordinal"] = np.concatenate( + (X_col_new["ordinal"], class_ord), axis=1 + ) + X_test_comp[..., p_col_n["ordinal"]] = X_col_new["ordinal"] + + if prob_type == "regression": + if type_predictor == "DNN": + pred_i = estimator.predict(current_X_test_list, scale=False) + else: + pred_i = estimator.predict(current_X_test_list[0].squeeze()) + + # Convert to the (n_samples x n_outputs) format + if len(pred_i.shape) != 2: + pred_i = pred_i.reshape(-1, 1) + + res_ar[sample, ::] = (y_test - pred_i) ** 2 - (y_test - org_pred) ** 2 + else: + if type_predictor == "DNN": + pred_i = estimator.predict_proba(current_X_test_list, scale=False) + else: + pred_i = convert_predict_proba( + estimator.predict_proba(current_X_test_list[0].squeeze()) + ) + + pred_i = np.clip(pred_i, 1e-10, 1 - 1e-10) + org_pred = np.clip(org_pred, 1e-10, 1 - 1e-10) + res_ar[sample, :, 0] = -np.sum(y_test * np.log(pred_i), axis=1) + np.sum( + y_test * np.log(org_pred), axis=1 + ) + + return res_ar, score + + +def joblib_compute_permutation( + p_col, + perm, + estimator, + type_predictor, + X_test_list, + y_test, + prob_type, + org_pred, + dict_cont={}, + dict_nom={}, + proc_col=None, + index_i=None, + group_stacking=False, + random_state=None, +): + """This function applies the permutation feature importance (PFI). + + Parameters + ---------- + p_col: list + The list of single variables/groups to compute the feature importance. + perm: int + The current processed permutation (also to print if verbose > 0). + estimator: scikit-learn compatible estimator, default=None + The provided estimator for the prediction task (First block). + type_predictor: string + The provided predictor either the DNN learner or not. + The DNN learner will use different inputs for the different + sub-models especially while applying the stacking case. + X_test_list: list + The list of inputs containing either one input or a number of inputs + equal to the number of sub-models of the DNN learner. + y_test: {array-like, sparse matrix}, shape (n_samples, n_output) + The output test samples. + prob_type: str, default='regression' + A classification or a regression problem. + org_pred: {array-like, sparse matrix}, shape (n_output, n_samples) + The predictions using the original samples. + dict_cont: dict, default={} + The dictionary providing the indices of the continuous variables. + dict_nom: dict, default={} + The dictionary providing the indices of the categorical variables. + proc_col: int, default=None + The processed column to print if verbose > 0. + index_i: int, default=None + The index of the current processed iteration. + group_stacking: bool, default=False + Apply the stacking-based method for the provided groups. + random_state: int, default=None + Fixing the seeds of the random generator. + """ + rng = np.random.RandomState(random_state) + + if index_i is not None: + print( + f"Iteration/Fold:{index_i}, Processing col:{proc_col+1}, Permutation:{perm+1}" + ) + else: + print(f"Processing col:{proc_col+1}, Permutation:{perm+1}") + + # A list of copied items to avoid any overlapping in the process + current_X_test_list = [X_test_el.copy() for X_test_el in X_test_list] + indices = np.arange(current_X_test_list[0].shape[-2]) + rng.shuffle(indices) + + if not group_stacking: + p_col_new = [] + for val in p_col: + if val in dict_nom.keys(): + p_col_new += dict_nom[val] + else: + p_col_new += dict_cont[val] + else: + p_col_new = p_col + + for X_test_comp in current_X_test_list: + X_test_comp[..., p_col_new] = X_test_comp[..., p_col_new].take(indices, axis=-2) + + if prob_type == "regression": + score = ( + mean_absolute_error(y_test, org_pred), + r2_score(y_test, org_pred), + ) + + if type_predictor == "DNN": + pred_i = estimator.predict(current_X_test_list, scale=False) + else: + pred_i = estimator.predict(current_X_test_list[0]) + + # Convert to the (n_samples x n_outputs) format + if len(pred_i.shape) != 2: + pred_i = pred_i.reshape(-1, 1) + + res = (y_test - pred_i) ** 2 - (y_test - org_pred) ** 2 + else: + score = roc_auc_score(y_test, org_pred) + if type_predictor == "DNN": + pred_i = estimator.predict_proba(current_X_test_list, scale=False) + else: + pred_i = convert_predict_proba( + estimator.predict_proba(current_X_test_list[0]) + ) + + pred_i = np.clip(pred_i, 1e-10, 1 - 1e-10) + org_pred = np.clip(org_pred, 1e-10, 1 - 1e-10) + res = -np.sum(y_test * np.log(pred_i), axis=1) + np.sum( + y_test * np.log(org_pred), axis=1 + ) + return res, score + + +def hypertune_predictor(estimator, X, y, param_grid): + # param_grid = {"max_depth": [2, 5, 10]} + grid_search = GridSearchCV(estimator, param_grid=param_grid, cv=2) + grid_search.fit(X, y) + return grid_search.best_estimator_ diff --git a/hidimstat/utils.py b/hidimstat/utils.py index 044f4c3..ab75b45 100644 --- a/hidimstat/utils.py +++ b/hidimstat/utils.py @@ -1,6 +1,13 @@ # -*- coding: utf-8 -*- # Author: Binh Nguyen & Jerome-Alexis Chevalier +import copy import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from sklearn.metrics import log_loss, mean_squared_error +from sklearn.preprocessing import StandardScaler +from torchmetrics import Accuracy def quantile_aggregation(pvals, gamma=0.5, gamma_min=0.05, adaptive=False): @@ -124,3 +131,601 @@ def _adaptive_quantile_aggregation(pvals, gamma_min=0.05): _fixed_quantile_aggregation(pvals, gamma) for gamma in gammas]) return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0)) + + +def create_X_y( + X, + y, + bootstrap=True, + split_perc=0.8, + prob_type="regression", + list_cont=None, + random_state=None, +): + """Create train/valid split of input data X and target variable y. + Parameters + ---------- + X: {array-like, sparse matrix}, shape (n_samples, n_features) + The input samples before the splitting process. + y: ndarray, shape (n_samples, ) + The output samples before the splitting process. + bootstrap: bool, default=True + Application of bootstrap sampling for the training set. + split_perc: float, default=0.8 + The training/validation cut for the provided data. + prob_type: str, default='regression' + A classification or a regression problem. + list_cont: list, default=[] + The list of continuous variables. + random_state: int, default=2023 + Fixing the seeds of the random generator. + + Returns + ------- + X_train_scaled: {array-like, sparse matrix}, shape (n_train_samples, n_features) + The bootstrapped training input samples with scaled continuous variables. + y_train_scaled: {array-like}, shape (n_train_samples, ) + The bootstrapped training output samples scaled if continous. + X_valid_scaled: {array-like, sparse matrix}, shape (n_valid_samples, n_features) + The validation input samples with scaled continuous variables. + y_valid_scaled: {array-like}, shape (n_valid_samples, ) + The validation output samples scaled if continous. + X_scaled: {array-like, sparse matrix}, shape (n_samples, n_features) + The original input samples with scaled continuous variables. + y_valid: {array-like}, shape (n_samples, ) + The original output samples with validation indices. + scaler_x: scikit-learn StandardScaler + The standard scaler encoder for the continuous variables of the input. + scaler_y: scikit-learn StandardScaler + The standard scaler encoder for the output if continuous. + valid_ind: list + The list of indices of the validation set. + """ + rng = np.random.RandomState(random_state) + scaler_x, scaler_y = StandardScaler(), StandardScaler() + n = X.shape[0] + + if bootstrap: + train_ind = rng.choice(n, n, replace=True) + else: + train_ind = rng.choice(n, size=int(np.floor(split_perc * n)), replace=False) + valid_ind = np.array([ind for ind in range(n) if ind not in train_ind]) + + X_train, X_valid = X[train_ind], X[valid_ind] + y_train, y_valid = y[train_ind], y[valid_ind] + + # Scaling X and y + X_train_scaled = X_train.copy() + X_valid_scaled = X_valid.copy() + X_scaled = X.copy() + + if len(list_cont) > 0: + X_train_scaled[:, list_cont] = scaler_x.fit_transform(X_train[:, list_cont]) + X_valid_scaled[:, list_cont] = scaler_x.transform(X_valid[:, list_cont]) + X_scaled[:, list_cont] = scaler_x.transform(X[:, list_cont]) + if prob_type == "regression": + y_train_scaled = scaler_y.fit_transform(y_train) + y_valid_scaled = scaler_y.transform(y_valid) + else: + y_train_scaled = y_train.copy() + y_valid_scaled = y_valid.copy() + + return ( + X_train_scaled, + y_train_scaled, + X_valid_scaled, + y_valid_scaled, + X_scaled, + y_valid, + scaler_x, + scaler_y, + valid_ind, + ) + + +def sigmoid(x): + """The function applies the sigmoid function element-wise to the input array x.""" + return 1 / (1 + np.exp(-x)) + + +def softmax(x): + """The function applies the softmax function element-wise to the input array x.""" + # Ensure numerical stability by subtracting the maximum value of x from each element of x + # This prevents overflow errors when exponentiating large numbers + x = x - np.max(x, axis=-1, keepdims=True) + exp_x = np.exp(x) + return exp_x / np.sum(exp_x, axis=-1, keepdims=True) + + +def relu(x): + """The function applies the relu function element-wise to the input array x.""" + return (abs(x) + x) / 2 + + +def relu_(x): + """The function applies the derivative of the relu function element-wise + to the input array x. + """ + return (x > 0) * 1 + + +def convert_predict_proba(list_probs): + """If the classification is done using a one-hot encoded variable, the list of + probabilites will be a list of lists for the probabilities of each of the categories. + This function takes the probabilities of having each category (=1 with binary) and stack + them into one ndarray. + """ + if len(list_probs.shape) == 3: + list_probs = np.array(list_probs)[..., 1].T + return list_probs + + +def ordinal_encode(y): + """This function encodes the ordinal variable with a special gradual encoding storing also + the natural order information. + """ + list_y = [] + for y_col in range(y.shape[-1]): + # Retrieve the unique values + unique_vals = np.unique(y[:, y_col]) + # Mapping each unique value to its corresponding index + mapping_dict = {} + for i, val in enumerate(unique_vals): + mapping_dict[val] = i + 1 + # create a zero-filled array for the ordinal encoding + y_ordinal = np.zeros((len(y[:, y_col]), len(set(y[:, y_col])))) + # set the appropriate indices to 1 for each ordinal value and all lower ordinal values + for ind_el, el in enumerate(y[:, y_col]): + y_ordinal[ind_el, np.arange(mapping_dict[el])] = 1 + list_y.append(y_ordinal[:, 1:]) + + return list_y + + +def sample_predictions(predictions, random_state=None): + """This function samples from the same leaf node of the input sample + in both the regression and the classification case + """ + rng = np.random.RandomState(random_state) + # print(predictions[..., rng.randint(predictions.shape[2]), :]) + # print(predictions.shape) + # exit(0) + return predictions[..., rng.randint(predictions.shape[2]), :] + + +def joblib_ensemble_dnnet( + X, + y, + prob_type="regression", + link_func=None, + list_cont=None, + list_grps=None, + bootstrap=False, + split_perc=0.8, + group_stacking=False, + inp_dim=None, + n_epoch=200, + batch_size=32, + beta1=0.9, + beta2=0.999, + lr=1e-3, + l1_weight=1e-2, + l2_weight=1e-2, + epsilon=1e-8, + random_state=None, +): + """ + Parameters + ---------- + X : {array-like, sparse-matrix}, shape (n_samples, n_features) + The input samples. + y : {array-like}, shape (n_samples,) + The output samples. + random_state: int, default=None + Fixing the seeds of the random generator + """ + + pred_v = np.empty(X.shape[0]) + # Sampling and Train/Validate splitting + ( + X_train_scaled, + y_train_scaled, + X_valid_scaled, + y_valid_scaled, + X_scaled, + y_valid, + scaler_x, + scaler_y, + valid_ind, + ) = create_X_y( + X, + y, + bootstrap=bootstrap, + split_perc=split_perc, + prob_type=prob_type, + list_cont=list_cont, + random_state=random_state, + ) + + current_model = dnn_net( + X_train_scaled, + y_train_scaled, + X_valid_scaled, + y_valid_scaled, + prob_type=prob_type, + n_epoch=n_epoch, + batch_size=batch_size, + beta1=beta1, + beta2=beta2, + lr=lr, + l1_weight=l1_weight, + l2_weight=l2_weight, + epsilon=epsilon, + list_grps=list_grps, + group_stacking=group_stacking, + inp_dim=inp_dim, + random_state=random_state, + ) + + if not group_stacking: + X_scaled_n = X_scaled.copy() + else: + X_scaled_n = np.zeros((X_scaled.shape[0], inp_dim[-1])) + for grp_ind in range(len(list_grps)): + n_layer_stacking = len(current_model[3][grp_ind]) - 1 + curr_pred = X_scaled[:, list_grps[grp_ind]].copy() + for ind_w_b in range(n_layer_stacking): + if ind_w_b == 0: + curr_pred = relu( + X_scaled[:, list_grps[grp_ind]].dot( + current_model[3][grp_ind][ind_w_b] + ) + + current_model[4][grp_ind][ind_w_b] + ) + else: + curr_pred = relu( + curr_pred.dot(current_model[3][grp_ind][ind_w_b]) + + current_model[4][grp_ind][ind_w_b] + ) + X_scaled_n[ + :, + list(np.arange(inp_dim[grp_ind], inp_dim[grp_ind + 1])), + ] = ( + curr_pred.dot(current_model[3][grp_ind][n_layer_stacking]) + + current_model[4][grp_ind][n_layer_stacking] + ) + + n_layer = len(current_model[0]) - 1 + for j in range(n_layer): + if j == 0: + pred = relu(X_scaled_n.dot(current_model[0][j]) + current_model[1][j]) + else: + pred = relu(pred.dot(current_model[0][j]) + current_model[1][j]) + + pred = pred.dot(current_model[0][n_layer]) + current_model[1][n_layer] + + if prob_type not in ("classification", "binary"): + if prob_type != "ordinal": + pred_v = pred * scaler_y.scale_ + scaler_y.mean_ + else: + pred_v = link_func[prob_type](pred) + loss = np.std(y_valid) ** 2 - mean_squared_error(y_valid, pred_v[valid_ind]) + else: + pred_v = link_func[prob_type](pred) + loss = log_loss( + y_valid, np.ones(y_valid.shape) * np.mean(y_valid, axis=0) + ) - log_loss(y_valid, pred_v[valid_ind]) + + return (current_model, scaler_x, scaler_y, pred_v, loss) + + +def init_weights(layer): + if isinstance(layer, nn.Linear): + layer.weight.data = (layer.weight.data.uniform_() - 0.5) * 0.2 + layer.bias.data = (layer.bias.data.uniform_() - 0.5) * 0.1 + + +def Dataset_Loader(X, y, shuffle=False, batch_size=50): + if y.shape[-1] == 2: + y = y[:, [1]] + dataset = torch.utils.data.TensorDataset( + torch.from_numpy(X).float(), torch.from_numpy(y).float() + ) + + loader = torch.utils.data.DataLoader( + dataset, batch_size=batch_size, shuffle=shuffle + ) + return loader + + +class DNN(nn.Module): + """Feedfoward neural network with 4 hidden layers""" + + def __init__(self, input_dim, group_stacking, list_grps, out_dim, prob_type): + super().__init__() + if prob_type == "classification": + self.accuracy = Accuracy(task="multiclass", num_classes=out_dim) + else: + self.accuracy = Accuracy(task="binary") + self.list_grps = list_grps + self.group_stacking = group_stacking + if group_stacking: + self.layers_stacking = nn.ModuleList( + [ + nn.Linear( + in_features=len(grp), + out_features=input_dim[grp_ind + 1] - input_dim[grp_ind], + ) + # nn.Sequential( + # nn.Linear( + # in_features=len(grp), + # # out_features=max(1, int(0.1 * len(grp))), + # out_features=input_dim[grp_ind + 1] + # - input_dim[grp_ind], + # ), + # nn.ReLU(), + # nn.Linear( + # in_features=max(1, int(0.1 * len(grp))), + # out_features=input_dim[grp_ind + 1] + # - input_dim[grp_ind], + # ), + # nn.ReLU(), + # nn.Linear( + # in_features=max(1, int(0.1 * len(grp))), + # out_features=input_dim[grp_ind + 1] + # - input_dim[grp_ind], + # ), + # ) + for grp_ind, grp in enumerate(list_grps) + ] + ) + input_dim = input_dim[-1] + self.layers = nn.Sequential( + # hidden layers + nn.Linear(input_dim, 50), + nn.ReLU(), + nn.Linear(50, 40), + nn.ReLU(), + nn.Linear(40, 30), + nn.ReLU(), + nn.Linear(30, 20), + nn.ReLU(), + # output layer + nn.Linear(20, out_dim), + ) + self.loss = 0 + + def forward(self, x): + if self.group_stacking: + list_stacking = [None] * len(self.layers_stacking) + for ind_layer, layer in enumerate(self.layers_stacking): + list_stacking[ind_layer] = layer(x[:, self.list_grps[ind_layer]]) + x = torch.cat(list_stacking, dim=1) + return self.layers(x) + + def training_step(self, batch, device, prob_type): + X, y = batch[0].to(device), batch[1].to(device) + y_pred = self(X) # Generate predictions + if prob_type == "regression": + loss = F.mse_loss(y_pred, y) + elif prob_type == "classification": + loss = F.cross_entropy(y_pred, y) # Calculate loss + else: + loss = F.binary_cross_entropy_with_logits(y_pred, y) + return loss + + def validation_step(self, batch, device, prob_type): + X, y = batch[0].to(device), batch[1].to(device) + y_pred = self(X) # Generate predictions + if prob_type == "regression": + loss = F.mse_loss(y_pred, y) + return { + "val_mse": loss, + "batch_size": len(X), + } + else: + if prob_type == "classification": + loss = F.cross_entropy(y_pred, y) # Calculate loss + else: + loss = F.binary_cross_entropy_with_logits(y_pred, y) + acc = self.accuracy(y_pred, y.int()) + return { + "val_loss": loss, + "val_acc": acc, + "batch_size": len(X), + } + + def validation_epoch_end(self, outputs, prob_type): + if prob_type in ("classification", "binary"): + batch_losses = [] + batch_accs = [] + batch_sizes = [] + for x in outputs: + batch_losses.append(x["val_loss"] * x["batch_size"]) + batch_accs.append(x["val_acc"] * x["batch_size"]) + batch_sizes.append(x["batch_size"]) + self.loss = torch.stack(batch_losses).sum().item() / np.sum( + batch_sizes + ) # Combine losses + epoch_acc = torch.stack(batch_accs).sum().item() / np.sum( + batch_sizes + ) # Combine accuracies + return {"val_loss": self.loss, "val_acc": epoch_acc} + else: + batch_losses = [x["val_mse"] * x["batch_size"] for x in outputs] + batch_sizes = [x["batch_size"] for x in outputs] + self.loss = torch.stack(batch_losses).sum().item() / np.sum( + batch_sizes + ) # Combine losses + return {"val_mse": self.loss} + + def epoch_end(self, epoch, result): + if len(result) == 2: + print( + "Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format( + epoch + 1, result["val_loss"], result["val_acc"] + ) + ) + else: + print("Epoch [{}], val_mse: {:.4f}".format(epoch + 1, result["val_mse"])) + + +def evaluate(model, loader, device, prob_type): + outputs = [model.validation_step(batch, device, prob_type) for batch in loader] + return model.validation_epoch_end(outputs, prob_type) + + +def dnn_net( + X_train, + y_train, + X_valid, + y_valid, + prob_type="regression", + n_epoch=200, + batch_size=32, + batch_size_val=128, + beta1=0.9, + beta2=0.999, + lr=1e-3, + l1_weight=1e-2, + l2_weight=1e-2, + epsilon=1e-8, + list_grps=None, + group_stacking=False, + inp_dim=None, + random_state=2023, + verbose=0, +): + """ + train_loader: DataLoader for Train data + val_loader: DataLoader for Validation data + original_loader: DataLoader for Original_data + p: Number of variables + n_epochs: The number of epochs + lr: learning rate + beta1: Beta1 parameter for Adam optimizer + beta2: Beta2 parameter for Adam optimizer + epsilon: Epsilon parameter for Adam optimizer + l1_weight: L1 regalurization weight + l2_weight: L2 regularization weight + verbose: If > 2, the metrics will be printed + prob_type: A classification or regression problem + """ + # Creating DataLoaders + train_loader = Dataset_Loader( + X_train, + y_train, + shuffle=True, + batch_size=batch_size, + ) + validate_loader = Dataset_Loader(X_valid, y_valid, batch_size=batch_size_val) + # Set the seed for PyTorch's random number generator + torch.manual_seed(random_state) + + # Set the seed for PyTorch's CUDA random number generator(s), if available + if torch.cuda.is_available(): + torch.cuda.manual_seed(random_state) + torch.cuda.manual_seed_all(random_state) + + # Specify whether to use GPU or CPU + # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + device = torch.device("cpu") + + if prob_type in ("regression", "binary"): + out_dim = 1 + else: + out_dim = y_train.shape[-1] + + # DNN model + input_dim = inp_dim.copy() if group_stacking else X_train.shape[1] + model = DNN(input_dim, group_stacking, list_grps, out_dim, prob_type) + model.to(device) + # Initializing weights/bias + model.apply(init_weights) + # Adam Optimizer + optimizer = torch.optim.Adam( + model.parameters(), lr=lr, betas=(beta1, beta2), eps=epsilon + ) + + best_loss = 1e100 + for epoch in range(n_epoch): + # Training Phase + model.train() + for batch in train_loader: + optimizer.zero_grad() + loss = model.training_step(batch, device, prob_type) + + loss.backward() + optimizer.step() + for name, param in model.named_parameters(): + if "bias" not in name: + # if name.split(".")[0] == "layers_stacking": + # param.data -= l2_weight * param.data + # else: + param.data -= ( + l1_weight * torch.sign(param.data) + l2_weight * param.data + ) + # Validation Phase + model.eval() + result = evaluate(model, validate_loader, device, prob_type) + if model.loss < best_loss: + best_loss = model.loss + dict_params = copy.deepcopy(model.state_dict()) + if verbose >= 2: + model.epoch_end(epoch, result) + + best_weight = [] + best_bias = [] + best_weight_stack = [[].copy() for _ in range(len(list_grps))] + best_bias_stack = [[].copy() for _ in range(len(list_grps))] + + for name, param in dict_params.items(): + if name.split(".")[0] == "layers": + if name.split(".")[-1] == "weight": + best_weight.append(param.numpy().T) + if name.split(".")[-1] == "bias": + best_bias.append(param.numpy()[np.newaxis, :]) + if name.split(".")[0] == "layers_stacking": + curr_ind = int(name.split(".")[1]) + if name.split(".")[-1] == "weight": + best_weight_stack[curr_ind].append(param.numpy().T) + if name.split(".")[-1] == "bias": + best_bias_stack[curr_ind].append(param.numpy()[np.newaxis, :]) + + return [ + best_weight, + best_bias, + best_loss, + best_weight_stack, + best_bias_stack, + ] + + +def compute_imp_std(pred_scores): + weights = np.array([el.shape[-2] for el in pred_scores]) + # Compute the mean of each fold over the number of observations + pred_mean = np.array( + [np.mean(el.copy(), axis=-2) for el in pred_scores] + ) + + # Weighted average + imp = np.average( + pred_mean, axis=0, weights=weights + ) + + # Compute the standard deviation of each fold + # over the number of observations + pred_std = np.array( + [ + np.mean( + (el - imp[..., np.newaxis]) ** 2, + axis=-2, + ) + for el in pred_scores + ] + ) + std = np.sqrt( + np.average(pred_std, axis=0, weights=weights) + / (np.sum(weights) - 1) + ) + return (imp, std)