From 8fd6f9287e4e48a4abfabf978949561adb16ed8f Mon Sep 17 00:00:00 2001
From: achamma <ahmad.chamma@inria.fr>
Date: Fri, 29 Dec 2023 18:08:05 +0100
Subject: [PATCH] First draft: Add BBI

---
 README.md                         |  12 +
 hidimstat/BBI.py                  | 859 ++++++++++++++++++++++++++++++
 hidimstat/Dnn_learner.py          | 293 ++++++++++
 hidimstat/Dnn_learner_single.py   | 550 +++++++++++++++++++
 hidimstat/RandomForestModified.py |  97 ++++
 hidimstat/__init__.py             |   4 +
 hidimstat/compute_importance.py   | 688 ++++++++++++++++++++++++
 hidimstat/utils.py                | 605 +++++++++++++++++++++
 8 files changed, 3108 insertions(+)
 create mode 100644 hidimstat/BBI.py
 create mode 100644 hidimstat/Dnn_learner.py
 create mode 100644 hidimstat/Dnn_learner_single.py
 create mode 100644 hidimstat/RandomForestModified.py
 create mode 100644 hidimstat/compute_importance.py

diff --git a/README.md b/README.md
index 1d83e13..6bf7481 100644
--- a/README.md
+++ b/README.md
@@ -91,6 +91,18 @@ Application to source localization (MEG/EEG data):
 
 * Chevalier, J. A., Gramfort, A., Salmon, J., & Thirion, B. (2020). __Statistical control for spatio-temporal MEG/EEG source imaging with desparsified multi-task Lasso__. In _Proceedings of the 34th Conference on Neural Information Processing Systems (NeurIPS 2020)_, Vancouver, Canada.
 
+Single/Group statistically validated importance using conditional permutations:
+
+* Chamma, A., Thirion, B., & Engemann, D. (2024). __Variable importance in
+  high-dimensional settings requires grouping__. In _Proceedings of
+  the 38th Conference of the Association for the Advancement of Artificial
+  Intelligence(AAAI 2024)_, Vancouver, Canada.
+
+* Chamma, A., Engemann, D., & Thirion, B. (2023). __Statistically Valid Variable
+  Importance Assessment through Conditional Permutations__. In _Proceedings of
+  the 37th Conference on Neural Information Processing Systems (NeurIPS 2023)_,
+  New Orleans, USA.
+
 If you use our packages, we would appreciate citations to the relevant aforementioned papers.
 
 #### Other useful references:
diff --git a/hidimstat/BBI.py b/hidimstat/BBI.py
new file mode 100644
index 0000000..cb70a3e
--- /dev/null
+++ b/hidimstat/BBI.py
@@ -0,0 +1,859 @@
+import itertools
+import warnings
+from copy import copy
+
+import numpy as np
+import pandas as pd
+from joblib import Parallel, delayed
+from scipy.stats import norm
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import RidgeCV
+from sklearn.metrics import (
+    log_loss,
+    mean_absolute_error,
+    mean_squared_error,
+    roc_auc_score,
+    r2_score,
+)
+from sklearn.model_selection import KFold
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.utils.validation import check_is_fitted
+
+from .compute_importance import (
+    joblib_compute_conditional,
+    joblib_compute_permutation,
+)
+from .Dnn_learner import DNN_learner
+from .utils import convert_predict_proba, create_X_y, compute_imp_std
+
+
+class BlockBasedImportance(BaseEstimator, TransformerMixin):
+    """This class implements the Block Based Importance (BBI),
+       it consists of the learner block (first block)
+       and the importance block (second block).
+    Parameters
+    ----------
+    estimator: scikit-learn compatible estimator, default=None
+        The provided estimator for the prediction task (First block).
+        The default estimator is the DNN learner. Other options are (1) RF
+        for Random Forest.
+    importance_estimator: {scikit-learn compatible estimator or string},
+                          default=Mod_RF
+        The provided estimator for the importance task (Second block).
+        Using "Mod_RF" will apply the modified version of the Random Forest as
+        the importance predictor.
+    do_hyper: bool, default=True
+        Tuning the hyperparameters of the provided estimator.
+    dict_hyper: dict, default=None
+        The dictionary of hyperparameters to tune.
+    prob_type: str, default='regression'
+        A classification or a regression problem.
+    bootstrap: bool, default=True
+        Application of bootstrap sampling for the training set.
+    split_perc: float, default=0.8
+        The training/validation cut for the provided data.
+    conditional: bool, default=True
+        The permutation or the conditional sampling approach.
+    list_nominal: dict, default=None
+        The dictionary of binary, nominal and ordinal variables.
+    Perm: bool, default=False
+        The use of permutations or random sampling with CPI-DNN.
+    n_perm: int, default=50
+        The number of permutations/random sampling for each column.
+    n_jobs: int, default=1
+        The number of workers for parallel processing.
+    verbose: int, default=0
+        If verbose > 0, the fitted iterations will be printed.
+    groups: dict, default=None
+        The knowledge-driven/data-driven grouping of the variables if provided.
+    group_stacking: bool, default=False
+        Apply the stacking-based method for the provided groups.
+    prop_out_subLayers: int, default=0.
+        If group_stacking is set to True, proportion of outputs for
+        the linear sub-layers per group.
+    index_i: int, default=None
+        The index of the current processed iteration.
+    random_state: int, default=2023
+        Fixing the seeds of the random generator.
+    com_imp: boolean, default=True
+        Compute or not the importance scores.
+    Attributes
+    ----------
+    ToDO
+    """
+
+    def __init__(
+        self,
+        estimator=None,
+        importance_estimator="Mod_RF",
+        coffeine_transformer=None,
+        do_hyper=True,
+        dict_hyper=None,
+        prob_type="regression",
+        bootstrap=True,
+        split_perc=0.8,
+        conditional=True,
+        list_nominal=None,
+        Perm=False,
+        n_perm=50,
+        n_jobs=1,
+        verbose=0,
+        groups=None,
+        group_stacking=False,
+        k_fold=2,
+        prop_out_subLayers=0,
+        index_i=None,
+        random_state=2023,
+        com_imp=True,
+    ):
+        self.estimator = estimator
+        self.importance_estimator = importance_estimator
+        self.coffeine_transformer = coffeine_transformer
+        self.do_hyper = do_hyper
+        self.dict_hyper = dict_hyper
+        self.prob_type = prob_type
+        self.bootstrap = bootstrap
+        self.split_perc = split_perc
+        self.conditional = conditional
+        self.list_nominal = list_nominal
+        self.Perm = Perm
+        self.n_perm = n_perm
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        self.groups = groups
+        self.group_stacking = group_stacking
+        self.k_fold = k_fold
+        self.prop_out_subLayers = prop_out_subLayers
+        self.index_i = index_i
+        self.random_state = random_state
+        self.X_test = [None] * max(self.k_fold, 1)
+        self.y_test = [None] * max(self.k_fold, 1)
+        self.org_pred = [None] * max(self.k_fold, 1)
+        self.pred_scores = [None] * max(self.k_fold, 1)
+        self.X_nominal = [None] * max(self.k_fold, 1)
+        self.type = None
+        self.list_estimators = [None] * max(self.k_fold, 1)
+        self.X_proc = [None] * max(self.k_fold, 1)
+        self.scaler_x = [None] * max(self.k_fold, 1)
+        self.scaler_y = [None] * max(self.k_fold, 1)
+        self.com_imp = com_imp
+        # Check for applying the stacking approach with the RidgeCV estimator
+        self.apply_ridge = False
+        # Check for the case of a coffeine transformer with provided groups
+        self.transformer_grp = True
+        self.coffeine_transformers = []
+
+    def fit(self, X, y=None):
+        """Build the provided estimator with the training set (X, y)
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The training input samples.
+        y : None
+            There is no need of a target in a transformer, yet the pipeline API
+            requires this parameter.
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        # Disable the importance computation when only one group is provided
+        # (group case)
+        if self.groups is not None:
+            if len(self.groups) <= 1:
+                self.com_imp = False
+
+        # Fixing the random generator's seed
+        self.rng = np.random.RandomState(self.random_state)
+
+        # Switch to special binary case
+        if (self.prob_type == "classification") and (len(np.unique(y)) < 3):
+            self.prob_type = "binary"
+        # Convert list_nominal to a dictionary if initialized
+        # as an empty string
+        if not isinstance(self.list_nominal, dict):
+            self.list_nominal = {
+                "nominal": [],
+                "ordinal": [],
+                "binary": [],
+            }
+
+        if "binary" not in self.list_nominal:
+            self.list_nominal["binary"] = []
+        if "ordinal" not in self.list_nominal:
+            self.list_nominal["ordinal"] = []
+        if "nominal" not in self.list_nominal:
+            self.list_nominal["nominal"] = []
+
+        # Move the ordinal columns with 2 values to the binary part
+        if self.list_nominal["ordinal"] != []:
+            for ord_col in set(self.list_nominal["ordinal"]).intersection(list(X.columns)):
+                if len(np.unique(X[ord_col])) < 3:
+                    self.list_nominal["binary"].append(ord_col)
+                    self.list_nominal["ordinal"].remove(ord_col)
+
+        # Convert X to pandas dataframe
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X)
+        else:
+            self.X_cols = list(X.columns)
+        if (self.groups is None) or (not bool(self.groups)):
+            # Initialize the list_cols variable with each feature
+            # in a seperate list (default case)
+            self.groups = [[col] for col in X.columns]
+            self.transformer_grp = False
+            if self.group_stacking:
+                # Remove the group_stacking flag not
+                # to complex the DNN architecture
+                self.group_stacking = False
+                warnings.warn(
+                    "The groups are not provided to apply the stacking"
+                    " approach, back to single variables case."
+                )
+
+        # Convert dictionary of groups to a list of lists
+        if isinstance(self.groups, dict):
+            self.groups = list(self.groups.values())
+
+        # Checks if the number of variables in the groups is equal to the
+        # number of variables provided
+        list_count = [item for sublist in self.groups for item in sublist]
+        if self.coffeine_transformer is None:
+            if len(list_count) != X.shape[1]:
+                raise Exception("The provided groups are missing some variables!")
+        else:
+            if self.transformer_grp:
+                if len(list_count) != (X.shape[1] * self.coffeine_transformer[1]):
+                    raise Exception("The provided groups are missing some variables!")
+            else:
+                if len(list_count) != X.shape[1]:
+                    raise Exception("The provided groups are missing some variables!")
+
+        # Check if categorical variables exist within the columns of the design
+        # matrix
+        self.list_nominal["binary"] = list(set(self.list_nominal["binary"]).intersection(list(X.columns)))
+        self.list_nominal["ordinal"] = list(set(self.list_nominal["ordinal"]).intersection(list(X.columns)))
+        self.list_nominal["nominal"] = list(set(self.list_nominal["nominal"]).intersection(list(X.columns)))
+
+        self.list_cols = self.groups.copy()
+        self.list_cat_tot = list(
+            itertools.chain.from_iterable(self.list_nominal.values())
+        )
+        X_nominal_org = X.loc[:, self.list_cat_tot]
+
+        # One-hot encoding of nominal variables
+        tmp_list = []
+        self.dict_nom = {}
+        # A dictionary to save the encoders of the nominal variables
+        self.dict_enc = {}
+        if len(self.list_nominal["nominal"]) > 0:
+            for col_encode in self.list_nominal["nominal"]:
+                enc = OneHotEncoder(handle_unknown="ignore")
+                enc.fit(X[[col_encode]])
+                labeled_cols = [
+                    enc.feature_names_in_[0] + "_" + str(enc.categories_[0][j])
+                    for j in range(len(enc.categories_[0]))
+                ]
+                hot_cols = pd.DataFrame(
+                    enc.transform(X[[col_encode]]).toarray(),
+                    dtype="int32",
+                    columns=labeled_cols,
+                )
+                X = X.drop(columns=[col_encode])
+                X = pd.concat([X, hot_cols], axis=1)
+                self.dict_enc[col_encode] = enc
+
+        # Create a dictionary for categorical variables with their indices
+        for col_cat in self.list_cat_tot:
+            current_list = [
+                col
+                for col in range(len(X.columns))
+                if X.columns[col].split("_")[0] == col_cat
+            ]
+            if len(current_list) > 0:
+                self.dict_nom[col_cat] = current_list
+                # A list to store the labels of the categorical variables
+                tmp_list.extend(current_list)
+
+        # Create a dictionary for the continuous variables that will be scaled
+        self.dict_cont = {}
+        if self.coffeine_transformer is None:
+            for ind_col, col_cont in enumerate(X.columns):
+                if ind_col not in tmp_list:
+                    self.dict_cont[col_cont] = [ind_col]
+            self.list_cont = [el[0] for el in self.dict_cont.values()]
+        else:
+            self.list_cols_tmp = []
+            self.list_cont = list(np.arange(X.shape[1] * self.coffeine_transformer[1]))
+            for i in range(X.shape[1] * self.coffeine_transformer[1]):
+                self.dict_cont[i] = [i]
+                self.list_cols_tmp.append([i])
+            if not self.transformer_grp:
+                self.list_cols = self.list_cols_tmp.copy()
+            self.coffeine_transformers = [
+                copy(self.coffeine_transformer[0]) for _ in range(max(self.k_fold, 1))
+            ]
+
+        X = X.to_numpy()
+
+        if len(y.shape) != 2:
+            y = np.array(y).reshape(-1, 1)
+
+        if self.prob_type in ("classification", "binary"):
+            self.loss = log_loss
+        else:
+            self.loss = mean_squared_error
+
+        # Replace groups' variables by the indices in the design matrix
+        self.list_grps = []
+        self.inp_dim = None
+        if self.group_stacking:
+            for grp in self.groups:
+                current_grp = []
+                for i in grp:
+                    if i in self.dict_nom.keys():
+                        current_grp += self.dict_nom[i]
+                    else:
+                        current_grp += self.dict_cont[i]
+                self.list_grps.append(current_grp)
+            
+            if len(self.coffeine_transformers) == 1:
+                X = self.coffeine_transformers[0].fit_transform(
+                    pd.DataFrame(X, columns=self.X_cols), np.ravel(y))
+
+            if self.estimator is not None:
+                # Force the output to 1 neurone per group
+                # in standard stacking case
+                self.prop_out_subLayers = 0
+                # RidgeCV estimators
+                self.ridge_folds = 10
+                list_alphas = np.logspace(-3, 5, num=100)
+                cv = KFold(
+                    n_splits=self.ridge_folds,
+                    random_state=self.random_state,
+                    shuffle=True,
+                )
+                self.ridge_mods = [
+                    [
+                        make_pipeline(StandardScaler(), RidgeCV(list_alphas))
+                        for __ in range(len(self.list_grps))
+                    ].copy()
+                    for _ in range(self.ridge_folds)
+                ]
+
+                X_prev = X.copy()
+                X = np.zeros((y.shape[0], len(self.list_grps)))
+                for ind_fold, (train, test) in enumerate(cv.split(X_prev)):
+                    X_train, X_test = X_prev[train], X_prev[test]
+                    y_train, _ = y[train], y[test]
+                    if len(self.coffeine_transformers) > 1:
+                        X_train = self.coffeine_transformers[ind_fold].fit_transform(
+                            pd.DataFrame(X_train, columns=self.X_cols), np.ravel(y_train)
+                        )
+                        X_test = self.coffeine_transformers[ind_fold].transform(
+                            pd.DataFrame(X_test, columns=self.X_cols)
+                        )
+                    for grp_ind, grp in enumerate(self.list_grps):
+                        self.ridge_mods[ind_fold][grp_ind].fit(X_train[:, grp], y_train)
+                        X[test, grp_ind] = (
+                            self.ridge_mods[ind_fold][grp_ind].predict(X_test[:, grp]).ravel()
+                        )
+                self.apply_ridge = True
+
+            self.inp_dim = [
+                max(1, int(self.prop_out_subLayers * len(grp)))
+                for grp in self.list_grps
+            ]
+            self.inp_dim.insert(0, 0)
+            self.inp_dim = np.cumsum(self.inp_dim)
+            self.list_cols = [
+                list(np.arange(self.inp_dim[grp_ind], self.inp_dim[grp_ind + 1]))
+                for grp_ind in range(len(self.list_grps))
+            ]
+
+        # Initialize the first estimator (block learner)
+        if self.estimator is None:
+            self.estimator = DNN_learner(
+                prob_type=self.prob_type,
+                encode=True,
+                do_hyper=False,
+                list_cont=self.list_cont,
+                list_grps=self.list_grps,
+                group_stacking=self.group_stacking,
+                n_jobs=self.n_jobs,
+                inp_dim=self.inp_dim,
+                random_state=self.random_state,
+                verbose=self.verbose,
+            )
+            self.type = "DNN"
+            # Initializing the dictionary for tuning the hyperparameters
+            if self.dict_hyper is None:
+                self.dict_hyper = {
+                    "lr": [1e-4, 1e-3, 1e-2],
+                    "l1_weight": [0, 1e-4, 1e-2],
+                    "l2_weight": [0, 1e-4, 1e-2],
+                }
+
+        elif self.estimator == "RF":
+            if self.prob_type == "regression":
+                self.estimator = RandomForestRegressor(random_state=2023)
+            else:
+                self.estimator = RandomForestClassifier(random_state=2023)
+            self.dict_hyper = {"max_depth": [2, 5, 10, 20]}
+            self.type = "RF"
+
+        if self.k_fold != 0:
+            # Implementing k-fold cross validation as the default behavior
+            kf = KFold(
+                n_splits=self.k_fold,
+                random_state=self.random_state,
+                shuffle=True,
+            )
+            for ind_fold, (train_index, test_index) in enumerate(kf.split(X)):
+                print(f"Processing: {ind_fold+1}")
+                X_fold = X.copy()
+                y_fold = y.copy()
+
+                self.X_nominal[ind_fold] = X_nominal_org.iloc[test_index, :]
+
+                X_train, X_test = (
+                    X_fold[train_index, :],
+                    X_fold[test_index, :],
+                )
+
+                y_train, y_test = y_fold[train_index], y_fold[test_index]
+
+                if not self.apply_ridge:
+                    if self.coffeine_transformer is not None:
+                        X_train = self.coffeine_transformers[ind_fold].fit_transform(
+                            pd.DataFrame(X_train, columns=self.X_cols), np.ravel(y_train)
+                        )
+
+                        X_test = self.coffeine_transformers[ind_fold].transform(
+                            pd.DataFrame(X_test, columns=self.X_cols)
+                        )
+
+                self.X_test[ind_fold] = X_test.copy()
+                self.y_test[ind_fold] = y_test.copy()
+
+                # Find the list of optimal sub-models to be used in the
+                # following steps (Default estimator)
+                if self.do_hyper:
+                    self.__tuning_hyper(X_train, y_train, ind_fold)
+                if self.type == "DNN":
+                    self.estimator.fit(X_train, y_train)
+                self.list_estimators[ind_fold] = copy(self.estimator)
+
+        else:
+            if not self.apply_ridge:
+                if self.coffeine_transformer is not None:
+                    X = self.coffeine_transformers[0].fit_transform(
+                        pd.DataFrame(X, columns=self.X_cols), np.ravel(y))
+                    if not self.transformer_grp:
+                        # Variables are provided as the third element of the
+                        # coffeine transformer parameter
+                        if len(self.coffeine_transformer) > 2:
+                            X = X[:, self.coffeine_transformer[2]]
+                            self.list_cont = np.arange(len(self.coffeine_transformer[2]))
+
+            # Hyperparameter tuning
+            if self.do_hyper:
+                self.__tuning_hyper(X, y, 0)
+
+            if self.type == "DNN":
+                self.estimator.fit(X, y)
+            self.list_estimators[0] = copy(self.estimator)
+
+        self.is_fitted = True
+        return self
+
+    def __tuning_hyper(self, X, y, ind_fold=None):
+        """ """
+        if not ((self.apply_ridge) and (self.group_stacking)):
+            (
+                X_train_scaled,
+                y_train_scaled,
+                X_valid_scaled,
+                y_valid_scaled,
+                X_scaled,
+                __,
+                scaler_x,
+                scaler_y,
+                ___,
+            ) = create_X_y(
+                X,
+                y,
+                bootstrap=self.bootstrap,
+                split_perc=self.split_perc,
+                prob_type=self.prob_type,
+                list_cont=self.list_cont,
+                random_state=self.random_state,
+            )
+            if self.dict_hyper is not None:
+                list_hyper = list(itertools.product(*list(self.dict_hyper.values())))
+            list_loss = []
+            if self.type == "DNN":
+                list_loss = self.estimator.hyper_tuning(
+                    X_train_scaled,
+                    y_train_scaled,
+                    X_valid_scaled,
+                    y_valid_scaled,
+                    list_hyper,
+                    random_state=self.random_state,
+                )
+            else:
+                if self.dict_hyper is None:
+                    self.estimator.fit(X_scaled, y)
+                    # If not a DNN learner case, need to save the scalers
+                    self.scaler_x[ind_fold] = scaler_x
+                    self.scaler_y[ind_fold] = scaler_y
+                    return
+                else:
+                    for ind_el, el in enumerate(list_hyper):
+                        curr_params = dict(
+                            (k, v) for v, k in zip(el, list(self.dict_hyper.keys()))
+                        )
+                        list_hyper[ind_el] = curr_params
+                        self.estimator.set_params(**curr_params)
+                        if self.prob_type == "regression":
+                            y_train_curr = (
+                                y_train_scaled * scaler_y.scale_ + scaler_y.mean_
+                            )
+                            y_valid_curr = (
+                                y_valid_scaled * scaler_y.scale_ + scaler_y.mean_
+                            )
+
+                            def func(x):
+                                return self.estimator.predict(x)
+
+                        else:
+                            y_train_curr = y_train_scaled.copy()
+                            y_valid_curr = y_valid_scaled.copy()
+
+                            def func(x):
+                                return self.estimator.predict_proba(x)
+
+                        self.estimator.fit(X_train_scaled, y_train_curr)
+
+                        list_loss.append(self.loss(y_valid_curr, func(X_valid_scaled)))
+
+            ind_min = np.argmin(list_loss)
+            best_hyper = list_hyper[ind_min]
+            if not isinstance(best_hyper, dict):
+                best_hyper = dict(zip(self.dict_hyper.keys(), best_hyper))
+
+            self.estimator.set_params(**best_hyper)
+            self.estimator.fit(X_scaled, y)
+
+            # If not a DNN learner case, need to save the scalers
+            self.scaler_x[ind_fold] = scaler_x
+            self.scaler_y[ind_fold] = scaler_y
+
+        else:
+            self.estimator.fit(X, y)
+
+    def predict(self, X=None, encoding=True):
+        """Predict regression target for X.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input samples.
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            Returns an array of ones.
+        """
+        if not isinstance(X, list):
+            list_X = [X.copy() for el in range(max(self.k_fold, 1))]
+            mean_pred = True
+        else:
+            list_X = X.copy()
+            mean_pred = False
+
+        for ind_fold, curr_X in enumerate(list_X):
+            # Prepare the test set for the prediction
+            if encoding:
+                X_tmp = self.__encode_input(curr_X)
+            else:
+                X_tmp = curr_X.copy()
+
+            if self.type != "DNN":
+                if not isinstance(curr_X, np.ndarray):
+                    X_tmp = np.array(X_tmp)
+                if self.scaler_x[ind_fold] is not None:
+                    X_tmp[:, self.list_cont] = self.scaler_x[ind_fold].transform(
+                        X_tmp[:, self.list_cont]
+                    )
+                self.X_proc[ind_fold] = [X_tmp.copy()]
+
+            self.org_pred[ind_fold] = self.list_estimators[ind_fold].predict(X_tmp)
+
+            # Convert to the (n_samples x n_outputs) format
+            if len(self.org_pred[ind_fold].shape) != 2:
+                self.org_pred[ind_fold] = self.org_pred[ind_fold].reshape(-1, 1)
+
+            if self.type == "DNN":
+                self.X_proc[ind_fold] = np.array(
+                    self.list_estimators[ind_fold].X_test.copy()
+                ).swapaxes(0, 1)
+
+        if mean_pred:
+            return np.mean(np.array(self.org_pred), axis=0)
+
+    def predict_proba(self, X=None, encoding=True):
+        """Predict class probabilities for X. ToDo
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input samples.
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            Returns an array of ones.
+        """
+        if not isinstance(X, list):
+            list_X = [X.copy() for el in range(max(self.k_fold, 1))]
+            mean_pred = True
+        else:
+            list_X = X.copy()
+            mean_pred = False
+
+        for ind_fold, curr_X in enumerate(list_X):
+            # Prepare the test set for the prediction
+            if encoding:
+                X_tmp = self.__encode_input(curr_X)
+            else:
+                X_tmp = curr_X.copy()
+
+            if self.type != "DNN":
+                if not isinstance(curr_X, np.ndarray):
+                    X_tmp = np.array(X_tmp)
+                if self.scaler_x[ind_fold] is not None:
+                    X_tmp[:, self.list_cont] = self.scaler_x[ind_fold].transform(
+                        X_tmp[:, self.list_cont]
+                    )
+                self.X_proc[ind_fold] = [X_tmp.copy()]
+
+            self.org_pred[ind_fold] = self.list_estimators[ind_fold].predict_proba(
+                X_tmp
+            )
+
+            if self.type == "DNN":
+                self.X_proc[ind_fold] = np.array(
+                    self.list_estimators[ind_fold].X_test.copy()
+                ).swapaxes(0, 1)
+            else:
+                self.org_pred[ind_fold] = convert_predict_proba(self.org_pred[ind_fold])
+
+        if mean_pred:
+            return np.mean(np.array(self.org_pred), axis=0)
+
+    def __encode_input(self, X):
+        # Check is fit had been called
+        check_is_fitted(self, ["is_fitted"])
+
+        # One-hot encoding for the test set
+        if len(self.list_nominal["nominal"]) > 0:
+            for col_encode in self.list_nominal["nominal"]:
+                enc = self.dict_enc[col_encode]
+                labeled_cols = [
+                    enc.feature_names_in_[0] + "_" + str(enc.categories_[0][j])
+                    for j in range(len(enc.categories_[0]))
+                ]
+                hot_cols = pd.DataFrame(
+                    enc.transform(X[[col_encode]]).toarray(),
+                    dtype="int32",
+                    columns=labeled_cols,
+                )
+                X = X.drop(columns=[col_encode])
+                X = pd.concat([X, hot_cols], axis=1)
+
+        return X
+
+    def compute_importance(self, X=None, y=None):
+        """This function is used to compute the importance scores and
+        the statistical guarantees for the different variables/groups
+        ----------
+        X : {array-like, sparse-matrix}, shape (n_samples, n_features)
+            The input samples.
+        y : {array-like}, shape (n_samples,)
+            The output samples.
+        Returns
+        -------
+        results : dict
+            The dictionary of importance scores, p-values and the corresponding
+            score.
+        """
+        # Check is fit had been called
+        check_is_fitted(self, ["is_fitted"])
+        encoding = True
+
+        if self.k_fold != 0:
+            X = self.X_test.copy()
+            y = self.y_test.copy()
+            encoding = False
+        else:
+            if self.coffeine_transformer is not None:
+                X = self.coffeine_transformers[0].transform(pd.DataFrame(X, columns=self.X_cols))
+                # Variables are provided as the third element of the
+                # coffeine transformer parameter
+                if len(self.coffeine_transformer) > 2:
+                    X = X[:, self.coffeine_transformer[2]]
+                    self.list_cont = np.arange(len(self.coffeine_transformer[2]))
+            # Perform stacking if enabled
+            if self.apply_ridge:
+                X_prev = X.copy()
+                X = np.zeros((y.shape[0], len(self.list_grps), self.ridge_folds))
+                for grp_ind, grp in enumerate(self.list_grps):
+                    for ind_ridge in range(self.ridge_folds):
+                        X[:, grp_ind, ind_ridge] = (
+                            self.ridge_mods[ind_ridge][grp_ind]
+                            .predict(X_prev.iloc[:, grp])
+                            .ravel()
+                        )
+                X = np.mean(X, axis=-1)
+
+            # Convert X to pandas dataframe
+            if not isinstance(X, pd.DataFrame):
+                X = pd.DataFrame(X)
+            self.X_nominal[0] = X.loc[:, self.list_cat_tot]
+            X = [X.copy() for _ in range(max(self.k_fold, 1))]
+            if self.prob_type in ("classification", "binary"):
+                pass
+            else:
+                if len(y.shape) != 2:
+                    y = y.reshape(-1, 1)
+            y = [y.copy() for _ in range(max(self.k_fold, 1))]
+
+        # Compute original predictions
+        if self.prob_type == "regression":
+            output_dim = y[0].shape[1]
+            self.predict(X, encoding=encoding)
+        else:
+            output_dim = 1
+            self.predict_proba(X, encoding=encoding)
+
+        list_seeds_imp = self.rng.randint(1e5, size=self.n_perm)
+        parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)
+        score_imp_l = []
+        results = {}
+        # n_features x n_permutations x n_samples
+        for ind_fold, estimator in enumerate(self.list_estimators):
+            if self.type == "DNN":
+                for y_col in range(y[ind_fold].shape[-1]):
+                    y[ind_fold] = self.estimator.encode_outcome(
+                        y[ind_fold], train=False
+                    )[y_col]
+            else:
+                if self.prob_type in ("classification", "binary"):
+                    y[ind_fold] = (
+                        OneHotEncoder(handle_unknown="ignore")
+                        .fit_transform(y[ind_fold].reshape(-1, 1))
+                        .toarray()
+                    )
+            if self.com_imp:
+                if not self.conditional:
+                    self.pred_scores[ind_fold], score_cur = list(
+                        zip(
+                            *parallel(
+                                delayed(joblib_compute_permutation)(
+                                    self.list_cols[p_col],
+                                    perm,
+                                    estimator,
+                                    self.type,
+                                    self.X_proc[ind_fold],
+                                    y[ind_fold],
+                                    self.prob_type,
+                                    self.org_pred[ind_fold],
+                                    dict_cont=self.dict_cont,
+                                    dict_nom=self.dict_nom,
+                                    proc_col=p_col,
+                                    index_i=ind_fold + 1,
+                                    group_stacking=self.group_stacking,
+                                    random_state=list_seeds_imp[perm],
+                                )
+                                for p_col in range(len(self.list_cols))
+                                for perm in range(self.n_perm)
+                            )
+                        )
+                    )
+                    self.pred_scores[ind_fold] = np.array(
+                        self.pred_scores[ind_fold]
+                    ).reshape(
+                        (
+                            len(self.list_cols),
+                            self.n_perm,
+                            y[ind_fold].shape[0],
+                            output_dim,
+                        )
+                    )
+                else:
+                    self.pred_scores[ind_fold], score_cur = list(
+                        zip(
+                            *parallel(
+                                delayed(joblib_compute_conditional)(
+                                    self.list_cols[p_col],
+                                    self.n_perm,
+                                    estimator,
+                                    self.type,
+                                    self.importance_estimator,
+                                    self.X_proc[ind_fold],
+                                    y[ind_fold],
+                                    self.prob_type,
+                                    self.org_pred[ind_fold],
+                                    seed=self.random_state,
+                                    dict_cont=self.dict_cont,
+                                    dict_nom=self.dict_nom,
+                                    X_nominal=self.X_nominal[ind_fold],
+                                    list_nominal=self.list_nominal,
+                                    encoder=self.dict_enc,
+                                    proc_col=p_col,
+                                    index_i=ind_fold + 1,
+                                    group_stacking=self.group_stacking,
+                                    list_seeds=list_seeds_imp,
+                                    Perm=self.Perm,
+                                    output_dim=output_dim,
+                                )
+                                for p_col in range(len(self.list_cols))
+                            )
+                        )
+                    )
+                    self.pred_scores[ind_fold] = np.array(self.pred_scores[ind_fold])
+                score_imp_l.append(score_cur[0])
+            else:
+                if self.prob_type in ("classification", "binary"):
+                    score = roc_auc_score(y[ind_fold], self.org_pred[ind_fold])
+                else:
+                    score = (
+                        mean_absolute_error(y[ind_fold], self.org_pred[ind_fold]),
+                        r2_score(y[ind_fold], self.org_pred[ind_fold]),
+                    )
+                score_imp_l.append(score)
+
+        # Compute performance
+        if self.prob_type == "regression":
+            results["score_MAE"] = np.mean(np.array(score_imp_l), axis=0)[0]
+            results["score_R2"] = np.mean(np.array(score_imp_l), axis=0)[1]
+        else:
+            results["score_AUC"] = np.mean(np.array(score_imp_l), axis=0)
+
+        if not self.com_imp:
+            return results
+
+        # Compute Importance and P-values
+        pred_scores_full = [
+            np.mean(self.pred_scores[ind_fold], axis=1)
+            for ind_fold in range(self.k_fold)
+        ]
+        results["importance"] = compute_imp_std(pred_scores_full)[0]
+        results["std"] = compute_imp_std(pred_scores_full)[1]
+        results["pval"] = norm.sf(results["importance"] / results["std"])
+        results["pval"][np.isnan(results["pval"])] = 1
+
+        # Compute Z-scores across Permutations/Sampling
+        imp_z = compute_imp_std(self.pred_scores)[0]
+        std_z = compute_imp_std(self.pred_scores)[1]
+        results["zscores"] = imp_z / std_z
+        return results
diff --git a/hidimstat/Dnn_learner.py b/hidimstat/Dnn_learner.py
new file mode 100644
index 0000000..92eb768
--- /dev/null
+++ b/hidimstat/Dnn_learner.py
@@ -0,0 +1,293 @@
+import numpy as np
+from sklearn.base import BaseEstimator
+
+from .Dnn_learner_single import DNN_learner_single
+from .utils import sigmoid, softmax
+
+
+class DNN_learner(BaseEstimator):
+    """ToDo
+    Parameters
+    ----------
+    encode: bool, default=False
+        Encoding the categorical outcome.
+    do_hyper: bool, default=True
+        Tuning the hyperparameters of the provided estimator.
+    dict_hyper: dict, default=None
+        The dictionary of hyperparameters to tune.
+    n_ensemble: int, default=10
+        The number of sub-DNN models to fit to the data.
+    min_keep: int, default=10
+        The minimal number of DNNs to be kept.
+    batch_size: int, default=32
+        The number of samples per batch for training.
+    batch_size_val: int, default=128
+        The number of samples per batch for validation.
+    n_epoch: int, default=200
+        The number of epochs for the DNN learner(s).
+    verbose: int, default=0
+        If verbose > 0, the fitted iterations will be printed.
+    bootstrap: bool, default=True
+        Application of bootstrap sampling for the training set.
+    split_perc: float, default=0.8
+        The training/validation cut for the provided data.
+    prob_type: str, default='regression'
+        A classification or a regression problem.
+    list_cont: list, default=None
+        The list of continuous variables.
+    list_grps: list of lists, default=None
+        A list collecting the indices of the groups' variables
+        while applying the stacking method.
+    beta1: float, default=0.9
+        The exponential decay rate for the first moment estimates.
+    beta2: float, default=0.999
+        The exponential decay rate for the second moment estimates.
+    lr: float, default=1e-3
+        The learning rate.
+    epsilon: float, default=1e-8
+        A small constant added to the denominator to prevent division by zero.
+    l1_weight: float, default=1e-2
+        The L1-regularization paramter for weight decay.
+    l2_weight: float, default=0
+        The L2-regularization paramter for weight decay.
+    n_jobs: int, default=1
+        The number of workers for parallel processing.
+    group_stacking: bool, default=False
+        Apply the stacking-based method for the provided groups.
+    inp_dim: list, default=None
+        The cumsum of inputs after the linear sub-layers.
+    random_state: int, default=2023
+        Fixing the seeds of the random generator.
+    Attributes
+    ----------
+    ToDO
+    """
+
+    def __init__(
+        self,
+        encode=False,
+        do_hyper=False,
+        dict_hyper=None,
+        n_ensemble=10,
+        min_keep=10,
+        batch_size=32,
+        batch_size_val=128,
+        n_epoch=200,
+        verbose=0,
+        bootstrap=True,
+        split_perc=0.8,
+        prob_type="regression",
+        list_cont=None,
+        list_grps=None,
+        beta1=0.9,
+        beta2=0.999,
+        lr=1e-2,
+        epsilon=1e-8,
+        l1_weight=1e-2,
+        l2_weight=1e-2,
+        n_jobs=1,
+        group_stacking=False,
+        inp_dim=None,
+        random_state=2023,
+    ):
+        self.list_estimators = []
+        self.encode = encode
+        self.do_hyper = do_hyper
+        self.dict_hyper = dict_hyper
+        self.n_ensemble = n_ensemble
+        self.min_keep = min_keep
+        self.batch_size = batch_size
+        self.batch_size_val = batch_size_val
+        self.n_epoch = n_epoch
+        self.verbose = verbose
+        self.bootstrap = bootstrap
+        self.split_perc = split_perc
+        self.prob_type = prob_type
+        self.list_grps = list_grps
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.lr = lr
+        self.epsilon = epsilon
+        self.l1_weight = l1_weight
+        self.l2_weight = l2_weight
+        self.list_cont = list_cont
+        self.n_jobs = n_jobs
+        self.group_stacking = group_stacking
+        self.inp_dim = inp_dim
+        self.random_state = random_state
+        self.pred = [None] * n_ensemble
+        self.enc_y = []
+        self.link_func = {
+            "classification": softmax,
+            "ordinal": sigmoid,
+            "binary": sigmoid,
+        }
+        self.is_encoded = False
+        self.dim_repeat = 1
+
+    def fit(self, X, y=None):
+        """Build the DNN learner with the training set (X, y)
+        Parameters
+        ----------
+        X : {pandas dataframe}, shape (n_samples, n_features)
+            The training input samples.
+        y : None
+            There is no need of a target in a transformer, yet the pipeline API
+            requires this parameter.
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        if (len(X.shape) != 3) or (X.shape[0] != y.shape[-1]):
+            X = np.squeeze(X)
+            X = np.array([X for i in range(y.shape[-1])])
+            self.dim_repeat = y.shape[-1]
+
+        self.list_estimators = [None] * y.shape[-1]
+        self.X_test = [None] * y.shape[-1]
+
+        for y_col in range(y.shape[-1]):
+            self.list_estimators[y_col] = DNN_learner_single(
+                encode=self.encode,
+                do_hyper=self.do_hyper,
+                dict_hyper=self.dict_hyper,
+                n_ensemble=self.n_ensemble,
+                min_keep=self.min_keep,
+                batch_size=self.batch_size,
+                batch_size_val=self.batch_size_val,
+                n_epoch=self.n_epoch,
+                verbose=self.verbose,
+                bootstrap=self.bootstrap,
+                split_perc=self.split_perc,
+                prob_type=self.prob_type,
+                list_cont=self.list_cont,
+                list_grps=self.list_grps,
+                beta1=self.beta1,
+                beta2=self.beta2,
+                lr=self.lr,
+                epsilon=self.epsilon,
+                l1_weight=self.l1_weight,
+                l2_weight=self.l2_weight,
+                n_jobs=self.n_jobs,
+                group_stacking=self.group_stacking,
+                inp_dim=self.inp_dim,
+                random_state=self.random_state,
+            )
+
+            self.list_estimators[y_col].fit(X[y_col, ...], y[:, [y_col]])
+
+        return self
+
+    def hyper_tuning(
+        self,
+        X_train,
+        y_train,
+        X_valid,
+        y_valid,
+        list_hyper=None,
+        random_state=None,
+    ):
+        estimator = DNN_learner_single(
+            encode=self.encode,
+            do_hyper=self.do_hyper,
+            dict_hyper=self.dict_hyper,
+            n_ensemble=self.n_ensemble,
+            min_keep=self.min_keep,
+            batch_size=self.batch_size,
+            batch_size_val=self.batch_size_val,
+            n_epoch=self.n_epoch,
+            verbose=self.verbose,
+            bootstrap=self.bootstrap,
+            split_perc=self.split_perc,
+            prob_type=self.prob_type,
+            list_cont=self.list_cont,
+            list_grps=self.list_grps,
+            beta1=self.beta1,
+            beta2=self.beta2,
+            lr=self.lr,
+            epsilon=self.epsilon,
+            l1_weight=self.l1_weight,
+            l2_weight=self.l2_weight,
+            n_jobs=self.n_jobs,
+            group_stacking=self.group_stacking,
+            inp_dim=self.inp_dim,
+            random_state=self.random_state,
+        )
+        return estimator.hyper_tuning(
+            X_train, y_train, X_valid, y_valid, list_hyper, random_state
+        )
+
+    def predict(self, X, scale=True):
+        """Predict regression target for X.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The training input samples.
+        scale: bool, default=True
+            The continuous features will be standard scaled or not.
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            Returns an array of ones.
+        """
+        if isinstance(X, list):
+            X = [self.check_X_dim(el) for el in X]
+        else:
+            X = self.check_X_dim(X)
+        list_res = []
+        for estimator_ind, estimator in enumerate(self.list_estimators):
+            if isinstance(X, list):
+                curr_X = [el[estimator_ind, ...] for el in X]
+                list_res.append(estimator.predict(curr_X, scale))
+            else:
+                list_res.append(estimator.predict(X[estimator_ind, ...], scale))
+                self.X_test[estimator_ind] = estimator.X_test.copy()
+        return np.array(list_res)
+
+    def predict_proba(self, X, scale=True):
+        """Predict class probabilities for X.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The training input samples.
+        scale: bool, default=True
+            The continuous features will be standard scaled or not.
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            Returns an array of ones.
+        """
+        if isinstance(X, list):
+            X = [self.check_X_dim(el) for el in X]
+        else:
+            X = self.check_X_dim(X)
+
+        list_res = []
+        for estimator_ind, estimator in enumerate(self.list_estimators):
+            if isinstance(X, list):
+                curr_X = [el[estimator_ind, ...] for el in X]
+                list_res.append(estimator.predict_proba(curr_X, scale))
+            else:
+                list_res.append(estimator.predict_proba(X[estimator_ind, ...], scale))
+                self.X_test[estimator_ind] = estimator.X_test.copy()
+        return np.squeeze(np.array(list_res))
+
+    def set_params(self, **kwargs):
+        """Set the parameters of this estimator."""
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+            for estimator in self.list_estimators:
+                setattr(estimator, key, value)
+
+    def check_X_dim(self, X):
+        if (len(X.shape) != 3) or (X.shape[0] != self.dim_repeat):
+            X = np.squeeze(X)
+            X = np.array([X for i in range(self.dim_repeat)])
+
+        return X
+
+    def encode_outcome(self, y, train=True):
+        for y_col in range(y.shape[-1]):
+            y_enc = self.list_estimators[y_col].encode_outcome(y, train=train)
+        return y_enc
\ No newline at end of file
diff --git a/hidimstat/Dnn_learner_single.py b/hidimstat/Dnn_learner_single.py
new file mode 100644
index 0000000..af7da51
--- /dev/null
+++ b/hidimstat/Dnn_learner_single.py
@@ -0,0 +1,550 @@
+import itertools
+
+import numpy as np
+import pandas as pd
+from joblib import Parallel, delayed
+from sklearn.base import BaseEstimator
+from sklearn.metrics import log_loss, mean_squared_error
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.utils.validation import check_is_fitted
+
+from .utils import (
+    create_X_y,
+    dnn_net,
+    joblib_ensemble_dnnet,
+    ordinal_encode,
+    relu,
+    sigmoid,
+    softmax,
+)
+
+
+class DNN_learner_single(BaseEstimator):
+    """
+    Parameters
+    ----------
+    encode: bool, default=False
+        Encoding the categorical outcome.
+    do_hyper: bool, default=True
+        Tuning the hyperparameters of the provided estimator.
+    dict_hyper: dict, default=None
+        The dictionary of hyperparameters to tune.
+    n_ensemble: int, default=10
+        The number of sub-DNN models to fit to the data
+    min_keep: int, default=10
+        The minimal number of DNNs to be kept
+    batch_size: int, default=32
+        The number of samples per batch for training
+    batch_size_val: int, default=128
+        The number of samples per batch for validation
+    n_epoch: int, default=200
+        The number of epochs for the DNN learner(s)
+    verbose: int, default=0
+        If verbose > 0, the fitted iterations will be printed
+    bootstrap: bool, default=True
+        Application of bootstrap sampling for the training set
+    split_perc: float, default=0.8
+        The training/validation cut for the provided data
+    prob_type: str, default='regression'
+        A classification or a regression problem
+    list_grps: list of lists, default=None
+        A list collecting the indices of the groups' variables
+        while applying the stacking method.
+    list_cont: list, default=None
+        The list of continuous variables
+    beta1: float, default=0.9
+        The exponential decay rate for the first moment estimates.
+    beta2: float, default=0.999
+        The exponential decay rate for the second moment estimates.
+    lr: float, default=1e-3
+        The learning rate
+    epsilon: float, default=1e-8
+        A small constant added to the denominator to prevent division by zero.
+    l1_weight: float, default=1e-2
+        The L1-regularization paramter for weight decay.
+    l2_weight: float, default=0
+        The L2-regularization paramter for weight decay.
+    n_jobs: int, default=1
+        The number of workers for parallel processing.
+    group_stacking: bool, default=False
+        Apply the stacking-based method for the provided groups.
+    inp_dim: list, default=None
+        The cumsum of inputs after the linear sub-layers.
+    random_state: int, default=2023
+        Fixing the seeds of the random generator
+    """
+
+    def __init__(
+        self,
+        encode=False,
+        do_hyper=False,
+        dict_hyper=None,
+        n_ensemble=10,
+        min_keep=10,
+        batch_size=32,
+        batch_size_val=128,
+        n_epoch=200,
+        verbose=0,
+        bootstrap=True,
+        split_perc=0.8,
+        prob_type="regression",
+        list_cont=None,
+        list_grps=None,
+        beta1=0.9,
+        beta2=0.999,
+        lr=1e-3,
+        epsilon=1e-8,
+        l1_weight=1e-2,
+        l2_weight=0,
+        n_jobs=1,
+        group_stacking=False,
+        inp_dim=None,
+        random_state=2023,
+    ):
+        self.encode = encode
+        self.do_hyper = do_hyper
+        self.dict_hyper = dict_hyper
+        self.n_ensemble = n_ensemble
+        self.min_keep = min_keep
+        self.batch_size = batch_size
+        self.batch_size_val = batch_size_val
+        self.n_epoch = n_epoch
+        self.verbose = verbose
+        self.bootstrap = bootstrap
+        self.split_perc = split_perc
+        self.prob_type = prob_type
+        self.list_grps = list_grps
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.lr = lr
+        self.epsilon = epsilon
+        self.l1_weight = l1_weight
+        self.l2_weight = l2_weight
+        self.list_cont = list_cont
+        self.n_jobs = n_jobs
+        self.group_stacking = group_stacking
+        self.inp_dim = inp_dim
+        self.random_state = random_state
+        self.enc_y = []
+        self.link_func = {
+            "classification": softmax,
+            "ordinal": sigmoid,
+            "binary": sigmoid,
+        }
+        self.is_encoded = False
+
+    def fit(self, X, y=None):
+        """Build the DNN learner with the training set (X, y)
+        Parameters
+        ----------
+        X : {pandas dataframe}, shape (n_samples, n_features)
+            The training input samples.
+        y : None
+            There is no need of a target in a transformer, yet the pipeline API
+            requires this parameter.
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        # Disabling the encoding parameter with the regression case
+        if self.prob_type == "regression":
+            if len(y.shape) != 2:
+                y = y.reshape(-1, 1)
+            self.encode = False
+
+        if self.encode:
+            y = self.encode_outcome(y)
+            self.is_encoded = True
+            y = np.squeeze(y, axis=0)
+
+        # Initializing the dictionary for tuning the hyperparameters
+        if self.dict_hyper is None:
+            self.dict_hyper = {
+                "lr": [1e-2, 1e-3, 1e-4],
+                "l1_weight": [0, 1e-2, 1e-4],
+                "l2_weight": [0, 1e-2, 1e-4],
+            }
+
+        # Switch to the special binary case
+        if (self.prob_type == "classification") and (y.shape[-1] < 3):
+            self.prob_type = "binary"
+        n, p = X.shape
+        self.min_keep = max(min(self.min_keep, self.n_ensemble), 1)
+        rng = np.random.RandomState(self.random_state)
+        list_seeds = rng.randint(1e5, size=(self.n_ensemble))
+
+        # Initialize the list of continuous variables
+        if self.list_cont is None:
+            self.list_cont = list(np.arange(p))
+
+        # Initialize the list of groups
+        if self.list_grps is None:
+            if not self.group_stacking:
+                self.list_grps = []
+
+        # Convert the matrix of predictors to numpy array
+        if not isinstance(X, np.ndarray):
+            X = np.array(X)
+
+        # Hyperparameter tuning
+        if self.do_hyper:
+            self.__tuning_hyper(X, y)
+
+        parallel = Parallel(
+            n_jobs=min(self.n_jobs, self.n_ensemble), verbose=self.verbose
+        )
+        res_ens = list(
+            zip(
+                *parallel(
+                    delayed(joblib_ensemble_dnnet)(
+                        X,
+                        y,
+                        prob_type=self.prob_type,
+                        link_func=self.link_func,
+                        list_cont=self.list_cont,
+                        list_grps=self.list_grps,
+                        bootstrap=self.bootstrap,
+                        split_perc=self.split_perc,
+                        group_stacking=self.group_stacking,
+                        inp_dim=self.inp_dim,
+                        n_epoch=self.n_epoch,
+                        batch_size=self.batch_size,
+                        beta1=self.beta1,
+                        beta2=self.beta2,
+                        lr=self.lr,
+                        l1_weight=self.l1_weight,
+                        l2_weight=self.l2_weight,
+                        epsilon=self.epsilon,
+                        random_state=list_seeds[i],
+                    )
+                    for i in range(self.n_ensemble)
+                )
+            )
+        )
+        pred_m = np.array(res_ens[3])
+        loss = np.array(res_ens[4])
+
+        if self.n_ensemble == 1:
+            return [(res_ens[0][0], (res_ens[1][0], res_ens[2][0]))]
+
+        # Keeping the optimal subset of DNNs
+        sorted_loss = loss.copy()
+        sorted_loss.sort()
+        new_loss = np.empty(self.n_ensemble - 1)
+        for i in range(self.n_ensemble - 1):
+            current_pred = np.mean(pred_m[loss >= sorted_loss[i], :], axis=0)
+            if self.prob_type == "regression":
+                new_loss[i] = mean_squared_error(y, current_pred)
+            else:
+                new_loss[i] = log_loss(y, current_pred)
+        keep_dnn = (
+            loss
+            >= sorted_loss[np.argmin(new_loss[: (self.n_ensemble - self.min_keep + 1)])]
+        )
+
+        self.optimal_list = [
+            (res_ens[0][i], (res_ens[1][i], res_ens[2][i]))
+            for i in range(self.n_ensemble)
+            if keep_dnn[i]
+        ]
+        self.pred = [None] * len(self.optimal_list)
+        self.is_fitted = True
+        return self
+
+    def encode_outcome(self, y, train=True):
+        list_y = []
+        if len(y.shape) != 2:
+            y = y.reshape(-1, 1)
+        if self.prob_type == "regression":
+            list_y.append(y)
+
+        for col in range(y.shape[1]):
+            if train:
+                # Encoding the target with the classification case
+                if self.prob_type in ("classification", "binary"):
+                    self.enc_y.append(OneHotEncoder(handle_unknown="ignore"))
+                    curr_y = self.enc_y[col].fit_transform(y[:, [col]]).toarray()
+                    list_y.append(curr_y)
+
+                # Encoding the target with the ordinal case
+                if self.prob_type == "ordinal":
+                    y = ordinal_encode(y)
+
+            else:
+                # Encoding the target with the classification case
+                if self.prob_type in ("classification", "binary"):
+                    curr_y = self.enc_y[col].transform(y[:, [col]]).toarray()
+                    list_y.append(curr_y)
+
+                ## ToDo Add the ordinal case
+        return np.array(list_y)
+
+    def hyper_tuning(
+        self,
+        X_train,
+        y_train,
+        X_valid,
+        y_valid,
+        list_hyper=None,
+        random_state=None,
+    ):
+        parallel = Parallel(
+            n_jobs=min(self.n_jobs, self.n_ensemble), verbose=self.verbose
+        )
+        y_train = self.encode_outcome(y_train)
+        y_valid = self.encode_outcome(y_valid, train=False)
+        return [
+            list(
+                zip(
+                    *parallel(
+                        delayed(dnn_net)(
+                            X_train,
+                            y_train[i, ...],
+                            X_valid,
+                            y_valid[i, ...],
+                            prob_type=self.prob_type,
+                            n_epoch=self.n_epoch,
+                            batch_size=self.batch_size,
+                            beta1=self.beta1,
+                            beta2=self.beta2,
+                            lr=el[0],
+                            l1_weight=el[1],
+                            l2_weight=el[2],
+                            epsilon=self.epsilon,
+                            list_grps=self.list_grps,
+                            group_stacking=self.group_stacking,
+                            inp_dim=self.inp_dim,
+                            random_state=random_state,
+                        )
+                        for el in list_hyper
+                    )
+                )
+            )[2]
+            for i in range(y_train.shape[0])
+        ]
+
+    def __tuning_hyper(self, X, y):
+        (
+            X_train_scaled,
+            y_train_scaled,
+            X_valid_scaled,
+            y_valid_scaled,
+            X_scaled,
+            __,
+            scaler_x,
+            scaler_y,
+            ___,
+        ) = create_X_y(
+            X,
+            y,
+            bootstrap=self.bootstrap,
+            split_perc=self.split_perc,
+            prob_type=self.prob_type,
+            list_cont=self.list_cont,
+            random_state=self.random_state,
+        )
+        list_hyper = list(itertools.product(*list(self.dict_hyper.values())))
+        list_loss = self.hyper_tuning(
+            X_train_scaled,
+            y_train_scaled,
+            X_valid_scaled,
+            y_valid_scaled,
+            list_hyper,
+            random_state=self.random_state,
+        )
+        ind_min = np.argmin(list_loss)
+        best_hyper = list_hyper[ind_min]
+        if not isinstance(best_hyper, dict):
+            best_hyper = dict(zip(self.dict_hyper.keys(), best_hyper))
+        self.set_params(**best_hyper)
+
+    def predict(self, X, scale=True):
+        """Predict regression target for X.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The training input samples.
+        scale: bool, default=True
+            The continuous features will be standard scaled or not.
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            Returns an array of ones.
+        """
+        if self.prob_type != "regression":
+            raise Exception("Use the predict_proba function for classification")
+
+        # Prepare the test set for the prediction
+        if scale:
+            X = self.__scale_test(X)
+
+        # Process the common prediction part
+        self.__pred_common(X)
+
+        res_pred = np.zeros((self.pred[0].shape))
+        total_n_elements = 0
+        for ind_mod, pred in enumerate(self.pred):
+            res_pred += (
+                pred * self.optimal_list[ind_mod][1][1].scale_
+                + self.optimal_list[ind_mod][1][1].mean_
+            )
+            total_n_elements += 1
+        res_pred = res_pred.copy() / total_n_elements
+
+        return res_pred
+
+    def predict_proba(self, X, scale=True):
+        """Predict class probabilities for X.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The training input samples.
+        scale: bool, default=True
+            The continuous features will be standard scaled or not.
+        Returns
+        -------
+        y : ndarray, shape (n_samples,)
+            Returns an array of ones.
+        """
+        if self.prob_type == "regression":
+            raise Exception("Use the predict function for classification")
+
+        # Prepare the test set for the prediction
+        if scale:
+            X = self.__scale_test(X)
+
+        # Process the common prediction part
+        self.__pred_common(X)
+
+        res_pred = np.zeros((self.pred[0].shape))
+        total_n_elements = 0
+        for pred in self.pred:
+            res_pred += self.link_func[self.prob_type](pred)
+            total_n_elements += 1
+        res_pred = res_pred.copy() / total_n_elements
+        if self.prob_type == "binary":
+            res_pred = np.array([[1-res_pred[i][0], res_pred[i][0]] for i in range(res_pred.shape[0])])
+        
+        return res_pred
+
+    def __scale_test(self, X):
+        """This function prepares the input for the DNN estimator either in the default
+        case or after applying the stacking method
+        Parameters
+        ----------
+        X : {array-like, sparse-matrix}, shape (n_samples, n_features)
+            The input samples.
+        """
+        # Check is fit had been called
+        check_is_fitted(self, ["is_fitted"])
+
+        if isinstance(X, pd.DataFrame):
+            X = np.array(X)
+
+        # The input will be either the original input or the result
+        # of the provided sub-linear layers in a stacking way for the different groups
+        # In the stacking method, each sub-linear layer will have a corresponding output
+        if self.group_stacking:
+            X_test_n = [None] * len(self.optimal_list)
+            for mod in range(len(self.optimal_list)):
+                X_test_scaled = X.copy()
+                if len(self.list_cont) > 0:
+                    X_test_scaled[:, self.list_cont] = self.optimal_list[mod][1][
+                        0
+                    ].transform(X_test_scaled[:, self.list_cont])
+                X_test_n_curr = np.zeros(
+                    (
+                        X_test_scaled.shape[0],
+                        self.inp_dim[-1],
+                    )
+                )
+                for grp_ind in range(len(self.list_grps)):
+                    curr_pred = X_test_scaled[:, self.list_grps[grp_ind]].copy()
+                    n_layer_stacking = len(self.optimal_list[mod][0][3][grp_ind]) - 1
+                    for ind_w_b in range(n_layer_stacking):
+                        if ind_w_b == 0:
+                            curr_pred = relu(
+                                X_test_scaled[:, self.list_grps[grp_ind]].dot(
+                                    self.optimal_list[mod][0][3][grp_ind][ind_w_b]
+                                )
+                                + self.optimal_list[mod][0][4][grp_ind][ind_w_b]
+                            )
+                        else:
+                            curr_pred = relu(
+                                curr_pred.dot(
+                                    self.optimal_list[mod][0][3][grp_ind][ind_w_b]
+                                )
+                                + self.optimal_list[mod][0][4][grp_ind][ind_w_b]
+                            )
+                    X_test_n_curr[
+                        :,
+                        list(
+                            np.arange(
+                                self.inp_dim[grp_ind],
+                                self.inp_dim[grp_ind + 1],
+                            )
+                        ),
+                    ] = (
+                        curr_pred.dot(
+                            self.optimal_list[mod][0][3][grp_ind][n_layer_stacking]
+                        )
+                        + self.optimal_list[mod][0][4][grp_ind][n_layer_stacking]
+                    )
+                    # X_test_n_curr[
+                    #     :,
+                    #     list(
+                    #         np.arange(
+                    #             self.inp_dim[grp_ind],
+                    #             self.inp_dim[grp_ind + 1],
+                    #         )
+                    #         # np.arange(
+                    #         #     self.n_out_subLayers * grp_ind,
+                    #         #     (grp_ind + 1) * self.n_out_subLayers,
+                    #         # )
+                    #     ),
+                    # ] = (
+                    #     X_test_scaled[:, self.list_grps[grp_ind]].dot(
+                    #         self.optimal_list[mod][0][3][grp_ind]
+                    #     )
+                    #     + self.optimal_list[mod][0][4][grp_ind]
+                    # )
+                X_test_n[mod] = X_test_n_curr
+        else:
+            X_test_n = [X.copy()]
+        self.X_test = X_test_n.copy()
+        return X_test_n
+
+    def __pred_common(self, X):
+        """
+        Parameters
+        ----------
+        X : {array-like, sparse-matrix}, shape (n_samples, n_features)
+            The input samples.
+        """
+        if not self.group_stacking:
+            X = [X[0].copy() for i in range(self.n_ensemble)]
+
+        n_layer = len(self.optimal_list[0][0][0]) - 1
+        for ind_mod, mod in enumerate(self.optimal_list):
+            X_test_scaled = X[ind_mod].copy()
+            for j in range(n_layer):
+                if not self.group_stacking:
+                    if len(self.list_cont) > 0:
+                        X_test_scaled[:, self.list_cont] = mod[1][0].transform(
+                            X_test_scaled[:, self.list_cont]
+                        )
+                if j == 0:
+                    pred = relu(X_test_scaled.dot(mod[0][0][j]) + mod[0][1][j])
+                else:
+                    pred = relu(pred.dot(mod[0][0][j]) + mod[0][1][j])
+
+            self.pred[ind_mod] = pred.dot(mod[0][0][n_layer]) + mod[0][1][n_layer]
+
+    def set_params(self, **kwargs):
+        """Set the parameters of this estimator."""
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+    def clone(self):
+        return type(self)(**self.get_params())
diff --git a/hidimstat/RandomForestModified.py b/hidimstat/RandomForestModified.py
new file mode 100644
index 0000000..7dd5b12
--- /dev/null
+++ b/hidimstat/RandomForestModified.py
@@ -0,0 +1,97 @@
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+
+
+class RandomForestClassifierModified(RandomForestClassifier):
+    def fit(self, X, y):
+        self.y_ = y
+        super().fit(X, y)
+
+    def predict(self, X):
+        super().predict(X)
+
+    def predict_proba(self, X):
+        super().predict_proba(X)
+
+    def sample_same_leaf(self, X, y=None):
+        if not (y is None):
+            self.y_ = y
+        rng = np.random.RandomState(self.get_params()["random_state"])
+        # The number of samples to sample from in the next step
+        random_samples = 10
+
+        # Get the leaf indices for each sample in the input data
+        leaf_indices = self.apply(X)
+
+        # Initialize an array to store the predictions for each sample
+        predictions = []
+
+        # Loop over each sample in the input data
+        for i in range(X.shape[0]):
+            # Removing the row of the corresponding input sample
+            leaf_indices_minus_i = np.delete(leaf_indices, i, axis=0)
+            y_minus_i = np.delete(self.y_, i, axis=0)
+
+            # The list of indices sampled from the same leaf of the input sample
+            leaf_samples = []
+            # Loop over each tree in the forest
+            for j in range(self.n_estimators):
+                # Find the samples that fall into the same leaf node for this tree
+                samples_in_leaf = np.where(
+                    leaf_indices_minus_i[:, j] == leaf_indices[i, j]
+                )[0]
+
+                # Append the samples to the list
+                leaf_samples.append(
+                    y_minus_i[rng.choice(samples_in_leaf, size=random_samples)]
+                )
+
+            predictions.append(leaf_samples)
+
+        # Combine the predictions from all trees to make the final prediction
+        return np.array(predictions)
+
+
+class RandomForestRegressorModified(RandomForestRegressor):
+    def fit(self, X, y):
+        self.y_ = y
+        super().fit(X, y)
+
+    def predict(self, X):
+        super().predict(X)
+
+    def sample_same_leaf(self, X):
+        rng = np.random.RandomState(self.get_params()["random_state"])
+        # The number of samples to sample from in the next step
+        random_samples = 10
+
+        # Get the leaf indices for each sample in the input data
+        leaf_indices = self.apply(X)
+
+        # Initialize an array to store the predictions for each sample
+        predictions = []
+
+        # Loop over each sample in the input data
+        for i in range(X.shape[0]):
+            # Removing the row of the corresponding input sample
+            leaf_indices_minus_i = np.delete(leaf_indices, i, axis=0)
+            y_minus_i = np.delete(self.y_, i, axis=0)
+
+            # The list of indices sampled from the same leaf of the input sample
+            leaf_samples = []
+            # Loop over each tree in the forest
+            for j in range(self.n_estimators):
+                # Find the samples that fall into the same leaf node for this tree
+                samples_in_leaf = np.where(
+                    leaf_indices_minus_i[:, j] == leaf_indices[i, j]
+                )[0]
+
+                # Append the samples to the list
+                leaf_samples.append(
+                    y_minus_i[rng.choice(samples_in_leaf, size=random_samples)]
+                )
+
+            predictions.append(leaf_samples)
+
+        # Combine the predictions from all trees to make the final prediction
+        return np.array(predictions)
diff --git a/hidimstat/__init__.py b/hidimstat/__init__.py
index 0e4b39e..d2d4180 100644
--- a/hidimstat/__init__.py
+++ b/hidimstat/__init__.py
@@ -1,6 +1,8 @@
 from .adaptive_permutation_threshold import ada_svr
+from .BBI import BlockBasedImportance
 from .clustered_inference import clustered_inference, hd_inference
 from .desparsified_lasso import desparsified_group_lasso, desparsified_lasso
+from .Dnn_learner_single import DNN_learner_single
 from .ensemble_clustered_inference import ensemble_clustered_inference
 from .knockoffs import model_x_knockoff
 from .knockoff_aggregation import knockoff_aggregation
@@ -15,9 +17,11 @@
 __all__ = [
     'ada_svr',
     'aggregate_quantiles',
+    'BlockBasedImportance',
     'clustered_inference',
     'desparsified_lasso',
     'desparsified_group_lasso',
+    'DNN_learner_single',
     'ensemble_clustered_inference',
     'group_reid',
     'hd_inference',
diff --git a/hidimstat/compute_importance.py b/hidimstat/compute_importance.py
new file mode 100644
index 0000000..517547c
--- /dev/null
+++ b/hidimstat/compute_importance.py
@@ -0,0 +1,688 @@
+import warnings
+from collections import Counter
+
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.metrics import mean_absolute_error, r2_score, roc_auc_score
+from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import OneHotEncoder
+
+from .RandomForestModified import (
+    RandomForestClassifierModified,
+    RandomForestRegressorModified,
+)
+from .utils import convert_predict_proba, ordinal_encode, sample_predictions
+
+warnings.filterwarnings("ignore")
+
+
+def joblib_compute_conditional(
+    p_col,
+    n_sample,
+    estimator,
+    type_predictor,
+    importance_estimator,
+    X_test_list,
+    y_test,
+    prob_type,
+    org_pred,
+    seed=None,
+    dict_cont={},
+    dict_nom={},
+    X_nominal=None,
+    list_nominal={},
+    encoder={},
+    proc_col=None,
+    index_i=None,
+    group_stacking=False,
+    list_seeds=None,
+    Perm=False,
+    output_dim=1,
+):
+    """This function applies the conditional approach for feature importance.
+    Parameters
+    ----------
+    p_col: list
+        The list of single variables/groups to compute the feature importance.
+    n_sample: int
+        The number of permutations/samples to loop
+    estimator: scikit-learn compatible estimator, default=None
+        The provided estimator for the prediction task (First block)
+    type_predictor: string
+        The provided predictor either the DNN learner or not.
+        The DNN learner will use different inputs for the different
+        sub-models especially while applying the stacking case.
+    importance_estimator: scikit-learn compatible estimator, default=None
+        The provided estimator for the importance task (Second block)
+    X_test_list: list
+        The list of inputs containing either one input or a number of inputs
+        equal to the number of sub-models of the DNN learner.
+    y_test: {array-like, sparse matrix}, shape (n_samples, n_output)
+        The output test samples.
+    prob_type: str, default='regression'
+        A classification or a regression problem.
+    org_pred: {array-like, sparse matrix}, shape (n_output, n_samples)
+        The predictions using the original samples.
+    seed: int, default=None
+        Fixing the seeds of the random generator.
+    dict_cont: dict, default={}
+        The dictionary providing the indices of the continuous variables.
+    dict_nom: dict, default={}
+        The dictionary providing the indices of the categorical variables.
+    X_nominal: {array-like}, default=None
+        The dataframe of categorical variables without encoding.
+    list_nominal: dict, default={}
+        The dictionary of binary, nominal and ordinal variables.
+    encoder: dict, default={}
+        The dictionary of encoders for categorical variables.
+    proc_col: int, default=None
+        The processed column to print if verbose > 0.
+    index_i: int, default=None
+        The index of the current processed iteration.
+    group_stacking: bool, default=False
+        Apply the stacking-based method for the provided groups.
+    list_seeds: list, default=None
+        The list of seeds to fix the RandomState for reproducibility.
+    Perm: bool, default=False
+        The use of permutations or random sampling with CPI-DNN.
+    output_dim:
+
+    """
+    rng = np.random.RandomState(seed)
+
+    res_ar = np.empty((n_sample, y_test.shape[0], output_dim))
+
+    # A list of copied items to avoid any overlapping in the process
+    current_X_test_list = [
+        (lambda x: x.copy()[np.newaxis, ...] if len(x.shape) != 3 else x.copy())(
+            X_test_el
+        )
+        for X_test_el in X_test_list
+    ]
+
+    Res_col = [None] * len(current_X_test_list)
+    X_col_pred = {
+        "regression": [None] * len(current_X_test_list),
+        "classification": None,
+        "ordinal": None,
+    }
+
+    # Group partitioning
+    grp_nom = [
+        item
+        for item in p_col
+        if (item in dict_nom.keys())
+        and (item in list_nominal["nominal"] + list_nominal["binary"])
+    ]
+    grp_ord = [
+        item for item in p_col if (item in dict_nom.keys()) and (item not in grp_nom)
+    ]
+    grp_cont = [item for item in p_col if item not in (grp_nom + grp_ord)]
+
+    # If modified Random Forest is applied
+    pred_mod_RF = {
+        "regression": [None] * len(current_X_test_list),
+        "classification": [None],
+        "ordinal": [None] * len(grp_ord),
+    }
+
+    p_col_n = {"regression": [], "classification": [], "ordinal": []}
+    if not group_stacking:
+        for val in p_col:
+            if val in grp_nom:
+                p_col_n["classification"] += dict_nom[val]
+            if val in grp_cont:
+                p_col_n["regression"] += dict_cont[val]
+            if val in grp_ord:
+                p_col_n["ordinal"] += dict_nom[val]
+    else:
+        p_col_n["regression"] = p_col
+
+    # Dictionary of booleans checking for the encountered type of group
+    var_type = {
+        "regression": False,
+        "classification": False,
+        "ordinal": False,
+    }
+
+    X_col_new = {
+        "regression": None,
+        "classification": None,
+        "ordinal": None,
+    }
+    output = {"regression": None, "classification": None, "ordinal": None}
+    importance_models = {
+        "regression": None,
+        "classification": None,
+        "ordinal": None,
+    }
+
+    if importance_estimator is None:
+        importance_models["regression"] = RandomForestRegressor(
+            random_state=seed, max_depth=5
+        )
+        importance_models["classification"] = RandomForestClassifier(
+            random_state=seed, max_depth=5
+        )
+        importance_models["ordinal"] = RandomForestClassifier(
+            random_state=seed, max_depth=5
+        )
+    elif importance_estimator == "Mod_RF":
+        importance_models["regression"] = RandomForestRegressorModified(
+            random_state=seed,
+            min_samples_leaf=10,
+        )
+        importance_models["classification"] = RandomForestClassifierModified(
+            random_state=seed, min_samples_leaf=10
+        )
+        importance_models["ordinal"] = RandomForestClassifierModified(
+            random_state=seed, min_samples_leaf=10
+        )
+    else:
+        importance_models = importance_estimator.copy()
+
+    # Checking for pure vs hybrid groups
+    if len(grp_cont) > 0:
+        var_type["regression"] = True
+    if len(grp_nom) > 0:
+        var_type["classification"] = True
+    if len(grp_ord) > 0:
+        var_type["ordinal"] = True
+
+    # All the variables of the same group will be removed simultaneously so it can be
+    # predicted conditional on the remaining variables
+    if importance_estimator != "Mod_RF":
+        if var_type["regression"]:
+            for counter_test, X_test_comp in enumerate(current_X_test_list):
+                X_test_minus_idx = np.delete(
+                    np.copy(X_test_comp),
+                    p_col_n["regression"]
+                    + p_col_n["classification"]
+                    + p_col_n["ordinal"],
+                    -1,
+                )
+
+                # Nb of y outputs x Nb of samples x Nb of regression outputs
+                output["regression"] = X_test_comp[..., p_col_n["regression"]]
+                X_col_pred["regression"][counter_test] = []
+                for cur_output_ind in range(X_test_minus_idx.shape[0]):
+                    importance_models["regression"] = hypertune_predictor(
+                        importance_models["regression"],
+                        X_test_minus_idx[cur_output_ind, ...],
+                        output["regression"][cur_output_ind, ...],
+                        param_grid={"max_depth": [2, 5, 10]}
+                    )
+                    importance_models["regression"].fit(
+                        X_test_minus_idx[cur_output_ind, ...],
+                        output["regression"][cur_output_ind, ...],
+                    )
+                    X_col_pred["regression"][counter_test].append(
+                        importance_models["regression"].predict(
+                            X_test_minus_idx[cur_output_ind, ...]
+                        )
+                    )
+
+                X_col_pred["regression"][counter_test] = np.array(
+                    X_col_pred["regression"][counter_test]
+                )
+                X_col_pred["regression"][counter_test] = X_col_pred["regression"][
+                    counter_test
+                ].reshape(output["regression"].shape)
+                Res_col[counter_test] = (
+                    output["regression"] - X_col_pred["regression"][counter_test]
+                )
+
+        # The loop doesn't include the classification part because the input across the different DNN sub-models
+        # is the same without the stacking part (where extra sub-linear layers are used), therefore identical inputs
+        # won't need looping classification process. This is not the case with the regression part.
+        if var_type["classification"]:
+            X_test_minus_idx = np.delete(
+                np.copy(current_X_test_list[0]),
+                p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"],
+                -1,
+            )
+            output["classification"] = np.array(X_nominal[grp_nom])
+            X_col_pred["classification"] = []
+            for cur_output_ind in range(X_test_minus_idx.shape[0]):
+                importance_models["classification"] = hypertune_predictor(
+                    importance_models["classification"],
+                    X_test_minus_idx[cur_output_ind, ...],
+                    output["classification"],
+                    param_grid={"max_depth": [2, 5, 10]}
+                )
+                importance_models["classification"].fit(
+                    X_test_minus_idx[cur_output_ind, ...],
+                    output["classification"],
+                )
+                X_col_pred["classification"].append(
+                    importance_models["classification"].predict_proba(
+                        X_test_minus_idx[cur_output_ind, ...]
+                    )
+                )
+
+                if not isinstance(X_col_pred["classification"][0], list):
+                    X_col_pred["classification"] = [X_col_pred["classification"]]
+
+        if var_type["ordinal"]:
+            X_test_minus_idx = np.delete(
+                np.copy(current_X_test_list[0]),
+                p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"],
+                -1,
+            )
+            output["ordinal"] = ordinal_encode(np.array(X_nominal[grp_ord]))
+            X_col_pred["ordinal"] = []
+            for cur_output_ind in range(X_test_minus_idx.shape[0]):
+                current_prediction = []
+                for cur_ordinal in output["ordinal"]:
+                    # To solve the problem of having multiple input matrices
+                    # with multiple ordinal outcomes
+                    importance_models["ordinal"] = hypertune_predictor(
+                        importance_models["ordinal"],
+                        X_test_minus_idx[cur_output_ind, ...],
+                        cur_ordinal,
+                        param_grid={"max_depth": [2, 5, 10]}
+                    )
+                    importance_models["ordinal"].fit(
+                        X_test_minus_idx[cur_output_ind, ...], cur_ordinal
+                    )
+                    probs = importance_models["ordinal"].predict_proba(
+                        X_test_minus_idx[cur_output_ind, ...]
+                    )
+                    current_prediction.append(convert_predict_proba(np.array(probs)))
+                X_col_pred["ordinal"].append(current_prediction)
+    else:
+        for counter_test, X_test_comp in enumerate(current_X_test_list):
+            if var_type["regression"]:
+                X_test_minus_idx = np.delete(
+                    np.copy(X_test_comp),
+                    p_col_n["regression"]
+                    + p_col_n["classification"]
+                    + p_col_n["ordinal"],
+                    -1,
+                )
+                output["regression"] = X_test_comp[..., p_col_n["regression"]]
+                for cur_output_ind in range(X_test_minus_idx.shape[0]):
+                    importance_models["regression"] = hypertune_predictor(
+                        importance_models["regression"],
+                        X_test_minus_idx[cur_output_ind, ...],
+                        output["regression"][cur_output_ind, ...],
+                        param_grid={"min_samples_leaf": [10, 15, 20]}
+                    )
+
+                    importance_models["regression"].fit(
+                        X_test_minus_idx[cur_output_ind, ...],
+                        output["regression"][cur_output_ind, ...],
+                    )
+                    pred_mod_RF["regression"][counter_test] = importance_models[
+                        "regression"
+                    ].sample_same_leaf(X_test_minus_idx[cur_output_ind, ...])
+
+        if var_type["classification"]:
+            X_test_minus_idx = np.delete(
+                np.copy(current_X_test_list[0]),
+                p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"],
+                -1,
+            )
+            output["classification"] = np.array(X_nominal[grp_nom])
+            for cur_output_ind in range(X_test_minus_idx.shape[0]):
+                importance_models["classification"] = hypertune_predictor(
+                    importance_models["classification"],
+                    X_test_minus_idx[cur_output_ind, ...],
+                    output["classification"],
+                    param_grid={"min_samples_leaf": [10, 15, 20]}
+                )
+                importance_models["classification"].fit(
+                    X_test_minus_idx[cur_output_ind, ...],
+                    output["classification"],
+                )
+                pred_mod_RF["classification"] = importance_models[
+                    "classification"
+                ].sample_same_leaf(X_test_minus_idx[cur_output_ind, ...])
+
+        if var_type["ordinal"]:
+            X_test_minus_idx = np.delete(
+                np.copy(current_X_test_list[0]),
+                p_col_n["regression"] + p_col_n["classification"] + p_col_n["ordinal"],
+                -1,
+            )
+            output["ordinal"] = ordinal_encode(np.array(X_nominal[grp_ord]))
+
+            for cur_output_ind in range(X_test_minus_idx.shape[0]):
+                for cur_ordinal_ind, cur_ordinal in enumerate(output["ordinal"]):
+                    importance_models["ordinal"] = hypertune_predictor(
+                        importance_models["ordinal"],
+                        X_test_minus_idx[cur_output_ind, ...],
+                        cur_ordinal,
+                        param_grid={"min_samples_leaf": [10, 15, 20]}
+                    )
+
+                    importance_models["ordinal"].fit(
+                        X_test_minus_idx[cur_output_ind, ...], cur_ordinal
+                    )
+                    pred_mod_RF["ordinal"][cur_ordinal_ind] = importance_models[
+                        "ordinal"
+                    ].sample_same_leaf(
+                        X_test_minus_idx[cur_output_ind, ...],
+                        np.array(X_nominal[grp_ord])[:, [cur_ordinal_ind]],
+                    )
+
+    if prob_type in ("classification", "binary"):
+        score = roc_auc_score(y_test, org_pred)
+    else:
+        score = (
+            mean_absolute_error(y_test, org_pred),
+            r2_score(y_test, org_pred),
+        )
+
+    for sample in range(n_sample):
+        if index_i is not None:
+            print(
+                f"Iteration/Fold:{index_i}, Processing col:{proc_col+1}, Sample:{sample+1}"
+            )
+        else:
+            print(f"Processing col:{proc_col+1}")
+        # Same shuffled indices across the sub-models items
+        indices = np.arange(current_X_test_list[0].shape[1])
+        if importance_estimator != "Mod_RF":
+            rng = np.random.RandomState(list_seeds[sample])
+            if Perm:
+                rng.shuffle(indices)
+            else:
+                indices = rng.choice(indices, size=len(indices))
+
+            for counter_test, X_test_comp in enumerate(current_X_test_list):
+                if var_type["regression"]:
+                    X_test_comp[..., p_col_n["regression"]] = (
+                        X_col_pred["regression"][counter_test]
+                        + Res_col[counter_test][:, indices, :]
+                    )
+
+                if var_type["classification"]:
+                    # X_col_pred['classification'] is a list of arrays representing the probability of getting
+                    # each value at the  corresponding variable (resp. to each observation)
+                    for cur_output in X_col_pred["classification"]:
+                        list_seed_cat = rng.randint(1e5, size=len(cur_output))
+                        for cat_prob_ind in range(len(cur_output)):
+                            rng_cat = np.random.RandomState(list_seed_cat[cat_prob_ind])
+                            current_X_col_new = np.array(
+                                [
+                                    [
+                                        rng_cat.choice(
+                                            np.unique(
+                                                X_nominal[[grp_nom[cat_prob_ind]]]
+                                            ),
+                                            size=1,
+                                            p=cur_output[cat_prob_ind][i],
+                                        )[0]
+                                        for i in range(
+                                            cur_output[cat_prob_ind].shape[0]
+                                        )
+                                    ]
+                                ]
+                            ).T
+                            if grp_nom[cat_prob_ind] in encoder:
+                                current_X_col_new = (
+                                    encoder[grp_nom[cat_prob_ind]]
+                                    .transform(current_X_col_new)
+                                    .toarray()
+                                    .astype("int32")
+                                )
+                            if cat_prob_ind == 0:
+                                X_col_new["classification"] = current_X_col_new
+                            else:
+                                X_col_new["classification"] = np.concatenate(
+                                    (
+                                        X_col_new["classification"],
+                                        current_X_col_new,
+                                    ),
+                                    axis=1,
+                                )
+                    X_test_comp[..., p_col_n["classification"]] = X_col_new[
+                        "classification"
+                    ]
+
+                if var_type["ordinal"]:
+                    for cur_output in X_col_pred["ordinal"]:
+                        list_seed_ord = rng.randint(1e5, size=len(cur_output))
+                        for ord_prob_ind, ord_prob in enumerate(cur_output):
+                            rng_ord = np.random.RandomState(list_seed_ord[ord_prob_ind])
+                            current_X_col_new = [None] * ord_prob.shape[0]
+                            for i in range(ord_prob.shape[0]):
+                                current_sample = [0] * ord_prob.shape[1]
+                                for ind_prob, prob in enumerate(ord_prob[i]):
+                                    current_sample[ind_prob] = rng_ord.choice(
+                                        [0, 1], p=[1 - prob, prob]
+                                    )
+                                    if current_sample[ind_prob] == 0:
+                                        break
+                                current_X_col_new[i] = np.array(current_sample)
+                                # while True:
+                                #     current_X_col_new[i] = rng_ord.binomial(1, ord_prob[i], ord_prob.shape[1])
+                                #     print(current_X_col_new[i])
+                                #     if np.any(np.all(output['ordinal'][ord_prob_ind] == current_X_col_new[i], axis=1)):
+                                #         break
+                                ind_cat = np.where(current_X_col_new[i] == 1)[0]
+                                current_X_col_new[i] = [
+                                    np.unique(X_nominal[grp_ord[ord_prob_ind]])[
+                                        len(ind_cat)
+                                    ]
+                                ]
+
+                            current_X_col_new = np.array(current_X_col_new)
+                            if ord_prob_ind == 0:
+                                X_col_new["ordinal"] = current_X_col_new
+                            else:
+                                X_col_new["ordinal"] = np.concatenate(
+                                    (
+                                        X_col_new["ordinal"],
+                                        current_X_col_new,
+                                    ),
+                                    axis=1,
+                                )
+                    X_test_comp[..., p_col_n["ordinal"]] = X_col_new["ordinal"]
+        else:
+            for counter_test, X_test_comp in enumerate(current_X_test_list):
+                if var_type["regression"]:
+                    X_test_comp[..., p_col_n["regression"]] = np.mean(
+                        sample_predictions(
+                            pred_mod_RF["regression"][counter_test],
+                            list_seeds[sample],
+                        ),
+                        axis=1,
+                    )
+
+                if var_type["classification"]:
+                    predictions = sample_predictions(
+                        pred_mod_RF["classification"], list_seeds[sample]
+                    )
+                    max_val = lambda row: list(Counter(row).keys())[0]
+                    for cat_prob_ind in range(predictions.shape[-1]):
+                        class_col = np.array(
+                            [[max_val(row)] for row in predictions[..., cat_prob_ind]]
+                        )
+                        if grp_nom[cat_prob_ind] in encoder:
+                            current_X_col_new = (
+                                encoder[grp_nom[cat_prob_ind]]
+                                .transform(class_col)
+                                .toarray()
+                                .astype("int32")
+                            )
+                        else:
+                            current_X_col_new = class_col
+                        if cat_prob_ind == 0:
+                            X_col_new["classification"] = current_X_col_new
+                        else:
+                            X_col_new["classification"] = np.concatenate(
+                                (
+                                    X_col_new["classification"],
+                                    current_X_col_new,
+                                ),
+                                axis=1,
+                            )
+                    X_test_comp[..., p_col_n["classification"]] = X_col_new[
+                        "classification"
+                    ]
+
+                if var_type["ordinal"]:
+                    max_val = lambda row: list(Counter(row).keys())[0]
+                    for cur_output_ind in range(len(pred_mod_RF["ordinal"])):
+                        predictions = sample_predictions(
+                            pred_mod_RF["ordinal"][cur_output_ind],
+                            list_seeds[sample],
+                        )
+                        class_ord = np.array(
+                            [[max_val(row)] for row in predictions[..., 0]]
+                        )
+                        if cur_output_ind == 0:
+                            X_col_new["ordinal"] = class_ord
+                        else:
+                            X_col_new["ordinal"] = np.concatenate(
+                                (X_col_new["ordinal"], class_ord), axis=1
+                            )
+                    X_test_comp[..., p_col_n["ordinal"]] = X_col_new["ordinal"]
+
+        if prob_type == "regression":
+            if type_predictor == "DNN":
+                pred_i = estimator.predict(current_X_test_list, scale=False)
+            else:
+                pred_i = estimator.predict(current_X_test_list[0].squeeze())
+
+            # Convert to the (n_samples x n_outputs) format
+            if len(pred_i.shape) != 2:
+                pred_i = pred_i.reshape(-1, 1)
+
+            res_ar[sample, ::] = (y_test - pred_i) ** 2 - (y_test - org_pred) ** 2
+        else:
+            if type_predictor == "DNN":
+                pred_i = estimator.predict_proba(current_X_test_list, scale=False)
+            else:
+                pred_i = convert_predict_proba(
+                    estimator.predict_proba(current_X_test_list[0].squeeze())
+                )
+
+            pred_i = np.clip(pred_i, 1e-10, 1 - 1e-10)
+            org_pred = np.clip(org_pred, 1e-10, 1 - 1e-10)
+            res_ar[sample, :, 0] = -np.sum(y_test * np.log(pred_i), axis=1) + np.sum(
+                y_test * np.log(org_pred), axis=1
+            )
+
+    return res_ar, score
+
+
+def joblib_compute_permutation(
+    p_col,
+    perm,
+    estimator,
+    type_predictor,
+    X_test_list,
+    y_test,
+    prob_type,
+    org_pred,
+    dict_cont={},
+    dict_nom={},
+    proc_col=None,
+    index_i=None,
+    group_stacking=False,
+    random_state=None,
+):
+    """This function applies the permutation feature importance (PFI).
+
+    Parameters
+    ----------
+    p_col: list
+        The list of single variables/groups to compute the feature importance.
+    perm: int
+        The current processed permutation (also to print if verbose > 0).
+    estimator: scikit-learn compatible estimator, default=None
+        The provided estimator for the prediction task (First block).
+    type_predictor: string
+        The provided predictor either the DNN learner or not.
+        The DNN learner will use different inputs for the different
+        sub-models especially while applying the stacking case.
+    X_test_list: list
+        The list of inputs containing either one input or a number of inputs
+        equal to the number of sub-models of the DNN learner.
+    y_test: {array-like, sparse matrix}, shape (n_samples, n_output)
+        The output test samples.
+    prob_type: str, default='regression'
+        A classification or a regression problem.
+    org_pred: {array-like, sparse matrix}, shape (n_output, n_samples)
+        The predictions using the original samples.
+    dict_cont: dict, default={}
+        The dictionary providing the indices of the continuous variables.
+    dict_nom: dict, default={}
+        The dictionary providing the indices of the categorical variables.
+    proc_col: int, default=None
+        The processed column to print if verbose > 0.
+    index_i: int, default=None
+        The index of the current processed iteration.
+    group_stacking: bool, default=False
+        Apply the stacking-based method for the provided groups.
+    random_state: int, default=None
+        Fixing the seeds of the random generator.
+    """
+    rng = np.random.RandomState(random_state)
+
+    if index_i is not None:
+        print(
+            f"Iteration/Fold:{index_i}, Processing col:{proc_col+1}, Permutation:{perm+1}"
+        )
+    else:
+        print(f"Processing col:{proc_col+1}, Permutation:{perm+1}")
+
+    # A list of copied items to avoid any overlapping in the process
+    current_X_test_list = [X_test_el.copy() for X_test_el in X_test_list]
+    indices = np.arange(current_X_test_list[0].shape[-2])
+    rng.shuffle(indices)
+
+    if not group_stacking:
+        p_col_new = []
+        for val in p_col:
+            if val in dict_nom.keys():
+                p_col_new += dict_nom[val]
+            else:
+                p_col_new += dict_cont[val]
+    else:
+        p_col_new = p_col
+
+    for X_test_comp in current_X_test_list:
+        X_test_comp[..., p_col_new] = X_test_comp[..., p_col_new].take(indices, axis=-2)
+
+    if prob_type == "regression":
+        score = (
+            mean_absolute_error(y_test, org_pred),
+            r2_score(y_test, org_pred),
+        )
+
+        if type_predictor == "DNN":
+            pred_i = estimator.predict(current_X_test_list, scale=False)
+        else:
+            pred_i = estimator.predict(current_X_test_list[0])
+
+        # Convert to the (n_samples x n_outputs) format
+        if len(pred_i.shape) != 2:
+            pred_i = pred_i.reshape(-1, 1)
+
+        res = (y_test - pred_i) ** 2 - (y_test - org_pred) ** 2
+    else:
+        score = roc_auc_score(y_test, org_pred)
+        if type_predictor == "DNN":
+            pred_i = estimator.predict_proba(current_X_test_list, scale=False)
+        else:
+            pred_i = convert_predict_proba(
+                estimator.predict_proba(current_X_test_list[0])
+            )
+
+        pred_i = np.clip(pred_i, 1e-10, 1 - 1e-10)
+        org_pred = np.clip(org_pred, 1e-10, 1 - 1e-10)
+        res = -np.sum(y_test * np.log(pred_i), axis=1) + np.sum(
+            y_test * np.log(org_pred), axis=1
+        )
+    return res, score
+
+
+def hypertune_predictor(estimator, X, y, param_grid):
+    # param_grid = {"max_depth": [2, 5, 10]}
+    grid_search = GridSearchCV(estimator, param_grid=param_grid, cv=2)
+    grid_search.fit(X, y)
+    return grid_search.best_estimator_
diff --git a/hidimstat/utils.py b/hidimstat/utils.py
index 044f4c3..ab75b45 100644
--- a/hidimstat/utils.py
+++ b/hidimstat/utils.py
@@ -1,6 +1,13 @@
 # -*- coding: utf-8 -*-
 # Author: Binh Nguyen <tuan-binh.nguyen@inria.fr> & Jerome-Alexis Chevalier
+import copy
 import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from sklearn.metrics import log_loss, mean_squared_error
+from sklearn.preprocessing import StandardScaler
+from torchmetrics import Accuracy
 
 
 def quantile_aggregation(pvals, gamma=0.5, gamma_min=0.05, adaptive=False):
@@ -124,3 +131,601 @@ def _adaptive_quantile_aggregation(pvals, gamma_min=0.05):
         _fixed_quantile_aggregation(pvals, gamma) for gamma in gammas])
 
     return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0))
+
+
+def create_X_y(
+    X,
+    y,
+    bootstrap=True,
+    split_perc=0.8,
+    prob_type="regression",
+    list_cont=None,
+    random_state=None,
+):
+    """Create train/valid split of input data X and target variable y.
+    Parameters
+    ----------
+    X: {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input samples before the splitting process.
+    y: ndarray, shape (n_samples, )
+        The output samples before the splitting process.
+    bootstrap: bool, default=True
+        Application of bootstrap sampling for the training set.
+    split_perc: float, default=0.8
+        The training/validation cut for the provided data.
+    prob_type: str, default='regression'
+        A classification or a regression problem.
+    list_cont: list, default=[]
+        The list of continuous variables.
+    random_state: int, default=2023
+        Fixing the seeds of the random generator.
+
+    Returns
+    -------
+    X_train_scaled: {array-like, sparse matrix}, shape (n_train_samples, n_features)
+        The bootstrapped training input samples with scaled continuous variables.
+    y_train_scaled: {array-like}, shape (n_train_samples, )
+        The bootstrapped training output samples scaled if continous.
+    X_valid_scaled: {array-like, sparse matrix}, shape (n_valid_samples, n_features)
+        The validation input samples with scaled continuous variables.
+    y_valid_scaled: {array-like}, shape (n_valid_samples, )
+        The validation output samples scaled if continous.
+    X_scaled: {array-like, sparse matrix}, shape (n_samples, n_features)
+        The original input samples with scaled continuous variables.
+    y_valid: {array-like}, shape (n_samples, )
+        The original output samples with validation indices.
+    scaler_x: scikit-learn StandardScaler
+        The standard scaler encoder for the continuous variables of the input.
+    scaler_y: scikit-learn StandardScaler
+        The standard scaler encoder for the output if continuous.
+    valid_ind: list
+        The list of indices of the validation set.
+    """
+    rng = np.random.RandomState(random_state)
+    scaler_x, scaler_y = StandardScaler(), StandardScaler()
+    n = X.shape[0]
+
+    if bootstrap:
+        train_ind = rng.choice(n, n, replace=True)
+    else:
+        train_ind = rng.choice(n, size=int(np.floor(split_perc * n)), replace=False)
+    valid_ind = np.array([ind for ind in range(n) if ind not in train_ind])
+
+    X_train, X_valid = X[train_ind], X[valid_ind]
+    y_train, y_valid = y[train_ind], y[valid_ind]
+
+    # Scaling X and y
+    X_train_scaled = X_train.copy()
+    X_valid_scaled = X_valid.copy()
+    X_scaled = X.copy()
+
+    if len(list_cont) > 0:
+        X_train_scaled[:, list_cont] = scaler_x.fit_transform(X_train[:, list_cont])
+        X_valid_scaled[:, list_cont] = scaler_x.transform(X_valid[:, list_cont])
+        X_scaled[:, list_cont] = scaler_x.transform(X[:, list_cont])
+    if prob_type == "regression":
+        y_train_scaled = scaler_y.fit_transform(y_train)
+        y_valid_scaled = scaler_y.transform(y_valid)
+    else:
+        y_train_scaled = y_train.copy()
+        y_valid_scaled = y_valid.copy()
+
+    return (
+        X_train_scaled,
+        y_train_scaled,
+        X_valid_scaled,
+        y_valid_scaled,
+        X_scaled,
+        y_valid,
+        scaler_x,
+        scaler_y,
+        valid_ind,
+    )
+
+
+def sigmoid(x):
+    """The function applies the sigmoid function element-wise to the input array x."""
+    return 1 / (1 + np.exp(-x))
+
+
+def softmax(x):
+    """The function applies the softmax function element-wise to the input array x."""
+    # Ensure numerical stability by subtracting the maximum value of x from each element of x
+    # This prevents overflow errors when exponentiating large numbers
+    x = x - np.max(x, axis=-1, keepdims=True)
+    exp_x = np.exp(x)
+    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
+
+
+def relu(x):
+    """The function applies the relu function element-wise to the input array x."""
+    return (abs(x) + x) / 2
+
+
+def relu_(x):
+    """The function applies the derivative of the relu function element-wise
+    to the input array x.
+    """
+    return (x > 0) * 1
+
+
+def convert_predict_proba(list_probs):
+    """If the classification is done using a one-hot encoded variable, the list of
+    probabilites will be a list of lists for the probabilities of each of the categories.
+    This function takes the probabilities of having each category (=1 with binary) and stack
+    them into one ndarray.
+    """
+    if len(list_probs.shape) == 3:
+        list_probs = np.array(list_probs)[..., 1].T
+    return list_probs
+
+
+def ordinal_encode(y):
+    """This function encodes the ordinal variable with a special gradual encoding storing also
+    the natural order information.
+    """
+    list_y = []
+    for y_col in range(y.shape[-1]):
+        # Retrieve the unique values
+        unique_vals = np.unique(y[:, y_col])
+        # Mapping each unique value to its corresponding index
+        mapping_dict = {}
+        for i, val in enumerate(unique_vals):
+            mapping_dict[val] = i + 1
+        # create a zero-filled array for the ordinal encoding
+        y_ordinal = np.zeros((len(y[:, y_col]), len(set(y[:, y_col]))))
+        # set the appropriate indices to 1 for each ordinal value and all lower ordinal values
+        for ind_el, el in enumerate(y[:, y_col]):
+            y_ordinal[ind_el, np.arange(mapping_dict[el])] = 1
+        list_y.append(y_ordinal[:, 1:])
+
+    return list_y
+
+
+def sample_predictions(predictions, random_state=None):
+    """This function samples from the same leaf node of the input sample
+    in both the regression and the classification case
+    """
+    rng = np.random.RandomState(random_state)
+    # print(predictions[..., rng.randint(predictions.shape[2]), :])
+    # print(predictions.shape)
+    # exit(0)
+    return predictions[..., rng.randint(predictions.shape[2]), :]
+
+
+def joblib_ensemble_dnnet(
+    X,
+    y,
+    prob_type="regression",
+    link_func=None,
+    list_cont=None,
+    list_grps=None,
+    bootstrap=False,
+    split_perc=0.8,
+    group_stacking=False,
+    inp_dim=None,
+    n_epoch=200,
+    batch_size=32,
+    beta1=0.9,
+    beta2=0.999,
+    lr=1e-3,
+    l1_weight=1e-2,
+    l2_weight=1e-2,
+    epsilon=1e-8,
+    random_state=None,
+):
+    """
+    Parameters
+    ----------
+    X : {array-like, sparse-matrix}, shape (n_samples, n_features)
+        The input samples.
+    y : {array-like}, shape (n_samples,)
+        The output samples.
+    random_state: int, default=None
+        Fixing the seeds of the random generator
+    """
+
+    pred_v = np.empty(X.shape[0])
+    # Sampling and Train/Validate splitting
+    (
+        X_train_scaled,
+        y_train_scaled,
+        X_valid_scaled,
+        y_valid_scaled,
+        X_scaled,
+        y_valid,
+        scaler_x,
+        scaler_y,
+        valid_ind,
+    ) = create_X_y(
+        X,
+        y,
+        bootstrap=bootstrap,
+        split_perc=split_perc,
+        prob_type=prob_type,
+        list_cont=list_cont,
+        random_state=random_state,
+    )
+
+    current_model = dnn_net(
+        X_train_scaled,
+        y_train_scaled,
+        X_valid_scaled,
+        y_valid_scaled,
+        prob_type=prob_type,
+        n_epoch=n_epoch,
+        batch_size=batch_size,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        l1_weight=l1_weight,
+        l2_weight=l2_weight,
+        epsilon=epsilon,
+        list_grps=list_grps,
+        group_stacking=group_stacking,
+        inp_dim=inp_dim,
+        random_state=random_state,
+    )
+
+    if not group_stacking:
+        X_scaled_n = X_scaled.copy()
+    else:
+        X_scaled_n = np.zeros((X_scaled.shape[0], inp_dim[-1]))
+        for grp_ind in range(len(list_grps)):
+            n_layer_stacking = len(current_model[3][grp_ind]) - 1
+            curr_pred = X_scaled[:, list_grps[grp_ind]].copy()
+            for ind_w_b in range(n_layer_stacking):
+                if ind_w_b == 0:
+                    curr_pred = relu(
+                        X_scaled[:, list_grps[grp_ind]].dot(
+                            current_model[3][grp_ind][ind_w_b]
+                        )
+                        + current_model[4][grp_ind][ind_w_b]
+                    )
+                else:
+                    curr_pred = relu(
+                        curr_pred.dot(current_model[3][grp_ind][ind_w_b])
+                        + current_model[4][grp_ind][ind_w_b]
+                    )
+            X_scaled_n[
+                :,
+                list(np.arange(inp_dim[grp_ind], inp_dim[grp_ind + 1])),
+            ] = (
+                curr_pred.dot(current_model[3][grp_ind][n_layer_stacking])
+                + current_model[4][grp_ind][n_layer_stacking]
+            )
+
+    n_layer = len(current_model[0]) - 1
+    for j in range(n_layer):
+        if j == 0:
+            pred = relu(X_scaled_n.dot(current_model[0][j]) + current_model[1][j])
+        else:
+            pred = relu(pred.dot(current_model[0][j]) + current_model[1][j])
+
+    pred = pred.dot(current_model[0][n_layer]) + current_model[1][n_layer]
+
+    if prob_type not in ("classification", "binary"):
+        if prob_type != "ordinal":
+            pred_v = pred * scaler_y.scale_ + scaler_y.mean_
+        else:
+            pred_v = link_func[prob_type](pred)
+        loss = np.std(y_valid) ** 2 - mean_squared_error(y_valid, pred_v[valid_ind])
+    else:
+        pred_v = link_func[prob_type](pred)
+        loss = log_loss(
+            y_valid, np.ones(y_valid.shape) * np.mean(y_valid, axis=0)
+        ) - log_loss(y_valid, pred_v[valid_ind])
+
+    return (current_model, scaler_x, scaler_y, pred_v, loss)
+
+
+def init_weights(layer):
+    if isinstance(layer, nn.Linear):
+        layer.weight.data = (layer.weight.data.uniform_() - 0.5) * 0.2
+        layer.bias.data = (layer.bias.data.uniform_() - 0.5) * 0.1
+
+
+def Dataset_Loader(X, y, shuffle=False, batch_size=50):
+    if y.shape[-1] == 2:
+        y = y[:, [1]]
+    dataset = torch.utils.data.TensorDataset(
+        torch.from_numpy(X).float(), torch.from_numpy(y).float()
+    )
+
+    loader = torch.utils.data.DataLoader(
+        dataset, batch_size=batch_size, shuffle=shuffle
+    )
+    return loader
+
+
+class DNN(nn.Module):
+    """Feedfoward neural network with 4 hidden layers"""
+
+    def __init__(self, input_dim, group_stacking, list_grps, out_dim, prob_type):
+        super().__init__()
+        if prob_type == "classification":
+            self.accuracy = Accuracy(task="multiclass", num_classes=out_dim)
+        else:
+            self.accuracy = Accuracy(task="binary")
+        self.list_grps = list_grps
+        self.group_stacking = group_stacking
+        if group_stacking:
+            self.layers_stacking = nn.ModuleList(
+                [
+                    nn.Linear(
+                        in_features=len(grp),
+                        out_features=input_dim[grp_ind + 1] - input_dim[grp_ind],
+                    )
+                    # nn.Sequential(
+                    #     nn.Linear(
+                    #         in_features=len(grp),
+                    #         # out_features=max(1, int(0.1 * len(grp))),
+                    #         out_features=input_dim[grp_ind + 1]
+                    #         - input_dim[grp_ind],
+                    #     ),
+                    #     nn.ReLU(),
+                    # nn.Linear(
+                    #     in_features=max(1, int(0.1 * len(grp))),
+                    #     out_features=input_dim[grp_ind + 1]
+                    #     - input_dim[grp_ind],
+                    # ),
+                    # nn.ReLU(),
+                    # nn.Linear(
+                    #     in_features=max(1, int(0.1 * len(grp))),
+                    #     out_features=input_dim[grp_ind + 1]
+                    #     - input_dim[grp_ind],
+                    # ),
+                    # )
+                    for grp_ind, grp in enumerate(list_grps)
+                ]
+            )
+            input_dim = input_dim[-1]
+        self.layers = nn.Sequential(
+            # hidden layers
+            nn.Linear(input_dim, 50),
+            nn.ReLU(),
+            nn.Linear(50, 40),
+            nn.ReLU(),
+            nn.Linear(40, 30),
+            nn.ReLU(),
+            nn.Linear(30, 20),
+            nn.ReLU(),
+            # output layer
+            nn.Linear(20, out_dim),
+        )
+        self.loss = 0
+
+    def forward(self, x):
+        if self.group_stacking:
+            list_stacking = [None] * len(self.layers_stacking)
+            for ind_layer, layer in enumerate(self.layers_stacking):
+                list_stacking[ind_layer] = layer(x[:, self.list_grps[ind_layer]])
+            x = torch.cat(list_stacking, dim=1)
+        return self.layers(x)
+
+    def training_step(self, batch, device, prob_type):
+        X, y = batch[0].to(device), batch[1].to(device)
+        y_pred = self(X)  # Generate predictions
+        if prob_type == "regression":
+            loss = F.mse_loss(y_pred, y)
+        elif prob_type == "classification":
+            loss = F.cross_entropy(y_pred, y)  # Calculate loss
+        else:
+            loss = F.binary_cross_entropy_with_logits(y_pred, y)
+        return loss
+
+    def validation_step(self, batch, device, prob_type):
+        X, y = batch[0].to(device), batch[1].to(device)
+        y_pred = self(X)  # Generate predictions
+        if prob_type == "regression":
+            loss = F.mse_loss(y_pred, y)
+            return {
+                "val_mse": loss,
+                "batch_size": len(X),
+            }
+        else:
+            if prob_type == "classification":
+                loss = F.cross_entropy(y_pred, y)  # Calculate loss
+            else:
+                loss = F.binary_cross_entropy_with_logits(y_pred, y)
+            acc = self.accuracy(y_pred, y.int())
+            return {
+                "val_loss": loss,
+                "val_acc": acc,
+                "batch_size": len(X),
+            }
+
+    def validation_epoch_end(self, outputs, prob_type):
+        if prob_type in ("classification", "binary"):
+            batch_losses = []
+            batch_accs = []
+            batch_sizes = []
+            for x in outputs:
+                batch_losses.append(x["val_loss"] * x["batch_size"])
+                batch_accs.append(x["val_acc"] * x["batch_size"])
+                batch_sizes.append(x["batch_size"])
+            self.loss = torch.stack(batch_losses).sum().item() / np.sum(
+                batch_sizes
+            )  # Combine losses
+            epoch_acc = torch.stack(batch_accs).sum().item() / np.sum(
+                batch_sizes
+            )  # Combine accuracies
+            return {"val_loss": self.loss, "val_acc": epoch_acc}
+        else:
+            batch_losses = [x["val_mse"] * x["batch_size"] for x in outputs]
+            batch_sizes = [x["batch_size"] for x in outputs]
+            self.loss = torch.stack(batch_losses).sum().item() / np.sum(
+                batch_sizes
+            )  # Combine losses
+            return {"val_mse": self.loss}
+
+    def epoch_end(self, epoch, result):
+        if len(result) == 2:
+            print(
+                "Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format(
+                    epoch + 1, result["val_loss"], result["val_acc"]
+                )
+            )
+        else:
+            print("Epoch [{}], val_mse: {:.4f}".format(epoch + 1, result["val_mse"]))
+
+
+def evaluate(model, loader, device, prob_type):
+    outputs = [model.validation_step(batch, device, prob_type) for batch in loader]
+    return model.validation_epoch_end(outputs, prob_type)
+
+
+def dnn_net(
+    X_train,
+    y_train,
+    X_valid,
+    y_valid,
+    prob_type="regression",
+    n_epoch=200,
+    batch_size=32,
+    batch_size_val=128,
+    beta1=0.9,
+    beta2=0.999,
+    lr=1e-3,
+    l1_weight=1e-2,
+    l2_weight=1e-2,
+    epsilon=1e-8,
+    list_grps=None,
+    group_stacking=False,
+    inp_dim=None,
+    random_state=2023,
+    verbose=0,
+):
+    """
+    train_loader: DataLoader for Train data
+    val_loader: DataLoader for Validation data
+    original_loader: DataLoader for Original_data
+    p: Number of variables
+    n_epochs: The number of epochs
+    lr: learning rate
+    beta1: Beta1 parameter for Adam optimizer
+    beta2: Beta2 parameter for Adam optimizer
+    epsilon: Epsilon parameter for Adam optimizer
+    l1_weight: L1 regalurization weight
+    l2_weight: L2 regularization weight
+    verbose: If > 2, the metrics will be printed
+    prob_type: A classification or regression problem
+    """
+    # Creating DataLoaders
+    train_loader = Dataset_Loader(
+        X_train,
+        y_train,
+        shuffle=True,
+        batch_size=batch_size,
+    )
+    validate_loader = Dataset_Loader(X_valid, y_valid, batch_size=batch_size_val)
+    # Set the seed for PyTorch's random number generator
+    torch.manual_seed(random_state)
+
+    # Set the seed for PyTorch's CUDA random number generator(s), if available
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(random_state)
+        torch.cuda.manual_seed_all(random_state)
+
+    # Specify whether to use GPU or CPU
+    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device("cpu")
+
+    if prob_type in ("regression", "binary"):
+        out_dim = 1
+    else:
+        out_dim = y_train.shape[-1]
+
+    # DNN model
+    input_dim = inp_dim.copy() if group_stacking else X_train.shape[1]
+    model = DNN(input_dim, group_stacking, list_grps, out_dim, prob_type)
+    model.to(device)
+    # Initializing weights/bias
+    model.apply(init_weights)
+    # Adam Optimizer
+    optimizer = torch.optim.Adam(
+        model.parameters(), lr=lr, betas=(beta1, beta2), eps=epsilon
+    )
+
+    best_loss = 1e100
+    for epoch in range(n_epoch):
+        # Training Phase
+        model.train()
+        for batch in train_loader:
+            optimizer.zero_grad()
+            loss = model.training_step(batch, device, prob_type)
+
+            loss.backward()
+            optimizer.step()
+            for name, param in model.named_parameters():
+                if "bias" not in name:
+                    # if name.split(".")[0] == "layers_stacking":
+                    #     param.data -= l2_weight * param.data
+                    # else:
+                    param.data -= (
+                        l1_weight * torch.sign(param.data) + l2_weight * param.data
+                    )
+        # Validation Phase
+        model.eval()
+        result = evaluate(model, validate_loader, device, prob_type)
+        if model.loss < best_loss:
+            best_loss = model.loss
+            dict_params = copy.deepcopy(model.state_dict())
+        if verbose >= 2:
+            model.epoch_end(epoch, result)
+
+    best_weight = []
+    best_bias = []
+    best_weight_stack = [[].copy() for _ in range(len(list_grps))]
+    best_bias_stack = [[].copy() for _ in range(len(list_grps))]
+
+    for name, param in dict_params.items():
+        if name.split(".")[0] == "layers":
+            if name.split(".")[-1] == "weight":
+                best_weight.append(param.numpy().T)
+            if name.split(".")[-1] == "bias":
+                best_bias.append(param.numpy()[np.newaxis, :])
+        if name.split(".")[0] == "layers_stacking":
+            curr_ind = int(name.split(".")[1])
+            if name.split(".")[-1] == "weight":
+                best_weight_stack[curr_ind].append(param.numpy().T)
+            if name.split(".")[-1] == "bias":
+                best_bias_stack[curr_ind].append(param.numpy()[np.newaxis, :])
+
+    return [
+        best_weight,
+        best_bias,
+        best_loss,
+        best_weight_stack,
+        best_bias_stack,
+    ]
+
+
+def compute_imp_std(pred_scores):
+    weights = np.array([el.shape[-2] for el in pred_scores])
+    # Compute the mean of each fold over the number of observations
+    pred_mean = np.array(
+        [np.mean(el.copy(), axis=-2) for el in pred_scores]
+    )
+
+    # Weighted average
+    imp = np.average(
+        pred_mean, axis=0, weights=weights
+    )
+
+    # Compute the standard deviation of each fold
+    # over the number of observations
+    pred_std = np.array(
+        [
+            np.mean(
+                (el - imp[..., np.newaxis]) ** 2,
+                axis=-2,
+            )
+            for el in pred_scores
+        ]
+    )
+    std = np.sqrt(
+        np.average(pred_std, axis=0, weights=weights)
+        / (np.sum(weights) - 1)
+    )
+    return (imp, std)