From 467e675a9c488eb87d4196f2649b4f5ff542681b Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Tue, 29 Nov 2022 14:19:58 -0800
Subject: [PATCH 01/48] omst graph theory

---
 idconn/networking/graph_theory.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/idconn/networking/graph_theory.py b/idconn/networking/graph_theory.py
index 2713929..710cadd 100644
--- a/idconn/networking/graph_theory.py
+++ b/idconn/networking/graph_theory.py
@@ -80,4 +80,5 @@ def graph_omst(matrix, measure, args):
 
     # calculate graph measure on thresholded matrix
     metric = measure(thresh_mat, args)
-    return metric
\ No newline at end of file
+    return metric
+

From 9c1feeb86214429298657886afc12baab9f732fa Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 12:08:10 -0800
Subject: [PATCH 02/48] update null modeling with new data shape

---
 idconn/networking/null_distribution.py | 69 +++++++-------------------
 1 file changed, 19 insertions(+), 50 deletions(-)

diff --git a/idconn/networking/null_distribution.py b/idconn/networking/null_distribution.py
index 623f64f..03f9ce7 100644
--- a/idconn/networking/null_distribution.py
+++ b/idconn/networking/null_distribution.py
@@ -4,47 +4,16 @@
 import bct
 import datetime
 
-def avg_corrmat(layout, task, session):
-    subjects = layout.get_subjects(task=task,session=session)
-    corrmats = {}
-    for subject in subjects:
-        try:
-            if task == "rest":
-                corrmat = np.genfromtxt(
-                    join(
-                        data_dir,
-                        sesh[session],
-                        subject,
-                        "{0}-session-{1}-{2}_network_corrmat_{3}.csv".format(
-                            subject, session, task, atlas
-                        ),
-                    ),
-                    delimiter=",",
-                )
-            else:
-                corrmat = np.genfromtxt(
-                    join(
-                        data_dir,
-                        sesh[session],
-                        subject,
-                        "{0}-session-{1}_{2}-{3}_{4}-corrmat.csv".format(
-                            subject, session, task, condition, atlas
-                        ),
-                    ),
-                    delimiter=" ",
-                )
-            # corrmat = np.genfromtxt(join(data_dir, '{0}-session-{1}_{2}-{3}_{4}-corrmat.csv'.format(subject, session, task, condition, atlas)), delimiter=' ')
-            corrmats[subject] = corrmat
-        except Exception as e:
-            print(subject, e)
-    data = list(corrmats.values())
-    stacked_corrmats = np.array(data)
+# this is all bullshit.
+# update to mesh with the BIDSy way of doing things
+def avg_corrmat(ppt_df):
+    stacked_corrmats = np.array(ppt_df['adj'])
     print('Stacked corrmats have dimensions', stacked_corrmats.shape)
     avg_corrmat = np.mean(stacked_corrmats, axis=0)
     return avg_corrmat
 
 
-def null_model_und_sign(W, bin_swaps=5, wei_freq=0.1, seed=None):
+def null_model(W, bin_swaps=5, wei_freq=0.1, seed=None):
     def get_rng(seed):
         if seed is None or seed == np.random:
             return np.random.mtrand._rand
@@ -53,7 +22,7 @@ def get_rng(seed):
         try:
             rstate = np.random.RandomState(seed)
         except ValueError:
-            rstate = np.random.RandomState(random.Random(seed).randint(0, 2 ** 32 - 1))
+            rstate = np.random.RandomState(np.random.Random(seed).randint(0, 2 ** 32 - 1))
         return rstate
 
     def randmio_und_signed(R, itr, seed=None):
@@ -194,28 +163,28 @@ def pick_four_unique_nodes_quickly(n, seed=None):
     W0 = W0 + W0.T
     return W0
 
-def generate_null(layout, task, session, mask):
+def generate_null(ppt_df, thresh_arr, measure):
+    '''
+    Generate a distribution of graph measure values based on a null connectivity matrix
+    that is like the average connectivity matrix across participants.
+    
+    '''
     null_dist = pd.DataFrame(index=subjects, columns=["mean", "sdev"])
     avg_corr = avg_corrmat(
-        layout, task, session, mask
+        ppt_df
     )
     eff_perm = []
-    j = 1
-    while j < 3:
+    j = 0
+    while j < 1000:
         effs = []
-        W = null_model_und_sign(avg_corr.values)
-        for thresh in np.arange(0.21, 0.31, 0.03):
+        W = null_model(avg_corr.values)
+        for thresh in thresh_arr:
             thresh_corr = bct.threshold_proportional(W, thresh)
-            leff = bct.efficiency_wei(thresh_corr)
+            leff = measure(thresh_corr)
             effs.append(leff)
         effs_arr = np.asarray(effs)
         leff_auc = np.trapz(effs_arr, dx=0.03, axis=0)
         eff_perm.append(leff_auc)
         j += 1
-    null_dist.at[(sesh[session], task, conds[i], mask), "mean"] = np.mean(
-        eff_perm
-    )
-    null_dist.at[(sesh[session], task, conds[i], mask), "sdev"] = np.std(
-        eff_perm
-    )
+    
     return null_dist
\ No newline at end of file

From 888604f8d69e56bc63b6f7f66e5e77ede99aeeec Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 12:16:23 -0800
Subject: [PATCH 03/48] draft of pynbs and nbspredict

---
 idconn/nbs.py | 297 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 297 insertions(+)
 create mode 100644 idconn/nbs.py

diff --git a/idconn/nbs.py b/idconn/nbs.py
new file mode 100644
index 0000000..8998e25
--- /dev/null
+++ b/idconn/nbs.py
@@ -0,0 +1,297 @@
+import numpy as np
+import statsmodels as sm
+import networkx as nx
+from utils import vectorize_corrmats, undo_vectorize
+
+
+def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000, stratified=False):
+    '''
+    Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided
+    of shape ((subject x session)x node x node)
+    in the network.
+    Returns a dataframe containing the results of kfolds cross-validation,
+    including the indices of train and test samples, the resulting p-value and largest connected component,
+    the accuracy of the network in predicting group belonging in the test samples (using logistic regression),
+    the parameter estimates from each regression, and the model object from each regression. 
+    from a BIDS derivative folder. Optionally returns a subject x session dataframe
+    of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) 
+    array of vectorized upper triangles of those correlation mat
+    Parameters
+    ----------
+    matrices : numpy array of shape (p, n, n)
+        Represents the link strengths of the graphs (i.e., functional connectivity). 
+        Assumed to be an array of symmetric matrices.
+    outcome : list-like of shape (p,)
+        Y-value to be predicted with connectivity
+    confounds : list-like of shape (p,m)
+        Covariates, included as predictors in model.
+    alpha : float
+        Type-I error (i.e., false positive) rate, for outcome-related edge detection.
+    predict : bool
+        If True, bypasses `permutations` parameter and only runs edge detection + component identification.
+        Used for NBS-Predict.
+    permutations : int
+        If `predict=False`, specifies the number of permutations run to create a null distribution
+        for estimating the significance of the connected component size. Recommended 10,000.
+    stratified : bool or list-like of shape (p,)
+        If `predict=True` and there are groups that should be equally sampled across k-fold 
+        cross-validation, input should be a list of group belonging (i.e., one label per participant).
+
+    Returns
+    -------
+    S1 : Pandas dataframe
+        A binary matrix denoting the largest connected component.
+    pval : float
+        If `predict=False`, denotes the significance of the largest connected component.
+    perms : numpy array of shape (permutations,)
+        If `predict=False`, largest connected component size per permutation.
+    '''
+    # need to do a mass-univariate test at every edge
+    # and retain significant edges
+    # then find the largest connected component
+    # and, if not predict, build a null distribution
+    n = matrices.shape[:-1]
+    ndims = len(matrices.shape)
+    
+    # vectorize_corrmats returns p x n^2
+    # we want to run pynbs per edge
+    # so vectorized edges must be transposed
+    
+    exog = np.hstack((outcome, confounds))
+    exog = sm.add_constant(exog, prepend=False)
+    # turn matrices into vectorized upper triangles
+    if ndims > 2:
+        edges = vectorize_corrmats(matrices)
+    else:
+        edges = matrices.copy()
+    edges = edges.T
+    
+    # run an ols per edge
+    # create significancs matrix for predictor of interest (outcome)
+    # 1 if edge is significantly predicted by outcome
+    # 0 if it's not
+    sig_edges = []
+    for i in range(0, edges.shape[0]):
+        # statsmodels for regressing predictors on edges
+        mod = sm.OLS(edges[i,:], exog, hasconst=True)
+        results = mod.fit()
+        edge_pval = results.pvalues[0]
+        
+        # build binary significance edge vector
+        if edge_pval < alpha:
+            sig_edges.append(1)
+        else:
+            sig_edges.append(0)
+    
+    # find largest connected component of sig_edges
+    # turn sig_edges into an nxn matrix first
+    sig_matrix = undo_vectorize(sig_edges) # need to write this function
+    matrix = nx.from_numpy_array(sig_matrix)
+    
+    #use networkX to find connected components
+    comps = nx.connected_components(matrix)
+    
+    # rearrange networkx output into an array of matrices, S
+    S = [matrix.subgraph(c).copy() for c in comps]
+    # find size of each connected component, s in S
+    size = np.asarray([s.number_of_edges() for s in S])
+    (max_comp, ) = np.where(size == max(size))
+    largest_comp_size = max(size)
+    print(f'Connected component has {largest_comp_size} edges.')
+
+    # retain size of largest connected component 
+    # for NBS permutation-based significance testing
+    max_comp = max_comp[0]
+
+    # pull the subgraph with largest number of nodes
+    # i.e., the largest connected component
+    G = S[max_comp]
+
+    # grab list of nodes in largest connected component
+    nodes = list(G.nodes)
+    
+    unused_nodes = list(set(matrix.nodes) - set(nodes))
+    S1 = nx.to_pandas_adjacency(G, nodelist=nodes)
+
+    # add empty edges for unused nodes
+    # bc NBS-Predict needs all nodes for
+    # the eventual weighted average
+    # and NBS might need all nodes for easier
+    # plotting in brain space
+    for i in unused_nodes:
+        S1.loc[i] = 0
+        S1[i] = 0
+
+    S1.sort_index(axis=0, inplace=True)
+    S1.sort_index(axis=1, inplace=True)
+    
+    # permutation testing to create a null distribution of max component size
+    # only for regular NBS, -Predict doesn't need this
+    if predict == False:
+        perms = np.zeros((permutations,))
+        hit = 0
+        rng = np.random.default_rng()
+        exog_copy = exog.copy()
+        for i in range(0, permutations):
+            # shuffle outcome order
+            rng.shuffle(exog_copy, axis=0)
+            #print(exog_copy)
+            perm_edges = []
+            for j in range(0, edges.shape[0]):
+                # statsmodels for regressing predictors on edges
+                mod = sm.OLS(edges[j,:], exog_copy, hasconst=False)
+                results = mod.fit()
+                edge_pval = results.pvalues[0]
+                
+                if edge_pval < alpha:
+                    perm_edges.append(1)
+                else:
+                    perm_edges.append(0)
+            #print(np.sum(perm_edges))
+            # find largest connected component of sig_edges
+            # turn sig_edges into an nxn matrix first
+            perm_matrix = undo_vectorize(perm_edges) # need to write this function
+            perm_nx = nx.from_numpy_array(perm_matrix)
+
+            comps = nx.connected_components(perm_nx)
+
+            S = [perm_nx.subgraph(c).copy() for c in comps]
+            perm_size = np.asarray([s.number_of_edges() for s in S])
+            (max_comp, ) = np.where(perm_size == max(perm_size))
+            #print(perm_size, max_comp)
+
+            # retain for null distribution
+            perms[i] = max(perm_size)
+            if i % 10 == 0:
+                print(f'p-value is {np.size(np.where(perms >= largest_comp_size)) / permutations} as of permutation {i}')
+            
+            # bctpy nbs code uses hit to mark progress across permutations
+            # prob not necessary?
+        
+        # bctpy calcs pval for all components, not just largest?
+        # but I don't think that's relevant for the og implimentation of nbs?
+        pval = np.size(np.where(perms >= largest_comp_size)) / permutations
+        print(largest_comp_size, permutations, pval)
+        
+        return pval, S1, perms
+    else:
+        return S1
+
+def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10, k=1000, shuffle=False, fig_dir=None):
+    """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided
+    of shape ((subject x session)x node x node)
+    in the network.
+    Returns a dataframe containing the results of kfolds cross-validation,
+    including the indices of train and test samples, the resulting p-value and largest connected component,
+    the accuracy of the network in predicting group belonging in the test samples (using logistic regression),
+    the parameter estimates from each regression, and the model object from each regression. 
+    from a BIDS derivative folder. Optionally returns a subject x session dataframe
+    of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) 
+    array of vectorized upper triangles of those correlation mat
+    Parameters
+    ----------
+    matrices : numpy array of shape (p, n, n)
+        Represents the link strengths of the graphs. Assumed to be
+        an array of symmetric matrices.
+    outcome : list-like of shape (p,)
+        Y-value to be predicted with connectivity
+    
+    Returns
+    -------
+    cv_results : Pandas dataframe
+        Includes the results of each cross-validation loop
+        the input matrices.
+    """
+    edges = vectorize_corrmats(matrices)
+    #print(edges.shape)
+    index = list(range(0,n_splits * n_iterations))
+
+    cv_results = pd.DataFrame(index=index, 
+                            columns=['split',  
+                                    'pval', 
+                                    'score',
+                                    'component',
+                                    'coefficient_matrix',
+                                    'coefficient_vector',
+                                    'model'])
+    if groups is not None:
+        cv = RepeatedStratifiedKFold(n_splits=n_splits,
+                                    n_repeats=n_iterations)
+        df = groups.shape[0] - 2
+    else:
+        cv = RepeatedKFold(n_splits=n_splits, 
+                        n_repeats=n_iterations)
+        df = edges.shape[0] - 1
+    
+    if tail == 'both':
+        alpha = 0.01
+    else:
+        alpha = 0.005
+    t_threshold = t.ppf(1 - alpha, df=df)
+    
+    if matrices.shape[0] != matrices.shape[1]:
+        if matrices.shape[1] == matrices.shape[2]:
+            num_node = matrices.shape[1]
+            matrices = np.moveaxis(matrices, 0, -1)
+        else:
+            raise ValueError(f'Matrices of shape {matrices.shape}',
+                             'requires matrices of shape (subject x session) x node x node',
+                             'or node x node x (subject x session).')
+    else:
+        num_node = matrices.shape[0]
+    upper_tri = np.triu_indices(num_node, k=1)
+    
+    i = 0
+    manager = enlighten.get_manager()
+    ticks = manager.counter(total=n_splits * n_iterations, desc='Progress', unit='folds')
+    for train_idx, test_idx in cv.split(edges, outcome, groups=groups):
+        cv_results.at[i, 'split'] = (train_idx, test_idx)
+        # all of this presumes the old bctpy version of nbs
+        # irrelevant for pynbs
+        #train_a_idx = [m for m in train_idx if outcome[m] == 0]
+        #train_b_idx = [m for m in train_idx if outcome[m] == 1]
+        #assert len(train_a_idx) == len(train_b_idx)
+        #train_a = matrices[:,:,train_a_idx]
+        #train_b = matrices[:,:,train_b_idx]
+        #print(train_a.shape, train_b.shape)
+        
+        # separate edges & covariates into 
+        train_y = outcome[train_idx]
+        test_y = outcome[test_idx]
+
+        pval, adj, _ = pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000)
+        pval, adj, _ = bct.nbs_bct(train_a,
+                                train_b,
+                                t_threshold,
+                                k=k,
+                                tail=tail)
+        cv_results.at[i, 'pval'] = pval
+        cv_results.at[i, 'component'] = adj
+
+        nbs_vector = adj[upper_tri]
+        mask = nbs_vector == 1
+        train_features = edges[train_idx, :].T[mask]
+        test_features = edges[test_idx, :].T[mask]
+
+        regressor = LogisticRegression(max_iter=1000)
+        model = regressor.fit(X=train_features.T, y=train_y)
+        cv_results.at[i, 'model'] = model
+        score = model.score(X=test_features.T, y=test_y)
+        cv_results.at[i, 'score'] = score
+
+        m = 0
+        param_vector = np.zeros_like(nbs_vector)
+        for l in range(0, nbs_vector.shape[0]):
+            if nbs_vector[l] == 1.:
+                param_vector[l] = model.coef_[0,m]
+                m+=1
+            else:
+                pass
+        X = np.zeros_like(adj)
+        X[np.triu_indices(X.shape[0], k=1)] = param_vector
+        X = X + X.T
+        cv_results.at[i, 'coefficient_matrix'] = X
+        cv_results.at[i, 'coefficient_vector'] = param_vector
+        i += 1
+        ticks.update()
+    return cv_results

From 2a78636f1b1e9d26db59ca4cc6becc84c7bb8821 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 12:29:01 -0800
Subject: [PATCH 04/48] move null dist into networking.py, delete empties

---
 idconn/data/missingness.py                    |   0
 .../graph_theory.py => networking.py}         |   0
 idconn/networking/__init__.py                 |   8 -
 idconn/networking/null_distribution.py        | 190 ------------------
 4 files changed, 198 deletions(-)
 delete mode 100644 idconn/data/missingness.py
 rename idconn/{networking/graph_theory.py => networking.py} (100%)
 delete mode 100644 idconn/networking/__init__.py
 delete mode 100644 idconn/networking/null_distribution.py

diff --git a/idconn/data/missingness.py b/idconn/data/missingness.py
deleted file mode 100644
index e69de29..0000000
diff --git a/idconn/networking/graph_theory.py b/idconn/networking.py
similarity index 100%
rename from idconn/networking/graph_theory.py
rename to idconn/networking.py
diff --git a/idconn/networking/__init__.py b/idconn/networking/__init__.py
deleted file mode 100644
index a4564bf..0000000
--- a/idconn/networking/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""
-Tools for computing network topology / graph theoretic measures
-"""
-
-from . import null_distribution
-from . import graph_theory
-
-__all__ = ["null_distribution", "graph_theory"]
diff --git a/idconn/networking/null_distribution.py b/idconn/networking/null_distribution.py
deleted file mode 100644
index 03f9ce7..0000000
--- a/idconn/networking/null_distribution.py
+++ /dev/null
@@ -1,190 +0,0 @@
-import numpy as np
-import pandas as pd
-from os.path import join, exists
-import bct
-import datetime
-
-# this is all bullshit.
-# update to mesh with the BIDSy way of doing things
-def avg_corrmat(ppt_df):
-    stacked_corrmats = np.array(ppt_df['adj'])
-    print('Stacked corrmats have dimensions', stacked_corrmats.shape)
-    avg_corrmat = np.mean(stacked_corrmats, axis=0)
-    return avg_corrmat
-
-
-def null_model(W, bin_swaps=5, wei_freq=0.1, seed=None):
-    def get_rng(seed):
-        if seed is None or seed == np.random:
-            return np.random.mtrand._rand
-        elif isinstance(seed, np.random.RandomState):
-            return seed
-        try:
-            rstate = np.random.RandomState(seed)
-        except ValueError:
-            rstate = np.random.RandomState(np.random.Random(seed).randint(0, 2 ** 32 - 1))
-        return rstate
-
-    def randmio_und_signed(R, itr, seed=None):
-        rng = get_rng(seed)
-        R = R.copy()
-        n = len(R)
-
-        itr *= int(n * (n - 1) / 2)
-
-        max_attempts = int(np.round(n / 2))
-        eff = 0
-
-        for it in range(int(itr)):
-            att = 0
-            while att <= max_attempts:
-
-                a, b, c, d = pick_four_unique_nodes_quickly(n, rng)
-
-                r0_ab = R[a, b]
-                r0_cd = R[c, d]
-                r0_ad = R[a, d]
-                r0_cb = R[c, b]
-
-                # rewiring condition
-                if (
-                    np.sign(r0_ab) == np.sign(r0_cd)
-                    and np.sign(r0_ad) == np.sign(r0_cb)
-                    and np.sign(r0_ab) != np.sign(r0_ad)
-                ):
-
-                    R[a, d] = R[d, a] = r0_ab
-                    R[a, b] = R[b, a] = r0_ad
-
-                    R[c, b] = R[b, c] = r0_cd
-                    R[c, d] = R[d, c] = r0_cb
-
-                    eff += 1
-                    break
-
-                att += 1
-
-        return R, eff
-
-    def pick_four_unique_nodes_quickly(n, seed=None):
-        """
-        This is equivalent to np.random.choice(n, 4, replace=False)
-        Another fellow suggested np.random.random_sample(n).argpartition(4) which is
-        clever but still substantially slower.
-        """
-        rng = get_rng(seed)
-        k = rng.randint(n ** 4)
-        a = k % n
-        b = k // n % n
-        c = k // n ** 2 % n
-        d = k // n ** 3 % n
-        if a != b and a != c and a != d and b != c and b != d and c != d:
-            return (a, b, c, d)
-        else:
-            # the probability of finding a wrong configuration is extremely low
-            # unless for extremely small n. if n is extremely small the
-            # computational demand is not a problem.
-
-            # In my profiling it only took 0.4 seconds to include the uniqueness
-            # check in 1 million runs of this function so I think it is OK.
-            return pick_four_unique_nodes_quickly(n, rng)
-
-    rng = get_rng(seed)
-    if not np.allclose(W, W.T):
-        print("Input must be undirected")
-    W = W.copy()
-    n = len(W)
-    np.fill_diagonal(W, 0)  # clear diagonal
-    Ap = W > 0  # positive adjmat
-    An = W < 0  # negative adjmat
-
-    if np.size(np.where(Ap.flat)) < (n * (n - 1)):
-        W_r, eff = randmio_und_signed(W, bin_swaps, seed=rng)
-        Ap_r = W_r > 0
-        An_r = W_r < 0
-    else:
-        Ap_r = Ap
-        An_r = An
-
-    W0 = np.zeros((n, n))
-    for s in (1, -1):
-        if s == 1:
-            Acur = Ap
-            A_rcur = Ap_r
-        else:
-            Acur = An
-            A_rcur = An_r
-
-        S = np.sum(W * Acur, axis=0)  # strengths
-        Wv = np.sort(W[np.where(np.triu(Acur))])  # sorted weights vector
-        i, j = np.where(np.triu(A_rcur))
-        (Lij,) = np.where(np.triu(A_rcur).flat)  # weights indices
-
-        P = np.outer(S, S)
-
-        if wei_freq == 0:  # get indices of Lij that sort P
-            Oind = np.argsort(P.flat[Lij])  # assign corresponding sorted
-            W0.flat[Lij[Oind]] = s * Wv  # weight at this index
-        else:
-            wsize = np.size(Wv)
-            wei_period = np.round(1 / wei_freq).astype(
-                int
-            )  # convert frequency to period
-            lq = np.arange(wsize, 0, -wei_period, dtype=int)
-            for m in lq:  # iteratively explore at this period
-                # get indices of Lij that sort P
-                Oind = np.argsort(P.flat[Lij])
-                R = rng.permutation(m)[: np.min((m, wei_period))]
-                for q, r in enumerate(R):
-                    # choose random index of sorted expected weight
-                    o = Oind[r]
-                    W0.flat[Lij[o]] = s * Wv[r]  # assign corresponding weight
-
-                    # readjust expected weighted probability for i[o],j[o]
-                    f = 1 - Wv[r] / S[i[o]]
-                    P[i[o], :] *= f
-                    P[:, i[o]] *= f
-                    f = 1 - Wv[r] / S[j[o]]
-                    P[j[o], :] *= f
-                    P[:, j[o]] *= f
-
-                    # readjust strength of i[o]
-                    S[i[o]] -= Wv[r]
-                    # readjust strength of j[o]
-                    S[j[o]] -= Wv[r]
-
-                O = Oind[R]
-                # remove current indices from further consideration
-                Lij = np.delete(Lij, O)
-                i = np.delete(i, O)
-                j = np.delete(j, O)
-                Wv = np.delete(Wv, R)
-
-    W0 = W0 + W0.T
-    return W0
-
-def generate_null(ppt_df, thresh_arr, measure):
-    '''
-    Generate a distribution of graph measure values based on a null connectivity matrix
-    that is like the average connectivity matrix across participants.
-    
-    '''
-    null_dist = pd.DataFrame(index=subjects, columns=["mean", "sdev"])
-    avg_corr = avg_corrmat(
-        ppt_df
-    )
-    eff_perm = []
-    j = 0
-    while j < 1000:
-        effs = []
-        W = null_model(avg_corr.values)
-        for thresh in thresh_arr:
-            thresh_corr = bct.threshold_proportional(W, thresh)
-            leff = measure(thresh_corr)
-            effs.append(leff)
-        effs_arr = np.asarray(effs)
-        leff_auc = np.trapz(effs_arr, dx=0.03, axis=0)
-        eff_perm.append(leff_auc)
-        j += 1
-    
-    return null_dist
\ No newline at end of file

From 4deb84fafaa1c2a570dd8d5da96198f96e6d65d8 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 12:29:29 -0800
Subject: [PATCH 05/48] clean up imputation

---
 idconn/data/iterative_imputation.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/idconn/data/iterative_imputation.py b/idconn/data/iterative_imputation.py
index 73505fe..575e7bc 100644
--- a/idconn/data/iterative_imputation.py
+++ b/idconn/data/iterative_imputation.py
@@ -5,23 +5,17 @@
 from sklearn.impute import IterativeImputer
 
 
-#sink_dir = "/Users/kbottenh/Dropbox/Projects/physics-retrieval/data/rescored"
-# sink_dir = '/home/kbott006/physics-retrieval'
-# fig_dir = '/Users/kbottenh/Dropbox/Projects/physics-retrieval/figures/'
-#data_dir = "/Users/kbottenh/Dropbox/Projects/physics-retrieval/data/rescored"
-# roi_dir = '/Users/kbottenh/Dropbox/Data/templates/shen2015/'
-# data_dir = '/home/kbott006/physics-retrieval'
-
-# big_df = pd.read_csv(join(data_dir, 'physics_learning-nonbrain_OLS-missing+fd+local_efficiency.csv'),
-#                index_col=0, header=0)
-
-# impute first?
-def impute(data, max_iter):
+def impute(data, max_iter=10000):
+    '''
+    Fill in missing data with an iterative imputation algorithm from scikit learn.
+    NOTE: Will not imput connectivity data.
+    '''
+    
     non_numeric = data.select_dtypes(exclude=['number']).columns
     dumb = pd.get_dummies(data[non_numeric], prefix='dummy')
     df = pd.concat([data.drop(non_numeric, axis=1), dumb])
     impute_pls = IterativeImputer(
-        max_iter=10000, skip_complete=True, verbose=1, tol=5e-3, n_nearest_features=1000
+        max_iter=max_iter, skip_complete=True, verbose=1, tol=5e-3, n_nearest_features=1000
     )
     imputed = impute_pls.fit_transform(df)
     imp_df = pd.DataFrame(imputed,columns=data.drop(non_numeric, axis=1).columns, index=data.index,

From 26db135a96a669c56206908f89dd3376c52cba6c Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 12:29:49 -0800
Subject: [PATCH 06/48] all graph functions in networking.py

---
 idconn/networking.py | 187 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 187 insertions(+)

diff --git a/idconn/networking.py b/idconn/networking.py
index 710cadd..3932e0a 100644
--- a/idconn/networking.py
+++ b/idconn/networking.py
@@ -8,6 +8,193 @@
 import bct
 #import datetime
 
+
+def avg_corrmat(ppt_df):
+    '''
+    Reads in adjacency matrices from the pandas df with ppt info and adj, then computes an average.
+    '''
+    stacked_corrmats = np.array(ppt_df['adj'])
+    print('Stacked corrmats have dimensions', stacked_corrmats.shape)
+    avg_corrmat = np.mean(stacked_corrmats, axis=0)
+    return avg_corrmat
+
+
+def null_model(W, bin_swaps=5, wei_freq=0.1, seed=None):
+    def get_rng(seed):
+        if seed is None or seed == np.random:
+            return np.random.mtrand._rand
+        elif isinstance(seed, np.random.RandomState):
+            return seed
+        try:
+            rstate = np.random.RandomState(seed)
+        except ValueError:
+            rstate = np.random.RandomState(np.random.Random(seed).randint(0, 2 ** 32 - 1))
+        return rstate
+
+    def randmio_und_signed(R, itr, seed=None):
+        rng = get_rng(seed)
+        R = R.copy()
+        n = len(R)
+
+        itr *= int(n * (n - 1) / 2)
+
+        max_attempts = int(np.round(n / 2))
+        eff = 0
+
+        for it in range(int(itr)):
+            att = 0
+            while att <= max_attempts:
+
+                a, b, c, d = pick_four_unique_nodes_quickly(n, rng)
+
+                r0_ab = R[a, b]
+                r0_cd = R[c, d]
+                r0_ad = R[a, d]
+                r0_cb = R[c, b]
+
+                # rewiring condition
+                if (
+                    np.sign(r0_ab) == np.sign(r0_cd)
+                    and np.sign(r0_ad) == np.sign(r0_cb)
+                    and np.sign(r0_ab) != np.sign(r0_ad)
+                ):
+
+                    R[a, d] = R[d, a] = r0_ab
+                    R[a, b] = R[b, a] = r0_ad
+
+                    R[c, b] = R[b, c] = r0_cd
+                    R[c, d] = R[d, c] = r0_cb
+
+                    eff += 1
+                    break
+
+                att += 1
+
+        return R, eff
+
+    def pick_four_unique_nodes_quickly(n, seed=None):
+        """
+        This is equivalent to np.random.choice(n, 4, replace=False)
+        Another fellow suggested np.random.random_sample(n).argpartition(4) which is
+        clever but still substantially slower.
+        """
+        rng = get_rng(seed)
+        k = rng.randint(n ** 4)
+        a = k % n
+        b = k // n % n
+        c = k // n ** 2 % n
+        d = k // n ** 3 % n
+        if a != b and a != c and a != d and b != c and b != d and c != d:
+            return (a, b, c, d)
+        else:
+            # the probability of finding a wrong configuration is extremely low
+            # unless for extremely small n. if n is extremely small the
+            # computational demand is not a problem.
+
+            # In my profiling it only took 0.4 seconds to include the uniqueness
+            # check in 1 million runs of this function so I think it is OK.
+            return pick_four_unique_nodes_quickly(n, rng)
+
+    rng = get_rng(seed)
+    if not np.allclose(W, W.T):
+        print("Input must be undirected")
+    W = W.copy()
+    n = len(W)
+    np.fill_diagonal(W, 0)  # clear diagonal
+    Ap = W > 0  # positive adjmat
+    An = W < 0  # negative adjmat
+
+    if np.size(np.where(Ap.flat)) < (n * (n - 1)):
+        W_r, eff = randmio_und_signed(W, bin_swaps, seed=rng)
+        Ap_r = W_r > 0
+        An_r = W_r < 0
+    else:
+        Ap_r = Ap
+        An_r = An
+
+    W0 = np.zeros((n, n))
+    for s in (1, -1):
+        if s == 1:
+            Acur = Ap
+            A_rcur = Ap_r
+        else:
+            Acur = An
+            A_rcur = An_r
+
+        S = np.sum(W * Acur, axis=0)  # strengths
+        Wv = np.sort(W[np.where(np.triu(Acur))])  # sorted weights vector
+        i, j = np.where(np.triu(A_rcur))
+        (Lij,) = np.where(np.triu(A_rcur).flat)  # weights indices
+
+        P = np.outer(S, S)
+
+        if wei_freq == 0:  # get indices of Lij that sort P
+            Oind = np.argsort(P.flat[Lij])  # assign corresponding sorted
+            W0.flat[Lij[Oind]] = s * Wv  # weight at this index
+        else:
+            wsize = np.size(Wv)
+            wei_period = np.round(1 / wei_freq).astype(
+                int
+            )  # convert frequency to period
+            lq = np.arange(wsize, 0, -wei_period, dtype=int)
+            for m in lq:  # iteratively explore at this period
+                # get indices of Lij that sort P
+                Oind = np.argsort(P.flat[Lij])
+                R = rng.permutation(m)[: np.min((m, wei_period))]
+                for q, r in enumerate(R):
+                    # choose random index of sorted expected weight
+                    o = Oind[r]
+                    W0.flat[Lij[o]] = s * Wv[r]  # assign corresponding weight
+
+                    # readjust expected weighted probability for i[o],j[o]
+                    f = 1 - Wv[r] / S[i[o]]
+                    P[i[o], :] *= f
+                    P[:, i[o]] *= f
+                    f = 1 - Wv[r] / S[j[o]]
+                    P[j[o], :] *= f
+                    P[:, j[o]] *= f
+
+                    # readjust strength of i[o]
+                    S[i[o]] -= Wv[r]
+                    # readjust strength of j[o]
+                    S[j[o]] -= Wv[r]
+
+                O = Oind[R]
+                # remove current indices from further consideration
+                Lij = np.delete(Lij, O)
+                i = np.delete(i, O)
+                j = np.delete(j, O)
+                Wv = np.delete(Wv, R)
+
+    W0 = W0 + W0.T
+    return W0
+
+def generate_null(ppt_df, thresh_arr, measure):
+    '''
+    Generate a distribution of graph measure values based on a null connectivity matrix
+    that is like the average connectivity matrix across participants.
+    
+    '''
+    null_dist = pd.DataFrame(index=subjects, columns=["mean", "sdev"])
+    avg_corr = avg_corrmat(
+        ppt_df
+    )
+    eff_perm = []
+    j = 0
+    while j < 1000:
+        effs = []
+        W = null_model(avg_corr.values)
+        for thresh in thresh_arr:
+            thresh_corr = bct.threshold_proportional(W, thresh)
+            leff = measure(thresh_corr)
+            effs.append(leff)
+        effs_arr = np.asarray(effs)
+        leff_auc = np.trapz(effs_arr, dx=0.03, axis=0)
+        eff_perm.append(leff_auc)
+        j += 1
+    
+    return null_dist
+
 def omst(matrix, density=True, plot=False):
     '''
     WARNING: THIS IS SLOW AF, REPLACING WITH NETWORKX VERSION IN NEAR FUTURE

From 25992dbfa5340fc66f753c880278f1e5743492f7 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 12:32:01 -0800
Subject: [PATCH 07/48] convert data folder to script

---
 idconn/{data/iterative_imputation.py => data.py} | 0
 idconn/data/__init__.py                          | 8 --------
 2 files changed, 8 deletions(-)
 rename idconn/{data/iterative_imputation.py => data.py} (100%)
 delete mode 100644 idconn/data/__init__.py

diff --git a/idconn/data/iterative_imputation.py b/idconn/data.py
similarity index 100%
rename from idconn/data/iterative_imputation.py
rename to idconn/data.py
diff --git a/idconn/data/__init__.py b/idconn/data/__init__.py
deleted file mode 100644
index e0be4c5..0000000
--- a/idconn/data/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""
-Tools for arranging data and addressing missing data
-"""
-
-from . import iterative_imputation
-from . import missingness
-
-__all__ = ["iterative_imputation", "missingness"]

From 65eb863bbeccc92c1997c8864cb7e6505342dd64 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 12:32:48 -0800
Subject: [PATCH 08/48] untested versions of pynbs and nbspredict

---
 idconn/nbs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/idconn/nbs.py b/idconn/nbs.py
index 8998e25..cc5b59a 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -294,4 +294,4 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
         cv_results.at[i, 'coefficient_vector'] = param_vector
         i += 1
         ticks.update()
-    return cv_results
+    return cv_results
\ No newline at end of file

From 1b70102c74ade0f600ac78f65c9893b31ced81b5 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 12:36:02 -0800
Subject: [PATCH 09/48] add utils to io

---
 .gitignore   |   1 -
 idconn/io.py | 372 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 372 insertions(+), 1 deletion(-)
 create mode 100644 idconn/io.py

diff --git a/.gitignore b/.gitignore
index 33bdf27..ce30a02 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,7 +32,6 @@ idconn/networking/task-graph-theory-fci.py
 idconn/networking/task-graph-theory-local-nodal.py
 idconn/networking/task-graph-theory-local.py
 idconn/networking/task-graph-theory-nodal.py
-idconn/io.py
 
 docs/_build/
 docs/generated/
diff --git a/idconn/io.py b/idconn/io.py
new file mode 100644
index 0000000..6ba41f1
--- /dev/null
+++ b/idconn/io.py
@@ -0,0 +1,372 @@
+import bids
+import json
+from nilearn import datasets
+import nibabel as nib
+from os.path import exists, join, basename
+
+
+import nibabel as nib
+import numpy as np
+import pandas as pd
+import seaborn as sns
+#from matplotlib import projections
+from matplotlib import pyplot as plt
+from matplotlib.gridspec import GridSpec
+from nilearn import datasets, plotting, surface
+
+
+def build_statsmodel_json(name, task, contrast, confounds, highpass, 
+                          mask, conn_meas, graph_meas=None, exclude=None, outfile=None):
+    '''
+    Creates a BIDS Stats Models json with analysis details for further use.
+
+    Parameters
+    ----------
+    root_dir : str
+        Location of BIDS dataset root
+    validate : bool
+        If true, pybids will check if this is a valid BIDS-format
+        dataset before continuing.
+    absolute_paths : bool
+        If true, will assume paths are absolute, instead of relative.
+    derivatives : str
+        Location of preprocessed data (i.e., name of fmriprep dir).
+    verbose : bool
+        If true, will narrate finding of dataset and describe it.
+    Returns
+    -------
+    atlas : str
+        Name of the atlas chosen.
+    path : str
+        File path of atlas. If user-provided, will be copied into
+        `derivatives/idconn`. If using an atlas from Nilearn, will
+        be path to downloaded nifti.
+    shape : str
+        Indicates shape of map (3d, 4d, coords) for choosing appropriate
+        Nilearn masker for extracting BOLD signals from nifti files.
+    
+    '''
+    mask_builtins = ['shen270', 'craddock270', 'schaefer400', 'yeo7', 'yeo17']
+    if '.nii' in mask:
+        assert exists(mask), 'Mask file does not exist at {mask}'.format(mask=mask)
+        if '.gz' in mask:
+            mask_name = basename(mask).rsplit('.', 2)[0]
+        else:
+            mask_name = basename(mask).rsplit('.', 1)[0]
+    else:
+        assert mask in mask_builtins, 'Mask {mask} not in built-in mask options. Please provide file path or one of {mask_builtins}'.format(mask=mask, mask_builtins=mask_builtins)
+    variables = confounds + ["{mask_name}*".format(mask_name=mask_name)]
+    statsmodel = {
+        "name": name,
+        "description": "A functional connectivity analysis of {task}, comparing {contrast}".format(task=task, 
+                                                                                                   contrast=contrast), 
+        "input":{
+            "task": task
+        },
+        "blocks":[{
+                "level": "run",
+                "transformations":{
+                        "name": "load_image_data",
+                        "input": ["bold"],
+                        "aggregate": ["mean"],
+                        "mask": [mask_name],
+                        "output": ["{mask_name}*".format(mask_name=mask_name)]
+                    },
+        },
+            {
+                "level": "session",
+                "model": {
+                    "variables": variables,
+                    "options": {
+                        "confounds": confounds,
+                        "high_pass_filter_cutoff_secs": highpass
+                    },
+                    "variances": {
+                        "name": "session_level",
+                        "groupBy": "session"
+                    },
+                    "software": {
+                        "IDConn": {
+                            "ConnectivityMeasure": [conn_meas],
+                            "GraphMetrics": [graph_meas]
+                        }
+                    }
+                }
+                
+            }
+        ]
+    }
+    statsmodel_json = json.dumps(statsmodel, indent = 2)
+    
+    outfile = '{name}-statsmodel.json'.format(name=name)
+    with open(outfile, 'w') as outfile:
+        json.dump(statsmodel, outfile)
+    return statsmodel_json
+
+def atlas_picker(atlas, path, key=None):
+    """Takes in atlas name and path to file, if local, returns
+    nifti-like object (usually file path to downloaded atlas),
+    and atlas name (for tagging output files). If atlas is from
+    Nilearn, will download atlas, **and space must be == 'MNI'.
+    If atlas is provided by user (path must be specified), then
+    space of atlas must match space of fMRI data, but that is up
+    to the user to determine.
+    Parameters
+    ----------
+    atlas : str
+        Name of the atlas/parcellation used to define nodes from 
+        voxels. If using an atlas fetchable by Nilearn, atlas name 
+        must match the function `fetch_atlas_[name]`.
+    path : str
+        Path to the atlas specified, if not using a dataset from Nilearn. 
+        If using `nilearn.datasets` to fetch an atlas, will revert to 
+        `derivatives/idconn` path.
+    key : str
+        Atlas-specific key for denoting which of multiple versions
+        will be used. Default behavior is described in the "atlases"
+        section of the docs. NOT IMPLEMENTED
+    Returns
+    -------
+    atlas : str
+        Name of the atlas chosen.
+    path : str
+        File path of atlas. If user-provided, will be copied into
+        `derivatives/idconn`. If using an atlas from Nilearn, will
+        be path to downloaded nifti.
+    shape : str
+        Indicates shape of map (3d, 4d, coords) for choosing appropriate
+        Nilearn masker for extracting BOLD signals from nifti files.
+    """
+    nilearn_3d = ['craddock_2012', 'destrieux_2009', 'harvard_oxford', 'smith_2009', 'yeo_2011', 'aal', 'pauli_2017', 'msdl']
+    #nilearn_coord = ['power_2011', 'dosenbach_2010', 'seitzman_2018']
+    #nilearn_4d = ['allen_2011', '']
+    if atlas in nilearn_3d:
+        if atlas == 'craddock_2012':
+            atlas_dict = datasets.fetch_atlas_craddock_2012(data_dir=path)
+            atlas_path = atlas_dict['tcorr_2level']
+            nifti = nib.load(atlas_path)
+            nifti_arr = nifti.get_fdata()
+            #selecting one volume of the nifti, each represent different granularity of parcellation
+            #selecting N = 270, the 27th volume per http://ccraddock.github.io/cluster_roi/atlases.html
+            nifti = nib.Nifti1Image(nifti_arr[:,:,:,26], nifti.affine)
+            nifti.to_filename()
+
+    return atlas, path
+
+def vectorize_corrmats(matrices):
+    """Returns the vectorized upper triangles of a 3-dimensional array
+    (i.e., node x node x matrix) of matrices. Output will be a 2-dimensional
+    array (i.e., matrix x node^2)
+    Parameters
+    ----------
+    matrices : numpy array of shape (p, n, n)
+        Represents the link strengths of the graphs. Assumed to be
+        an array of symmetric matrices.
+    
+    Returns
+    -------
+    edge_vector : numpy array of shape (p, n^2)
+        Represents an array of vectorized upper triangles of 
+        the input matrices.
+    """
+    #print(matrices.shape, matrices.ndim)
+    num_node = matrices.shape[1]
+    upper_tri = np.triu_indices(num_node, k=1)
+    if matrices.ndim == 3:
+        num_node = matrices.shape[1]
+        upper_tri = np.triu_indices(num_node, k=1)
+        num_matrices = matrices.shape[0]
+        edge_vector = []
+        for matrix in range(0,num_matrices):
+            vectorized = matrices[matrix,:,:][upper_tri]
+            edge_vector.append(vectorized)
+    
+    elif matrices.ndim == 2:
+        true = matrices[0].T == matrices[0]
+        if true.all():
+            edge_vector = matrices[upper_tri]
+        else:
+            print('Matrices of incompatible shape:', matrices.shape, 
+                '\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).')
+    elif matrices.ndim == 1:
+        if matrices[0].ndim == 2:
+            num_node = matrices[0].shape[0]
+            upper_tri = np.triu_indices(num_node, k=1)
+            edge_vector = []
+            for matrix in matrices:
+                vectorized = matrix[upper_tri]
+                edge_vector.append(vectorized)
+        else:
+            print('Matrices of incompatible shape:', matrices.shape, 
+                  '\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).')
+    edge_vector = np.asarray(edge_vector)
+    return edge_vector
+
+def read_corrmats(layout, task, deriv_name, conf_measures=None, z_score=True, vectorized=True, verbose=False):
+    """Returns a node x node x (subject x session) matrix of correlation matrices  
+    from a BIDS derivative folder. Optionally returns a subject x session dataframe
+    of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) 
+    array of vectorized upper triangles of those correlation matrices.
+    Parameters
+    ----------
+    matrices : numpy array of shape (n, n, p)
+        Represents the link strengths of the graphs. Assumed to be
+        an array of symmetric matrices.
+    
+    Returns
+    -------
+    edge_vector : numpy array of shape (p, n^2)
+        Represents an array of vectorized upper triangles of 
+        the input matrices.
+    """
+    subjects = layout.get(return_type='id', 
+                          target='subject', 
+                          suffix='bold', 
+                          scope=deriv_name
+                         )
+    all_sesh = layout.get(return_type='id',
+           target='session',
+           task=task, 
+           suffix='bold',
+           scope=deriv_name
+          )
+    ppts_fname = layout.get_file('participants.tsv').path
+    ppt_df = pd.read_csv(ppts_fname, sep='\t', index_col=[0,1])
+    ppt_df['adj'] = ''
+    if vectorized:
+        ppt_df['edge_vector'] = ''
+    
+    for subject in subjects:
+        if verbose:
+            print(subject)
+        else:
+            pass
+        sessions = layout.get(return_type='id', 
+                              target='session', 
+                              task=task, 
+                              suffix='bold', 
+                              subject=subject, 
+                              scope=deriv_name)
+        
+        for session in sessions:
+            if verbose:
+                print(session)
+            else:
+                pass
+            path = layout.get(return_type='filename',
+                               task=task, 
+                               subject=subject,
+                               session=session,
+                               suffix='bold',
+                               scope='IDConn'
+                              )
+            if verbose:
+                print(f'Corrmat path for sub-{subject}, ses-{session}: \t{path}')
+            else:
+                pass
+            if type(path) == list:
+                #print(len(path))
+                path = path[0]
+            else:
+                pass
+            assert exists(path), f'Corrmat file not found at {path}'
+            adj_matrix = pd.read_csv(path, sep='\t', header=0, index_col=0)
+            if z_score == True:
+                z_adj = np.arctanh(adj_matrix.values)
+                z_adj = np.where(z_adj == np.inf, 0, z_adj)
+                #print(z_adj.shape)
+                ppt_df.at[(f'sub-{subject}', 
+                           f'ses-{session}'), 
+                          'adj'] = z_adj
+            else:
+                #print(adj_matrix.values.shape)
+                ppt_df.at[(f'sub-{subject}', 
+                           f'ses-{session}'), 
+                          'adj'] = adj_matrix.values
+                
+            
+            if vectorized == True:
+                edge_vector = vectorize_corrmats(adj_matrix.values)
+                #print(edge_vector.shape)
+                ppt_df.at[(f'sub-{subject}', 
+                                   f'ses-{session}'), 
+                                  'edge_vector'] = edge_vector
+    ppt_df.replace({'': np.nan}, inplace=True)
+    return ppt_df
+
+def undo_vectorize(edges):
+    j = len(edges)
+    num_node = (np.sqrt((8 * j) + 1) + 1) / 2
+    X = np.zeros((num_node,num_node))
+    X[np.triu_indices(X.shape[0], k = 1)] = edges
+    X = X + X.T
+    return X
+
+def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='icefire', node_size='strength'):
+    coords = plotting.find_parcellation_cut_coords(atlas_nii)
+    num_node = adj.shape[0]
+    # only plot the top t% of edges
+    if threshold == 'computed':
+        threshold = f'{(1 - (100 / num_node ** 2)) * 100}%'
+    elif type(threshold) == float or type(threshold) == int:
+        threshold = f'{threshold}%'
+    else:
+        threshold = '99%'
+    print('edge plotting threshold: ', threshold)
+
+    if node_size == 'strength':
+        node_strength = np.sum((np.abs(adj)), axis=0)
+        node_strength /= np.max(node_strength)
+        node_strength **= 4
+        node_size = node_strength
+    fig = plt.figure(figsize=(12,4))
+    if title is not None:
+        fig.suptitle(title)
+    gs = GridSpec(1, 2, width_ratios=[4,2])
+    ax0 = fig.add_subplot(gs[0])
+    ax1 = fig.add_subplot(gs[1])
+
+    plt.tight_layout(w_pad=5)
+    g = plotting.plot_connectome(adj, coords, 
+                                node_size=node_size,
+                                edge_threshold=threshold, 
+                                edge_cmap=cmap, 
+                                figure=fig, 
+                                axes=ax0,
+                                colorbar=False, 
+                                annotate=False)
+    h = sns.heatmap(adj, square=True, cmap=cmap, ax=ax1)
+    if strength:
+        fig2 = plt.figure(figsize=(12,4))
+        if title is not None:
+            fig2.suptitle(title)
+        fsaverage = datasets.fetch_surf_fsaverage()
+        nimg = nib.load(atlas_nii)
+        regn_sch_arr = nimg.get_fdata()
+        for i in np.arange(0,num_node):
+            regn_sch_arr[np.where(regn_sch_arr == i+1)] = np.sum(adj[i])
+        strength_nimg = nib.Nifti1Image(regn_sch_arr, nimg.affine)
+        nib.save(strength_nimg, '/Users/katherine.b/Dropbox/HC_Use_predictive-strength.nii')
+
+        gs = GridSpec(1, 4)
+        # plot edge weights on surfaces
+        ax2 = fig2.add_subplot(gs[0], projection='3d')
+        ax3 = fig2.add_subplot(gs[1], projection='3d')
+        ax4 = fig2.add_subplot(gs[2], projection='3d')
+        ax5 = fig2.add_subplot(gs[3], projection='3d')
+
+        texture_l = surface.vol_to_surf(strength_nimg, fsaverage.pial_left, interpolation='nearest')
+        texture_r = surface.vol_to_surf(strength_nimg, fsaverage.pial_right, interpolation='nearest')
+
+        plt.tight_layout(w_pad=-1)
+        i = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5,
+                                                cmap=cmap, view='lateral', colorbar=False, axes=ax2)
+        j = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5,
+                                                cmap=cmap, view='medial', colorbar=False, axes=ax3)
+        k = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5,
+                                                cmap=cmap, view='lateral', colorbar=False, axes=ax4)
+        l = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5,
+                                                cmap=cmap, view='medial', colorbar=False, axes=ax5)
+        return fig, fig2
+    else:
+        return fig
\ No newline at end of file

From 141140e25441180c7188eb180ee88f2b20e05722 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 12:42:18 -0800
Subject: [PATCH 10/48] add imports to nbs

---
 idconn/nbs.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/idconn/nbs.py b/idconn/nbs.py
index cc5b59a..28420db 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -1,7 +1,14 @@
 import numpy as np
 import statsmodels as sm
 import networkx as nx
-from utils import vectorize_corrmats, undo_vectorize
+import pandas as pd
+from io import vectorize_corrmats, undo_vectorize
+from scipy.stats import t
+import enlighten
+import bct
+
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold
+from sklearn.linear_model import LogisticRegression
 
 
 def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000, stratified=False):
@@ -260,11 +267,7 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
         test_y = outcome[test_idx]
 
         pval, adj, _ = pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000)
-        pval, adj, _ = bct.nbs_bct(train_a,
-                                train_b,
-                                t_threshold,
-                                k=k,
-                                tail=tail)
+        
         cv_results.at[i, 'pval'] = pval
         cv_results.at[i, 'component'] = adj
 
@@ -273,6 +276,8 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
         train_features = edges[train_idx, :].T[mask]
         test_features = edges[test_idx, :].T[mask]
 
+        # need an IF GROUPS statement
+        # ELSE statsmodels OLS
         regressor = LogisticRegression(max_iter=1000)
         model = regressor.fit(X=train_features.T, y=train_y)
         cv_results.at[i, 'model'] = model

From f2a33e5bc0c163799e09b054d6fc5167b76358a6 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 12:45:37 -0800
Subject: [PATCH 11/48] add thresholding functions to networking

---
 idconn/networking.py | 66 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 5 deletions(-)

diff --git a/idconn/networking.py b/idconn/networking.py
index 3932e0a..273af7c 100644
--- a/idconn/networking.py
+++ b/idconn/networking.py
@@ -1,10 +1,12 @@
 import numpy as np
-#import pandas as pd
+import pandas as pd
 import seaborn as sns
+import networkx as nx
 import matplotlib.pyplot as plt
 from os.path import join
 #from nilearn.connectome import ConnectivityMeasure
 from scipy.sparse.csgraph import minimum_spanning_tree
+from scipy.stats import skew
 import bct
 #import datetime
 
@@ -18,7 +20,6 @@ def avg_corrmat(ppt_df):
     avg_corrmat = np.mean(stacked_corrmats, axis=0)
     return avg_corrmat
 
-
 def null_model(W, bin_swaps=5, wei_freq=0.1, seed=None):
     def get_rng(seed):
         if seed is None or seed == np.random:
@@ -169,19 +170,19 @@ def pick_four_unique_nodes_quickly(n, seed=None):
     W0 = W0 + W0.T
     return W0
 
-def generate_null(ppt_df, thresh_arr, measure):
+def generate_null(ppt_df, thresh_arr, measure, permutations=1000):
     '''
     Generate a distribution of graph measure values based on a null connectivity matrix
     that is like the average connectivity matrix across participants.
     
     '''
-    null_dist = pd.DataFrame(index=subjects, columns=["mean", "sdev"])
+    null_dist = pd.DataFrame(index=range(0,permutations), columns=["mean", "sdev"])
     avg_corr = avg_corrmat(
         ppt_df
     )
     eff_perm = []
     j = 0
-    while j < 1000:
+    while j < permutations:
         effs = []
         W = null_model(avg_corr.values)
         for thresh in thresh_arr:
@@ -269,3 +270,58 @@ def graph_omst(matrix, measure, args):
     metric = measure(thresh_mat, args)
     return metric
 
+
+def scale_free_tau(corrmat, skew_thresh, proportional=True):
+    ''''
+    Calculates threshold at which network becomes scale-free, estimated from the skewness of the networks degree distribution.
+    Parameters
+    ----------
+    corrmat : numpy.array
+        Correlation or other connectivity matrix from which tau_connected will be estimated.
+        Should be values between 0 and 1.
+    proportional : bool
+        Determines whether connectivity matrix is thresholded proportionally or absolutely.
+        Default is proportional as maintaining network density across participants is a priority
+    Returns
+    -------
+    tau : float
+        Lowest vaue of tau (threshold) at which network is scale-free.
+    '''
+    tau = 0.01
+    skewness = 1
+    while abs(skewness) > 0.3:
+        if proportional:
+            w = bct.threshold_proportional(corrmat, tau)
+        else:
+            w = bct.threshold_absolute(corrmat, tau)
+        skewness = skew(bct.degrees_und(w))
+        tau += 0.01
+    return tau
+
+def connected_tau(corrmat, proportional=True):
+    '''
+    Calculates threshold at network becomes node connected, using NetworkX's `is_connected` function.
+    Parameters
+    ----------
+    corrmat : numpy.array
+        Correlation or other connectivity matrix from which tau_connected will be estimated.
+        Should be values between 0 and 1.
+    proportional : bool
+        Determines whether connectivity matrix is thresholded proportionally or absolutely.
+        Default is proportional as maintaining network density across participants is a priority
+    Returns
+    -------
+    tau : float
+        Highest vaue of tau (threshold) at which network becomes node-connected.
+    '''
+    tau = 0.01
+    connected = False
+    while connected == False:
+        if proportional:
+            w = bct.threshold_proportional(corrmat, tau)
+        else:
+            w = bct.threshold_absolute(corrmat, tau)
+        w_nx = nx.convert_matrix.from_numpy_array(w)
+        connected = nx.algorithms.components.is_connected(w_nx)
+        tau += 0.01
+    return tau
\ No newline at end of file

From 745c206e21037399b4d924b6507d31b04ced8fe5 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 12:47:18 -0800
Subject: [PATCH 12/48] move connectivity estimting to connectivity.py

---
 .../build_networks.py => connectivity.py}     |   0
 idconn/connectivity/__init__.py               |   8 ---
 .../__pycache__/__init__.cpython-37.pyc       | Bin 348 -> 0 bytes
 idconn/connectivity/estimate_thresh.py        |  60 ------------------
 4 files changed, 68 deletions(-)
 rename idconn/{connectivity/build_networks.py => connectivity.py} (100%)
 delete mode 100644 idconn/connectivity/__init__.py
 delete mode 100644 idconn/connectivity/__pycache__/__init__.cpython-37.pyc
 delete mode 100644 idconn/connectivity/estimate_thresh.py

diff --git a/idconn/connectivity/build_networks.py b/idconn/connectivity.py
similarity index 100%
rename from idconn/connectivity/build_networks.py
rename to idconn/connectivity.py
diff --git a/idconn/connectivity/__init__.py b/idconn/connectivity/__init__.py
deleted file mode 100644
index afe46b9..0000000
--- a/idconn/connectivity/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""
-Tools for computing connectivity matrices/graphs
-"""
-
-from . import build_networks
-from . import estimate_thresh
-
-__all__ = ["build_networks", "estimate_thresh"]
diff --git a/idconn/connectivity/__pycache__/__init__.cpython-37.pyc b/idconn/connectivity/__pycache__/__init__.cpython-37.pyc
deleted file mode 100644
index 587b05d21a7b91244269ae5024ce1fb989e55580..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 348
zcmYk0!Ait16h+gtGnF!<ztE*!Xu!3I=xp4G;0V&qqtnDTPMVY?9UXD&uL%B9SAr{l
z!Idwd=!4vxTpr|{&F2}xd3k?^SG+&M<98^+DIT|v!AVJY#HorpTGA+Rj2xFSa>6GK
zEfYSyCPn&mlr4=>F26H2uT9qvUg`!(Yf*c-k$#(ZmAA4Mjy1OGTbF%C_(c(oyQ@Je
z4qEsJW7iIwJ>k5>bpgJ$!nHP7*%;%pn2ad^H#NXW{`yc>N&&nPn}+2OU$^YW3G3K;
zWxN-<WeaQim3d@W*4(3)V;75aRAiC|(*rBeP;4soQ?UR33_xn>0S^1^xO3bYJ}7Y#
Nq}&uEp|fa4zXASXVy6HA

diff --git a/idconn/connectivity/estimate_thresh.py b/idconn/connectivity/estimate_thresh.py
deleted file mode 100644
index 7dd99a4..0000000
--- a/idconn/connectivity/estimate_thresh.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import numpy as np
-import networkx as nx
-import pandas as pd
-import bct 
-
-
-def scale_free_tau(corrmat, skew_thresh, proportional=True):
-    ''''
-    Calculates threshold at which network becomes scale-free, estimated from the skewness of the networks degree distribution.
-    Parameters
-    ----------
-    corrmat : numpy.array
-        Correlation or other connectivity matrix from which tau_connected will be estimated.
-        Should be values between 0 and 1.
-    proportional : bool
-        Determines whether connectivity matrix is thresholded proportionally or absolutely.
-        Default is proportional as maintaining network density across participants is a priority
-    Returns
-    -------
-    tau : float
-        Lowest vaue of tau (threshold) at which network is scale-free.
-    '''
-    tau = 0.01
-    skewness = 1
-    while abs(skewness) > 0.3:
-        if proportional:
-            w = bct.threshold_proportional(corrmat, tau)
-        else:
-            w = bct.threshold_absolute(corrmat, tau)
-        skewness = skew(bct.degrees_und(w))
-        tau += 0.01
-    return tau
-
-def connected_tau(corrmat, proportional=True):
-    '''
-    Calculates threshold at network becomes node connected, using NetworkX's `is_connected` function.
-    Parameters
-    ----------
-    corrmat : numpy.array
-        Correlation or other connectivity matrix from which tau_connected will be estimated.
-        Should be values between 0 and 1.
-    proportional : bool
-        Determines whether connectivity matrix is thresholded proportionally or absolutely.
-        Default is proportional as maintaining network density across participants is a priority
-    Returns
-    -------
-    tau : float
-        Highest vaue of tau (threshold) at which network becomes node-connected.
-    '''
-    tau = 0.01
-    connected = False
-    while connected == False:
-        if proportional:
-            w = bct.threshold_proportional(corrmat, tau)
-        else:
-            w = bct.threshold_absolute(corrmat, tau)
-        w_nx = nx.convert_matrix.from_numpy_array(w)
-        connected = nx.algorithms.components.is_connected(w_nx)
-        tau += 0.01
-    return tau
\ No newline at end of file

From 6c57950f14fcc8eec019c48cc2d3c6886737c521 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 12:49:51 -0800
Subject: [PATCH 13/48] add nbs to init

---
 idconn/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/idconn/__init__.py b/idconn/__init__.py
index 83ad6d8..75cdf18 100644
--- a/idconn/__init__.py
+++ b/idconn/__init__.py
@@ -31,7 +31,8 @@
         # "preprocessing",
         #"statistics",
         # "utils",
-        # "io",
+        "io",
+        "nbs",
         "__version__",
     ]
 

From bc5da00874eb3da698131171b6dc0e271d3f6ec0 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 1 Dec 2022 13:09:03 -0800
Subject: [PATCH 14/48] update imports in connectivity and pipeline

---
 idconn/connectivity.py | 9 ++++-----
 idconn/pipeline.py     | 8 ++++----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/idconn/connectivity.py b/idconn/connectivity.py
index 15e3017..6f15e51 100644
--- a/idconn/connectivity.py
+++ b/idconn/connectivity.py
@@ -1,11 +1,11 @@
 from posixpath import sep
 import numpy as np
 import pandas as pd
-import idconn.connectivity.build_networks
+#import idconn.connectivity.build_networks
 from os import makedirs
 from os.path import join, exists, basename
 from nilearn import input_data, datasets, connectome, image, plotting
-
+from . import __version__
 #from .utils import contrast
 
 def _check_dims(matrix):
@@ -18,7 +18,6 @@ def _check_dims(matrix):
     if matrix.ndim != 2:
         raise ValueError('Expected a square matrix, got array of shape'
                          ' {0}.'.format(matrix.shape))
-    
 
 def task_connectivity(layout, subject, session, task, atlas, confounds, connectivity_metric='correlation', out_dir=None):
     """
@@ -52,7 +51,7 @@ def task_connectivity(layout, subject, session, task, atlas, confounds, connecti
     """
     #version = '0.1.1'
     try:
-        version = idconn.__version__
+        version = __version__
     except:
         version = 'test'
     if '.nii' in atlas:
@@ -191,7 +190,7 @@ def connectivity(layout, subject, session, task, atlas, connectivity_metric='cor
     adjacency_matrix
     """
     try:
-        version = idconn.__version__
+        version = __version__
     except:
         version = 'test'
     if '.nii' in atlas:
diff --git a/idconn/pipeline.py b/idconn/pipeline.py
index 667870f..38c0ccd 100644
--- a/idconn/pipeline.py
+++ b/idconn/pipeline.py
@@ -23,7 +23,7 @@
 from os.path import exists
 #from glob import glob
 #from nilearn import input_data, connectome, plotting, image
-from idconn.connectivity import build_networks
+from idconn.connectivity import connectivity, task_connectivity
 from idconn.parser_utils import is_valid_file, is_valid_path
 
 #from idconn.networking import graph_theory, null_distribution
@@ -116,17 +116,17 @@ def idconn_workflow(dset_dir, atlas, task, out_dir, space="MNI152NLin2009cAsym",
             print(f"here are the inputs: {layout, subject, session, task, atlas, conn, space, confounds}")
             if 'rest' in task:
                 try:
-                    adj_matrix = build_networks.connectivity(layout, subject, session, task, atlas, conn, space, confounds)
+                    adj_matrix = connectivity(layout, subject, session, task, atlas, conn, space, confounds)
                 except Exception as e:
                     print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}')
             if len(conditions) < 1:
                 try:
-                    adj_matrix = build_networks.connectivity(layout, subject, session, task, atlas, conn, space, confounds)
+                    adj_matrix = connectivity(layout, subject, session, task, atlas, conn, space, confounds)
                 except Exception as e:
                     print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}')
             else:
                 try:
-                    adj_matrix = build_networks.task_connectivity(layout=layout, subject=subject, session=session, task=task, atlas=atlas, confounds=confounds, connectivity_metric=conn)
+                    adj_matrix = task_connectivity(layout=layout, subject=subject, session=session, task=task, atlas=atlas, confounds=confounds, connectivity_metric=conn)
                 except Exception as e:
                     print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}')
 

From 6fe67219329eaa589f1e05ea5e7dd3d77e11fcd0 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Tue, 6 Dec 2022 13:16:57 -0800
Subject: [PATCH 15/48] fixed version import

---
 idconn/connectivity.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/idconn/connectivity.py b/idconn/connectivity.py
index 6f15e51..3746433 100644
--- a/idconn/connectivity.py
+++ b/idconn/connectivity.py
@@ -5,7 +5,7 @@
 from os import makedirs
 from os.path import join, exists, basename
 from nilearn import input_data, datasets, connectome, image, plotting
-from . import __version__
+from ._version import get_versions
 #from .utils import contrast
 
 def _check_dims(matrix):
@@ -51,7 +51,7 @@ def task_connectivity(layout, subject, session, task, atlas, confounds, connecti
     """
     #version = '0.1.1'
     try:
-        version = __version__
+        version = get_versions()["version"]
     except:
         version = 'test'
     if '.nii' in atlas:

From 423cb45d3bd1c58c83bac3e3fcb9482112f4ee56 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Tue, 6 Dec 2022 21:34:02 -0800
Subject: [PATCH 16/48] bug fixing, creating nbspredict wf

---
 idconn/__init__.py              |   2 +-
 idconn/connectivity.py          |   4 +-
 idconn/io.py                    |  19 +++---
 idconn/nbs.py                   | 117 +++++++++++++++++++++-----------
 idconn/networking.py            |   1 -
 idconn/workflows/nbs_predict.py |  60 ++++++++++++++++
 setup.py                        |   3 +-
 7 files changed, 154 insertions(+), 52 deletions(-)
 create mode 100644 idconn/workflows/nbs_predict.py

diff --git a/idconn/__init__.py b/idconn/__init__.py
index 75cdf18..000932b 100644
--- a/idconn/__init__.py
+++ b/idconn/__init__.py
@@ -12,7 +12,7 @@
     warnings.simplefilter("ignore")
     from . import connectivity
     from . import data
-    #from . import figures
+    from . import nbs
     from . import networking
 
     # from . import preprocessing
diff --git a/idconn/connectivity.py b/idconn/connectivity.py
index 3746433..e54914b 100644
--- a/idconn/connectivity.py
+++ b/idconn/connectivity.py
@@ -163,7 +163,7 @@ def task_connectivity(layout, subject, session, task, atlas, confounds, connecti
             print('saving corrmat...', e)
     return files, avg_corrmats
 
-def connectivity(layout, subject, session, task, atlas, connectivity_metric='correlation', confounds=None, out_dir=None):
+def rest_connectivity(layout, subject, session, task, atlas, connectivity_metric='correlation', confounds=None, out_dir=None):
 
     """
     Makes connectivity matrices per subject per session per task per condition.
@@ -190,7 +190,7 @@ def connectivity(layout, subject, session, task, atlas, connectivity_metric='cor
     adjacency_matrix
     """
     try:
-        version = __version__
+        version = get_versions()["version"]
     except:
         version = 'test'
     if '.nii' in atlas:
diff --git a/idconn/io.py b/idconn/io.py
index 6ba41f1..487844b 100644
--- a/idconn/io.py
+++ b/idconn/io.py
@@ -161,7 +161,7 @@ def vectorize_corrmats(matrices):
     ----------
     matrices : numpy array of shape (p, n, n)
         Represents the link strengths of the graphs. Assumed to be
-        an array of symmetric matrices.
+        an array of symmetric nxn matrices per participant and/or timepoint (p).
     
     Returns
     -------
@@ -169,7 +169,7 @@ def vectorize_corrmats(matrices):
         Represents an array of vectorized upper triangles of 
         the input matrices.
     """
-    #print(matrices.shape, matrices.ndim)
+    #print(f'\n\n\n{matrices.shape}, {matrices.ndim}\n\n\n')
     num_node = matrices.shape[1]
     upper_tri = np.triu_indices(num_node, k=1)
     if matrices.ndim == 3:
@@ -202,7 +202,7 @@ def vectorize_corrmats(matrices):
     edge_vector = np.asarray(edge_vector)
     return edge_vector
 
-def read_corrmats(layout, task, deriv_name, conf_measures=None, z_score=True, vectorized=True, verbose=False):
+def read_corrmats(layout, task, deriv_name='IDConn', atlas=None, conf_measures=None, z_score=True, vectorized=True, verbose=False):
     """Returns a node x node x (subject x session) matrix of correlation matrices  
     from a BIDS derivative folder. Optionally returns a subject x session dataframe
     of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) 
@@ -258,14 +258,15 @@ def read_corrmats(layout, task, deriv_name, conf_measures=None, z_score=True, ve
                                subject=subject,
                                session=session,
                                suffix='bold',
-                               scope='IDConn'
+                               scope='IDConn', 
+                               atlas=atlas,
                               )
             if verbose:
                 print(f'Corrmat path for sub-{subject}, ses-{session}: \t{path}')
             else:
                 pass
             if type(path) == list:
-                #print(len(path))
+                #print(path)
                 path = path[0]
             else:
                 pass
@@ -294,9 +295,9 @@ def read_corrmats(layout, task, deriv_name, conf_measures=None, z_score=True, ve
     ppt_df.replace({'': np.nan}, inplace=True)
     return ppt_df
 
-def undo_vectorize(edges):
-    j = len(edges)
-    num_node = (np.sqrt((8 * j) + 1) + 1) / 2
+def undo_vectorize(edges, num_node):
+    #j = len(edges)
+    #num_node = (np.sqrt((8 * j) + 1) + 1) / 2
     X = np.zeros((num_node,num_node))
     X[np.triu_indices(X.shape[0], k = 1)] = edges
     X = X + X.T
@@ -335,7 +336,7 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap=
                                 axes=ax0,
                                 colorbar=False, 
                                 annotate=False)
-    h = sns.heatmap(adj, square=True, cmap=cmap, ax=ax1)
+    h = sns.heatmap(adj, square=True, cmap=cmap, ax=ax1, center=0)
     if strength:
         fig2 = plt.figure(figsize=(12,4))
         if title is not None:
diff --git a/idconn/nbs.py b/idconn/nbs.py
index 28420db..c5b4caa 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -1,15 +1,27 @@
 import numpy as np
-import statsmodels as sm
+import statsmodels.api as sm
 import networkx as nx
 import pandas as pd
-from io import vectorize_corrmats, undo_vectorize
+from idconn.io import vectorize_corrmats, undo_vectorize
 from scipy.stats import t
 import enlighten
-import bct
+#import bct
 
 from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LogisticRegression, LinearRegression
 
+def calc_number_of_nodes(matrices):
+    if matrices.shape[0] != matrices.shape[1]:
+        if matrices.shape[1] == matrices.shape[2]:
+            num_node = matrices.shape[1]
+            matrices = np.moveaxis(matrices, 0, -1)
+        else:
+            raise ValueError(f'Matrices of shape {matrices.shape}',
+                             'requires matrices of shape (subject x session) x node x node',
+                             'or node x node x (subject x session).')
+    else:
+        num_node = matrices.shape[0]
+    return num_node
 
 def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000, stratified=False):
     '''
@@ -59,6 +71,9 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000
     # and, if not predict, build a null distribution
     n = matrices.shape[:-1]
     ndims = len(matrices.shape)
+    #print(ndims)
+    #if ndims >=2
+    num_node = calc_number_of_nodes(matrices)
     
     # vectorize_corrmats returns p x n^2
     # we want to run pynbs per edge
@@ -70,8 +85,10 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000
     if ndims > 2:
         edges = vectorize_corrmats(matrices)
     else:
-        edges = matrices.copy()
+        raise ValueError(f'Input matrices have shape {matrices.shape},',
+                             'pyNBS requires matrices of shape (subject x session) x node x node.')
     edges = edges.T
+    #print(f'\n\n\n{edges.shape}\n\n\n')
     
     # run an ols per edge
     # create significancs matrix for predictor of interest (outcome)
@@ -92,7 +109,7 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000
     
     # find largest connected component of sig_edges
     # turn sig_edges into an nxn matrix first
-    sig_matrix = undo_vectorize(sig_edges) # need to write this function
+    sig_matrix = undo_vectorize(sig_edges, num_node) # need to write this function
     matrix = nx.from_numpy_array(sig_matrix)
     
     #use networkX to find connected components
@@ -104,7 +121,10 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000
     size = np.asarray([s.number_of_edges() for s in S])
     (max_comp, ) = np.where(size == max(size))
     largest_comp_size = max(size)
-    print(f'Connected component has {largest_comp_size} edges.')
+    if predict == False:
+        print(f'Connected component has {largest_comp_size} edges.')
+    else:
+        pass
 
     # retain size of largest connected component 
     # for NBS permutation-based significance testing
@@ -127,8 +147,10 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000
     # plotting in brain space
     for i in unused_nodes:
         S1.loc[i] = 0
-        S1[i] = 0
-
+        temp = S1.copy()
+        temp[i] = 0
+        S1 = temp.copy()
+    
     S1.sort_index(axis=0, inplace=True)
     S1.sort_index(axis=1, inplace=True)
     
@@ -157,7 +179,7 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000
             #print(np.sum(perm_edges))
             # find largest connected component of sig_edges
             # turn sig_edges into an nxn matrix first
-            perm_matrix = undo_vectorize(perm_edges) # need to write this function
+            perm_matrix = undo_vectorize(perm_edges, num_node) # need to write this function
             perm_nx = nx.from_numpy_array(perm_matrix)
 
             comps = nx.connected_components(perm_nx)
@@ -202,6 +224,8 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
         an array of symmetric matrices.
     outcome : list-like of shape (p,)
         Y-value to be predicted with connectivity
+    groups : list-like of shape (p,)
+        Grouping variable - currently only works for 2 groups
     
     Returns
     -------
@@ -215,7 +239,7 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
 
     cv_results = pd.DataFrame(index=index, 
                             columns=['split',  
-                                    'pval', 
+                                    #'pval', 
                                     'score',
                                     'component',
                                     'coefficient_matrix',
@@ -224,28 +248,32 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
     if groups is not None:
         cv = RepeatedStratifiedKFold(n_splits=n_splits,
                                     n_repeats=n_iterations)
-        df = groups.shape[0] - 2
+        dof = groups.shape[0] - 2
     else:
         cv = RepeatedKFold(n_splits=n_splits, 
                         n_repeats=n_iterations)
-        df = edges.shape[0] - 1
+        dof = edges.shape[0] - 1
     
     if tail == 'both':
         alpha = 0.01
     else:
         alpha = 0.005
-    t_threshold = t.ppf(1 - alpha, df=df)
+    t_threshold = t.ppf(1 - alpha, df=dof)
     
-    if matrices.shape[0] != matrices.shape[1]:
-        if matrices.shape[1] == matrices.shape[2]:
-            num_node = matrices.shape[1]
-            matrices = np.moveaxis(matrices, 0, -1)
-        else:
-            raise ValueError(f'Matrices of shape {matrices.shape}',
-                             'requires matrices of shape (subject x session) x node x node',
-                             'or node x node x (subject x session).')
-    else:
-        num_node = matrices.shape[0]
+    # really can't remember why tf I did this?
+    # maybe it's an artifact of permuted_ols?
+    num_node = calc_number_of_nodes(matrices)
+    #print(num_node)
+    #if matrices.shape[0] != matrices.shape[1]:
+    #    if matrices.shape[1] == matrices.shape[2]:
+    #        num_node = matrices.shape[1]
+            #matrices = np.moveaxis(matrices, 0, -1)
+    #    else:
+    #        raise ValueError(f'Matrices of shape {matrices.shape}',
+                             #'requires matrices of shape (subject x session) x node x node',
+                             #'or node x node x (subject x session).')
+    #else:
+    #    num_node = matrices.shape[0]
     upper_tri = np.triu_indices(num_node, k=1)
     
     i = 0
@@ -255,32 +283,47 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
         cv_results.at[i, 'split'] = (train_idx, test_idx)
         # all of this presumes the old bctpy version of nbs
         # irrelevant for pynbs
-        #train_a_idx = [m for m in train_idx if outcome[m] == 0]
-        #train_b_idx = [m for m in train_idx if outcome[m] == 1]
+        
         #assert len(train_a_idx) == len(train_b_idx)
-        #train_a = matrices[:,:,train_a_idx]
-        #train_b = matrices[:,:,train_b_idx]
+        if groups is not None:
+            train_a_idx = [m for m in train_idx if groups[m] == 0]
+            train_b_idx = [m for m in train_idx if groups[m] == 1]
+            regressor = LogisticRegression(max_iter=1000)
+        else:
+            regressor = LinearRegression()
+        train_mats = matrices[train_idx,:,:]
         #print(train_a.shape, train_b.shape)
         
         # separate edges & covariates into 
         train_y = outcome[train_idx]
         test_y = outcome[test_idx]
 
-        pval, adj, _ = pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000)
+        train_confounds = confounds.values[train_idx]
+        #test_confounds = confounds.values[test_idx]
+        
+        # perform NBS wooooooooo
+        # note: output is a dataframe :)
+        adj = pynbs(train_mats, train_y, train_confounds, alpha, predict=True)
+        #print(adj.shape, adj.ndim, adj[0].shape, upper_tri)
         
-        cv_results.at[i, 'pval'] = pval
-        cv_results.at[i, 'component'] = adj
+        #cv_results.at[i, 'pval'] = pval
+        cv_results.at[i, 'component'] = adj.values
 
-        nbs_vector = adj[upper_tri]
+        # grab the values of the adjacency matrix that are just in the upper triangle
+        # so you don't have repeated edges
+        nbs_vector = adj.values[upper_tri]
+        # use those to make a "significant edges" mask
         mask = nbs_vector == 1
+
+        # grab only the significant edges from testing and training sets of edges
+        # for use as features in the predictive models
         train_features = edges[train_idx, :].T[mask]
         test_features = edges[test_idx, :].T[mask]
 
-        # need an IF GROUPS statement
-        # ELSE statsmodels OLS
-        regressor = LogisticRegression(max_iter=1000)
+        # train model predicting outcome from brain (note: no mas covariates)
         model = regressor.fit(X=train_features.T, y=train_y)
         cv_results.at[i, 'model'] = model
+        # score that model on the testing data
         score = model.score(X=test_features.T, y=test_y)
         cv_results.at[i, 'score'] = score
 
@@ -292,9 +335,7 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
                 m+=1
             else:
                 pass
-        X = np.zeros_like(adj)
-        X[np.triu_indices(X.shape[0], k=1)] = param_vector
-        X = X + X.T
+        X = undo_vectorize(param_vector, num_node=num_node)
         cv_results.at[i, 'coefficient_matrix'] = X
         cv_results.at[i, 'coefficient_vector'] = param_vector
         i += 1
diff --git a/idconn/networking.py b/idconn/networking.py
index 273af7c..f74ee12 100644
--- a/idconn/networking.py
+++ b/idconn/networking.py
@@ -270,7 +270,6 @@ def graph_omst(matrix, measure, args):
     metric = measure(thresh_mat, args)
     return metric
 
-
 def scale_free_tau(corrmat, skew_thresh, proportional=True):
     ''''
     Calculates threshold at which network becomes scale-free, estimated from the skewness of the networks degree distribution.
diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
new file mode 100644
index 0000000..169531a
--- /dev/null
+++ b/idconn/workflows/nbs_predict.py
@@ -0,0 +1,60 @@
+from idconn import nbs, io
+import pandas as pd
+import numpy as np
+import bids
+from os.path import join
+from datetime import datetime
+from time import strftime
+
+today = datetime.today()
+today_str = strftime("%m_%d_%Y")
+
+TRAIN_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset'
+TEST_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset'
+DERIV_NAME = 'IDConn'
+OUTCOME = 'Mean E2 (pg/mL)'
+atlas_fname = '/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz'
+
+layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
+
+dat = io.read_corrmats(layout, task='rest', atlas='craddock2012', z_score=False)
+
+keep = dat['adj'].dropna().index
+dat = dat.loc[keep]
+#print(dat['adj'].values.shape)
+num_node = dat.iloc[0]['adj'].shape[0]
+
+matrices = np.vstack(dat['adj'].values).reshape((len(keep), num_node, num_node))
+upper_tri = np.triu_indices(num_node, k=1)
+
+outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]),1))
+confounds = dat[['bc', 'menst_cycle-day']]
+alpha = 0.1
+fig_dir = '/Users/katherine.b/Dropbox/Projects/IDConn'
+
+cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=4, n_iterations=2, k=1000, shuffle=False, fig_dir=fig_dir)
+
+cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t')
+best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0]
+subnetwork = cv_results.loc[best]['component']
+subnetwork_df = pd.DataFrame(subnetwork,
+                             index=range(0,num_node), 
+                             columns=range(0,num_node))
+
+subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_edge_parameters-{today_str}.tsv'),sep='\t')
+
+nbs_vector = subnetwork[upper_tri]
+mask = nbs_vector == 1
+edges = np.vstack(dat['edge_vector'].values)
+features = edges[:,mask]
+#plot the parameters
+param_mat = cv_results.loc[best]['coefficient_matrix']
+odds = 10 ** param_mat 
+prob = odds / (1 + odds)
+
+# run the model on the whole 28andMe dataset to get params
+model = cv_results.loc[best]['model']
+model.fit(features, outcome)
+fig,fig2 = io.plot_edges(param_mat, atlas_fname, title=None, strength=True, cmap='icefire', node_size='strength')
+fig.savefig('/Users/katherine.b/Dropbox/Projects/IDConn/test1.png')
+fig2.savefig('/Users/katherine.b/Dropbox/Projects/IDConn/test2.png')
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 6d42c12..abab8f5 100644
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,8 @@
         "bctpy",
         "pybids",
         "networkx",
-        "matplotlib",  # necessary until nilearn includes mpl as a dependency
+        "matplotlib", # necessary until nilearn includes mpl as a dependency
+        "enlighten",  
     ],
     extras_require={
         "doc": [

From 49ae2746abb8d0ce9dcb6b1c7c2da181868e9c2f Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Tue, 6 Dec 2022 21:52:12 -0800
Subject: [PATCH 17/48] changed default color palette

---
 idconn/io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/idconn/io.py b/idconn/io.py
index 487844b..0930d06 100644
--- a/idconn/io.py
+++ b/idconn/io.py
@@ -303,7 +303,7 @@ def undo_vectorize(edges, num_node):
     X = X + X.T
     return X
 
-def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='icefire', node_size='strength'):
+def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='vlag', node_size='strength'):
     coords = plotting.find_parcellation_cut_coords(atlas_nii)
     num_node = adj.shape[0]
     # only plot the top t% of edges

From 5dd8ba725b13b571fcf489a344a29228fa994223 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Tue, 6 Dec 2022 22:05:32 -0800
Subject: [PATCH 18/48] nbs-predict runsgit add idconn/workflows/nbs_predict.py

---
 idconn/workflows/nbs_predict.py | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index 169531a..1436528 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -9,15 +9,18 @@
 today = datetime.today()
 today_str = strftime("%m_%d_%Y")
 
-TRAIN_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset'
+TRAIN_DSET = '/Users/katherine.b/Dropbox/Data/ds002674'
 TEST_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset'
 DERIV_NAME = 'IDConn'
-OUTCOME = 'Mean E2 (pg/mL)'
+OUTCOME = 'estradiol'
+CONFOUNDS = ['bc']
+TASK = 'rest'
+ATLAS = 'craddock2012'
 atlas_fname = '/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz'
 
 layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
 
-dat = io.read_corrmats(layout, task='rest', atlas='craddock2012', z_score=False)
+dat = io.read_corrmats(layout, task=TASK, atlas=ATLAS, z_score=False)
 
 keep = dat['adj'].dropna().index
 dat = dat.loc[keep]
@@ -28,7 +31,7 @@
 upper_tri = np.triu_indices(num_node, k=1)
 
 outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]),1))
-confounds = dat[['bc', 'menst_cycle-day']]
+confounds = dat[CONFOUNDS]
 alpha = 0.1
 fig_dir = '/Users/katherine.b/Dropbox/Projects/IDConn'
 
@@ -56,5 +59,22 @@
 model = cv_results.loc[best]['model']
 model.fit(features, outcome)
 fig,fig2 = io.plot_edges(param_mat, atlas_fname, title=None, strength=True, cmap='icefire', node_size='strength')
-fig.savefig('/Users/katherine.b/Dropbox/Projects/IDConn/test1.png')
-fig2.savefig('/Users/katherine.b/Dropbox/Projects/IDConn/test2.png')
\ No newline at end of file
+fig.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_betas-{today_str}.png'), dpi=400)
+fig2.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_betas-strength-{today_str}.png'), dpi=400)
+
+layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
+
+test_df = io.read_corrmats(layout, task=TASK, atlas=ATLAS, z_score=False)
+
+test_df.dropna(inplace=True)
+
+outcome_test = test_df[OUTCOME].values
+groups_test = outcome
+matrices_test = np.vstack(test_df['adj'].dropna().values).reshape((len(test_df['adj'].dropna().index),num_node,num_node))
+edges_test = np.vstack(test_df['edge_vector'].dropna().values)
+
+test_features = edges_test.T[mask,:]
+test_outcome = test_df[OUTCOME].values
+accuracy = model.score(test_features.T, test_outcome)
+print('Independent prediction accuracy:\t', accuracy)
+np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'accuracy-{today_str}.txt'), [accuracy])
\ No newline at end of file

From 2aa185feccff6a9d968cbc13a9796ca164682546 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Tue, 6 Dec 2022 22:11:34 -0800
Subject: [PATCH 19/48] removve unused options'

---
 idconn/nbs.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/idconn/nbs.py b/idconn/nbs.py
index c5b4caa..41b63ca 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -78,8 +78,10 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000
     # vectorize_corrmats returns p x n^2
     # we want to run pynbs per edge
     # so vectorized edges must be transposed
-    
-    exog = np.hstack((outcome, confounds))
+    if confounds:
+        exog = np.hstack((outcome, confounds))
+    else:
+        exog = outcome
     exog = sm.add_constant(exog, prepend=False)
     # turn matrices into vectorized upper triangles
     if ndims > 2:
@@ -206,7 +208,7 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000
     else:
         return S1
 
-def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10, k=1000, shuffle=False, fig_dir=None):
+def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10):
     """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided
     of shape ((subject x session)x node x node)
     in the network.
@@ -289,6 +291,8 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
             train_a_idx = [m for m in train_idx if groups[m] == 0]
             train_b_idx = [m for m in train_idx if groups[m] == 1]
             regressor = LogisticRegression(max_iter=1000)
+        elif np.unique(outcome).shape[0] >2:
+            regressor = LogisticRegression(max_iter=1000)
         else:
             regressor = LinearRegression()
         train_mats = matrices[train_idx,:,:]

From f4b8531d48deb97de8aad5952d8fed24666dde23 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Tue, 6 Dec 2022 22:12:00 -0800
Subject: [PATCH 20/48] remove unused options

---
 idconn/workflows/nbs_predict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index 1436528..d3bfbb8 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -35,7 +35,7 @@
 alpha = 0.1
 fig_dir = '/Users/katherine.b/Dropbox/Projects/IDConn'
 
-cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=4, n_iterations=2, k=1000, shuffle=False, fig_dir=fig_dir)
+cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10000)
 
 cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t')
 best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0]

From 8228235495b169cf937c2a9479f4620a15f082ff Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Tue, 6 Dec 2022 22:15:15 -0800
Subject: [PATCH 21/48] auto-detect binary outcome and select logistic
 regression

---
 idconn/nbs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/idconn/nbs.py b/idconn/nbs.py
index 41b63ca..164eb68 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -23,7 +23,7 @@ def calc_number_of_nodes(matrices):
         num_node = matrices.shape[0]
     return num_node
 
-def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000, stratified=False):
+def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutations=10000, stratified=False):
     '''
     Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided
     of shape ((subject x session)x node x node)
@@ -78,7 +78,7 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000
     # vectorize_corrmats returns p x n^2
     # we want to run pynbs per edge
     # so vectorized edges must be transposed
-    if confounds:
+    if confounds is not None:
         exog = np.hstack((outcome, confounds))
     else:
         exog = outcome
@@ -291,7 +291,7 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
             train_a_idx = [m for m in train_idx if groups[m] == 0]
             train_b_idx = [m for m in train_idx if groups[m] == 1]
             regressor = LogisticRegression(max_iter=1000)
-        elif np.unique(outcome).shape[0] >2:
+        elif np.unique(outcome).shape[0] == 2:
             regressor = LogisticRegression(max_iter=1000)
         else:
             regressor = LinearRegression()

From 37f29b7f7a233b77c213f58f0946ff8f5b82cc89 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Wed, 7 Dec 2022 09:15:44 -0800
Subject: [PATCH 22/48] bypass regression if no significant edges

---
 idconn/nbs.py | 59 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 27 deletions(-)

diff --git a/idconn/nbs.py b/idconn/nbs.py
index 164eb68..9513eef 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -312,36 +312,41 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
         
         #cv_results.at[i, 'pval'] = pval
         cv_results.at[i, 'component'] = adj.values
+        
+        # in the event of no edges significantly related to <outcome>
+        if sum(adj) > 0:
+            # grab the values of the adjacency matrix that are just in the upper triangle
+            # so you don't have repeated edges
+            nbs_vector = adj.values[upper_tri]
+            # use those to make a "significant edges" mask
+            mask = nbs_vector == 1
 
-        # grab the values of the adjacency matrix that are just in the upper triangle
-        # so you don't have repeated edges
-        nbs_vector = adj.values[upper_tri]
-        # use those to make a "significant edges" mask
-        mask = nbs_vector == 1
+            # grab only the significant edges from testing and training sets of edges
+            # for use as features in the predictive models
+            train_features = edges[train_idx, :].T[mask]
+            test_features = edges[test_idx, :].T[mask]
 
-        # grab only the significant edges from testing and training sets of edges
-        # for use as features in the predictive models
-        train_features = edges[train_idx, :].T[mask]
-        test_features = edges[test_idx, :].T[mask]
 
-        # train model predicting outcome from brain (note: no mas covariates)
-        model = regressor.fit(X=train_features.T, y=train_y)
-        cv_results.at[i, 'model'] = model
-        # score that model on the testing data
-        score = model.score(X=test_features.T, y=test_y)
-        cv_results.at[i, 'score'] = score
+            # train model predicting outcome from brain (note: no mas covariates)
+            model = regressor.fit(X=train_features.T, y=train_y)
+            cv_results.at[i, 'model'] = model
+            # score that model on the testing data
+            score = model.score(X=test_features.T, y=test_y)
+            cv_results.at[i, 'score'] = score
 
-        m = 0
-        param_vector = np.zeros_like(nbs_vector)
-        for l in range(0, nbs_vector.shape[0]):
-            if nbs_vector[l] == 1.:
-                param_vector[l] = model.coef_[0,m]
-                m+=1
-            else:
-                pass
-        X = undo_vectorize(param_vector, num_node=num_node)
-        cv_results.at[i, 'coefficient_matrix'] = X
-        cv_results.at[i, 'coefficient_vector'] = param_vector
-        i += 1
+            m = 0
+            param_vector = np.zeros_like(nbs_vector)
+            for l in range(0, nbs_vector.shape[0]):
+                if nbs_vector[l] == 1.:
+                    param_vector[l] = model.coef_[0,m]
+                    m+=1
+                else:
+                    pass
+            X = undo_vectorize(param_vector, num_node=num_node)
+            cv_results.at[i, 'coefficient_matrix'] = X
+            cv_results.at[i, 'coefficient_vector'] = param_vector
+            i += 1
+        else:
+            pass
         ticks.update()
     return cv_results
\ No newline at end of file

From dfc470d87a182c5689f724b1f09327b901f35801 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Wed, 7 Dec 2022 09:25:41 -0800
Subject: [PATCH 23/48] testing nbs-predict (it runs)

---
 idconn/workflows/nbs_predict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index d3bfbb8..0ab55c0 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -35,7 +35,7 @@
 alpha = 0.1
 fig_dir = '/Users/katherine.b/Dropbox/Projects/IDConn'
 
-cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10000)
+cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=1000)
 
 cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t')
 best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0]

From 26f69187b4c2b920a0d4d53f58c70252ad353570 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Wed, 7 Dec 2022 10:43:48 -0800
Subject: [PATCH 24/48] standardize output file names

---
 idconn/workflows/nbs_predict.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index 0ab55c0..dfdae87 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -37,14 +37,14 @@
 
 cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=1000)
 
-cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t')
+cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t')
 best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0]
 subnetwork = cv_results.loc[best]['component']
 subnetwork_df = pd.DataFrame(subnetwork,
                              index=range(0,num_node), 
                              columns=range(0,num_node))
 
-subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_edge_parameters-{today_str}.tsv'),sep='\t')
+subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_edge-parameters-{today_str}.tsv'),sep='\t')
 
 nbs_vector = subnetwork[upper_tri]
 mask = nbs_vector == 1
@@ -59,8 +59,8 @@
 model = cv_results.loc[best]['model']
 model.fit(features, outcome)
 fig,fig2 = io.plot_edges(param_mat, atlas_fname, title=None, strength=True, cmap='icefire', node_size='strength')
-fig.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_betas-{today_str}.png'), dpi=400)
-fig2.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_betas-strength-{today_str}.png'), dpi=400)
+fig.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_betas-{today_str}.png'), dpi=400)
+fig2.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_betas-strength-{today_str}.png'), dpi=400)
 
 layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
 
@@ -77,4 +77,4 @@
 test_outcome = test_df[OUTCOME].values
 accuracy = model.score(test_features.T, test_outcome)
 print('Independent prediction accuracy:\t', accuracy)
-np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'accuracy-{today_str}.txt'), [accuracy])
\ No newline at end of file
+np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_accuracy-{today_str}.txt'), [accuracy])
\ No newline at end of file

From c8ec46f4bce0dcf749ade1aa7aa418b0d254e113 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Wed, 7 Dec 2022 20:35:58 -0800
Subject: [PATCH 25/48] add correlation, fix logistic conditional

---
 idconn/workflows/nbs_predict.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index dfdae87..b09fac7 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -5,6 +5,7 @@
 from os.path import join
 from datetime import datetime
 from time import strftime
+from scipy.stats import spearmanr
 
 today = datetime.today()
 today_str = strftime("%m_%d_%Y")
@@ -31,7 +32,10 @@
 upper_tri = np.triu_indices(num_node, k=1)
 
 outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]),1))
-confounds = dat[CONFOUNDS]
+if CONFOUNDS is not None:
+    confounds = dat[CONFOUNDS]
+else:
+    confounds = None
 alpha = 0.1
 fig_dir = '/Users/katherine.b/Dropbox/Projects/IDConn'
 
@@ -55,7 +59,7 @@
 odds = 10 ** param_mat 
 prob = odds / (1 + odds)
 
-# run the model on the whole 28andMe dataset to get params
+# run the model on the whole test dataset to get params
 model = cv_results.loc[best]['model']
 model.fit(features, outcome)
 fig,fig2 = io.plot_edges(param_mat, atlas_fname, title=None, strength=True, cmap='icefire', node_size='strength')
@@ -75,6 +79,17 @@
 
 test_features = edges_test.T[mask,:]
 test_outcome = test_df[OUTCOME].values
-accuracy = model.score(test_features.T, test_outcome)
-print('Independent prediction accuracy:\t', accuracy)
-np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_accuracy-{today_str}.txt'), [accuracy])
\ No newline at end of file
+# if the model is a logistic regression, i.e. with a binary outcome
+# then score is prediction accuracy
+# if the model is a linear regression, i.e., with a continuous outcome
+# then the score is R^2 (coefficient of determination)
+score = model.score(test_features.T, test_outcome)
+print('Independent prediction accuracy:\t', score)
+pred_outcome = model.predict(test_features.T)
+if len(np.unique(test_outcome)) > 2:
+    corr = spearmanr(test_outcome, pred_outcome)
+    print('\nSpearman correlation:\t', corr)
+    np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_score-{today_str}.txt'), [score, corr[0], corr[1]])
+else: 
+    np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_score-{today_str}.txt'), [score])
+

From e83affa503ecb01a654f9f4e989e4d2a5c2a49e7 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 8 Dec 2022 17:02:12 -0800
Subject: [PATCH 26/48] removve out_dir, add docs

---
 idconn/connectivity.py | 54 ++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/idconn/connectivity.py b/idconn/connectivity.py
index e54914b..cf14137 100644
--- a/idconn/connectivity.py
+++ b/idconn/connectivity.py
@@ -19,35 +19,37 @@ def _check_dims(matrix):
         raise ValueError('Expected a square matrix, got array of shape'
                          ' {0}.'.format(matrix.shape))
 
-def task_connectivity(layout, subject, session, task, atlas, confounds, connectivity_metric='correlation', out_dir=None):
+def task_connectivity(layout, subject, session, task, atlas, confounds, connectivity_metric='correlation'):
     """
     Makes connectivity matrices per subject per session per task per condition.
     Parameters
     ----------
-    dset_dir : str
-        BIDS-formatted dataset path (top-level, in which a 'derivatives/' directory will be made if one does not exist)
+    layout : BIDSLayout object
+        BIDSLayout (i.e., pybids layout object) for directory containing data for analysis (with `derivative=True`, as we're using fmriprep output).
     subject : str
         Subject ID for which the networks will be calculated.
     session : str, optional
-        Session of data collection. If there's only one session, we'll find it.
+        Session of data collection for which networks will be calculated. If there's only one session, we'll find it.
     task : str
-        Name of task fMRI scan from which networks will be calculated.
+        Name of task fMRI scan (can be "rest") from which networks will be calculated.
     connectivity_metric : {"correlation", "partial correlation", "tangent",\
                            "covariance", "precision"}, optional
-        The matrix kind. Passed to Nilearn's `ConnectivityMeasure`.
+        The matrix kind. Passed to Nilearn's `ConnectivityMeasure`. Default is product-moment correlation, "correlation".
     space : str
-        'native' if analyses will be performed in subjects' functional native space (atlas(es) should be transformed)
-        'mni152-2mm' if analyses will be performed in MNI125 2mm isotropic space (fMRI data should already be transformed)
+        'native' if analyses will be performed in subjects' functional native space (atlas(es) should be transformed into this space already).
+        'mni152-2mm' if analyses will be performed in MNI125 2mm isotropic space (fMRI data should already be transformed into MNI space).
     atlas : str
         If you want to grab an atlas using Nilearn, this is the name of the atlas and 
         must match the corresponding function `fetch_atlas_[name]` in `nilearn.datasets`. 
-        If you have your own atlas, this is the path to that nifti file.`
+        If you have your own atlas, this is the path to that nifti file. Currently: only works with paths. 
     confounds : list-like
-        Filenames of confounds files.
+        Columns from fMRIPrep confounds output to be regressed out of fMRI data before correlation matrices are made.
     Returns
     -------
-    confounds_file : str
-        Filename of merged confounds .tsv file
+    avg_corrmats: numpy array
+        Average corrmat (per condition, if applicable).
+    files : list
+        Filenames of computed correlation matrices.
     """
     #version = '0.1.1'
     try:
@@ -57,10 +59,8 @@ def task_connectivity(layout, subject, session, task, atlas, confounds, connecti
     if '.nii' in atlas:
         assert exists(atlas), f'Mask file does not exist at {atlas}'
     
-    if not out_dir:
-        deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}')
-    else:
-        deriv_dir = out_dir
+    deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}')
+    
     space = 'MNI152NLin2009cAsym'
     atlas_name = basename(atlas).rsplit('.', 2)[0]
     # use pybids here to grab # of runs and preproc bold filenames
@@ -163,14 +163,14 @@ def task_connectivity(layout, subject, session, task, atlas, confounds, connecti
             print('saving corrmat...', e)
     return files, avg_corrmats
 
-def rest_connectivity(layout, subject, session, task, atlas, connectivity_metric='correlation', confounds=None, out_dir=None):
+def rest_connectivity(layout, subject, session, task, atlas, confounds=None,connectivity_metric='correlation'):
 
     """
     Makes connectivity matrices per subject per session per task per condition.
     Parameters
     ----------
     layout : str
-        BIDS layout with derivatives indexed from pyBIDS
+        BIDS layout with fMRIPrep derivatives indexed from pyBIDS
     subject : str
         Subject ID for which the networks will be calculated.
     session : str, optional
@@ -178,16 +178,17 @@ def rest_connectivity(layout, subject, session, task, atlas, connectivity_metric
     connectivity_metric : {"correlation", "partial correlation", "tangent",\
                            "covariance", "precision"}, optional
         The matrix kind. Passed to Nilearn's `ConnectivityMeasure`.
-    space : str
-        'native' if analyses will be performed in subjects' functional native space (atlas(es) should be transformed)
-        'mni152-2mm' if analyses will be performed in MNI125 2mm isotropic space (fMRI data should already be transformed)
     atlas : str
-        Name of atlas for parcellating voxels into nodes, must be in the same `space` given above.
+        Name of atlas for parcellating voxels into nodes, must be in the same `space` as preprocessed rsfMRI data from fMRIPrep.
     confounds : list-like
         Names of confounds (should be columns in fmriprep output confounds.tsv).
     Returns
     -------
-    adjacency_matrix
+    corrmat_df : Pandas dataframe 
+        Functional connectivity matrix with labeled nodes (i.e., rows, columns) and weighted edges (i.e., elements) based on
+        the connectivity metric selected. If multiple runs, represents average across runs.
+    corrmat_file : str
+        Path to saved correlation matrix.
     """
     try:
         version = get_versions()["version"]
@@ -196,16 +197,13 @@ def rest_connectivity(layout, subject, session, task, atlas, connectivity_metric
     if '.nii' in atlas:
         assert exists(atlas), f'Mask file does not exist at {atlas}'
     
-    if not out_dir:
-        deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}')
-    else:
-        deriv_dir = out_dir
+    deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}')
     atlas_name = basename(atlas).rsplit('.', 2)[0]
     # use pybids here to grab # of runs and preproc bold filenames
     connectivity_measure = connectome.ConnectivityMeasure(kind=connectivity_metric)
     bold_files = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz') # should be preprocessed BOLD file from fmriprep, grabbed with pybids
     print(f'BOLD files found at {bold_files}')
-    confounds_files = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task)
+    #confounds_files = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task)
 
     runs = []
     if len(bold_files) > 1:

From 8e119f622cdf5c68dbd1fb8e01717eecb6100d71 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 8 Dec 2022 17:13:53 -0800
Subject: [PATCH 27/48] removed unused params, added docs

---
 idconn/nbs.py | 75 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/idconn/nbs.py b/idconn/nbs.py
index 9513eef..e9d59fe 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -3,7 +3,7 @@
 import networkx as nx
 import pandas as pd
 from idconn.io import vectorize_corrmats, undo_vectorize
-from scipy.stats import t
+from scipy.stats import t, pearsonr, pointbiserialr, spearmanr
 import enlighten
 #import bct
 
@@ -23,7 +23,7 @@ def calc_number_of_nodes(matrices):
         num_node = matrices.shape[0]
     return num_node
 
-def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutations=10000, stratified=False):
+def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutations=10000):
     '''
     Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided
     of shape ((subject x session)x node x node)
@@ -52,10 +52,7 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat
     permutations : int
         If `predict=False`, specifies the number of permutations run to create a null distribution
         for estimating the significance of the connected component size. Recommended 10,000.
-    stratified : bool or list-like of shape (p,)
-        If `predict=True` and there are groups that should be equally sampled across k-fold 
-        cross-validation, input should be a list of group belonging (i.e., one label per participant).
-
+    
     Returns
     -------
     S1 : Pandas dataframe
@@ -69,7 +66,7 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat
     # and retain significant edges
     # then find the largest connected component
     # and, if not predict, build a null distribution
-    n = matrices.shape[:-1]
+    n = matrices.shape[0]
     ndims = len(matrices.shape)
     #print(ndims)
     #if ndims >=2
@@ -98,10 +95,17 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat
     # 0 if it's not
     sig_edges = []
     for i in range(0, edges.shape[0]):
+        y = edges[i,:]
         # statsmodels for regressing predictors on edges
-        mod = sm.OLS(edges[i,:], exog, hasconst=True)
-        results = mod.fit()
-        edge_pval = results.pvalues[0]
+        #mod = sm.OLS(y, exog, hasconst=True)
+        #results = mod.fit()
+        #edge_pval = results.pvalues[0]
+        
+        # let's try straight up correlations?
+        if len(np.unique(outcome)) > 2:
+            r, edge_pval = pearsonr(outcome.reshape(n,), y.reshape(n,))
+        else:
+            r, edge_pval = pointbiserialr(outcome.reshape(n,), y.reshape(n,))
         
         # build binary significance edge vector
         if edge_pval < alpha:
@@ -148,9 +152,9 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat
     # and NBS might need all nodes for easier
     # plotting in brain space
     for i in unused_nodes:
-        S1.loc[i] = 0
+        S1.loc[i] = 0.0
         temp = S1.copy()
-        temp[i] = 0
+        temp[i] = 0.0
         S1 = temp.copy()
     
     S1.sort_index(axis=0, inplace=True)
@@ -160,7 +164,6 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat
     # only for regular NBS, -Predict doesn't need this
     if predict == False:
         perms = np.zeros((permutations,))
-        hit = 0
         rng = np.random.default_rng()
         exog_copy = exog.copy()
         for i in range(0, permutations):
@@ -208,7 +211,7 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat
     else:
         return S1
 
-def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10):
+def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10, n_iterations=10):
     """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided
     of shape ((subject x session)x node x node)
     in the network.
@@ -226,14 +229,28 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
         an array of symmetric matrices.
     outcome : list-like of shape (p,)
         Y-value to be predicted with connectivity
+    confounds : list-like
+        Names of columns in `participants.tsv` to be regressed out of connectivity and outcome 
+        data in each CV fold (per recommendation from Snoek et al., 2019).
+    alpha : float
+        Proportion of type II errors (i.e., false positives) we're willing to put up with. 
+        This is the upper limit for pvalues in the edge detection process.
     groups : list-like of shape (p,)
-        Grouping variable - currently only works for 2 groups
+        Grouping variable - currently only works for 2 groups. Will enforce stratified k-fold CV.
+    n_splits : int
+        Value of K for K-fold cross-validation. Will split data into K chunks, train on K-1 chunks and test on the Kth.
+    n_iterations : int
+        Number of times to run K-fold cross-validation. More times = more stable results.
     
     Returns
     -------
+    weighted_average : Pandas dataframe
+        Includes the average of all largest components across folds and iterations, weighted by
+        their prediction performance (i.e., accuracy for binary outcome, correlation for continuous).
+        Could be used for out-of-sample prediction, once thresholded and binarized.
     cv_results : Pandas dataframe
-        Includes the results of each cross-validation loop
-        the input matrices.
+        Includes the results of each cross-validation loop 
+        (e.g., predictive performance, data split, largest connected component per fold per iteration).
     """
     edges = vectorize_corrmats(matrices)
     #print(edges.shape)
@@ -256,11 +273,7 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
                         n_repeats=n_iterations)
         dof = edges.shape[0] - 1
     
-    if tail == 'both':
-        alpha = 0.01
-    else:
-        alpha = 0.005
-    t_threshold = t.ppf(1 - alpha, df=dof)
+    #t_threshold = t.ppf(1 - alpha, df=dof)
     
     # really can't remember why tf I did this?
     # maybe it's an artifact of permuted_ols?
@@ -302,7 +315,10 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
         train_y = outcome[train_idx]
         test_y = outcome[test_idx]
 
-        train_confounds = confounds.values[train_idx]
+        if confounds is not None:
+            train_confounds = confounds.values[train_idx]
+        else:
+            train_confounds = None
         #test_confounds = confounds.values[test_idx]
         
         # perform NBS wooooooooo
@@ -314,24 +330,25 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s
         cv_results.at[i, 'component'] = adj.values
         
         # in the event of no edges significantly related to <outcome>
-        if sum(adj) > 0:
+        #print(sum(sum(adj.values)), '\n', adj.values.shape)
+        if sum(sum(adj.values)) > 0:
             # grab the values of the adjacency matrix that are just in the upper triangle
             # so you don't have repeated edges
             nbs_vector = adj.values[upper_tri]
             # use those to make a "significant edges" mask
-            mask = nbs_vector == 1
+            mask = nbs_vector == 1.0
 
             # grab only the significant edges from testing and training sets of edges
             # for use as features in the predictive models
             train_features = edges[train_idx, :].T[mask]
             test_features = edges[test_idx, :].T[mask]
 
-
             # train model predicting outcome from brain (note: no mas covariates)
-            model = regressor.fit(X=train_features.T, y=train_y)
-            cv_results.at[i, 'model'] = model
+            #print(train_features.T.shape, train_y.shape)
+            model = regressor.fit(X=train_features.T, y=train_y.ravel())
+            #cv_results.at[i, 'model'] = model
             # score that model on the testing data
-            score = model.score(X=test_features.T, y=test_y)
+            score = model.score(X=test_features.T, y=test_y.ravel())
             cv_results.at[i, 'score'] = score
 
             m = 0

From 8336a8b09560bf7ca7328671deab7f8338bb2f45 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 9 Mar 2023 19:50:18 -0800
Subject: [PATCH 28/48] commit before overhauling nbs.py

---
 idconn/nbs.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/idconn/nbs.py b/idconn/nbs.py
index e9d59fe..cbcc395 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -8,6 +8,7 @@
 #import bct
 
 from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold
+from sklearn.feature_selection import f_regression, f_classif
 from sklearn.linear_model import LogisticRegression, LinearRegression
 
 def calc_number_of_nodes(matrices):
@@ -115,17 +116,17 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat
     
     # find largest connected component of sig_edges
     # turn sig_edges into an nxn matrix first
-    sig_matrix = undo_vectorize(sig_edges, num_node) # need to write this function
+    sig_matrix = undo_vectorize(sig_edges, num_node)
+
+    # turn it into a networkx matrix
     matrix = nx.from_numpy_array(sig_matrix)
     
     #use networkX to find connected components
-    comps = nx.connected_components(matrix)
+    largest_cc = max(nx.connected_components(matrix), key=len)
+    G0 = G.subgraph(largest_cc)
+    
+    # grab number of edges from G0
     
-    # rearrange networkx output into an array of matrices, S
-    S = [matrix.subgraph(c).copy() for c in comps]
-    # find size of each connected component, s in S
-    size = np.asarray([s.number_of_edges() for s in S])
-    (max_comp, ) = np.where(size == max(size))
     largest_comp_size = max(size)
     if predict == False:
         print(f'Connected component has {largest_comp_size} edges.')
@@ -187,7 +188,9 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat
             perm_matrix = undo_vectorize(perm_edges, num_node) # need to write this function
             perm_nx = nx.from_numpy_array(perm_matrix)
 
-            comps = nx.connected_components(perm_nx)
+            #comps = nx.connected_components(perm_nx)
+
+            
 
             S = [perm_nx.subgraph(c).copy() for c in comps]
             perm_size = np.asarray([s.number_of_edges() for s in S])

From a3869a07ae48e44ca5031510c8c209fa19171eeb Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 9 Mar 2023 19:51:04 -0800
Subject: [PATCH 29/48] add docstrings to io

---
 idconn/io.py | 79 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 67 insertions(+), 12 deletions(-)

diff --git a/idconn/io.py b/idconn/io.py
index 0930d06..7690615 100644
--- a/idconn/io.py
+++ b/idconn/io.py
@@ -19,6 +19,7 @@ def build_statsmodel_json(name, task, contrast, confounds, highpass,
                           mask, conn_meas, graph_meas=None, exclude=None, outfile=None):
     '''
     Creates a BIDS Stats Models json with analysis details for further use.
+    DOES NOT WORK YET.
 
     Parameters
     ----------
@@ -202,16 +203,25 @@ def vectorize_corrmats(matrices):
     edge_vector = np.asarray(edge_vector)
     return edge_vector
 
-def read_corrmats(layout, task, deriv_name='IDConn', atlas=None, conf_measures=None, z_score=True, vectorized=True, verbose=False):
+def read_corrmats(layout, task, deriv_name='IDConn', z_score=True, vectorized=True, verbose=False):
     """Returns a node x node x (subject x session) matrix of correlation matrices  
     from a BIDS derivative folder. Optionally returns a subject x session dataframe
     of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) 
     array of vectorized upper triangles of those correlation matrices.
     Parameters
     ----------
-    matrices : numpy array of shape (n, n, p)
-        Represents the link strengths of the graphs. Assumed to be
-        an array of symmetric matrices.
+    layout : BIDSLayout object
+        BIDSLayout (i.e., pybids layout object) for directory containing data for analysis (with `derivative=True`, as we're using fmriprep output).
+    task : str
+        Name of task fMRI scan (can be "rest") from which networks will be calculated.
+    deriv_name : str
+        Name of the package used to generate the correlation matrices to be read. Could be IDConn, could be something else.
+    z_score : bool
+        If True, assumes computed connectivity matrices are product-moment correlations, uses Fisher's r-to-Z.
+    vectorized : bool
+        Would you also like this function to return the vectorized upper triangles of all your matrices?
+    verbose : bool
+        Print statements? Y/N?
     
     Returns
     -------
@@ -296,6 +306,20 @@ def read_corrmats(layout, task, deriv_name='IDConn', atlas=None, conf_measures=N
     return ppt_df
 
 def undo_vectorize(edges, num_node):
+    '''
+    Puts an edge vector back into an adjacency matrix.
+    Parameters
+    ----------
+    edges : list-like of shape ((n^2-n)/2,) 
+        Vectorized upper triangle of an adjacency matrix.
+    num_node : int
+        The number of nodes in the graph. I would calculate this myself, but I'd rather not.
+    
+    Returns
+    -------
+    matrix : numpy array of size (n,n)
+        Symmetric array of connectivity values.
+    '''
     #j = len(edges)
     #num_node = (np.sqrt((8 * j) + 1) + 1) / 2
     X = np.zeros((num_node,num_node))
@@ -303,7 +327,37 @@ def undo_vectorize(edges, num_node):
     X = X + X.T
     return X
 
-def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='vlag', node_size='strength'):
+def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='coolwarm', node_size='strength'):
+    '''
+    Plots the edges of a connectivity/adjacency matrix both in a heatmap and in brain space, with the option to include
+    a surface plot of node strength.
+    Parameters
+    ----------
+    adj : array-like of shape (n, n) 
+        Adjacency matrix to be plotted. Can be numpy array or Pandas dataframe.
+    atlas_nii : str
+        Path to the atlas used to define nodes in the adjacency matrix. 
+        Should be one value per node, with the same number of values as rows and columns in adj (i.e., n).
+        Background should be 0, should be in MNI space.
+    threshold : int
+        Percentile of edges to plot, between 0 and 100 such that 0 plots all the edges and 100 plots none. 
+        If not specified, default is 99, which plots the top 1% of edges.
+    title : str
+        Title for plots. 
+    strength : bool
+        If True, plots surface maps of node strength (i.e., the sum of all a node's edge weights) 
+    cmap : str
+        One of the matplotlib colormaps. 
+    node_size : int or 'strength'
+        Size to plot nodes in brain space. If 'strength', node size varies according to a node's summed edges (i.e., strength).
+    
+    Returns
+    -------
+    fig1 : Matplotlib figure object
+        Connectivity figure.
+    fig2 : Matplotlib figure object
+        If `strength=True`,  the surface node strength plot.
+    '''
     coords = plotting.find_parcellation_cut_coords(atlas_nii)
     num_node = adj.shape[0]
     # only plot the top t% of edges
@@ -331,12 +385,12 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap=
     g = plotting.plot_connectome(adj, coords, 
                                 node_size=node_size,
                                 edge_threshold=threshold, 
-                                edge_cmap=cmap, 
+                                edge_cmap='coolwarm', 
                                 figure=fig, 
                                 axes=ax0,
                                 colorbar=False, 
                                 annotate=False)
-    h = sns.heatmap(adj, square=True, cmap=cmap, ax=ax1, center=0)
+    h = sns.heatmap(adj, square=True, cmap='coolwarm', ax=ax1, center=0)
     if strength:
         fig2 = plt.figure(figsize=(12,4))
         if title is not None:
@@ -347,7 +401,8 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap=
         for i in np.arange(0,num_node):
             regn_sch_arr[np.where(regn_sch_arr == i+1)] = np.sum(adj[i])
         strength_nimg = nib.Nifti1Image(regn_sch_arr, nimg.affine)
-        nib.save(strength_nimg, '/Users/katherine.b/Dropbox/HC_Use_predictive-strength.nii')
+        # replace this filename with BIDSy output
+        #nib.save(strength_nimg, f'/Users/katherine.b/Dropbox/{title}predictive-strength.nii')
 
         gs = GridSpec(1, 4)
         # plot edge weights on surfaces
@@ -361,13 +416,13 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap=
 
         plt.tight_layout(w_pad=-1)
         i = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5,
-                                                cmap=cmap, view='lateral', colorbar=False, axes=ax2)
+                                                cmap='coolwarm', view='lateral', colorbar=False, axes=ax2)
         j = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5,
-                                                cmap=cmap, view='medial', colorbar=False, axes=ax3)
+                                                cmap='coolwarm', view='medial', colorbar=False, axes=ax3)
         k = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5,
-                                                cmap=cmap, view='lateral', colorbar=False, axes=ax4)
+                                                cmap='coolwarm', view='lateral', colorbar=False, axes=ax4)
         l = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5,
-                                                cmap=cmap, view='medial', colorbar=False, axes=ax5)
+                                                cmap='coolwarm', view='medial', colorbar=False, axes=ax5)
         return fig, fig2
     else:
         return fig
\ No newline at end of file

From cadd0d40811500b9f3da296384956e77a4e982c8 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 9 Mar 2023 19:52:50 -0800
Subject: [PATCH 30/48] rename task/rest conn modules

---
 idconn/pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/idconn/pipeline.py b/idconn/pipeline.py
index 38c0ccd..8c82eea 100644
--- a/idconn/pipeline.py
+++ b/idconn/pipeline.py
@@ -23,7 +23,7 @@
 from os.path import exists
 #from glob import glob
 #from nilearn import input_data, connectome, plotting, image
-from idconn.connectivity import connectivity, task_connectivity
+from idconn.connectivity import rest_connectivity, task_connectivity
 from idconn.parser_utils import is_valid_file, is_valid_path
 
 #from idconn.networking import graph_theory, null_distribution
@@ -116,12 +116,12 @@ def idconn_workflow(dset_dir, atlas, task, out_dir, space="MNI152NLin2009cAsym",
             print(f"here are the inputs: {layout, subject, session, task, atlas, conn, space, confounds}")
             if 'rest' in task:
                 try:
-                    adj_matrix = connectivity(layout, subject, session, task, atlas, conn, space, confounds)
+                    adj_matrix = rest_connectivity(layout, subject, session, task, atlas, conn, space, confounds)
                 except Exception as e:
                     print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}')
             if len(conditions) < 1:
                 try:
-                    adj_matrix = connectivity(layout, subject, session, task, atlas, conn, space, confounds)
+                    adj_matrix = rest_connectivity(layout, subject, session, task, atlas, conn, space, confounds)
                 except Exception as e:
                     print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}')
             else:

From adc613968c0537ebc6349caf794467d90b5826b3 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 9 Mar 2023 19:53:45 -0800
Subject: [PATCH 31/48] update nbs_predict script

---
 idconn/workflows/nbs_predict.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index b09fac7..01b5e8f 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -11,12 +11,13 @@
 today_str = strftime("%m_%d_%Y")
 
 TRAIN_DSET = '/Users/katherine.b/Dropbox/Data/ds002674'
-TEST_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset'
+TEST_DSET = '/Users/katherine.b/Dropbox/Data/ds002674'
 DERIV_NAME = 'IDConn'
 OUTCOME = 'estradiol'
-CONFOUNDS = ['bc']
+CONFOUNDS = None
 TASK = 'rest'
 ATLAS = 'craddock2012'
+alpha = 0.01
 atlas_fname = '/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz'
 
 layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
@@ -36,10 +37,9 @@
     confounds = dat[CONFOUNDS]
 else:
     confounds = None
-alpha = 0.1
-fig_dir = '/Users/katherine.b/Dropbox/Projects/IDConn'
 
-cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=1000)
+
+cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10)
 
 cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t')
 best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0]
@@ -84,11 +84,11 @@
 # if the model is a linear regression, i.e., with a continuous outcome
 # then the score is R^2 (coefficient of determination)
 score = model.score(test_features.T, test_outcome)
-print('Independent prediction accuracy:\t', score)
+print('Out-of-sample prediction score:\t', score)
 pred_outcome = model.predict(test_features.T)
 if len(np.unique(test_outcome)) > 2:
     corr = spearmanr(test_outcome, pred_outcome)
-    print('\nSpearman correlation:\t', corr)
+    print(f'\nSpearman correlation between predicted and actual {OUTCOME}:\t', corr)
     np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_score-{today_str}.txt'), [score, corr[0], corr[1]])
 else: 
     np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_score-{today_str}.txt'), [score])

From afb2308cf4453203a7a41baa0b284b6ffc4f8fce Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 9 Mar 2023 19:56:57 -0800
Subject: [PATCH 32/48] update nbspy, use f_classif/f_regression, so fast!

---
 idconn/nbs.py | 128 ++++++++++++++++++++------------------------------
 1 file changed, 51 insertions(+), 77 deletions(-)

diff --git a/idconn/nbs.py b/idconn/nbs.py
index cbcc395..abf6885 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -24,7 +24,7 @@ def calc_number_of_nodes(matrices):
         num_node = matrices.shape[0]
     return num_node
 
-def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutations=10000):
+def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations=10000):
     '''
     Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided
     of shape ((subject x session)x node x node)
@@ -46,7 +46,7 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat
     confounds : list-like of shape (p,m)
         Covariates, included as predictors in model.
     alpha : float
-        Type-I error (i.e., false positive) rate, for outcome-related edge detection.
+        Type-I error (i.e., false positive) rate, for outcome-related edge detection. Default = 0.05
     predict : bool
         If True, bypasses `permutations` parameter and only runs edge detection + component identification.
         Used for NBS-Predict.
@@ -67,85 +67,64 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat
     # and retain significant edges
     # then find the largest connected component
     # and, if not predict, build a null distribution
-    n = matrices.shape[0]
+    n = matrices.shape[:-1]
     ndims = len(matrices.shape)
-    #print(ndims)
-    #if ndims >=2
-    num_node = calc_number_of_nodes(matrices)
     
     # vectorize_corrmats returns p x n^2
     # we want to run pynbs per edge
     # so vectorized edges must be transposed
+    
     if confounds is not None:
-        exog = np.hstack((outcome, confounds))
+        #regress out the confounds, use the residuals for the rest of the algorithm
+        pass
     else:
-        exog = outcome
-    exog = sm.add_constant(exog, prepend=False)
+        pass
+    exog = outcome
+    
     # turn matrices into vectorized upper triangles
     if ndims > 2:
         edges = vectorize_corrmats(matrices)
     else:
-        raise ValueError(f'Input matrices have shape {matrices.shape},',
-                             'pyNBS requires matrices of shape (subject x session) x node x node.')
-    edges = edges.T
-    #print(f'\n\n\n{edges.shape}\n\n\n')
+        edges = matrices.copy()
+    #edges = edges.T
     
     # run an ols per edge
     # create significancs matrix for predictor of interest (outcome)
     # 1 if edge is significantly predicted by outcome
     # 0 if it's not
-    sig_edges = []
-    for i in range(0, edges.shape[0]):
-        y = edges[i,:]
-        # statsmodels for regressing predictors on edges
-        #mod = sm.OLS(y, exog, hasconst=True)
-        #results = mod.fit()
-        #edge_pval = results.pvalues[0]
-        
-        # let's try straight up correlations?
-        if len(np.unique(outcome)) > 2:
-            r, edge_pval = pearsonr(outcome.reshape(n,), y.reshape(n,))
-        else:
-            r, edge_pval = pointbiserialr(outcome.reshape(n,), y.reshape(n,))
-        
-        # build binary significance edge vector
-        if edge_pval < alpha:
-            sig_edges.append(1)
-        else:
-            sig_edges.append(0)
+    
+    if len(np.unique(exog)) < 5:
+        (f, p) = f_classif(edges, exog)
+    else:
+        (f, p) = f_regression(edges, exog, center=False)
+    sig_edges = np.where(p < alpha, 1, 0)
     
     # find largest connected component of sig_edges
     # turn sig_edges into an nxn matrix first
-    sig_matrix = undo_vectorize(sig_edges, num_node)
-
-    # turn it into a networkx matrix
+    sig_matrix = undo_vectorize(sig_edges) # need to write this function
     matrix = nx.from_numpy_array(sig_matrix)
     
     #use networkX to find connected components
     largest_cc = max(nx.connected_components(matrix), key=len)
-    G0 = G.subgraph(largest_cc)
+    G0 = matrix.subgraph(largest_cc)
+    print(G0)
     
-    # grab number of edges from G0
-    
-    largest_comp_size = max(size)
-    if predict == False:
-        print(f'Connected component has {largest_comp_size} edges.')
-    else:
-        pass
-
     # retain size of largest connected component 
     # for NBS permutation-based significance testing
-    max_comp = max_comp[0]
+    max_comp = G0.number_of_edges()
+    print(f'Connected component has {max_comp} edges.')
+
+    
+    
 
     # pull the subgraph with largest number of nodes
     # i.e., the largest connected component
-    G = S[max_comp]
-
+    
     # grab list of nodes in largest connected component
-    nodes = list(G.nodes)
+    nodes = list(G0.nodes)
     
     unused_nodes = list(set(matrix.nodes) - set(nodes))
-    S1 = nx.to_pandas_adjacency(G, nodelist=nodes)
+    S1 = nx.to_pandas_adjacency(G0, nodelist=nodes)
 
     # add empty edges for unused nodes
     # bc NBS-Predict needs all nodes for
@@ -153,11 +132,9 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat
     # and NBS might need all nodes for easier
     # plotting in brain space
     for i in unused_nodes:
-        S1.loc[i] = 0.0
-        temp = S1.copy()
-        temp[i] = 0.0
-        S1 = temp.copy()
-    
+        S1.loc[i] = 0
+        S1[i] = 0
+
     S1.sort_index(axis=0, inplace=True)
     S1.sort_index(axis=1, inplace=True)
     
@@ -165,50 +142,47 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat
     # only for regular NBS, -Predict doesn't need this
     if predict == False:
         perms = np.zeros((permutations,))
+        hit = 0
         rng = np.random.default_rng()
         exog_copy = exog.copy()
         for i in range(0, permutations):
             # shuffle outcome order
             rng.shuffle(exog_copy, axis=0)
             #print(exog_copy)
-            perm_edges = []
-            for j in range(0, edges.shape[0]):
-                # statsmodels for regressing predictors on edges
-                mod = sm.OLS(edges[j,:], exog_copy, hasconst=False)
-                results = mod.fit()
-                edge_pval = results.pvalues[0]
-                
-                if edge_pval < alpha:
-                    perm_edges.append(1)
-                else:
-                    perm_edges.append(0)
+            
+            if len(np.unique(exog)) < 5:
+                (f1, p1) = f_classif(edges, exog_copy)
+            else:
+                (f1, p1) = f_regression(edges, exog_copy, center=False)
+            
+            perm_edges = np.where(p1 < alpha, 1, 0)
+            
             #print(np.sum(perm_edges))
             # find largest connected component of sig_edges
             # turn sig_edges into an nxn matrix first
-            perm_matrix = undo_vectorize(perm_edges, num_node) # need to write this function
+            perm_matrix = undo_vectorize(perm_edges) # need to write this function
             perm_nx = nx.from_numpy_array(perm_matrix)
 
-            #comps = nx.connected_components(perm_nx)
+            largest_cc = max(nx.connected_components(perm_nx), key=len)
+            S = perm_nx.subgraph(largest_cc)
 
+            perm_comp_size = S.number_of_edges()
             
 
-            S = [perm_nx.subgraph(c).copy() for c in comps]
-            perm_size = np.asarray([s.number_of_edges() for s in S])
-            (max_comp, ) = np.where(perm_size == max(perm_size))
-            #print(perm_size, max_comp)
-
             # retain for null distribution
-            perms[i] = max(perm_size)
-            if i % 10 == 0:
-                print(f'p-value is {np.size(np.where(perms >= largest_comp_size)) / permutations} as of permutation {i}')
+            perms[i] = perm_comp_size
+            if i == 0:
+                pass
+            elif i % 100 == 0:
+                print(f'p-value is {np.round(np.sum(np.where(perms >= max_comp, 1, 0)) / i, 3)} as of permutation {i}')
             
             # bctpy nbs code uses hit to mark progress across permutations
             # prob not necessary?
         
         # bctpy calcs pval for all components, not just largest?
         # but I don't think that's relevant for the og implimentation of nbs?
-        pval = np.size(np.where(perms >= largest_comp_size)) / permutations
-        print(largest_comp_size, permutations, pval)
+        pval = np.size(np.where(perms >= max_comp)) / permutations
+        print(max_comp, permutations, pval)
         
         return pval, S1, perms
     else:

From d49847e49e52fa4a1f0d6e8ff1a871bfbe8d96e2 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Wed, 29 Mar 2023 09:37:55 -0700
Subject: [PATCH 33/48] nbs-predict works now

---
 CONTRIBUTING.md                 |   0
 idconn/__init__.py              |   2 +-
 idconn/io.py                    |  95 +++++++-----
 idconn/nbs.py                   | 200 +++++++++++++++++--------
 idconn/workflows/nbs_predict.py | 257 +++++++++++++++++++++++++++-----
 5 files changed, 412 insertions(+), 142 deletions(-)
 create mode 100644 CONTRIBUTING.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..e69de29
diff --git a/idconn/__init__.py b/idconn/__init__.py
index 000932b..6915dae 100644
--- a/idconn/__init__.py
+++ b/idconn/__init__.py
@@ -18,7 +18,7 @@
     # from . import preprocessing
     # from . import statistics
     # from . import utils
-    # from . import io
+    from . import io
 
     __version__ = get_versions()["version"]
 
diff --git a/idconn/io.py b/idconn/io.py
index 7690615..b14abb6 100644
--- a/idconn/io.py
+++ b/idconn/io.py
@@ -15,6 +15,7 @@
 from nilearn import datasets, plotting, surface
 
 
+
 def build_statsmodel_json(name, task, contrast, confounds, highpass, 
                           mask, conn_meas, graph_meas=None, exclude=None, outfile=None):
     '''
@@ -203,43 +204,47 @@ def vectorize_corrmats(matrices):
     edge_vector = np.asarray(edge_vector)
     return edge_vector
 
-def read_corrmats(layout, task, deriv_name='IDConn', z_score=True, vectorized=True, verbose=False):
+def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True, verbose=False):
     """Returns a node x node x (subject x session) matrix of correlation matrices  
-    from a BIDS derivative folder. Optionally returns a subject x session dataframe
-    of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) 
+    from a BIDS derivative folder. Optionally returns a node^2 x (subject x session) 
     array of vectorized upper triangles of those correlation matrices.
     Parameters
     ----------
-    layout : BIDSLayout object
-        BIDSLayout (i.e., pybids layout object) for directory containing data for analysis (with `derivative=True`, as we're using fmriprep output).
+    layout : BIDSLayout or str
+        A valid BIDSLayout or directory. If BIDSLayout, must be generated with derivatives=True,
+        in order to find the derivatives folder containing the relevant correlation matrices.
     task : str
-        Name of task fMRI scan (can be "rest") from which networks will be calculated.
+        The task used to collect fMRI data from which correlation matrices were computed.
     deriv_name : str
-        Name of the package used to generate the correlation matrices to be read. Could be IDConn, could be something else.
-    z_score : bool
-        If True, assumes computed connectivity matrices are product-moment correlations, uses Fisher's r-to-Z.
-    vectorized : bool
-        Would you also like this function to return the vectorized upper triangles of all your matrices?
-    verbose : bool
-        Print statements? Y/N?
+        The name of the derivatives subdirectory in which correlation matrices can be found
+    atlas: str
+        The name of the atlas used to make the correlation matrix. Must match the string in corrmat filename.
+    z_score : Bool
+        Would you like the correlation matrices z-scored? (Uses Fishers r-to-z, 
+        thus assumes elements/edges of corrmats are product-moment correlations).
+    vectorized : Bool
+        If True, returns the vectorized upper triangles of correlation matrices in a p x (n^2 - n)/2 array. 
+        If false, returns the full correlation matrices in a p x n x n array.
+    verbose : Bool
+        If True, prints out subjects/sessions as their correlationmatrices are being read. 
+        If False, prints nothing.
     
     Returns
     -------
-    edge_vector : numpy array of shape (p, n^2)
+    # NOT TRUE CURRENTLY RETURNS DATAFRAME
+    edge_vector : numpy array of shape (p, (n^2-n)/2)
         Represents an array of vectorized upper triangles of 
-        the input matrices.
+        the input nxn matrices if vectorized=True.
+    edge_cube : numpy array of shape (p, n^2)
+        Represents an array of the input nxn matrices 
+        if vectorized=False.
     """
     subjects = layout.get(return_type='id', 
                           target='subject', 
                           suffix='bold', 
                           scope=deriv_name
                          )
-    all_sesh = layout.get(return_type='id',
-           target='session',
-           task=task, 
-           suffix='bold',
-           scope=deriv_name
-          )
+    
     ppts_fname = layout.get_file('participants.tsv').path
     ppt_df = pd.read_csv(ppts_fname, sep='\t', index_col=[0,1])
     ppt_df['adj'] = ''
@@ -258,7 +263,9 @@ def read_corrmats(layout, task, deriv_name='IDConn', z_score=True, vectorized=Tr
                               subject=subject, 
                               scope=deriv_name)
         
+        
         for session in sessions:
+            
             if verbose:
                 print(session)
             else:
@@ -267,21 +274,22 @@ def read_corrmats(layout, task, deriv_name='IDConn', z_score=True, vectorized=Tr
                                task=task, 
                                subject=subject,
                                session=session,
+                                atlas=atlas,
                                suffix='bold',
-                               scope='IDConn', 
-                               atlas=atlas,
+                               scope='IDConn'
                               )
             if verbose:
                 print(f'Corrmat path for sub-{subject}, ses-{session}: \t{path}')
             else:
                 pass
             if type(path) == list:
-                #print(path)
+                #print(len(path))
                 path = path[0]
             else:
                 pass
             assert exists(path), f'Corrmat file not found at {path}'
             adj_matrix = pd.read_csv(path, sep='\t', header=0, index_col=0)
+            
             if z_score == True:
                 z_adj = np.arctanh(adj_matrix.values)
                 z_adj = np.where(z_adj == np.inf, 0, z_adj)
@@ -305,7 +313,7 @@ def read_corrmats(layout, task, deriv_name='IDConn', z_score=True, vectorized=Tr
     ppt_df.replace({'': np.nan}, inplace=True)
     return ppt_df
 
-def undo_vectorize(edges, num_node):
+def undo_vectorize(edges, num_node=None):
     '''
     Puts an edge vector back into an adjacency matrix.
     Parameters
@@ -322,12 +330,17 @@ def undo_vectorize(edges, num_node):
     '''
     #j = len(edges)
     #num_node = (np.sqrt((8 * j) + 1) + 1) / 2
+    if num_node == None:
+        j = len(edges)
+        num_node = int((np.sqrt((8 * j) + 1) + 1) / 2)
+    else:
+        num_node = int(num_node)
     X = np.zeros((num_node,num_node))
     X[np.triu_indices(X.shape[0], k = 1)] = edges
     X = X + X.T
     return X
 
-def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='coolwarm', node_size='strength'):
+def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='seismic', node_size='strength'):
     '''
     Plots the edges of a connectivity/adjacency matrix both in a heatmap and in brain space, with the option to include
     a surface plot of node strength.
@@ -366,18 +379,20 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap=
     elif type(threshold) == float or type(threshold) == int:
         threshold = f'{threshold}%'
     else:
-        threshold = '99%'
+        threshold = '99.99%'
     print('edge plotting threshold: ', threshold)
 
     if node_size == 'strength':
-        node_strength = np.sum((np.abs(adj)), axis=0)
-        node_strength /= np.max(node_strength)
-        node_strength **= 4
+        node_strength = np.sum(adj, axis=0)
+        #node_strength /= np.max(node_strength)
+        #node_strength **= 4
+        node_strength = node_strength / np.max(node_strength) * 60
         node_size = node_strength
+    
     fig = plt.figure(figsize=(12,4))
     if title is not None:
         fig.suptitle(title)
-    gs = GridSpec(1, 2, width_ratios=[4,2])
+    gs = GridSpec(1, 2, width_ratios=[3,1])
     ax0 = fig.add_subplot(gs[0])
     ax1 = fig.add_subplot(gs[1])
 
@@ -385,12 +400,14 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap=
     g = plotting.plot_connectome(adj, coords, 
                                 node_size=node_size,
                                 edge_threshold=threshold, 
-                                edge_cmap='coolwarm', 
+                                edge_cmap=cmap,
+                                edge_kwargs={'alpha': 0.4},
+                                display_mode='lyrz', 
                                 figure=fig, 
                                 axes=ax0,
                                 colorbar=False, 
-                                annotate=False)
-    h = sns.heatmap(adj, square=True, cmap='coolwarm', ax=ax1, center=0)
+                                annotate=True)
+    h = sns.heatmap(adj, square=True, linewidths=0, cmap=cmap, ax=ax1, center=0)
     if strength:
         fig2 = plt.figure(figsize=(12,4))
         if title is not None:
@@ -416,13 +433,13 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap=
 
         plt.tight_layout(w_pad=-1)
         i = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5,
-                                                cmap='coolwarm', view='lateral', colorbar=False, axes=ax2)
+                                                cmap=cmap, view='lateral', colorbar=False, axes=ax2)
         j = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5,
-                                                cmap='coolwarm', view='medial', colorbar=False, axes=ax3)
+                                                cmap=cmap, view='medial', colorbar=False, axes=ax3)
         k = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5,
-                                                cmap='coolwarm', view='lateral', colorbar=False, axes=ax4)
+                                                cmap=cmap, view='lateral', colorbar=False, axes=ax4)
         l = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5,
-                                                cmap='coolwarm', view='medial', colorbar=False, axes=ax5)
-        return fig, fig2
+                                                cmap=cmap, view='medial', colorbar=False, axes=ax5)
+        return fig, fig2, strength_nimg
     else:
         return fig
\ No newline at end of file
diff --git a/idconn/nbs.py b/idconn/nbs.py
index abf6885..ea7025b 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -1,5 +1,5 @@
 import numpy as np
-import statsmodels.api as sm
+import pingouin as pg
 import networkx as nx
 import pandas as pd
 from idconn.io import vectorize_corrmats, undo_vectorize
@@ -7,9 +7,13 @@
 import enlighten
 #import bct
 
-from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, GridSearchCV, StratifiedKFold, KFold
+
 from sklearn.feature_selection import f_regression, f_classif
-from sklearn.linear_model import LogisticRegression, LinearRegression
+from sklearn.linear_model import LogisticRegression, ElasticNet
+from sklearn.preprocessing import StandardScaler
+
+from sklearn.metrics import mean_squared_error
 
 def calc_number_of_nodes(matrices):
     if matrices.shape[0] != matrices.shape[1]:
@@ -24,7 +28,41 @@ def calc_number_of_nodes(matrices):
         num_node = matrices.shape[0]
     return num_node
 
-def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations=10000):
+def residualize(X, y=None, confounds=None):
+    # residualize the outcome
+    if confounds is not None:
+        if y is not None:
+            temp_y = np.reshape(y, (y.shape[0],))
+            y = pg.linear_regression(confounds, temp_y)
+            resid_y = y.residuals_
+
+            # residualize features
+            resid_X = np.zeros_like(X)
+            #print(X.shape, resid_X.shape)
+            for i in range(0, X.shape[1]):
+                X_temp = X[:,i]
+                #print(X_temp.shape)
+                X_ = pg.linear_regression(confounds, X_temp)
+                #print(X_.residuals_.shape)
+                resid_X[:,i] = X_.residuals_.flatten()
+            return resid_y, resid_X
+        else:
+            # residualize features
+            resid_X = np.zeros_like(X)
+            #print(X.shape, resid_X.shape)
+            for i in range(0, X.shape[1]):
+                X_temp = X[:,i]
+                #print(X_temp.shape)
+                X_ = pg.linear_regression(confounds, X_temp)
+                #print(X_.residuals_.shape)
+                resid_X[:,i] = X_.residuals_.flatten()
+            return resid_X
+    else:
+        print('Confound matrix wasn\'t provided, so no confounding was done')
+        
+    
+
+def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000):
     '''
     Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided
     of shape ((subject x session)x node x node)
@@ -67,25 +105,19 @@ def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations=
     # and retain significant edges
     # then find the largest connected component
     # and, if not predict, build a null distribution
-    n = matrices.shape[:-1]
+    #n = matrices.shape[:-1]
     ndims = len(matrices.shape)
     
     # vectorize_corrmats returns p x n^2
-    # we want to run pynbs per edge
-    # so vectorized edges must be transposed
-    
-    if confounds is not None:
-        #regress out the confounds, use the residuals for the rest of the algorithm
-        pass
-    else:
-        pass
-    exog = outcome
-    
+
     # turn matrices into vectorized upper triangles
     if ndims > 2:
         edges = vectorize_corrmats(matrices)
     else:
         edges = matrices.copy()
+    #print(edges.shape)
+    
+    
     #edges = edges.T
     
     # run an ols per edge
@@ -93,10 +125,10 @@ def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations=
     # 1 if edge is significantly predicted by outcome
     # 0 if it's not
     
-    if len(np.unique(exog)) < 5:
-        (f, p) = f_classif(edges, exog)
+    if len(np.unique(outcome)) < 5:
+        (f, p) = f_classif(X=edges, y=outcome)
     else:
-        (f, p) = f_regression(edges, exog, center=False)
+        (f, p) = f_regression(X=edges, y=outcome, center=False)
     sig_edges = np.where(p < alpha, 1, 0)
     
     # find largest connected component of sig_edges
@@ -107,15 +139,12 @@ def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations=
     #use networkX to find connected components
     largest_cc = max(nx.connected_components(matrix), key=len)
     G0 = matrix.subgraph(largest_cc)
-    print(G0)
+    #print(G0)
     
     # retain size of largest connected component 
     # for NBS permutation-based significance testing
     max_comp = G0.number_of_edges()
-    print(f'Connected component has {max_comp} edges.')
-
-    
-    
+    #print(f'Connected component has {max_comp} edges.')    
 
     # pull the subgraph with largest number of nodes
     # i.e., the largest connected component
@@ -142,18 +171,17 @@ def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations=
     # only for regular NBS, -Predict doesn't need this
     if predict == False:
         perms = np.zeros((permutations,))
-        hit = 0
         rng = np.random.default_rng()
-        exog_copy = exog.copy()
+        outcome_copy = outcome.copy()
         for i in range(0, permutations):
             # shuffle outcome order
-            rng.shuffle(exog_copy, axis=0)
-            #print(exog_copy)
+            rng.shuffle(outcome_copy, axis=0)
+            #print(outcome_copy)
             
-            if len(np.unique(exog)) < 5:
-                (f1, p1) = f_classif(edges, exog_copy)
+            if len(np.unique(outcome)) < 5:
+                (f1, p1) = f_classif(edges, outcome_copy)
             else:
-                (f1, p1) = f_regression(edges, exog_copy, center=False)
+                (f1, p1) = f_regression(edges, outcome_copy, center=False)
             
             perm_edges = np.where(p1 < alpha, 1, 0)
             
@@ -188,7 +216,7 @@ def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations=
     else:
         return S1
 
-def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10, n_iterations=10):
+def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_splits=10, n_iterations=10):
     """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided
     of shape ((subject x session)x node x node)
     in the network.
@@ -207,13 +235,15 @@ def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10
     outcome : list-like of shape (p,)
         Y-value to be predicted with connectivity
     confounds : list-like
-        Names of columns in `participants.tsv` to be regressed out of connectivity and outcome 
+        Columns in `participants.tsv` to be regressed out of connectivity and outcome 
         data in each CV fold (per recommendation from Snoek et al., 2019).
     alpha : float
         Proportion of type II errors (i.e., false positives) we're willing to put up with. 
         This is the upper limit for pvalues in the edge detection process.
     groups : list-like of shape (p,)
         Grouping variable - currently only works for 2 groups. Will enforce stratified k-fold CV.
+        Currently intended for use where grouping variable is the outcome of interest, assumed by StratifiedKFold.
+        NEED TO FIX THIS: ALLOW THE CASE WHERE GROUPING VAR != OUTCOME VAR
     n_splits : int
         Value of K for K-fold cross-validation. Will split data into K chunks, train on K-1 chunks and test on the Kth.
     n_iterations : int
@@ -231,6 +261,7 @@ def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10
     """
     edges = vectorize_corrmats(matrices)
     #print(edges.shape)
+    #print(edges.shape)
     index = list(range(0,n_splits * n_iterations))
 
     cv_results = pd.DataFrame(index=index, 
@@ -244,16 +275,13 @@ def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10
     if groups is not None:
         cv = RepeatedStratifiedKFold(n_splits=n_splits,
                                     n_repeats=n_iterations)
-        dof = groups.shape[0] - 2
+        split_y = groups
+        
     else:
         cv = RepeatedKFold(n_splits=n_splits, 
-                        n_repeats=n_iterations)
-        dof = edges.shape[0] - 1
-    
-    #t_threshold = t.ppf(1 - alpha, df=dof)
+                        n_repeats=n_iterations) 
+        split_y = outcome   
     
-    # really can't remember why tf I did this?
-    # maybe it's an artifact of permuted_ols?
     num_node = calc_number_of_nodes(matrices)
     #print(num_node)
     #if matrices.shape[0] != matrices.shape[1]:
@@ -271,36 +299,50 @@ def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10
     i = 0
     manager = enlighten.get_manager()
     ticks = manager.counter(total=n_splits * n_iterations, desc='Progress', unit='folds')
-    for train_idx, test_idx in cv.split(edges, outcome, groups=groups):
+    for train_idx, test_idx in cv.split(edges, split_y):
+        scaler = StandardScaler()
         cv_results.at[i, 'split'] = (train_idx, test_idx)
-        # all of this presumes the old bctpy version of nbs
-        # irrelevant for pynbs
         
         #assert len(train_a_idx) == len(train_b_idx)
-        if groups is not None:
-            train_a_idx = [m for m in train_idx if groups[m] == 0]
-            train_b_idx = [m for m in train_idx if groups[m] == 1]
-            regressor = LogisticRegression(max_iter=1000)
-        elif np.unique(outcome).shape[0] == 2:
-            regressor = LogisticRegression(max_iter=1000)
+        if np.unique(outcome).shape[0] == 2:
+            regressor = LogisticRegression(l1_ratio=0.25, max_iter=1000, penalty='elasticnet', solver='saga')
         else:
-            regressor = LinearRegression()
-        train_mats = matrices[train_idx,:,:]
-        #print(train_a.shape, train_b.shape)
-        
-        # separate edges & covariates into 
+            regressor = ElasticNet(l1_ratio=0.25, max_iter=1000)
+
         train_y = outcome[train_idx]
         test_y = outcome[test_idx]
 
+        train_edges = edges[train_idx, :]
+        test_edges = edges[test_idx, :]
+        
         if confounds is not None:
             train_confounds = confounds.values[train_idx]
+            test_confounds = confounds.values[test_idx]
+            #print(train_edges.shape, train_confounds.shape, train_y.shape)
+            
+            # residualize the edges and outcome
+            if np.unique(outcome).shape[0] == 2:
+                train_edges = residualize(train_edges,train_confounds)
+                test_edges = residualize(test_edges, test_confounds)
+            elif np.unique(outcome).shape[0] > 3:
+                train_y, train_edges = residualize(train_edges, train_y, train_confounds)
+                test_y, test_edges = residualize(test_edges, test_y, test_confounds)
         else:
-            train_confounds = None
-        #test_confounds = confounds.values[test_idx]
+            pass
+        
+        train_edges = scaler.fit_transform(train_edges)
+        test_edges = scaler.fit_transform(test_edges)
+
+        if np.unique(outcome).shape[0] == 2:
+            pass
+        else:
+            train_y = scaler.fit_transform(train_y.reshape(-1, 1))
+            test_y = scaler.fit_transform(test_y.reshape(-1, 1))
         
         # perform NBS wooooooooo
         # note: output is a dataframe :)
-        adj = pynbs(train_mats, train_y, train_confounds, alpha, predict=True)
+        # PYNBS SHOULD NOT DO CONFOUND REGRESSION?
+        adj = pynbs(train_edges, train_y, alpha, predict=True)
         #print(adj.shape, adj.ndim, adj[0].shape, upper_tri)
         
         #cv_results.at[i, 'pval'] = pval
@@ -311,28 +353,45 @@ def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10
         if sum(sum(adj.values)) > 0:
             # grab the values of the adjacency matrix that are just in the upper triangle
             # so you don't have repeated edges
+            # returns (n_edges, )
             nbs_vector = adj.values[upper_tri]
+            #print(nbs_vector.shape)
             # use those to make a "significant edges" mask
             mask = nbs_vector == 1.0
 
             # grab only the significant edges from testing and training sets of edges
             # for use as features in the predictive models
-            train_features = edges[train_idx, :].T[mask]
-            test_features = edges[test_idx, :].T[mask]
+            # these are already residualized
+            #print(train_edges.shape)
+            # returns (n_edges, samples)
+            train_features = train_edges.T[mask]
+            test_features = test_edges.T[mask]
 
+            train_features = scaler.fit_transform(train_features.T)
+            test_features = scaler.fit_transform(test_features.T)
+            #print(np.ravel(train_y))
             # train model predicting outcome from brain (note: no mas covariates)
-            #print(train_features.T.shape, train_y.shape)
-            model = regressor.fit(X=train_features.T, y=train_y.ravel())
-            #cv_results.at[i, 'model'] = model
+            model = regressor.fit(X=train_features, y=np.ravel(train_y))
+            cv_results.at[i, 'model'] = model
+            
             # score that model on the testing data
-            score = model.score(X=test_features.T, y=test_y.ravel())
+            # if logistic regression: score = mean accuracy
+            # if linear regression: score = coefficient of determination (R^2)
+            # both from 0 (low) to 1 (high)
+            score = model.score(X=test_features, y=np.ravel(test_y))
             cv_results.at[i, 'score'] = score
+            #print(model.coef_.shape)
 
             m = 0
             param_vector = np.zeros_like(nbs_vector)
             for l in range(0, nbs_vector.shape[0]):
                 if nbs_vector[l] == 1.:
-                    param_vector[l] = model.coef_[0,m]
+                    ###
+                    # NEEDS IF STATEMENT BC LOGISTIC AND LINEAR HAVE DIFFERENT COEF_ SHAPES
+                    if np.unique(outcome).shape[0] == 2:
+                        param_vector[l] = model.coef_[0,m]
+                    else:
+                        param_vector[l] = model.coef_[m]   
                     m+=1
                 else:
                     pass
@@ -343,4 +402,17 @@ def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10
         else:
             pass
         ticks.update()
-    return cv_results
\ No newline at end of file
+    # calculate weighted average
+    #print(cv_results['score'])
+    weighted_stack = cv_results.at[0, 'component'] * cv_results.at[0, 'score']
+    #print(weighted_stack.shape)
+    for j in index[1:]:
+        #print(cv_results.at[j, 'score'])
+        if cv_results.at[j, 'score'] > 0:
+            weighted = cv_results.at[j, 'component'] * cv_results.at[j, 'score']
+            weighted_stack = np.dstack([weighted_stack, weighted])
+        else:
+            pass
+        #print(weighted_stack.shape, weighted.shape)
+    weighted_average = np.mean(weighted_stack, axis=-1)
+    return weighted_average, cv_results
\ No newline at end of file
diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index 01b5e8f..9d20d0d 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -1,28 +1,42 @@
-from idconn import nbs, io
+#!/usr/bin/env python3
 import pandas as pd
 import numpy as np
+import pingouin as pg
+import nibabel as nib
 import bids
 from os.path import join
 from datetime import datetime
 from time import strftime
 from scipy.stats import spearmanr
+from idconn import nbs, io
+
+
+from sklearn.linear_model import LogisticRegression, ElasticNet
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import mean_squared_error
+
+import warnings
+import json
+
+warnings.simplefilter("ignore")
 
 today = datetime.today()
 today_str = strftime("%m_%d_%Y")
 
 TRAIN_DSET = '/Users/katherine.b/Dropbox/Data/ds002674'
-TEST_DSET = '/Users/katherine.b/Dropbox/Data/ds002674'
+TEST_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset'
 DERIV_NAME = 'IDConn'
-OUTCOME = 'estradiol'
-CONFOUNDS = None
+OUTCOME = 'bc'
+CONFOUNDS = 'fd'
 TASK = 'rest'
 ATLAS = 'craddock2012'
-alpha = 0.01
+alpha = 0.05
 atlas_fname = '/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz'
 
+
 layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
 
-dat = io.read_corrmats(layout, task=TASK, atlas=ATLAS, z_score=False)
+dat = io.read_corrmats(layout, task=TASK, deriv_name='IDConn', atlas=ATLAS, z_score=True)
 
 keep = dat['adj'].dropna().index
 dat = dat.loc[keep]
@@ -33,63 +47,230 @@
 upper_tri = np.triu_indices(num_node, k=1)
 
 outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]),1))
+
 if CONFOUNDS is not None:
     confounds = dat[CONFOUNDS]
 else:
     confounds = None
+#print(dat['bc'])
+
+weighted_average, cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, groups=dat['bc'], n_splits=10, n_iterations=100)
+
+fig,fig2, nimg = io.plot_edges(weighted_average, 
+                         atlas_fname, 
+                         threshold='computed', 
+                         title=f'{OUTCOME} Precition-Weighted Average', 
+                         strength=True, 
+                         cmap='seismic', 
+                         node_size='strength')
+
+if CONFOUNDS is not None:
+    base_name = f'nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}'
+else:
+    base_name = f'nbs-predict_outcome-{OUTCOME}'
 
+fig.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-{today_str}.png'), dpi=400)
+fig2.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-strength-{today_str}.png'), dpi=400)
+nib.save(nimg, join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-strength-{today_str}'))
+
+
+avg_df = pd.DataFrame(weighted_average, 
+                      index=range(0,weighted_average.shape[0]),
+                      columns=range(0,weighted_average.shape[1]))
+
+cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_models-{today_str}.tsv'),sep='\t')
+avg_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-{today_str}.tsv'),sep='\t')
+
+
+# this uses the most predictive subnetwork as features in the model
+# might replace with thresholded weighted_average
+# or use _all_ the edges in weighted_average with KRR or ElasticNet...
+# ORRR use thresholded weighted average edges with ElasticNet...
+# - stays true to NBS-Predict
+# - increases parsimony while handling multicollinearity...
+# either way, I don't think cv_results is necessary
+
+#best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0]
+#subnetwork = cv_results.loc[best]['component']
+#subnetwork_df = pd.DataFrame(subnetwork,
+#                             index=range(0,num_node), 
+#                             columns=range(0,num_node))
+
+#if CONFOUNDS is not None:
+#    subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}_edge-parameters-{today_str}.tsv'),sep='\t')
+#else:
+#    subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_edge-parameters-{today_str}.tsv'),sep='\t')
+
+# here is where we'd threshold the weighted average to use for elastic-net
+
+nbs_vector = weighted_average[upper_tri]
+p50 = np.percentile(nbs_vector, 50)
+filter = np.where(nbs_vector >= p50, True, False)
+#print(nbs_vector.shape, filter.shape)
+
+#mask = io.vectorize_corrmats(filter)
+edges_train = np.vstack(dat['edge_vector'].dropna().values)
+
+#print(features.shape)
+
+scaler = StandardScaler()
+edges_train = scaler.fit_transform(edges_train)
+if len(np.unique(outcome)) <= 2:
+    pass
+else:
+    outcome = scaler.fit_transform(outcome)
 
-cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10)
 
-cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t')
-best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0]
-subnetwork = cv_results.loc[best]['component']
-subnetwork_df = pd.DataFrame(subnetwork,
-                             index=range(0,num_node), 
-                             columns=range(0,num_node))
+#edges = np.vstack(dat['edge_vector'].values)
+#features = edges[:,mask]
 
-subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_edge-parameters-{today_str}.tsv'),sep='\t')
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    #regress out the confounds from each edge and the outcome variable, 
+    # use the residuals for the rest of the algorithm
+    #print(confounds.shape, outcome.shape)
+    outcome_train = np.reshape(outcome, (outcome.shape[0],))
+    y = pg.linear_regression(confounds, outcome_train)
+    train_outcome = y.residuals_
 
-nbs_vector = subnetwork[upper_tri]
-mask = nbs_vector == 1
-edges = np.vstack(dat['edge_vector'].values)
-features = edges[:,mask]
-#plot the parameters
-param_mat = cv_results.loc[best]['coefficient_matrix']
-odds = 10 ** param_mat 
-prob = odds / (1 + odds)
+    resid_edges = np.zeros_like(edges_train)
+    for i in range(0, edges_train.shape[1]):
+        x = pg.linear_regression(confounds, edges_train[:,i])
+        resid_edges[:,i] = x.residuals_
+    train_features = resid_edges[:,filter]
+else:
+    train_features = edges_train[:,filter]
+    train_outcome = outcome
 
 # run the model on the whole test dataset to get params
-model = cv_results.loc[best]['model']
-model.fit(features, outcome)
-fig,fig2 = io.plot_edges(param_mat, atlas_fname, title=None, strength=True, cmap='icefire', node_size='strength')
-fig.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_betas-{today_str}.png'), dpi=400)
-fig2.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_betas-strength-{today_str}.png'), dpi=400)
+
+# classification if the outcome is binary (for now)
+# could be extended to the multiclass case?
+
+if len(np.unique(outcome)) == 2:
+    model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.25, warm_start=True)
+else:
+    model = ElasticNet(l1_ratio=0.25, warm_start=True)
+
+# train ElasticNet on full train dataset, using feature extraction from NBS-Predict
+train_metrics = {}
+fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+in_sample_score = fitted.score(X=train_features, y=np.ravel(train_outcome))
+if len(np.unique(outcome)) == 2:
+    train_metrics['accuracy'] = in_sample_score
+else:
+    train_metrics['coefficient of determination'] = in_sample_score
+y_pred = fitted.predict(X=train_features)
+mse = mean_squared_error(train_outcome, y_pred)
+train_metrics['mean squared error'] = mse
+print('In-sample prediction score: ', in_sample_score)
+print('In-sample mean squared error: ', mse)
+#print(np.mean(train_features))
+with open(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_fit-{today_str}.json'), 'w') as fp:
+    json.dump(train_metrics, fp)
+
+
+
+# yoink the coefficients? for a more parsimonious figure?
+coeff_vec = np.zeros_like(filter)
+j = 0
+for i in range(0, filter.shape[0]):
+    if filter[i] == True:
+        if len(np.unique(outcome)) == 2:
+            coeff_vec[i] = fitted.coef_[0,j]
+        else:
+            coeff_vec[i] = fitted.coef_[j]
+        j += 1
+    else:
+        pass
+
+#print(coeff_vec)
+
+coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
+#print(coef_mat == coef_mat.T)
+
+fig,fig2, nimg = io.plot_edges(coef_mat, 
+                         atlas_fname, 
+                         threshold='computed',
+                         title=f'{OUTCOME} Coefficients', 
+                         strength=True, 
+                         cmap='seismic', 
+                         node_size='strength')
+
+fig.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_betas-{today_str}.png'), dpi=400)
+fig2.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_betas-strength-{today_str}.png'), dpi=400)
+nib.save(nimg, join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_betas-strength-{today_str}'))
+
 
 layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
 
-test_df = io.read_corrmats(layout, task=TASK, atlas=ATLAS, z_score=False)
+test_df = io.read_corrmats(layout, task=TASK, deriv_name='IDConn', atlas=ATLAS, z_score=True)
 
-test_df.dropna(inplace=True)
+keep = test_df[[OUTCOME, 'adj']].dropna().index
+#print(keep)
+
+test_df = test_df.loc[keep]
 
 outcome_test = test_df[OUTCOME].values
-groups_test = outcome
+
+if len(np.unique(outcome_test)) <= 2:
+    pass
+else:
+    outcome_test = scaler.fit_transform(outcome_test.reshape(-1, 1))
+
+#print(outcome_test)
 matrices_test = np.vstack(test_df['adj'].dropna().values).reshape((len(test_df['adj'].dropna().index),num_node,num_node))
 edges_test = np.vstack(test_df['edge_vector'].dropna().values)
+edges_test = scaler.fit_transform(edges_test)
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if confounds is not None:
+    confounds_test = test_df[CONFOUNDS].values
+    #regress out the confounds from each edge and the outcome variable, 
+    # use the residuals for the rest of the algorithm
+    #print(confounds.shape, outcome.shape)
+    outcome_test = np.reshape(outcome_test, (outcome_test.shape[0],))
+    y = pg.linear_regression(confounds_test, outcome_test)
+    test_outcome = y.residuals_
 
-test_features = edges_test.T[mask,:]
-test_outcome = test_df[OUTCOME].values
+    resid_edges = np.zeros_like(edges_test)
+    for i in range(0, edges_test.shape[1]):
+        x = pg.linear_regression(confounds_test, edges_test[:,i])
+        resid_edges[:,i] = x.residuals_
+    test_features = resid_edges[:,filter]
+else:
+    test_features = edges_test[:,filter]
+    test_outcome = outcome_test
+
+#print(test_features.shape)
 # if the model is a logistic regression, i.e. with a binary outcome
 # then score is prediction accuracy
 # if the model is a linear regression, i.e., with a continuous outcome
 # then the score is R^2 (coefficient of determination)
-score = model.score(test_features.T, test_outcome)
+
+# fit trained ElasticNet, initialized via warm_start
+# prob in CV?
+#fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
+#score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
+test_metrics = {}
+y_pred = fitted.predict(X=test_features)
+score = fitted.score(X=test_features, y=np.ravel(test_outcome))
+if len(np.unique(test_outcome)) == 2:
+    test_metrics['accuracy'] = score
+else:
+    test_metrics['coefficient of determination'] = score
+mse = mean_squared_error(test_outcome, y_pred)
+test_metrics['mean squared error'] = mse
 print('Out-of-sample prediction score:\t', score)
-pred_outcome = model.predict(test_features.T)
+print('Out-of-sample mean squared error:\t', mse)
+#print(np.mean(test_features))
+pred_outcome = fitted.predict(test_features)
+
+#print(test_outcome, '\n',pred_outcome)
+#print(pred_outcome)
 if len(np.unique(test_outcome)) > 2:
     corr = spearmanr(test_outcome, pred_outcome)
     print(f'\nSpearman correlation between predicted and actual {OUTCOME}:\t', corr)
-    np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_score-{today_str}.txt'), [score, corr[0], corr[1]])
-else: 
-    np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_score-{today_str}.txt'), [score])
-
+    test_metrics['spearman correlation'] = corr
+with open(join(TEST_DSET, 'derivatives', DERIV_NAME, f'{base_name}_fit-{today_str}.json'), 'w') as fp:
+    json.dump(test_metrics, fp)

From d5f557f4cc9cc8afda345b1add05232ecf9c3ec8 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 30 Mar 2023 12:55:37 -0700
Subject: [PATCH 34/48] add auto-FD comp, residualizing

---
 idconn/io.py                    | 79 ++++++++++++++++++++++++++++++++-
 idconn/nbs.py                   |  8 ++--
 idconn/workflows/nbs_predict.py | 79 ++++++++++++---------------------
 3 files changed, 111 insertions(+), 55 deletions(-)

diff --git a/idconn/io.py b/idconn/io.py
index b14abb6..61d5d93 100644
--- a/idconn/io.py
+++ b/idconn/io.py
@@ -14,7 +14,24 @@
 from matplotlib.gridspec import GridSpec
 from nilearn import datasets, plotting, surface
 
+def calc_fd(confounds):
+    x = confounds['trans_x'].values
+    y = confounds['trans_y'].values
+    z = confounds['trans_z'].values
+    alpha = confounds['rot_x'].values
+    beta = confounds['rot_y'].values
+    gamma = confounds['rot_z'].values
+    
+    delta_x = [np.abs(t - s) for s, t in zip(x, x[1:])]
+    delta_y = [np.abs(t - s) for s, t in zip(y, y[1:])]
+    delta_z = [np.abs(t - s) for s, t in zip(z, z[1:])]
+
+    delta_alpha = [np.abs(t - s) for s, t in zip(alpha, alpha[1:])]
+    delta_beta = [np.abs(t - s) for s, t in zip(beta, beta[1:])]
+    delta_gamma = [np.abs(t - s) for s, t in zip(gamma, gamma[1:])]
 
+    fd = np.sum([delta_x, delta_y, delta_z, delta_alpha, delta_beta, delta_gamma], axis=0)
+    return fd
 
 def build_statsmodel_json(name, task, contrast, confounds, highpass, 
                           mask, conn_meas, graph_meas=None, exclude=None, outfile=None):
@@ -265,7 +282,67 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True
         
         
         for session in sessions:
-            
+            runs = layout.get(return_type='id', 
+                              session=session,
+                              target='run', 
+                              task=task, 
+                              suffix='timeseries', 
+                              subject=subject, 
+                              scope=deriv_name)
+            if len(runs) > 0:
+                path = layout.get(return_type='filename', 
+                                    session=session,
+                                    run=runs[0], 
+                                    task=task, 
+                                    suffix='timeseries', 
+                                    subject=subject, 
+                                    scope=deriv_name)
+                confounds = pd.read_table(path[0], header=0, index_col=0)
+                if not 'framewise_displacement' in confounds.columns:
+                    fd = calc_fd(confounds)
+                    #fd.append(0)
+                    fd = np.append(fd, [0])
+                    confounds['framewise_displacement'] = fd
+                confound_means = confounds.mean(axis=0)
+                if len(runs) > 1:
+                    for run in runs[1:]:
+                        path = layout.get(return_type='filename', 
+                                        session=session,
+                                        run=run, 
+                                        task=task, 
+                                        suffix='timeseries', 
+                                        subject=subject, 
+                                        scope=deriv_name)
+                        confounds = pd.read_table(path[0], header=0, index_col=0)
+                        if not 'framewise_displacement' in confounds.columns:
+                            fd = calc_fd(confounds)
+                            #fd.append(0)
+                            fd = np.append(fd, [0])
+                            confounds['framewise_displacement'] = fd
+                        confound_means_temp = confounds.mean(axis=0)
+                        confound_means = np.mean(pd.concat([confound_means, confound_means_temp], axis=1), axis=1)
+                        #print(confound_means)
+            else:
+                path = path = layout.get(return_type='filename', 
+                                    session=session,
+                                    desc='confounds', 
+                                    task=task, 
+                                    suffix='timeseries', 
+                                    subject=subject, 
+                                    scope=deriv_name)
+                
+                confounds = pd.read_table(path[0], header=0, index_col=0)
+                if not 'framewise_displacement' in confounds.columns:
+                    fd = calc_fd(confounds)
+                    fd = np.append(fd, [0])
+                    confounds['framewise_displacement'] = fd 
+                confound_means = confounds.mean(axis=0)
+                #print(confound_means)
+            for confound in confound_means.index:
+                ppt_df.at[(f'sub-{subject}', 
+                        f'ses-{session}'), 
+                        confound] = confound_means[confound]
+
             if verbose:
                 print(session)
             else:
diff --git a/idconn/nbs.py b/idconn/nbs.py
index ea7025b..ad236c0 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -322,11 +322,11 @@ def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_spli
             
             # residualize the edges and outcome
             if np.unique(outcome).shape[0] == 2:
-                train_edges = residualize(train_edges,train_confounds)
-                test_edges = residualize(test_edges, test_confounds)
+                train_edges = residualize(X=train_edges, confounds=train_confounds)
+                test_edges = residualize(X=test_edges, confounds=test_confounds)
             elif np.unique(outcome).shape[0] > 3:
-                train_y, train_edges = residualize(train_edges, train_y, train_confounds)
-                test_y, test_edges = residualize(test_edges, test_y, test_confounds)
+                train_y, train_edges = residualize(X=train_edges, y=train_y, confounds=train_confounds)
+                test_y, test_edges = residualize(X=test_edges, y=test_y, confounds=test_confounds)
         else:
             pass
         
diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index 9d20d0d..1830a82 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -27,7 +27,7 @@
 TEST_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset'
 DERIV_NAME = 'IDConn'
 OUTCOME = 'bc'
-CONFOUNDS = 'fd'
+CONFOUNDS = 'framewise_displacement'
 TASK = 'rest'
 ATLAS = 'craddock2012'
 alpha = 0.05
@@ -90,19 +90,7 @@
 # - increases parsimony while handling multicollinearity...
 # either way, I don't think cv_results is necessary
 
-#best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0]
-#subnetwork = cv_results.loc[best]['component']
-#subnetwork_df = pd.DataFrame(subnetwork,
-#                             index=range(0,num_node), 
-#                             columns=range(0,num_node))
-
-#if CONFOUNDS is not None:
-#    subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}_edge-parameters-{today_str}.tsv'),sep='\t')
-#else:
-#    subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_edge-parameters-{today_str}.tsv'),sep='\t')
-
 # here is where we'd threshold the weighted average to use for elastic-net
-
 nbs_vector = weighted_average[upper_tri]
 p50 = np.percentile(nbs_vector, 50)
 filter = np.where(nbs_vector >= p50, True, False)
@@ -111,37 +99,30 @@
 #mask = io.vectorize_corrmats(filter)
 edges_train = np.vstack(dat['edge_vector'].dropna().values)
 
-#print(features.shape)
-
-scaler = StandardScaler()
-edges_train = scaler.fit_transform(edges_train)
-if len(np.unique(outcome)) <= 2:
-    pass
-else:
-    outcome = scaler.fit_transform(outcome)
-
-
-#edges = np.vstack(dat['edge_vector'].values)
-#features = edges[:,mask]
-
 # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
 if CONFOUNDS is not None:
+    confounds_train = dat[CONFOUNDS].values
+    outcome_train = np.reshape(outcome, (outcome.shape[0],))
     #regress out the confounds from each edge and the outcome variable, 
     # use the residuals for the rest of the algorithm
     #print(confounds.shape, outcome.shape)
-    outcome_train = np.reshape(outcome, (outcome.shape[0],))
-    y = pg.linear_regression(confounds, outcome_train)
-    train_outcome = y.residuals_
-
-    resid_edges = np.zeros_like(edges_train)
-    for i in range(0, edges_train.shape[1]):
-        x = pg.linear_regression(confounds, edges_train[:,i])
-        resid_edges[:,i] = x.residuals_
+    if np.unique(outcome).shape[0] == 2:
+        resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
+        train_outcome = outcome
+    elif np.unique(outcome).shape[0] > 3:
+        train_outcome, resid_edges = nbs.residualize(X=edges_train, y=outcome_train, confounds=confounds_train)
     train_features = resid_edges[:,filter]
 else:
     train_features = edges_train[:,filter]
     train_outcome = outcome
 
+scaler = StandardScaler()
+train_features = scaler.fit_transform(train_features)
+if len(np.unique(train_outcome)) <= 2:
+    pass
+else:
+    outcome_test = scaler.fit_transform(train_outcome.reshape(-1, 1))
+
 # run the model on the whole test dataset to get params
 
 # classification if the outcome is binary (for now)
@@ -210,38 +191,36 @@
 #print(keep)
 
 test_df = test_df.loc[keep]
-
 outcome_test = test_df[OUTCOME].values
-
-if len(np.unique(outcome_test)) <= 2:
-    pass
-else:
-    outcome_test = scaler.fit_transform(outcome_test.reshape(-1, 1))
+#print(test_df)
 
 #print(outcome_test)
 matrices_test = np.vstack(test_df['adj'].dropna().values).reshape((len(test_df['adj'].dropna().index),num_node,num_node))
 edges_test = np.vstack(test_df['edge_vector'].dropna().values)
-edges_test = scaler.fit_transform(edges_test)
 
 # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
-if confounds is not None:
+if CONFOUNDS is not None:
     confounds_test = test_df[CONFOUNDS].values
+    
     #regress out the confounds from each edge and the outcome variable, 
     # use the residuals for the rest of the algorithm
     #print(confounds.shape, outcome.shape)
-    outcome_test = np.reshape(outcome_test, (outcome_test.shape[0],))
-    y = pg.linear_regression(confounds_test, outcome_test)
-    test_outcome = y.residuals_
-
-    resid_edges = np.zeros_like(edges_test)
-    for i in range(0, edges_test.shape[1]):
-        x = pg.linear_regression(confounds_test, edges_test[:,i])
-        resid_edges[:,i] = x.residuals_
+    if np.unique(outcome_test).shape[0] == 2:
+        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
+        test_outcome = outcome_test
+    elif np.unique(outcome_test).shape[0] > 3:
+        test_outcome, resid_edges = nbs.residualize(X=edges_test, y=outcome_test, confounds=confounds_test)
     test_features = resid_edges[:,filter]
 else:
     test_features = edges_test[:,filter]
     test_outcome = outcome_test
 
+# scale after residualizing omg
+test_features = scaler.fit_transform(test_features)
+if len(np.unique(test_outcome)) <= 2:
+    pass
+else:
+    test_outcome = scaler.fit_transform(test_outcome.reshape(-1, 1))
 #print(test_features.shape)
 # if the model is a logistic regression, i.e. with a binary outcome
 # then score is prediction accuracy

From c1a7878c76783b2e7ded1750b5bacf88a846da4c Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Fri, 31 Mar 2023 17:29:38 -0700
Subject: [PATCH 35/48] fix scaling bug in training data

---
 idconn/workflows/nbs_predict.py | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index 1830a82..7444bfa 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -50,11 +50,13 @@
 
 if CONFOUNDS is not None:
     confounds = dat[CONFOUNDS]
+    base_name = f'nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}'
 else:
     confounds = None
+    base_name = f'nbs-predict_outcome-{OUTCOME}'
 #print(dat['bc'])
 
-weighted_average, cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, groups=dat['bc'], n_splits=10, n_iterations=100)
+weighted_average, cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, groups=dat['bc'], n_splits=10, n_iterations=1000)
 
 fig,fig2, nimg = io.plot_edges(weighted_average, 
                          atlas_fname, 
@@ -64,11 +66,6 @@
                          cmap='seismic', 
                          node_size='strength')
 
-if CONFOUNDS is not None:
-    base_name = f'nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}'
-else:
-    base_name = f'nbs-predict_outcome-{OUTCOME}'
-
 fig.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-{today_str}.png'), dpi=400)
 fig2.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-strength-{today_str}.png'), dpi=400)
 nib.save(nimg, join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-strength-{today_str}'))
@@ -106,10 +103,10 @@
     #regress out the confounds from each edge and the outcome variable, 
     # use the residuals for the rest of the algorithm
     #print(confounds.shape, outcome.shape)
-    if np.unique(outcome).shape[0] == 2:
+    if len(np.unique(outcome_train)) <= 2:
         resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
         train_outcome = outcome
-    elif np.unique(outcome).shape[0] > 3:
+    elif len(np.unique(outcome_train)) > 3:
         train_outcome, resid_edges = nbs.residualize(X=edges_train, y=outcome_train, confounds=confounds_train)
     train_features = resid_edges[:,filter]
 else:
@@ -121,7 +118,7 @@
 if len(np.unique(train_outcome)) <= 2:
     pass
 else:
-    outcome_test = scaler.fit_transform(train_outcome.reshape(-1, 1))
+    train_outcome = scaler.fit_transform(train_outcome.reshape(-1, 1))
 
 # run the model on the whole test dataset to get params
 
@@ -129,9 +126,9 @@
 # could be extended to the multiclass case?
 
 if len(np.unique(outcome)) == 2:
-    model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.25, warm_start=True)
+    model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.25)
 else:
-    model = ElasticNet(l1_ratio=0.25, warm_start=True)
+    model = ElasticNet(l1_ratio=0.25)
 
 # train ElasticNet on full train dataset, using feature extraction from NBS-Predict
 train_metrics = {}
@@ -150,8 +147,6 @@
 with open(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_fit-{today_str}.json'), 'w') as fp:
     json.dump(train_metrics, fp)
 
-
-
 # yoink the coefficients? for a more parsimonious figure?
 coeff_vec = np.zeros_like(filter)
 j = 0
@@ -205,10 +200,10 @@
     #regress out the confounds from each edge and the outcome variable, 
     # use the residuals for the rest of the algorithm
     #print(confounds.shape, outcome.shape)
-    if np.unique(outcome_test).shape[0] == 2:
+    if len(np.unique(outcome_test)) <= 2:
         resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
         test_outcome = outcome_test
-    elif np.unique(outcome_test).shape[0] > 3:
+    elif len(np.unique(outcome_test)) > 3:
         test_outcome, resid_edges = nbs.residualize(X=edges_test, y=outcome_test, confounds=confounds_test)
     test_features = resid_edges[:,filter]
 else:
@@ -243,13 +238,15 @@
 print('Out-of-sample prediction score:\t', score)
 print('Out-of-sample mean squared error:\t', mse)
 #print(np.mean(test_features))
-pred_outcome = fitted.predict(test_features)
+#pred_outcome = fitted.predict(test_features)
+
 
-#print(test_outcome, '\n',pred_outcome)
+print(test_outcome, '\n',y_pred)
 #print(pred_outcome)
 if len(np.unique(test_outcome)) > 2:
-    corr = spearmanr(test_outcome, pred_outcome)
+    corr = spearmanr(test_outcome, y_pred)
     print(f'\nSpearman correlation between predicted and actual {OUTCOME}:\t', corr)
     test_metrics['spearman correlation'] = corr
 with open(join(TEST_DSET, 'derivatives', DERIV_NAME, f'{base_name}_fit-{today_str}.json'), 'w') as fp:
     json.dump(test_metrics, fp)
+np.savetxt(join(TEST_DSET, f'{base_name}_predicted-values_fit-{today_str}.txt'), y_pred)
\ No newline at end of file

From fa48a1745eb191bce5b045706f67a78fb75a4adf Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Tue, 4 Apr 2023 10:45:12 -0700
Subject: [PATCH 36/48] just changed number of iterations

---
 idconn/workflows/nbs_predict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index 7444bfa..ed6b664 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -56,7 +56,7 @@
     base_name = f'nbs-predict_outcome-{OUTCOME}'
 #print(dat['bc'])
 
-weighted_average, cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, groups=dat['bc'], n_splits=10, n_iterations=1000)
+weighted_average, cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, groups=dat['bc'], n_splits=10, n_iterations=100)
 
 fig,fig2, nimg = io.plot_edges(weighted_average, 
                          atlas_fname, 

From b33e24fad88803e009a57828a145d664fcce5b1f Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Tue, 4 Apr 2023 11:03:46 -0700
Subject: [PATCH 37/48] linted

---
 idconn/__init__.py              |   4 +-
 idconn/connectivity.py          | 436 ++++++++++++++++++--------
 idconn/data.py                  |  17 +-
 idconn/io.py                    | 538 ++++++++++++++++++--------------
 idconn/nbs.py                   | 269 ++++++++--------
 idconn/networking.py            |  97 +++---
 idconn/parser_utils.py          |   4 +-
 idconn/pipeline.py              | 179 +++++++----
 idconn/workflows/nbs_predict.py | 234 ++++++++------
 setup.py                        |   8 +-
 versioneer.py                   |  25 +-
 11 files changed, 1088 insertions(+), 723 deletions(-)

diff --git a/idconn/__init__.py b/idconn/__init__.py
index 6915dae..79ab307 100644
--- a/idconn/__init__.py
+++ b/idconn/__init__.py
@@ -26,10 +26,10 @@
         "idconn",
         "connectivity",
         "data",
-        #"figures",
+        # "figures",
         "networking",
         # "preprocessing",
-        #"statistics",
+        # "statistics",
         # "utils",
         "io",
         "nbs",
diff --git a/idconn/connectivity.py b/idconn/connectivity.py
index cf14137..1e79998 100644
--- a/idconn/connectivity.py
+++ b/idconn/connectivity.py
@@ -1,12 +1,15 @@
 from posixpath import sep
 import numpy as np
 import pandas as pd
-#import idconn.connectivity.build_networks
+
+# import idconn.connectivity.build_networks
 from os import makedirs
 from os.path import join, exists, basename
 from nilearn import input_data, datasets, connectome, image, plotting
 from ._version import get_versions
-#from .utils import contrast
+
+# from .utils import contrast
+
 
 def _check_dims(matrix):
     """Raise a ValueError if the input matrix has more than two square.
@@ -16,10 +19,14 @@ def _check_dims(matrix):
         Input array.
     """
     if matrix.ndim != 2:
-        raise ValueError('Expected a square matrix, got array of shape'
-                         ' {0}.'.format(matrix.shape))
+        raise ValueError(
+            "Expected a square matrix, got array of shape" " {0}.".format(matrix.shape)
+        )
+
 
-def task_connectivity(layout, subject, session, task, atlas, confounds, connectivity_metric='correlation'):
+def task_connectivity(
+    layout, subject, session, task, atlas, confounds, connectivity_metric="correlation"
+):
     """
     Makes connectivity matrices per subject per session per task per condition.
     Parameters
@@ -51,120 +58,202 @@ def task_connectivity(layout, subject, session, task, atlas, confounds, connecti
     files : list
         Filenames of computed correlation matrices.
     """
-    #version = '0.1.1'
+    # version = '0.1.1'
     try:
         version = get_versions()["version"]
     except:
-        version = 'test'
-    if '.nii' in atlas:
-        assert exists(atlas), f'Mask file does not exist at {atlas}'
-    
-    deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}')
-    
-    space = 'MNI152NLin2009cAsym'
-    atlas_name = basename(atlas).rsplit('.', 2)[0]
+        version = "test"
+    if ".nii" in atlas:
+        assert exists(atlas), f"Mask file does not exist at {atlas}"
+
+    deriv_dir = join(layout.root, "derivatives", f"idconn-{version}")
+
+    space = "MNI152NLin2009cAsym"
+    atlas_name = basename(atlas).rsplit(".", 2)[0]
     # use pybids here to grab # of runs and preproc bold filenames
     connectivity_measure = connectome.ConnectivityMeasure(kind=connectivity_metric)
-    bold_files = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space=space,subject=subject, session=session, extension='nii.gz') # should be preprocessed BOLD file from fmriprep, grabbed with pybids
-    print(f'BOLD files found at {bold_files}')
+    bold_files = layout.get(
+        scope="derivatives",
+        return_type="file",
+        suffix="bold",
+        task=task,
+        space=space,
+        subject=subject,
+        session=session,
+        extension="nii.gz",
+    )  # should be preprocessed BOLD file from fmriprep, grabbed with pybids
+    print(f"BOLD files found at {bold_files}")
 
     runs = []
     if len(bold_files) > 1:
         for i in range(0, len(bold_files)):
-            assert exists(bold_files[i]), "Preprocessed bold file(s) does not exist at {0}".format(bold_files)
-            runs.append(layout.parse_file_entities(bold_files[i])['run'])
+            assert exists(bold_files[i]), "Preprocessed bold file(s) does not exist at {0}".format(
+                bold_files
+            )
+            runs.append(layout.parse_file_entities(bold_files[i])["run"])
     else:
         runs = None
-    print(f'Found runs: {runs}')
+    print(f"Found runs: {runs}")
 
-    out = join(deriv_dir,  f'sub-{subject}', f'ses-{session}', 'func')
+    out = join(deriv_dir, f"sub-{subject}", f"ses-{session}", "func")
     if not exists(out):
-            makedirs(out)
-    
-    event_files = layout.get(return_type='filename', suffix='events', task=task, subject=subject)
-    timing = pd.read_csv(event_files[0], header=0, index_col=0, sep='\t')
-    conditions = timing['trial_type'].unique()
+        makedirs(out)
+
+    event_files = layout.get(return_type="filename", suffix="events", task=task, subject=subject)
+    timing = pd.read_csv(event_files[0], header=0, index_col=0, sep="\t")
+    conditions = timing["trial_type"].unique()
 
     run_cond = {}
     corrmats = {}
     for run in runs:
-        bold_file = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz', run=run)
-        assert len(bold_file) == 1, f'BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}, {run}: {bold_file}'
+        bold_file = layout.get(
+            scope="derivatives",
+            return_type="file",
+            suffix="bold",
+            task=task,
+            space="MNI152NLin2009cAsym",
+            subject=subject,
+            session=session,
+            extension="nii.gz",
+            run=run,
+        )
+        assert (
+            len(bold_file) == 1
+        ), f"BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}, {run}: {bold_file}"
         tr = layout.get_tr(bold_file)
-	
-        #load timing file 
-        #update to use pyBIDS + layout
-        event_file = layout.get(return_type='filename', suffix='events', task=task, subject=subject, run=run, session=session)
-        print('# of event files =', len(event_file), '\nfilename = ', event_file[0])
+
+        # load timing file
+        # update to use pyBIDS + layout
+        event_file = layout.get(
+            return_type="filename",
+            suffix="events",
+            task=task,
+            subject=subject,
+            run=run,
+            session=session,
+        )
+        print("# of event files =", len(event_file), "\nfilename = ", event_file[0])
         the_file = str(event_file[0])
-        assert exists(the_file), 'file really does not exist'
-        timing = pd.read_csv(the_file, header=0, index_col=0, sep='\t')
-        timing.sort_values('onset')
+        assert exists(the_file), "file really does not exist"
+        timing = pd.read_csv(the_file, header=0, index_col=0, sep="\t")
+        timing.sort_values("onset")
 
-        confounds_file = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task, run=run, extension='tsv')
-        print(f'Confounds file located at: {confounds_file}')
-        confounds_df = pd.read_csv(confounds_file[0], header=0, sep='\t')
+        confounds_file = layout.get(
+            scope="derivatives",
+            return_type="file",
+            desc="confounds",
+            subject=subject,
+            session=session,
+            task=task,
+            run=run,
+            extension="tsv",
+        )
+        print(f"Confounds file located at: {confounds_file}")
+        confounds_df = pd.read_csv(confounds_file[0], header=0, sep="\t")
         confounds_df = confounds_df[confounds].fillna(0)
-        confounds_fname = join(deriv_dir,  f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-confounds_timeseries.tsv')
-        confounds_df.to_csv(confounds_fname, sep='\t')
+        confounds_fname = join(
+            deriv_dir,
+            f"sub-{subject}",
+            f"ses-{session}",
+            "func",
+            f"sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-confounds_timeseries.tsv",
+        )
+        confounds_df.to_csv(confounds_fname, sep="\t")
 
         masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=2)
         ex_bold = image.index_img(bold_file[0], 2)
         display = plotting.plot_epi(ex_bold)
         display.add_contours(atlas)
-        display.savefig(join(deriv_dir,  f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_run-{run}_space-MNI152NLin2009cAsym_space-{atlas_name}_overlay.png'))
-            
-        print(f'BOLD file located at {bold_file}\nTR = {tr}s')
-        
+        display.savefig(
+            join(
+                deriv_dir,
+                f"sub-{subject}",
+                f"ses-{session}",
+                "func",
+                f"sub-{subject}_ses-{session}_task-{task}_run-{run}_space-MNI152NLin2009cAsym_space-{atlas_name}_overlay.png",
+            )
+        )
+
+        print(f"BOLD file located at {bold_file}\nTR = {tr}s")
+
         masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=1)
         timeseries = masker.fit_transform(bold_file[0], confounds=confounds_fname)
-        #load timing file 
-        #update to use pyBIDS + layout
+        # load timing file
+        # update to use pyBIDS + layout
         try:
-            #and now we slice into conditions
+            # and now we slice into conditions
             for condition in conditions:
                 run_cond[condition] = {}
                 corrmats[condition] = {}
                 blocks = []
-                cond_timing = timing[timing['trial_type'] == condition]
+                cond_timing = timing[timing["trial_type"] == condition]
                 for i in cond_timing.index:
-                    blocks.append((cond_timing.loc[i]['onset'] / tr, ((cond_timing.loc[i]['onset'] + cond_timing.loc[i]['duration']) / tr) + 1))
+                    blocks.append(
+                        (
+                            cond_timing.loc[i]["onset"] / tr,
+                            ((cond_timing.loc[i]["onset"] + cond_timing.loc[i]["duration"]) / tr)
+                            + 1,
+                        )
+                    )
                 if len(blocks) > 1:
-                    run_cond[condition][run] = np.vstack((timeseries[int(blocks[0][0]):int(blocks[0][1]), :], timeseries[int(blocks[1][0]):int(blocks[1][1]), :]))
+                    run_cond[condition][run] = np.vstack(
+                        (
+                            timeseries[int(blocks[0][0]) : int(blocks[0][1]), :],
+                            timeseries[int(blocks[1][0]) : int(blocks[1][1]), :],
+                        )
+                    )
                 if len(blocks) > 2:
-                    for i in np.arange(2,len(blocks)):
-                        run_cond[condition][run] = np.vstack((timeseries[int(blocks[0][0]):int(blocks[0][1]), :], timeseries[int(blocks[1][0]):int(blocks[1][1]), :]))
-                    #print('extracted signals for {0}, {1}, {2}'.format(task, run, condition), run_cond['{0}-{1}'.format(run, condition)].shape)
+                    for i in np.arange(2, len(blocks)):
+                        run_cond[condition][run] = np.vstack(
+                            (
+                                timeseries[int(blocks[0][0]) : int(blocks[0][1]), :],
+                                timeseries[int(blocks[1][0]) : int(blocks[1][1]), :],
+                            )
+                        )
+                    # print('extracted signals for {0}, {1}, {2}'.format(task, run, condition), run_cond['{0}-{1}'.format(run, condition)].shape)
                 else:
                     pass
-                print(f'Making correlation matrix for {run}, {condition}.')
-                corrmats[condition][run] = connectivity_measure.fit_transform([run_cond[condition][run]])[0]
-                print('And that correlation matrix is', corrmats[condition][run].shape)
+                print(f"Making correlation matrix for {run}, {condition}.")
+                corrmats[condition][run] = connectivity_measure.fit_transform(
+                    [run_cond[condition][run]]
+                )[0]
+                print("And that correlation matrix is", corrmats[condition][run].shape)
         except Exception as e:
-            print('trying to slice and dice, but', e)
-    #and paste together the timeseries from each run together per condition
+            print("trying to slice and dice, but", e)
+    # and paste together the timeseries from each run together per condition
     files = []
     avg_corrmats = {}
-    print('Corrmats per run per condition have been made!')
+    print("Corrmats per run per condition have been made!")
     for condition in conditions:
-        print(f'Merging corrmats for {task}-{condition}...')
+        print(f"Merging corrmats for {task}-{condition}...")
         data = list(corrmats[condition].values())
         stacked_corrmats = np.array(data)
-        print('Stacked corrmats have dimensions', stacked_corrmats.shape)
+        print("Stacked corrmats have dimensions", stacked_corrmats.shape)
         avg_corrmat = np.mean(stacked_corrmats, axis=0)
-        corrmat_df = pd.DataFrame(index=np.arange(1, avg_corrmat.shape[0]+1), columns=np.arange(1, avg_corrmat.shape[0]+1),data=avg_corrmat)
+        corrmat_df = pd.DataFrame(
+            index=np.arange(1, avg_corrmat.shape[0] + 1),
+            columns=np.arange(1, avg_corrmat.shape[0] + 1),
+            data=avg_corrmat,
+        )
         avg_corrmats[condition] = corrmat_df
-        corrmat_file = join(deriv_dir,  
-                            f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_desc-{condition}_space-MNI152NLin2009cAsym_atlas-{atlas_name}_corrmat.tsv')
+        corrmat_file = join(
+            deriv_dir,
+            f"sub-{subject}",
+            f"ses-{session}",
+            "func",
+            f"sub-{subject}_ses-{session}_task-{task}_desc-{condition}_space-MNI152NLin2009cAsym_atlas-{atlas_name}_corrmat.tsv",
+        )
         try:
-            corrmat_df.to_csv(corrmat_file, sep='\t')
+            corrmat_df.to_csv(corrmat_file, sep="\t")
             files.append(corrmat_file)
         except Exception as e:
-            print('saving corrmat...', e)
+            print("saving corrmat...", e)
     return files, avg_corrmats
 
-def rest_connectivity(layout, subject, session, task, atlas, confounds=None,connectivity_metric='correlation'):
 
+def rest_connectivity(
+    layout, subject, session, task, atlas, confounds=None, connectivity_metric="correlation"
+):
     """
     Makes connectivity matrices per subject per session per task per condition.
     Parameters
@@ -193,116 +282,201 @@ def rest_connectivity(layout, subject, session, task, atlas, confounds=None,conn
     try:
         version = get_versions()["version"]
     except:
-        version = 'test'
-    if '.nii' in atlas:
-        assert exists(atlas), f'Mask file does not exist at {atlas}'
-    
-    deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}')
-    atlas_name = basename(atlas).rsplit('.', 2)[0]
+        version = "test"
+    if ".nii" in atlas:
+        assert exists(atlas), f"Mask file does not exist at {atlas}"
+
+    deriv_dir = join(layout.root, "derivatives", f"idconn-{version}")
+    atlas_name = basename(atlas).rsplit(".", 2)[0]
     # use pybids here to grab # of runs and preproc bold filenames
     connectivity_measure = connectome.ConnectivityMeasure(kind=connectivity_metric)
-    bold_files = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz') # should be preprocessed BOLD file from fmriprep, grabbed with pybids
-    print(f'BOLD files found at {bold_files}')
-    #confounds_files = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task)
+    bold_files = layout.get(
+        scope="derivatives",
+        return_type="file",
+        suffix="bold",
+        task=task,
+        space="MNI152NLin2009cAsym",
+        subject=subject,
+        session=session,
+        extension="nii.gz",
+    )  # should be preprocessed BOLD file from fmriprep, grabbed with pybids
+    print(f"BOLD files found at {bold_files}")
+    # confounds_files = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task)
 
     runs = []
     if len(bold_files) > 1:
         for i in range(0, len(bold_files)):
-            assert exists(bold_files[i]), "Preprocessed bold file(s) does not exist at {0}".format(bold_files)
-            runs.append(layout.parse_file_entities(bold_files[i])['run'])
+            assert exists(bold_files[i]), "Preprocessed bold file(s) does not exist at {0}".format(
+                bold_files
+            )
+            runs.append(layout.parse_file_entities(bold_files[i])["run"])
     else:
         runs = None
-    print(f'Found runs: {runs}')
+    print(f"Found runs: {runs}")
 
-    out = join(deriv_dir,  f'sub-{subject}', f'ses-{session}', 'func')
+    out = join(deriv_dir, f"sub-{subject}", f"ses-{session}", "func")
     if not exists(out):
-            makedirs(out)
-    
-    
-    #event_files = layout.get(return_type='filename', suffix='events', task=task, subject=subject)
-    #timing = pd.read_csv(event_files[0], header=0, index_col=0, sep='\t')
-    #conditions = timing['trial_type'].unique()
+        makedirs(out)
+
+    # event_files = layout.get(return_type='filename', suffix='events', task=task, subject=subject)
+    # timing = pd.read_csv(event_files[0], header=0, index_col=0, sep='\t')
+    # conditions = timing['trial_type'].unique()
 
     if runs:
         corrmats = {}
         for run in runs:
-            print('run = ', run)
+            print("run = ", run)
             # read in events file for this subject, task, and run
-            
 
-            confounds_file = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task, run=run, extension='tsv')
-            print(f'Confounds file located at: {confounds_file}')
-            confounds_df = pd.read_csv(confounds_file[0], header=0, sep='\t')
+            confounds_file = layout.get(
+                scope="derivatives",
+                return_type="file",
+                desc="confounds",
+                subject=subject,
+                session=session,
+                task=task,
+                run=run,
+                extension="tsv",
+            )
+            print(f"Confounds file located at: {confounds_file}")
+            confounds_df = pd.read_csv(confounds_file[0], header=0, sep="\t")
             confounds_df = confounds_df[confounds].fillna(0)
-            confounds_fname = join(deriv_dir,  f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-confounds_timeseries.tsv')
-            confounds_df.to_csv(confounds_fname, sep='\t')
+            confounds_fname = join(
+                deriv_dir,
+                f"sub-{subject}",
+                f"ses-{session}",
+                "func",
+                f"sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-confounds_timeseries.tsv",
+            )
+            confounds_df.to_csv(confounds_fname, sep="\t")
 
-            bold_file = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz', run=run)
-            assert len(bold_file) == 1, f'BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}, {run}: {bold_file}'
+            bold_file = layout.get(
+                scope="derivatives",
+                return_type="file",
+                suffix="bold",
+                task=task,
+                space="MNI152NLin2009cAsym",
+                subject=subject,
+                session=session,
+                extension="nii.gz",
+                run=run,
+            )
+            assert (
+                len(bold_file) == 1
+            ), f"BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}, {run}: {bold_file}"
             tr = layout.get_tr(bold_file)
             masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=2)
 
             ex_bold = image.index_img(bold_file[0], 2)
             display = plotting.plot_epi(ex_bold)
             display.add_contours(atlas)
-            display.savefig(join(deriv_dir,  f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-atlas_overlay.png'))
-                
-            print(f'BOLD file located at {bold_file}\nTR = {tr}s')
+            display.savefig(
+                join(
+                    deriv_dir,
+                    f"sub-{subject}",
+                    f"ses-{session}",
+                    "func",
+                    f"sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-atlas_overlay.png",
+                )
+            )
+
+            print(f"BOLD file located at {bold_file}\nTR = {tr}s")
             try:
-                #for each parcellation, extract BOLD timeseries
-                print(f'Extracting bold signal for sub-{subject}, ses-{session}, run-{run}...')
-                timeseries = masker.fit_transform(bold_file[0], confounds_fname)   
+                # for each parcellation, extract BOLD timeseries
+                print(f"Extracting bold signal for sub-{subject}, ses-{session}, run-{run}...")
+                timeseries = masker.fit_transform(bold_file[0], confounds_fname)
             except Exception as e:
-                print('ERROR: Trying to extract BOLD signals, but', e)
+                print("ERROR: Trying to extract BOLD signals, but", e)
             try:
-                print(f'Making correlation matrix for for sub-{subject}, ses-{session}, task-{task}, run-{run}...')
+                print(
+                    f"Making correlation matrix for for sub-{subject}, ses-{session}, task-{task}, run-{run}..."
+                )
                 corrmats[run] = connectivity_measure.fit_transform([timeseries])[0]
             except Exception as e:
-                print('ERROR: Trying to make corrmat, but', e)
+                print("ERROR: Trying to make corrmat, but", e)
         data = list(corrmats.values())
         stacked_corrmats = np.array(data)
-        print('Stacked corrmats have dimensions', stacked_corrmats.shape)
+        print("Stacked corrmats have dimensions", stacked_corrmats.shape)
         avg_corrmat = np.mean(stacked_corrmats, axis=0)
     else:
-        confounds_file = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task, extension='tsv')
-        print(f'Confounds file located at: {confounds_file}')
-        confounds_df = pd.read_csv(confounds_file[0], header=0, sep='\t')
+        confounds_file = layout.get(
+            scope="derivatives",
+            return_type="file",
+            desc="confounds",
+            subject=subject,
+            session=session,
+            task=task,
+            extension="tsv",
+        )
+        print(f"Confounds file located at: {confounds_file}")
+        confounds_df = pd.read_csv(confounds_file[0], header=0, sep="\t")
         confounds_df = confounds_df[confounds].fillna(0)
-        confounds_fname = join(deriv_dir,  f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_desc-confounds_timeseries.tsv')
-        confounds_df.to_csv(confounds_fname, sep='\t')
+        confounds_fname = join(
+            deriv_dir,
+            f"sub-{subject}",
+            f"ses-{session}",
+            "func",
+            f"sub-{subject}_ses-{session}_task-{task}_desc-confounds_timeseries.tsv",
+        )
+        confounds_df.to_csv(confounds_fname, sep="\t")
 
-        bold_file = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz')
-        assert len(bold_file) == 1, f'BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}: {bold_file}'
+        bold_file = layout.get(
+            scope="derivatives",
+            return_type="file",
+            suffix="bold",
+            task=task,
+            space="MNI152NLin2009cAsym",
+            subject=subject,
+            session=session,
+            extension="nii.gz",
+        )
+        assert (
+            len(bold_file) == 1
+        ), f"BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}: {bold_file}"
         tr = layout.get_tr(bold_file)
         masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=2)
-	
+
         ex_bold = image.index_img(bold_file[0], 2)
         display = plotting.plot_epi(ex_bold)
         display.add_contours(atlas)
-        display.savefig(join(deriv_dir,  f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_desc-atlas_overlay.png'))
-            
-        print(f'BOLD file located at {bold_file}\nTR = {tr}s')
+        display.savefig(
+            join(
+                deriv_dir,
+                f"sub-{subject}",
+                f"ses-{session}",
+                "func",
+                f"sub-{subject}_ses-{session}_task-{task}_desc-atlas_overlay.png",
+            )
+        )
+
+        print(f"BOLD file located at {bold_file}\nTR = {tr}s")
         try:
-            #for each parcellation, extract BOLD timeseries
-            print(f'Extracting bold signal for sub-{subject}, ses-{session}...')
-            timeseries = masker.fit_transform(bold_file[0], confounds_fname)   
+            # for each parcellation, extract BOLD timeseries
+            print(f"Extracting bold signal for sub-{subject}, ses-{session}...")
+            timeseries = masker.fit_transform(bold_file[0], confounds_fname)
         except Exception as e:
-            print('ERROR: Trying to extract BOLD signals, but', e)
+            print("ERROR: Trying to extract BOLD signals, but", e)
         try:
-            print(f'Making correlation matrix for for sub-{subject}, ses-{session}...')
+            print(f"Making correlation matrix for for sub-{subject}, ses-{session}...")
             avg_corrmat = connectivity_measure.fit_transform([timeseries])[0]
         except Exception as e:
-            print('ERROR: Trying to make corrmat, but', e)
+            print("ERROR: Trying to make corrmat, but", e)
 
-    print('Correlation matrix created, dimensions:', avg_corrmat.shape)
+    print("Correlation matrix created, dimensions:", avg_corrmat.shape)
     try:
-        corrmat_df = pd.DataFrame(index=np.arange(1, avg_corrmat.shape[0]+1), columns=np.arange(1, avg_corrmat.shape[0]+1),data=avg_corrmat)
-        corrmat_file = join(deriv_dir,  
-                            f'sub-{subject}', 
-                            f'ses-{session}', 
-                            'func', 
-                            f'sub-{subject}_ses-{session}_task-{task}_space-MNI152NLin2009cAsym_atlas-{atlas_name}_desc-corrmat_bold.tsv')
-        corrmat_df.to_csv(corrmat_file, sep='\t')
+        corrmat_df = pd.DataFrame(
+            index=np.arange(1, avg_corrmat.shape[0] + 1),
+            columns=np.arange(1, avg_corrmat.shape[0] + 1),
+            data=avg_corrmat,
+        )
+        corrmat_file = join(
+            deriv_dir,
+            f"sub-{subject}",
+            f"ses-{session}",
+            "func",
+            f"sub-{subject}_ses-{session}_task-{task}_space-MNI152NLin2009cAsym_atlas-{atlas_name}_desc-corrmat_bold.tsv",
+        )
+        corrmat_df.to_csv(corrmat_file, sep="\t")
     except Exception as e:
-        print('ERROR saving corrmat...', e)
+        print("ERROR saving corrmat...", e)
     return corrmat_df, corrmat_file
diff --git a/idconn/data.py b/idconn/data.py
index 575e7bc..0e18186 100644
--- a/idconn/data.py
+++ b/idconn/data.py
@@ -6,20 +6,21 @@
 
 
 def impute(data, max_iter=10000):
-    '''
+    """
     Fill in missing data with an iterative imputation algorithm from scikit learn.
     NOTE: Will not imput connectivity data.
-    '''
-    
-    non_numeric = data.select_dtypes(exclude=['number']).columns
-    dumb = pd.get_dummies(data[non_numeric], prefix='dummy')
+    """
+
+    non_numeric = data.select_dtypes(exclude=["number"]).columns
+    dumb = pd.get_dummies(data[non_numeric], prefix="dummy")
     df = pd.concat([data.drop(non_numeric, axis=1), dumb])
     impute_pls = IterativeImputer(
         max_iter=max_iter, skip_complete=True, verbose=1, tol=5e-3, n_nearest_features=1000
     )
     imputed = impute_pls.fit_transform(df)
-    imp_df = pd.DataFrame(imputed,columns=data.drop(non_numeric, axis=1).columns, index=data.index,
+    imp_df = pd.DataFrame(
+        imputed,
+        columns=data.drop(non_numeric, axis=1).columns,
+        index=data.index,
     )
     return imp_df
-
-    
diff --git a/idconn/io.py b/idconn/io.py
index 61d5d93..b5f43e1 100644
--- a/idconn/io.py
+++ b/idconn/io.py
@@ -9,19 +9,21 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
-#from matplotlib import projections
+
+# from matplotlib import projections
 from matplotlib import pyplot as plt
 from matplotlib.gridspec import GridSpec
 from nilearn import datasets, plotting, surface
 
+
 def calc_fd(confounds):
-    x = confounds['trans_x'].values
-    y = confounds['trans_y'].values
-    z = confounds['trans_z'].values
-    alpha = confounds['rot_x'].values
-    beta = confounds['rot_y'].values
-    gamma = confounds['rot_z'].values
-    
+    x = confounds["trans_x"].values
+    y = confounds["trans_y"].values
+    z = confounds["trans_z"].values
+    alpha = confounds["rot_x"].values
+    beta = confounds["rot_y"].values
+    gamma = confounds["rot_z"].values
+
     delta_x = [np.abs(t - s) for s, t in zip(x, x[1:])]
     delta_y = [np.abs(t - s) for s, t in zip(y, y[1:])]
     delta_z = [np.abs(t - s) for s, t in zip(z, z[1:])]
@@ -33,9 +35,20 @@ def calc_fd(confounds):
     fd = np.sum([delta_x, delta_y, delta_z, delta_alpha, delta_beta, delta_gamma], axis=0)
     return fd
 
-def build_statsmodel_json(name, task, contrast, confounds, highpass, 
-                          mask, conn_meas, graph_meas=None, exclude=None, outfile=None):
-    '''
+
+def build_statsmodel_json(
+    name,
+    task,
+    contrast,
+    confounds,
+    highpass,
+    mask,
+    conn_meas,
+    graph_meas=None,
+    exclude=None,
+    outfile=None,
+):
+    """
     Creates a BIDS Stats Models json with analysis details for further use.
     DOES NOT WORK YET.
 
@@ -63,65 +76,63 @@ def build_statsmodel_json(name, task, contrast, confounds, highpass,
     shape : str
         Indicates shape of map (3d, 4d, coords) for choosing appropriate
         Nilearn masker for extracting BOLD signals from nifti files.
-    
-    '''
-    mask_builtins = ['shen270', 'craddock270', 'schaefer400', 'yeo7', 'yeo17']
-    if '.nii' in mask:
-        assert exists(mask), 'Mask file does not exist at {mask}'.format(mask=mask)
-        if '.gz' in mask:
-            mask_name = basename(mask).rsplit('.', 2)[0]
+
+    """
+    mask_builtins = ["shen270", "craddock270", "schaefer400", "yeo7", "yeo17"]
+    if ".nii" in mask:
+        assert exists(mask), "Mask file does not exist at {mask}".format(mask=mask)
+        if ".gz" in mask:
+            mask_name = basename(mask).rsplit(".", 2)[0]
         else:
-            mask_name = basename(mask).rsplit('.', 1)[0]
+            mask_name = basename(mask).rsplit(".", 1)[0]
     else:
-        assert mask in mask_builtins, 'Mask {mask} not in built-in mask options. Please provide file path or one of {mask_builtins}'.format(mask=mask, mask_builtins=mask_builtins)
+        assert (
+            mask in mask_builtins
+        ), "Mask {mask} not in built-in mask options. Please provide file path or one of {mask_builtins}".format(
+            mask=mask, mask_builtins=mask_builtins
+        )
     variables = confounds + ["{mask_name}*".format(mask_name=mask_name)]
     statsmodel = {
         "name": name,
-        "description": "A functional connectivity analysis of {task}, comparing {contrast}".format(task=task, 
-                                                                                                   contrast=contrast), 
-        "input":{
-            "task": task
-        },
-        "blocks":[{
+        "description": "A functional connectivity analysis of {task}, comparing {contrast}".format(
+            task=task, contrast=contrast
+        ),
+        "input": {"task": task},
+        "blocks": [
+            {
                 "level": "run",
-                "transformations":{
-                        "name": "load_image_data",
-                        "input": ["bold"],
-                        "aggregate": ["mean"],
-                        "mask": [mask_name],
-                        "output": ["{mask_name}*".format(mask_name=mask_name)]
-                    },
-        },
+                "transformations": {
+                    "name": "load_image_data",
+                    "input": ["bold"],
+                    "aggregate": ["mean"],
+                    "mask": [mask_name],
+                    "output": ["{mask_name}*".format(mask_name=mask_name)],
+                },
+            },
             {
                 "level": "session",
                 "model": {
                     "variables": variables,
-                    "options": {
-                        "confounds": confounds,
-                        "high_pass_filter_cutoff_secs": highpass
-                    },
-                    "variances": {
-                        "name": "session_level",
-                        "groupBy": "session"
-                    },
+                    "options": {"confounds": confounds, "high_pass_filter_cutoff_secs": highpass},
+                    "variances": {"name": "session_level", "groupBy": "session"},
                     "software": {
                         "IDConn": {
                             "ConnectivityMeasure": [conn_meas],
-                            "GraphMetrics": [graph_meas]
+                            "GraphMetrics": [graph_meas],
                         }
-                    }
-                }
-                
-            }
-        ]
+                    },
+                },
+            },
+        ],
     }
-    statsmodel_json = json.dumps(statsmodel, indent = 2)
-    
-    outfile = '{name}-statsmodel.json'.format(name=name)
-    with open(outfile, 'w') as outfile:
+    statsmodel_json = json.dumps(statsmodel, indent=2)
+
+    outfile = "{name}-statsmodel.json".format(name=name)
+    with open(outfile, "w") as outfile:
         json.dump(statsmodel, outfile)
     return statsmodel_json
 
+
 def atlas_picker(atlas, path, key=None):
     """Takes in atlas name and path to file, if local, returns
     nifti-like object (usually file path to downloaded atlas),
@@ -133,12 +144,12 @@ def atlas_picker(atlas, path, key=None):
     Parameters
     ----------
     atlas : str
-        Name of the atlas/parcellation used to define nodes from 
-        voxels. If using an atlas fetchable by Nilearn, atlas name 
+        Name of the atlas/parcellation used to define nodes from
+        voxels. If using an atlas fetchable by Nilearn, atlas name
         must match the function `fetch_atlas_[name]`.
     path : str
-        Path to the atlas specified, if not using a dataset from Nilearn. 
-        If using `nilearn.datasets` to fetch an atlas, will revert to 
+        Path to the atlas specified, if not using a dataset from Nilearn.
+        If using `nilearn.datasets` to fetch an atlas, will revert to
         `derivatives/idconn` path.
     key : str
         Atlas-specific key for denoting which of multiple versions
@@ -156,22 +167,32 @@ def atlas_picker(atlas, path, key=None):
         Indicates shape of map (3d, 4d, coords) for choosing appropriate
         Nilearn masker for extracting BOLD signals from nifti files.
     """
-    nilearn_3d = ['craddock_2012', 'destrieux_2009', 'harvard_oxford', 'smith_2009', 'yeo_2011', 'aal', 'pauli_2017', 'msdl']
-    #nilearn_coord = ['power_2011', 'dosenbach_2010', 'seitzman_2018']
-    #nilearn_4d = ['allen_2011', '']
+    nilearn_3d = [
+        "craddock_2012",
+        "destrieux_2009",
+        "harvard_oxford",
+        "smith_2009",
+        "yeo_2011",
+        "aal",
+        "pauli_2017",
+        "msdl",
+    ]
+    # nilearn_coord = ['power_2011', 'dosenbach_2010', 'seitzman_2018']
+    # nilearn_4d = ['allen_2011', '']
     if atlas in nilearn_3d:
-        if atlas == 'craddock_2012':
+        if atlas == "craddock_2012":
             atlas_dict = datasets.fetch_atlas_craddock_2012(data_dir=path)
-            atlas_path = atlas_dict['tcorr_2level']
+            atlas_path = atlas_dict["tcorr_2level"]
             nifti = nib.load(atlas_path)
             nifti_arr = nifti.get_fdata()
-            #selecting one volume of the nifti, each represent different granularity of parcellation
-            #selecting N = 270, the 27th volume per http://ccraddock.github.io/cluster_roi/atlases.html
-            nifti = nib.Nifti1Image(nifti_arr[:,:,:,26], nifti.affine)
+            # selecting one volume of the nifti, each represent different granularity of parcellation
+            # selecting N = 270, the 27th volume per http://ccraddock.github.io/cluster_roi/atlases.html
+            nifti = nib.Nifti1Image(nifti_arr[:, :, :, 26], nifti.affine)
             nifti.to_filename()
 
     return atlas, path
 
+
 def vectorize_corrmats(matrices):
     """Returns the vectorized upper triangles of a 3-dimensional array
     (i.e., node x node x matrix) of matrices. Output will be a 2-dimensional
@@ -181,14 +202,14 @@ def vectorize_corrmats(matrices):
     matrices : numpy array of shape (p, n, n)
         Represents the link strengths of the graphs. Assumed to be
         an array of symmetric nxn matrices per participant and/or timepoint (p).
-    
+
     Returns
     -------
     edge_vector : numpy array of shape (p, n^2)
-        Represents an array of vectorized upper triangles of 
+        Represents an array of vectorized upper triangles of
         the input matrices.
     """
-    #print(f'\n\n\n{matrices.shape}, {matrices.ndim}\n\n\n')
+    # print(f'\n\n\n{matrices.shape}, {matrices.ndim}\n\n\n')
     num_node = matrices.shape[1]
     upper_tri = np.triu_indices(num_node, k=1)
     if matrices.ndim == 3:
@@ -196,17 +217,20 @@ def vectorize_corrmats(matrices):
         upper_tri = np.triu_indices(num_node, k=1)
         num_matrices = matrices.shape[0]
         edge_vector = []
-        for matrix in range(0,num_matrices):
-            vectorized = matrices[matrix,:,:][upper_tri]
+        for matrix in range(0, num_matrices):
+            vectorized = matrices[matrix, :, :][upper_tri]
             edge_vector.append(vectorized)
-    
+
     elif matrices.ndim == 2:
         true = matrices[0].T == matrices[0]
         if true.all():
             edge_vector = matrices[upper_tri]
         else:
-            print('Matrices of incompatible shape:', matrices.shape, 
-                '\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).')
+            print(
+                "Matrices of incompatible shape:",
+                matrices.shape,
+                "\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).",
+            )
     elif matrices.ndim == 1:
         if matrices[0].ndim == 2:
             num_node = matrices[0].shape[0]
@@ -216,14 +240,18 @@ def vectorize_corrmats(matrices):
                 vectorized = matrix[upper_tri]
                 edge_vector.append(vectorized)
         else:
-            print('Matrices of incompatible shape:', matrices.shape, 
-                  '\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).')
+            print(
+                "Matrices of incompatible shape:",
+                matrices.shape,
+                "\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).",
+            )
     edge_vector = np.asarray(edge_vector)
     return edge_vector
 
+
 def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True, verbose=False):
-    """Returns a node x node x (subject x session) matrix of correlation matrices  
-    from a BIDS derivative folder. Optionally returns a node^2 x (subject x session) 
+    """Returns a node x node x (subject x session) matrix of correlation matrices
+    from a BIDS derivative folder. Optionally returns a node^2 x (subject x session)
     array of vectorized upper triangles of those correlation matrices.
     Parameters
     ----------
@@ -237,286 +265,336 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True
     atlas: str
         The name of the atlas used to make the correlation matrix. Must match the string in corrmat filename.
     z_score : Bool
-        Would you like the correlation matrices z-scored? (Uses Fishers r-to-z, 
+        Would you like the correlation matrices z-scored? (Uses Fishers r-to-z,
         thus assumes elements/edges of corrmats are product-moment correlations).
     vectorized : Bool
-        If True, returns the vectorized upper triangles of correlation matrices in a p x (n^2 - n)/2 array. 
+        If True, returns the vectorized upper triangles of correlation matrices in a p x (n^2 - n)/2 array.
         If false, returns the full correlation matrices in a p x n x n array.
     verbose : Bool
-        If True, prints out subjects/sessions as their correlationmatrices are being read. 
+        If True, prints out subjects/sessions as their correlationmatrices are being read.
         If False, prints nothing.
-    
+
     Returns
     -------
     # NOT TRUE CURRENTLY RETURNS DATAFRAME
     edge_vector : numpy array of shape (p, (n^2-n)/2)
-        Represents an array of vectorized upper triangles of 
+        Represents an array of vectorized upper triangles of
         the input nxn matrices if vectorized=True.
     edge_cube : numpy array of shape (p, n^2)
-        Represents an array of the input nxn matrices 
+        Represents an array of the input nxn matrices
         if vectorized=False.
     """
-    subjects = layout.get(return_type='id', 
-                          target='subject', 
-                          suffix='bold', 
-                          scope=deriv_name
-                         )
-    
-    ppts_fname = layout.get_file('participants.tsv').path
-    ppt_df = pd.read_csv(ppts_fname, sep='\t', index_col=[0,1])
-    ppt_df['adj'] = ''
+    subjects = layout.get(return_type="id", target="subject", suffix="bold", scope=deriv_name)
+
+    ppts_fname = layout.get_file("participants.tsv").path
+    ppt_df = pd.read_csv(ppts_fname, sep="\t", index_col=[0, 1])
+    ppt_df["adj"] = ""
     if vectorized:
-        ppt_df['edge_vector'] = ''
-    
+        ppt_df["edge_vector"] = ""
+
     for subject in subjects:
         if verbose:
             print(subject)
         else:
             pass
-        sessions = layout.get(return_type='id', 
-                              target='session', 
-                              task=task, 
-                              suffix='bold', 
-                              subject=subject, 
-                              scope=deriv_name)
-        
-        
+        sessions = layout.get(
+            return_type="id",
+            target="session",
+            task=task,
+            suffix="bold",
+            subject=subject,
+            scope=deriv_name,
+        )
+
         for session in sessions:
-            runs = layout.get(return_type='id', 
-                              session=session,
-                              target='run', 
-                              task=task, 
-                              suffix='timeseries', 
-                              subject=subject, 
-                              scope=deriv_name)
+            runs = layout.get(
+                return_type="id",
+                session=session,
+                target="run",
+                task=task,
+                suffix="timeseries",
+                subject=subject,
+                scope=deriv_name,
+            )
             if len(runs) > 0:
-                path = layout.get(return_type='filename', 
-                                    session=session,
-                                    run=runs[0], 
-                                    task=task, 
-                                    suffix='timeseries', 
-                                    subject=subject, 
-                                    scope=deriv_name)
+                path = layout.get(
+                    return_type="filename",
+                    session=session,
+                    run=runs[0],
+                    task=task,
+                    suffix="timeseries",
+                    subject=subject,
+                    scope=deriv_name,
+                )
                 confounds = pd.read_table(path[0], header=0, index_col=0)
-                if not 'framewise_displacement' in confounds.columns:
+                if not "framewise_displacement" in confounds.columns:
                     fd = calc_fd(confounds)
-                    #fd.append(0)
+                    # fd.append(0)
                     fd = np.append(fd, [0])
-                    confounds['framewise_displacement'] = fd
+                    confounds["framewise_displacement"] = fd
                 confound_means = confounds.mean(axis=0)
                 if len(runs) > 1:
                     for run in runs[1:]:
-                        path = layout.get(return_type='filename', 
-                                        session=session,
-                                        run=run, 
-                                        task=task, 
-                                        suffix='timeseries', 
-                                        subject=subject, 
-                                        scope=deriv_name)
+                        path = layout.get(
+                            return_type="filename",
+                            session=session,
+                            run=run,
+                            task=task,
+                            suffix="timeseries",
+                            subject=subject,
+                            scope=deriv_name,
+                        )
                         confounds = pd.read_table(path[0], header=0, index_col=0)
-                        if not 'framewise_displacement' in confounds.columns:
+                        if not "framewise_displacement" in confounds.columns:
                             fd = calc_fd(confounds)
-                            #fd.append(0)
+                            # fd.append(0)
                             fd = np.append(fd, [0])
-                            confounds['framewise_displacement'] = fd
+                            confounds["framewise_displacement"] = fd
                         confound_means_temp = confounds.mean(axis=0)
-                        confound_means = np.mean(pd.concat([confound_means, confound_means_temp], axis=1), axis=1)
-                        #print(confound_means)
+                        confound_means = np.mean(
+                            pd.concat([confound_means, confound_means_temp], axis=1), axis=1
+                        )
+                        # print(confound_means)
             else:
-                path = path = layout.get(return_type='filename', 
-                                    session=session,
-                                    desc='confounds', 
-                                    task=task, 
-                                    suffix='timeseries', 
-                                    subject=subject, 
-                                    scope=deriv_name)
-                
+                path = path = layout.get(
+                    return_type="filename",
+                    session=session,
+                    desc="confounds",
+                    task=task,
+                    suffix="timeseries",
+                    subject=subject,
+                    scope=deriv_name,
+                )
+
                 confounds = pd.read_table(path[0], header=0, index_col=0)
-                if not 'framewise_displacement' in confounds.columns:
+                if not "framewise_displacement" in confounds.columns:
                     fd = calc_fd(confounds)
                     fd = np.append(fd, [0])
-                    confounds['framewise_displacement'] = fd 
+                    confounds["framewise_displacement"] = fd
                 confound_means = confounds.mean(axis=0)
-                #print(confound_means)
+                # print(confound_means)
             for confound in confound_means.index:
-                ppt_df.at[(f'sub-{subject}', 
-                        f'ses-{session}'), 
-                        confound] = confound_means[confound]
+                ppt_df.at[(f"sub-{subject}", f"ses-{session}"), confound] = confound_means[
+                    confound
+                ]
 
             if verbose:
                 print(session)
             else:
                 pass
-            path = layout.get(return_type='filename',
-                               task=task, 
-                               subject=subject,
-                               session=session,
-                                atlas=atlas,
-                               suffix='bold',
-                               scope='IDConn'
-                              )
+            path = layout.get(
+                return_type="filename",
+                task=task,
+                subject=subject,
+                session=session,
+                atlas=atlas,
+                suffix="bold",
+                scope="IDConn",
+            )
             if verbose:
-                print(f'Corrmat path for sub-{subject}, ses-{session}: \t{path}')
+                print(f"Corrmat path for sub-{subject}, ses-{session}: \t{path}")
             else:
                 pass
             if type(path) == list:
-                #print(len(path))
+                # print(len(path))
                 path = path[0]
             else:
                 pass
-            assert exists(path), f'Corrmat file not found at {path}'
-            adj_matrix = pd.read_csv(path, sep='\t', header=0, index_col=0)
-            
+            assert exists(path), f"Corrmat file not found at {path}"
+            adj_matrix = pd.read_csv(path, sep="\t", header=0, index_col=0)
+
             if z_score == True:
                 z_adj = np.arctanh(adj_matrix.values)
                 z_adj = np.where(z_adj == np.inf, 0, z_adj)
-                #print(z_adj.shape)
-                ppt_df.at[(f'sub-{subject}', 
-                           f'ses-{session}'), 
-                          'adj'] = z_adj
+                # print(z_adj.shape)
+                ppt_df.at[(f"sub-{subject}", f"ses-{session}"), "adj"] = z_adj
             else:
-                #print(adj_matrix.values.shape)
-                ppt_df.at[(f'sub-{subject}', 
-                           f'ses-{session}'), 
-                          'adj'] = adj_matrix.values
-                
-            
+                # print(adj_matrix.values.shape)
+                ppt_df.at[(f"sub-{subject}", f"ses-{session}"), "adj"] = adj_matrix.values
+
             if vectorized == True:
                 edge_vector = vectorize_corrmats(adj_matrix.values)
-                #print(edge_vector.shape)
-                ppt_df.at[(f'sub-{subject}', 
-                                   f'ses-{session}'), 
-                                  'edge_vector'] = edge_vector
-    ppt_df.replace({'': np.nan}, inplace=True)
+                # print(edge_vector.shape)
+                ppt_df.at[(f"sub-{subject}", f"ses-{session}"), "edge_vector"] = edge_vector
+    ppt_df.replace({"": np.nan}, inplace=True)
     return ppt_df
 
+
 def undo_vectorize(edges, num_node=None):
-    '''
+    """
     Puts an edge vector back into an adjacency matrix.
     Parameters
     ----------
-    edges : list-like of shape ((n^2-n)/2,) 
+    edges : list-like of shape ((n^2-n)/2,)
         Vectorized upper triangle of an adjacency matrix.
     num_node : int
         The number of nodes in the graph. I would calculate this myself, but I'd rather not.
-    
+
     Returns
     -------
     matrix : numpy array of size (n,n)
         Symmetric array of connectivity values.
-    '''
-    #j = len(edges)
-    #num_node = (np.sqrt((8 * j) + 1) + 1) / 2
+    """
+    # j = len(edges)
+    # num_node = (np.sqrt((8 * j) + 1) + 1) / 2
     if num_node == None:
         j = len(edges)
         num_node = int((np.sqrt((8 * j) + 1) + 1) / 2)
     else:
         num_node = int(num_node)
-    X = np.zeros((num_node,num_node))
-    X[np.triu_indices(X.shape[0], k = 1)] = edges
+    X = np.zeros((num_node, num_node))
+    X[np.triu_indices(X.shape[0], k=1)] = edges
     X = X + X.T
     return X
 
-def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='seismic', node_size='strength'):
-    '''
+
+def plot_edges(
+    adj,
+    atlas_nii,
+    threshold=None,
+    title=None,
+    strength=False,
+    cmap="seismic",
+    node_size="strength",
+):
+    """
     Plots the edges of a connectivity/adjacency matrix both in a heatmap and in brain space, with the option to include
     a surface plot of node strength.
     Parameters
     ----------
-    adj : array-like of shape (n, n) 
+    adj : array-like of shape (n, n)
         Adjacency matrix to be plotted. Can be numpy array or Pandas dataframe.
     atlas_nii : str
-        Path to the atlas used to define nodes in the adjacency matrix. 
+        Path to the atlas used to define nodes in the adjacency matrix.
         Should be one value per node, with the same number of values as rows and columns in adj (i.e., n).
         Background should be 0, should be in MNI space.
     threshold : int
-        Percentile of edges to plot, between 0 and 100 such that 0 plots all the edges and 100 plots none. 
+        Percentile of edges to plot, between 0 and 100 such that 0 plots all the edges and 100 plots none.
         If not specified, default is 99, which plots the top 1% of edges.
     title : str
-        Title for plots. 
+        Title for plots.
     strength : bool
-        If True, plots surface maps of node strength (i.e., the sum of all a node's edge weights) 
+        If True, plots surface maps of node strength (i.e., the sum of all a node's edge weights)
     cmap : str
-        One of the matplotlib colormaps. 
+        One of the matplotlib colormaps.
     node_size : int or 'strength'
         Size to plot nodes in brain space. If 'strength', node size varies according to a node's summed edges (i.e., strength).
-    
+
     Returns
     -------
     fig1 : Matplotlib figure object
         Connectivity figure.
     fig2 : Matplotlib figure object
         If `strength=True`,  the surface node strength plot.
-    '''
+    """
     coords = plotting.find_parcellation_cut_coords(atlas_nii)
     num_node = adj.shape[0]
     # only plot the top t% of edges
-    if threshold == 'computed':
-        threshold = f'{(1 - (100 / num_node ** 2)) * 100}%'
+    if threshold == "computed":
+        threshold = f"{(1 - (100 / num_node ** 2)) * 100}%"
     elif type(threshold) == float or type(threshold) == int:
-        threshold = f'{threshold}%'
+        threshold = f"{threshold}%"
     else:
-        threshold = '99.99%'
-    print('edge plotting threshold: ', threshold)
+        threshold = "99.99%"
+    print("edge plotting threshold: ", threshold)
 
-    if node_size == 'strength':
+    if node_size == "strength":
         node_strength = np.sum(adj, axis=0)
-        #node_strength /= np.max(node_strength)
-        #node_strength **= 4
+        # node_strength /= np.max(node_strength)
+        # node_strength **= 4
         node_strength = node_strength / np.max(node_strength) * 60
         node_size = node_strength
-    
-    fig = plt.figure(figsize=(12,4))
+
+    fig = plt.figure(figsize=(12, 4))
     if title is not None:
         fig.suptitle(title)
-    gs = GridSpec(1, 2, width_ratios=[3,1])
+    gs = GridSpec(1, 2, width_ratios=[3, 1])
     ax0 = fig.add_subplot(gs[0])
     ax1 = fig.add_subplot(gs[1])
 
     plt.tight_layout(w_pad=5)
-    g = plotting.plot_connectome(adj, coords, 
-                                node_size=node_size,
-                                edge_threshold=threshold, 
-                                edge_cmap=cmap,
-                                edge_kwargs={'alpha': 0.4},
-                                display_mode='lyrz', 
-                                figure=fig, 
-                                axes=ax0,
-                                colorbar=False, 
-                                annotate=True)
+    g = plotting.plot_connectome(
+        adj,
+        coords,
+        node_size=node_size,
+        edge_threshold=threshold,
+        edge_cmap=cmap,
+        edge_kwargs={"alpha": 0.4},
+        display_mode="lyrz",
+        figure=fig,
+        axes=ax0,
+        colorbar=False,
+        annotate=True,
+    )
     h = sns.heatmap(adj, square=True, linewidths=0, cmap=cmap, ax=ax1, center=0)
     if strength:
-        fig2 = plt.figure(figsize=(12,4))
+        fig2 = plt.figure(figsize=(12, 4))
         if title is not None:
             fig2.suptitle(title)
         fsaverage = datasets.fetch_surf_fsaverage()
         nimg = nib.load(atlas_nii)
         regn_sch_arr = nimg.get_fdata()
-        for i in np.arange(0,num_node):
-            regn_sch_arr[np.where(regn_sch_arr == i+1)] = np.sum(adj[i])
+        for i in np.arange(0, num_node):
+            regn_sch_arr[np.where(regn_sch_arr == i + 1)] = np.sum(adj[i])
         strength_nimg = nib.Nifti1Image(regn_sch_arr, nimg.affine)
         # replace this filename with BIDSy output
-        #nib.save(strength_nimg, f'/Users/katherine.b/Dropbox/{title}predictive-strength.nii')
+        # nib.save(strength_nimg, f'/Users/katherine.b/Dropbox/{title}predictive-strength.nii')
 
         gs = GridSpec(1, 4)
         # plot edge weights on surfaces
-        ax2 = fig2.add_subplot(gs[0], projection='3d')
-        ax3 = fig2.add_subplot(gs[1], projection='3d')
-        ax4 = fig2.add_subplot(gs[2], projection='3d')
-        ax5 = fig2.add_subplot(gs[3], projection='3d')
+        ax2 = fig2.add_subplot(gs[0], projection="3d")
+        ax3 = fig2.add_subplot(gs[1], projection="3d")
+        ax4 = fig2.add_subplot(gs[2], projection="3d")
+        ax5 = fig2.add_subplot(gs[3], projection="3d")
 
-        texture_l = surface.vol_to_surf(strength_nimg, fsaverage.pial_left, interpolation='nearest')
-        texture_r = surface.vol_to_surf(strength_nimg, fsaverage.pial_right, interpolation='nearest')
+        texture_l = surface.vol_to_surf(
+            strength_nimg, fsaverage.pial_left, interpolation="nearest"
+        )
+        texture_r = surface.vol_to_surf(
+            strength_nimg, fsaverage.pial_right, interpolation="nearest"
+        )
 
         plt.tight_layout(w_pad=-1)
-        i = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5,
-                                                cmap=cmap, view='lateral', colorbar=False, axes=ax2)
-        j = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5,
-                                                cmap=cmap, view='medial', colorbar=False, axes=ax3)
-        k = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5,
-                                                cmap=cmap, view='lateral', colorbar=False, axes=ax4)
-        l = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5,
-                                                cmap=cmap, view='medial', colorbar=False, axes=ax5)
+        i = plotting.plot_surf_stat_map(
+            fsaverage.pial_left,
+            texture_l,
+            symmetric_cbar=False,
+            threshold=0.5,
+            cmap=cmap,
+            view="lateral",
+            colorbar=False,
+            axes=ax2,
+        )
+        j = plotting.plot_surf_stat_map(
+            fsaverage.pial_left,
+            texture_l,
+            symmetric_cbar=False,
+            threshold=0.5,
+            cmap=cmap,
+            view="medial",
+            colorbar=False,
+            axes=ax3,
+        )
+        k = plotting.plot_surf_stat_map(
+            fsaverage.pial_right,
+            texture_r,
+            symmetric_cbar=False,
+            threshold=0.5,
+            cmap=cmap,
+            view="lateral",
+            colorbar=False,
+            axes=ax4,
+        )
+        l = plotting.plot_surf_stat_map(
+            fsaverage.pial_right,
+            texture_r,
+            symmetric_cbar=False,
+            threshold=0.5,
+            cmap=cmap,
+            view="medial",
+            colorbar=False,
+            axes=ax5,
+        )
         return fig, fig2, strength_nimg
     else:
-        return fig
\ No newline at end of file
+        return fig
diff --git a/idconn/nbs.py b/idconn/nbs.py
index ad236c0..facf96f 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -5,9 +5,16 @@
 from idconn.io import vectorize_corrmats, undo_vectorize
 from scipy.stats import t, pearsonr, pointbiserialr, spearmanr
 import enlighten
-#import bct
 
-from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, GridSearchCV, StratifiedKFold, KFold
+# import bct
+
+from sklearn.model_selection import (
+    RepeatedStratifiedKFold,
+    RepeatedKFold,
+    GridSearchCV,
+    StratifiedKFold,
+    KFold,
+)
 
 from sklearn.feature_selection import f_regression, f_classif
 from sklearn.linear_model import LogisticRegression, ElasticNet
@@ -15,19 +22,23 @@
 
 from sklearn.metrics import mean_squared_error
 
+
 def calc_number_of_nodes(matrices):
     if matrices.shape[0] != matrices.shape[1]:
         if matrices.shape[1] == matrices.shape[2]:
             num_node = matrices.shape[1]
             matrices = np.moveaxis(matrices, 0, -1)
         else:
-            raise ValueError(f'Matrices of shape {matrices.shape}',
-                             'requires matrices of shape (subject x session) x node x node',
-                             'or node x node x (subject x session).')
+            raise ValueError(
+                f"Matrices of shape {matrices.shape}",
+                "requires matrices of shape (subject x session) x node x node",
+                "or node x node x (subject x session).",
+            )
     else:
         num_node = matrices.shape[0]
     return num_node
 
+
 def residualize(X, y=None, confounds=None):
     # residualize the outcome
     if confounds is not None:
@@ -38,46 +49,45 @@ def residualize(X, y=None, confounds=None):
 
             # residualize features
             resid_X = np.zeros_like(X)
-            #print(X.shape, resid_X.shape)
+            # print(X.shape, resid_X.shape)
             for i in range(0, X.shape[1]):
-                X_temp = X[:,i]
-                #print(X_temp.shape)
+                X_temp = X[:, i]
+                # print(X_temp.shape)
                 X_ = pg.linear_regression(confounds, X_temp)
-                #print(X_.residuals_.shape)
-                resid_X[:,i] = X_.residuals_.flatten()
+                # print(X_.residuals_.shape)
+                resid_X[:, i] = X_.residuals_.flatten()
             return resid_y, resid_X
         else:
             # residualize features
             resid_X = np.zeros_like(X)
-            #print(X.shape, resid_X.shape)
+            # print(X.shape, resid_X.shape)
             for i in range(0, X.shape[1]):
-                X_temp = X[:,i]
-                #print(X_temp.shape)
+                X_temp = X[:, i]
+                # print(X_temp.shape)
                 X_ = pg.linear_regression(confounds, X_temp)
-                #print(X_.residuals_.shape)
-                resid_X[:,i] = X_.residuals_.flatten()
+                # print(X_.residuals_.shape)
+                resid_X[:, i] = X_.residuals_.flatten()
             return resid_X
     else:
-        print('Confound matrix wasn\'t provided, so no confounding was done')
-        
-    
+        print("Confound matrix wasn't provided, so no confounding was done")
+
 
 def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000):
-    '''
+    """
     Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided
     of shape ((subject x session)x node x node)
     in the network.
     Returns a dataframe containing the results of kfolds cross-validation,
     including the indices of train and test samples, the resulting p-value and largest connected component,
     the accuracy of the network in predicting group belonging in the test samples (using logistic regression),
-    the parameter estimates from each regression, and the model object from each regression. 
+    the parameter estimates from each regression, and the model object from each regression.
     from a BIDS derivative folder. Optionally returns a subject x session dataframe
-    of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) 
+    of confound measures (e.g., motion averages) and/or a node^2 x (subject x session)
     array of vectorized upper triangles of those correlation mat
     Parameters
     ----------
     matrices : numpy array of shape (p, n, n)
-        Represents the link strengths of the graphs (i.e., functional connectivity). 
+        Represents the link strengths of the graphs (i.e., functional connectivity).
         Assumed to be an array of symmetric matrices.
     outcome : list-like of shape (p,)
         Y-value to be predicted with connectivity
@@ -91,7 +101,7 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000):
     permutations : int
         If `predict=False`, specifies the number of permutations run to create a null distribution
         for estimating the significance of the connected component size. Recommended 10,000.
-    
+
     Returns
     -------
     S1 : Pandas dataframe
@@ -100,14 +110,14 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000):
         If `predict=False`, denotes the significance of the largest connected component.
     perms : numpy array of shape (permutations,)
         If `predict=False`, largest connected component size per permutation.
-    '''
+    """
     # need to do a mass-univariate test at every edge
     # and retain significant edges
     # then find the largest connected component
     # and, if not predict, build a null distribution
-    #n = matrices.shape[:-1]
+    # n = matrices.shape[:-1]
     ndims = len(matrices.shape)
-    
+
     # vectorize_corrmats returns p x n^2
 
     # turn matrices into vectorized upper triangles
@@ -115,43 +125,42 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000):
         edges = vectorize_corrmats(matrices)
     else:
         edges = matrices.copy()
-    #print(edges.shape)
-    
-    
-    #edges = edges.T
-    
+    # print(edges.shape)
+
+    # edges = edges.T
+
     # run an ols per edge
     # create significancs matrix for predictor of interest (outcome)
     # 1 if edge is significantly predicted by outcome
     # 0 if it's not
-    
+
     if len(np.unique(outcome)) < 5:
         (f, p) = f_classif(X=edges, y=outcome)
     else:
         (f, p) = f_regression(X=edges, y=outcome, center=False)
     sig_edges = np.where(p < alpha, 1, 0)
-    
+
     # find largest connected component of sig_edges
     # turn sig_edges into an nxn matrix first
-    sig_matrix = undo_vectorize(sig_edges) # need to write this function
+    sig_matrix = undo_vectorize(sig_edges)  # need to write this function
     matrix = nx.from_numpy_array(sig_matrix)
-    
-    #use networkX to find connected components
+
+    # use networkX to find connected components
     largest_cc = max(nx.connected_components(matrix), key=len)
     G0 = matrix.subgraph(largest_cc)
-    #print(G0)
-    
-    # retain size of largest connected component 
+    # print(G0)
+
+    # retain size of largest connected component
     # for NBS permutation-based significance testing
     max_comp = G0.number_of_edges()
-    #print(f'Connected component has {max_comp} edges.')    
+    # print(f'Connected component has {max_comp} edges.')
 
     # pull the subgraph with largest number of nodes
     # i.e., the largest connected component
-    
+
     # grab list of nodes in largest connected component
     nodes = list(G0.nodes)
-    
+
     unused_nodes = list(set(matrix.nodes) - set(nodes))
     S1 = nx.to_pandas_adjacency(G0, nodelist=nodes)
 
@@ -166,7 +175,7 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000):
 
     S1.sort_index(axis=0, inplace=True)
     S1.sort_index(axis=1, inplace=True)
-    
+
     # permutation testing to create a null distribution of max component size
     # only for regular NBS, -Predict doesn't need this
     if predict == False:
@@ -176,56 +185,60 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000):
         for i in range(0, permutations):
             # shuffle outcome order
             rng.shuffle(outcome_copy, axis=0)
-            #print(outcome_copy)
-            
+            # print(outcome_copy)
+
             if len(np.unique(outcome)) < 5:
                 (f1, p1) = f_classif(edges, outcome_copy)
             else:
                 (f1, p1) = f_regression(edges, outcome_copy, center=False)
-            
+
             perm_edges = np.where(p1 < alpha, 1, 0)
-            
-            #print(np.sum(perm_edges))
+
+            # print(np.sum(perm_edges))
             # find largest connected component of sig_edges
             # turn sig_edges into an nxn matrix first
-            perm_matrix = undo_vectorize(perm_edges) # need to write this function
+            perm_matrix = undo_vectorize(perm_edges)  # need to write this function
             perm_nx = nx.from_numpy_array(perm_matrix)
 
             largest_cc = max(nx.connected_components(perm_nx), key=len)
             S = perm_nx.subgraph(largest_cc)
 
             perm_comp_size = S.number_of_edges()
-            
 
             # retain for null distribution
             perms[i] = perm_comp_size
             if i == 0:
                 pass
             elif i % 100 == 0:
-                print(f'p-value is {np.round(np.sum(np.where(perms >= max_comp, 1, 0)) / i, 3)} as of permutation {i}')
-            
+                print(
+                    f"p-value is {np.round(np.sum(np.where(perms >= max_comp, 1, 0)) / i, 3)} as of permutation {i}"
+                )
+
             # bctpy nbs code uses hit to mark progress across permutations
             # prob not necessary?
-        
+
         # bctpy calcs pval for all components, not just largest?
         # but I don't think that's relevant for the og implimentation of nbs?
         pval = np.size(np.where(perms >= max_comp)) / permutations
         print(max_comp, permutations, pval)
-        
+
         return pval, S1, perms
     else:
         return S1
 
-def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_splits=10, n_iterations=10):
+
+def kfold_nbs(
+    matrices, outcome, confounds=None, alpha=0.05, groups=None, n_splits=10, n_iterations=10
+):
     """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided
     of shape ((subject x session)x node x node)
     in the network.
     Returns a dataframe containing the results of kfolds cross-validation,
     including the indices of train and test samples, the resulting p-value and largest connected component,
     the accuracy of the network in predicting group belonging in the test samples (using logistic regression),
-    the parameter estimates from each regression, and the model object from each regression. 
+    the parameter estimates from each regression, and the model object from each regression.
     from a BIDS derivative folder. Optionally returns a subject x session dataframe
-    of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) 
+    of confound measures (e.g., motion averages) and/or a node^2 x (subject x session)
     array of vectorized upper triangles of those correlation mat
     Parameters
     ----------
@@ -235,10 +248,10 @@ def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_spli
     outcome : list-like of shape (p,)
         Y-value to be predicted with connectivity
     confounds : list-like
-        Columns in `participants.tsv` to be regressed out of connectivity and outcome 
+        Columns in `participants.tsv` to be regressed out of connectivity and outcome
         data in each CV fold (per recommendation from Snoek et al., 2019).
     alpha : float
-        Proportion of type II errors (i.e., false positives) we're willing to put up with. 
+        Proportion of type II errors (i.e., false positives) we're willing to put up with.
         This is the upper limit for pvalues in the edge detection process.
     groups : list-like of shape (p,)
         Grouping variable - currently only works for 2 groups. Will enforce stratified k-fold CV.
@@ -248,7 +261,7 @@ def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_spli
         Value of K for K-fold cross-validation. Will split data into K chunks, train on K-1 chunks and test on the Kth.
     n_iterations : int
         Number of times to run K-fold cross-validation. More times = more stable results.
-    
+
     Returns
     -------
     weighted_average : Pandas dataframe
@@ -256,56 +269,60 @@ def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_spli
         their prediction performance (i.e., accuracy for binary outcome, correlation for continuous).
         Could be used for out-of-sample prediction, once thresholded and binarized.
     cv_results : Pandas dataframe
-        Includes the results of each cross-validation loop 
+        Includes the results of each cross-validation loop
         (e.g., predictive performance, data split, largest connected component per fold per iteration).
     """
     edges = vectorize_corrmats(matrices)
-    #print(edges.shape)
-    #print(edges.shape)
-    index = list(range(0,n_splits * n_iterations))
-
-    cv_results = pd.DataFrame(index=index, 
-                            columns=['split',  
-                                    #'pval', 
-                                    'score',
-                                    'component',
-                                    'coefficient_matrix',
-                                    'coefficient_vector',
-                                    'model'])
+    # print(edges.shape)
+    # print(edges.shape)
+    index = list(range(0, n_splits * n_iterations))
+
+    cv_results = pd.DataFrame(
+        index=index,
+        columns=[
+            "split",
+            #'pval',
+            "score",
+            "component",
+            "coefficient_matrix",
+            "coefficient_vector",
+            "model",
+        ],
+    )
     if groups is not None:
-        cv = RepeatedStratifiedKFold(n_splits=n_splits,
-                                    n_repeats=n_iterations)
+        cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_iterations)
         split_y = groups
-        
+
     else:
-        cv = RepeatedKFold(n_splits=n_splits, 
-                        n_repeats=n_iterations) 
-        split_y = outcome   
-    
+        cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_iterations)
+        split_y = outcome
+
     num_node = calc_number_of_nodes(matrices)
-    #print(num_node)
-    #if matrices.shape[0] != matrices.shape[1]:
+    # print(num_node)
+    # if matrices.shape[0] != matrices.shape[1]:
     #    if matrices.shape[1] == matrices.shape[2]:
     #        num_node = matrices.shape[1]
-            #matrices = np.moveaxis(matrices, 0, -1)
+    # matrices = np.moveaxis(matrices, 0, -1)
     #    else:
     #        raise ValueError(f'Matrices of shape {matrices.shape}',
-                             #'requires matrices of shape (subject x session) x node x node',
-                             #'or node x node x (subject x session).')
-    #else:
+    #'requires matrices of shape (subject x session) x node x node',
+    #'or node x node x (subject x session).')
+    # else:
     #    num_node = matrices.shape[0]
     upper_tri = np.triu_indices(num_node, k=1)
-    
+
     i = 0
     manager = enlighten.get_manager()
-    ticks = manager.counter(total=n_splits * n_iterations, desc='Progress', unit='folds')
+    ticks = manager.counter(total=n_splits * n_iterations, desc="Progress", unit="folds")
     for train_idx, test_idx in cv.split(edges, split_y):
         scaler = StandardScaler()
-        cv_results.at[i, 'split'] = (train_idx, test_idx)
-        
-        #assert len(train_a_idx) == len(train_b_idx)
+        cv_results.at[i, "split"] = (train_idx, test_idx)
+
+        # assert len(train_a_idx) == len(train_b_idx)
         if np.unique(outcome).shape[0] == 2:
-            regressor = LogisticRegression(l1_ratio=0.25, max_iter=1000, penalty='elasticnet', solver='saga')
+            regressor = LogisticRegression(
+                l1_ratio=0.25, max_iter=1000, penalty="elasticnet", solver="saga"
+            )
         else:
             regressor = ElasticNet(l1_ratio=0.25, max_iter=1000)
 
@@ -314,22 +331,24 @@ def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_spli
 
         train_edges = edges[train_idx, :]
         test_edges = edges[test_idx, :]
-        
+
         if confounds is not None:
             train_confounds = confounds.values[train_idx]
             test_confounds = confounds.values[test_idx]
-            #print(train_edges.shape, train_confounds.shape, train_y.shape)
-            
+            # print(train_edges.shape, train_confounds.shape, train_y.shape)
+
             # residualize the edges and outcome
             if np.unique(outcome).shape[0] == 2:
                 train_edges = residualize(X=train_edges, confounds=train_confounds)
                 test_edges = residualize(X=test_edges, confounds=test_confounds)
             elif np.unique(outcome).shape[0] > 3:
-                train_y, train_edges = residualize(X=train_edges, y=train_y, confounds=train_confounds)
+                train_y, train_edges = residualize(
+                    X=train_edges, y=train_y, confounds=train_confounds
+                )
                 test_y, test_edges = residualize(X=test_edges, y=test_y, confounds=test_confounds)
         else:
             pass
-        
+
         train_edges = scaler.fit_transform(train_edges)
         test_edges = scaler.fit_transform(test_edges)
 
@@ -338,81 +357,81 @@ def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_spli
         else:
             train_y = scaler.fit_transform(train_y.reshape(-1, 1))
             test_y = scaler.fit_transform(test_y.reshape(-1, 1))
-        
+
         # perform NBS wooooooooo
         # note: output is a dataframe :)
         # PYNBS SHOULD NOT DO CONFOUND REGRESSION?
         adj = pynbs(train_edges, train_y, alpha, predict=True)
-        #print(adj.shape, adj.ndim, adj[0].shape, upper_tri)
-        
-        #cv_results.at[i, 'pval'] = pval
-        cv_results.at[i, 'component'] = adj.values
-        
+        # print(adj.shape, adj.ndim, adj[0].shape, upper_tri)
+
+        # cv_results.at[i, 'pval'] = pval
+        cv_results.at[i, "component"] = adj.values
+
         # in the event of no edges significantly related to <outcome>
-        #print(sum(sum(adj.values)), '\n', adj.values.shape)
+        # print(sum(sum(adj.values)), '\n', adj.values.shape)
         if sum(sum(adj.values)) > 0:
             # grab the values of the adjacency matrix that are just in the upper triangle
             # so you don't have repeated edges
             # returns (n_edges, )
             nbs_vector = adj.values[upper_tri]
-            #print(nbs_vector.shape)
+            # print(nbs_vector.shape)
             # use those to make a "significant edges" mask
             mask = nbs_vector == 1.0
 
             # grab only the significant edges from testing and training sets of edges
             # for use as features in the predictive models
             # these are already residualized
-            #print(train_edges.shape)
+            # print(train_edges.shape)
             # returns (n_edges, samples)
             train_features = train_edges.T[mask]
             test_features = test_edges.T[mask]
 
             train_features = scaler.fit_transform(train_features.T)
             test_features = scaler.fit_transform(test_features.T)
-            #print(np.ravel(train_y))
+            # print(np.ravel(train_y))
             # train model predicting outcome from brain (note: no mas covariates)
             model = regressor.fit(X=train_features, y=np.ravel(train_y))
-            cv_results.at[i, 'model'] = model
-            
+            cv_results.at[i, "model"] = model
+
             # score that model on the testing data
             # if logistic regression: score = mean accuracy
             # if linear regression: score = coefficient of determination (R^2)
             # both from 0 (low) to 1 (high)
             score = model.score(X=test_features, y=np.ravel(test_y))
-            cv_results.at[i, 'score'] = score
-            #print(model.coef_.shape)
+            cv_results.at[i, "score"] = score
+            # print(model.coef_.shape)
 
             m = 0
             param_vector = np.zeros_like(nbs_vector)
             for l in range(0, nbs_vector.shape[0]):
-                if nbs_vector[l] == 1.:
+                if nbs_vector[l] == 1.0:
                     ###
                     # NEEDS IF STATEMENT BC LOGISTIC AND LINEAR HAVE DIFFERENT COEF_ SHAPES
                     if np.unique(outcome).shape[0] == 2:
-                        param_vector[l] = model.coef_[0,m]
+                        param_vector[l] = model.coef_[0, m]
                     else:
-                        param_vector[l] = model.coef_[m]   
-                    m+=1
+                        param_vector[l] = model.coef_[m]
+                    m += 1
                 else:
                     pass
             X = undo_vectorize(param_vector, num_node=num_node)
-            cv_results.at[i, 'coefficient_matrix'] = X
-            cv_results.at[i, 'coefficient_vector'] = param_vector
+            cv_results.at[i, "coefficient_matrix"] = X
+            cv_results.at[i, "coefficient_vector"] = param_vector
             i += 1
         else:
             pass
         ticks.update()
     # calculate weighted average
-    #print(cv_results['score'])
-    weighted_stack = cv_results.at[0, 'component'] * cv_results.at[0, 'score']
-    #print(weighted_stack.shape)
+    # print(cv_results['score'])
+    weighted_stack = cv_results.at[0, "component"] * cv_results.at[0, "score"]
+    # print(weighted_stack.shape)
     for j in index[1:]:
-        #print(cv_results.at[j, 'score'])
-        if cv_results.at[j, 'score'] > 0:
-            weighted = cv_results.at[j, 'component'] * cv_results.at[j, 'score']
+        # print(cv_results.at[j, 'score'])
+        if cv_results.at[j, "score"] > 0:
+            weighted = cv_results.at[j, "component"] * cv_results.at[j, "score"]
             weighted_stack = np.dstack([weighted_stack, weighted])
         else:
             pass
-        #print(weighted_stack.shape, weighted.shape)
+        # print(weighted_stack.shape, weighted.shape)
     weighted_average = np.mean(weighted_stack, axis=-1)
-    return weighted_average, cv_results
\ No newline at end of file
+    return weighted_average, cv_results
diff --git a/idconn/networking.py b/idconn/networking.py
index f74ee12..c2ddf39 100644
--- a/idconn/networking.py
+++ b/idconn/networking.py
@@ -4,22 +4,25 @@
 import networkx as nx
 import matplotlib.pyplot as plt
 from os.path import join
-#from nilearn.connectome import ConnectivityMeasure
+
+# from nilearn.connectome import ConnectivityMeasure
 from scipy.sparse.csgraph import minimum_spanning_tree
 from scipy.stats import skew
 import bct
-#import datetime
+
+# import datetime
 
 
 def avg_corrmat(ppt_df):
-    '''
+    """
     Reads in adjacency matrices from the pandas df with ppt info and adj, then computes an average.
-    '''
-    stacked_corrmats = np.array(ppt_df['adj'])
-    print('Stacked corrmats have dimensions', stacked_corrmats.shape)
+    """
+    stacked_corrmats = np.array(ppt_df["adj"])
+    print("Stacked corrmats have dimensions", stacked_corrmats.shape)
     avg_corrmat = np.mean(stacked_corrmats, axis=0)
     return avg_corrmat
 
+
 def null_model(W, bin_swaps=5, wei_freq=0.1, seed=None):
     def get_rng(seed):
         if seed is None or seed == np.random:
@@ -29,7 +32,7 @@ def get_rng(seed):
         try:
             rstate = np.random.RandomState(seed)
         except ValueError:
-            rstate = np.random.RandomState(np.random.Random(seed).randint(0, 2 ** 32 - 1))
+            rstate = np.random.RandomState(np.random.Random(seed).randint(0, 2**32 - 1))
         return rstate
 
     def randmio_und_signed(R, itr, seed=None):
@@ -45,7 +48,6 @@ def randmio_und_signed(R, itr, seed=None):
         for it in range(int(itr)):
             att = 0
             while att <= max_attempts:
-
                 a, b, c, d = pick_four_unique_nodes_quickly(n, rng)
 
                 r0_ab = R[a, b]
@@ -59,7 +61,6 @@ def randmio_und_signed(R, itr, seed=None):
                     and np.sign(r0_ad) == np.sign(r0_cb)
                     and np.sign(r0_ab) != np.sign(r0_ad)
                 ):
-
                     R[a, d] = R[d, a] = r0_ab
                     R[a, b] = R[b, a] = r0_ad
 
@@ -80,11 +81,11 @@ def pick_four_unique_nodes_quickly(n, seed=None):
         clever but still substantially slower.
         """
         rng = get_rng(seed)
-        k = rng.randint(n ** 4)
+        k = rng.randint(n**4)
         a = k % n
         b = k // n % n
-        c = k // n ** 2 % n
-        d = k // n ** 3 % n
+        c = k // n**2 % n
+        d = k // n**3 % n
         if a != b and a != c and a != d and b != c and b != d and c != d:
             return (a, b, c, d)
         else:
@@ -134,9 +135,7 @@ def pick_four_unique_nodes_quickly(n, seed=None):
             W0.flat[Lij[Oind]] = s * Wv  # weight at this index
         else:
             wsize = np.size(Wv)
-            wei_period = np.round(1 / wei_freq).astype(
-                int
-            )  # convert frequency to period
+            wei_period = np.round(1 / wei_freq).astype(int)  # convert frequency to period
             lq = np.arange(wsize, 0, -wei_period, dtype=int)
             for m in lq:  # iteratively explore at this period
                 # get indices of Lij that sort P
@@ -170,16 +169,15 @@ def pick_four_unique_nodes_quickly(n, seed=None):
     W0 = W0 + W0.T
     return W0
 
+
 def generate_null(ppt_df, thresh_arr, measure, permutations=1000):
-    '''
+    """
     Generate a distribution of graph measure values based on a null connectivity matrix
     that is like the average connectivity matrix across participants.
-    
-    '''
-    null_dist = pd.DataFrame(index=range(0,permutations), columns=["mean", "sdev"])
-    avg_corr = avg_corrmat(
-        ppt_df
-    )
+
+    """
+    null_dist = pd.DataFrame(index=range(0, permutations), columns=["mean", "sdev"])
+    avg_corr = avg_corrmat(ppt_df)
     eff_perm = []
     j = 0
     while j < permutations:
@@ -193,19 +191,21 @@ def generate_null(ppt_df, thresh_arr, measure, permutations=1000):
         leff_auc = np.trapz(effs_arr, dx=0.03, axis=0)
         eff_perm.append(leff_auc)
         j += 1
-    
+
     return null_dist
 
+
 def omst(matrix, density=True, plot=False):
-    '''
+    """
     WARNING: THIS IS SLOW AF, REPLACING WITH NETWORKX VERSION IN NEAR FUTURE
-    '''
+    """
     dims = matrix.shape
     if matrix.ndim > 2:
-        raise ValueError("'matrix' should be a 2D array. "
-                         "An array with %d dimension%s was passed"
-                         % (matrix.ndim,
-                            "s" if matrix.ndim > 1 else ""))
+        raise ValueError(
+            "'matrix' should be a 2D array. "
+            "An array with %d dimension%s was passed"
+            % (matrix.ndim, "s" if matrix.ndim > 1 else "")
+        )
     else:
         mst = minimum_spanning_tree(matrix)
         mst_arr = mst.toarray().astype(float)
@@ -217,7 +217,7 @@ def omst(matrix, density=True, plot=False):
         Cost = [cost]
 
         while np.sum(matrix_2) > 1000:
-            #print(np.sum(matrix_2))
+            # print(np.sum(matrix_2))
             mst = minimum_spanning_tree(matrix_2)
             mst_arr = mst.toarray().astype(float)
             matrix_2 = np.where(mst_arr != 0, 0, matrix_2)
@@ -231,26 +231,23 @@ def omst(matrix, density=True, plot=False):
         max_GCE = GCE.index(max_value)
         thresholded = np.sum(trees[:max_GCE, :, :], axis=0)
         if plot == True:
-            fig,ax = plt.subplots()
-            sns.lineplot(Cost, GCE, ax=ax, palette='husl')
-            plt.scatter(Cost[max_GCE], 
-                        GCE[max_GCE], 
-                        marker='x', 
-                        edgecolors=None, 
-                        c='magenta')
-            ax.set_ylabel('Global Cost Efficiency')
-            ax.set_xlabel('Cost')
-            
+            fig, ax = plt.subplots()
+            sns.lineplot(Cost, GCE, ax=ax, palette="husl")
+            plt.scatter(Cost[max_GCE], GCE[max_GCE], marker="x", edgecolors=None, c="magenta")
+            ax.set_ylabel("Global Cost Efficiency")
+            ax.set_xlabel("Cost")
+
         if density == True:
             den = np.sum(thresholded != 0) / (dims[0] * dims[1])
             return thresholded, den
     return thresholded, fig
 
+
 def graph_auc(matrix, thresholds, measure, args):
-    '''
+    """
     matrix : array
     measure : function from bctpy
-    '''
+    """
     from bct import measure, threshold_proportional
 
     metrics = []
@@ -258,11 +255,13 @@ def graph_auc(matrix, thresholds, measure, args):
         thresh = threshold_proportional(matrix, p, copy=True)
         metric = measure(thresh, args)
         metrics.append(metric)
-    auc= np.trapz(metrics, dx=0.01)
+    auc = np.trapz(metrics, dx=0.01)
     return auc
 
+
 def graph_omst(matrix, measure, args):
     from bct import measure
+
     # threshold using orthogonal minimum spanning tree
     thresh_mat = omst(matrix)
 
@@ -270,8 +269,9 @@ def graph_omst(matrix, measure, args):
     metric = measure(thresh_mat, args)
     return metric
 
+
 def scale_free_tau(corrmat, skew_thresh, proportional=True):
-    ''''
+    """'
     Calculates threshold at which network becomes scale-free, estimated from the skewness of the networks degree distribution.
     Parameters
     ----------
@@ -285,7 +285,7 @@ def scale_free_tau(corrmat, skew_thresh, proportional=True):
     -------
     tau : float
         Lowest vaue of tau (threshold) at which network is scale-free.
-    '''
+    """
     tau = 0.01
     skewness = 1
     while abs(skewness) > 0.3:
@@ -297,8 +297,9 @@ def scale_free_tau(corrmat, skew_thresh, proportional=True):
         tau += 0.01
     return tau
 
+
 def connected_tau(corrmat, proportional=True):
-    '''
+    """
     Calculates threshold at network becomes node connected, using NetworkX's `is_connected` function.
     Parameters
     ----------
@@ -312,7 +313,7 @@ def connected_tau(corrmat, proportional=True):
     -------
     tau : float
         Highest vaue of tau (threshold) at which network becomes node-connected.
-    '''
+    """
     tau = 0.01
     connected = False
     while connected == False:
@@ -323,4 +324,4 @@ def connected_tau(corrmat, proportional=True):
         w_nx = nx.convert_matrix.from_numpy_array(w)
         connected = nx.algorithms.components.is_connected(w_nx)
         tau += 0.01
-    return tau
\ No newline at end of file
+    return tau
diff --git a/idconn/parser_utils.py b/idconn/parser_utils.py
index 792123e..5872ec8 100644
--- a/idconn/parser_utils.py
+++ b/idconn/parser_utils.py
@@ -5,7 +5,7 @@
 def is_valid_file(parser, arg):
     """Check if argument is existing folder."""
     if not op.isfile(arg) and arg is not None:
-        parser.error(f'The file {arg} does not exist!')
+        parser.error(f"The file {arg} does not exist!")
 
     return arg
 
@@ -13,6 +13,6 @@ def is_valid_file(parser, arg):
 def is_valid_path(parser, arg):
     """Check if argument is existing folder."""
     if not op.isdir(arg) and arg is not None:
-        parser.error(f'The folder {arg} does not exist!')
+        parser.error(f"The folder {arg} does not exist!")
 
     return arg
diff --git a/idconn/pipeline.py b/idconn/pipeline.py
index 8c82eea..08b00bb 100644
--- a/idconn/pipeline.py
+++ b/idconn/pipeline.py
@@ -13,122 +13,191 @@
 Please scroll to bottom to read full license.
 """
 import warnings
-warnings.filterwarnings('ignore')
-#import numpy as np
+
+warnings.filterwarnings("ignore")
+# import numpy as np
 import pandas as pd
 import bids
 import argparse
-#import logging
-#from os import makedirs
+
+# import logging
+# from os import makedirs
 from os.path import exists
-#from glob import glob
-#from nilearn import input_data, connectome, plotting, image
+
+# from glob import glob
+# from nilearn import input_data, connectome, plotting, image
 from idconn.connectivity import rest_connectivity, task_connectivity
 from idconn.parser_utils import is_valid_file, is_valid_path
 
-#from idconn.networking import graph_theory, null_distribution
+# from idconn.networking import graph_theory, null_distribution
 
-#LGR = logging.getLogger(__name__)
-#LGR.setLevel(logging.INFO)
+# LGR = logging.getLogger(__name__)
+# LGR.setLevel(logging.INFO)
 
 
 def _get_parser():
-    parser = argparse.ArgumentParser(description='Make correlation matrices from BOLD data + mask.')
+    parser = argparse.ArgumentParser(
+        description="Make correlation matrices from BOLD data + mask."
+    )
     parser.add_argument(
-        'dset_dir',
+        "dset_dir",
         type=lambda x: is_valid_path(parser, x),
-        help='Path to BIDS dataset containing fmriprep derivatives folder.',
+        help="Path to BIDS dataset containing fmriprep derivatives folder.",
     )
     parser.add_argument(
-        'atlas',
+        "atlas",
         type=lambda x: is_valid_file(parser, x),
-        help='Path to atlas file in space specified by `space`.',
+        help="Path to atlas file in space specified by `space`.",
     )
-    parser.add_argument('task', type=str,
-                        help='Task to be analyzed.')
+    parser.add_argument("task", type=str, help="Task to be analyzed.")
     parser.add_argument(
-        '--space',
+        "--space",
         type=str,
-        help='Space in which to run analyses (must be the space `atlas` is in).',
+        help="Space in which to run analyses (must be the space `atlas` is in).",
         default="MNI152NLin2009cAsym",
     )
     parser.add_argument(
-        '--conn',
-        action='store',
-        choices=['covariance', 'correlation', 'partial correlation', 'tangent', 'precision'],
-        help='Metric used to calculate connectivity.',
-        default='correlation',
+        "--conn",
+        action="store",
+        choices=["covariance", "correlation", "partial correlation", "tangent", "precision"],
+        help="Metric used to calculate connectivity.",
+        default="correlation",
     )
     parser.add_argument(
-        '--bids_db',
+        "--bids_db",
         metavar="PATH",
         type=lambda x: is_valid_path(parser, x),
-        help='Path to saved BIDS dataset layout file.',
+        help="Path to saved BIDS dataset layout file.",
     )
     parser.add_argument(
-        '--confounds',
+        "--confounds",
         nargs="+",
         type=str,
-        help='Names of confound regressors from ',
+        help="Names of confound regressors from ",
         default=None,
     )
 
     return parser
 
 
-def idconn_workflow(dset_dir, atlas, task, out_dir, space="MNI152NLin2009cAsym", conn=None, bids_db=None, confounds=None):
-    print('Getting started!')
+def idconn_workflow(
+    dset_dir,
+    atlas,
+    task,
+    out_dir,
+    space="MNI152NLin2009cAsym",
+    conn=None,
+    bids_db=None,
+    confounds=None,
+):
+    print("Getting started!")
 
     if not confounds:
         confounds = [
-            "cosine00", "cosine01", "cosine02",
-            "trans_x", "trans_x_derivative1", "trans_x_power2", "trans_x_derivative1_power2",
-            "trans_y", "trans_y_derivative1", "trans_y_derivative1_power2", "trans_y_power2",
-            "trans_z", "trans_z_derivative1", "trans_z_power2", "trans_z_derivative1_power2",
-            "rot_x", "rot_x_derivative1", "rot_x_power2", "rot_x_derivative1_power2",
-            "rot_y", "rot_y_derivative1", "rot_y_power2", "rot_y_derivative1_power2",
-            "rot_z", "rot_z_derivative1", "rot_z_derivative1_power2", "rot_z_power2",
-            "a_comp_cor_00", "a_comp_cor_01", "a_comp_cor_02", "a_comp_cor_03", "a_comp_cor_04", "a_comp_cor_05", "a_comp_cor_06"
+            "cosine00",
+            "cosine01",
+            "cosine02",
+            "trans_x",
+            "trans_x_derivative1",
+            "trans_x_power2",
+            "trans_x_derivative1_power2",
+            "trans_y",
+            "trans_y_derivative1",
+            "trans_y_derivative1_power2",
+            "trans_y_power2",
+            "trans_z",
+            "trans_z_derivative1",
+            "trans_z_power2",
+            "trans_z_derivative1_power2",
+            "rot_x",
+            "rot_x_derivative1",
+            "rot_x_power2",
+            "rot_x_derivative1_power2",
+            "rot_y",
+            "rot_y_derivative1",
+            "rot_y_power2",
+            "rot_y_derivative1_power2",
+            "rot_z",
+            "rot_z_derivative1",
+            "rot_z_derivative1_power2",
+            "rot_z_power2",
+            "a_comp_cor_00",
+            "a_comp_cor_01",
+            "a_comp_cor_02",
+            "a_comp_cor_03",
+            "a_comp_cor_04",
+            "a_comp_cor_05",
+            "a_comp_cor_06",
         ]
 
     print(f"Atlas: {atlas}\nConnectivity measure: {conn}")
 
-    assert exists(dset_dir), f"Specified dataset doesn't exist:\n{dset_dir} not found.\n\nPlease check the filepath."
+    assert exists(
+        dset_dir
+    ), f"Specified dataset doesn't exist:\n{dset_dir} not found.\n\nPlease check the filepath."
     layout = bids.BIDSLayout(dset_dir, derivatives=True, database_path=bids_db)
-    subjects = layout.get(return_type='id', target='subject', suffix='bold')
+    subjects = layout.get(return_type="id", target="subject", suffix="bold")
     print(f"Subjects: {subjects}")
-    #runs = layout.get(return_type='id', target='session', suffix='bold')
-    preproc_subjects = layout.get(return_type='id', target='subject', task=task, space=space, desc='preproc', suffix='bold')
+    # runs = layout.get(return_type='id', target='session', suffix='bold')
+    preproc_subjects = layout.get(
+        return_type="id", target="subject", task=task, space=space, desc="preproc", suffix="bold"
+    )
     if len(subjects) != len(preproc_subjects):
-        print(f'{len(subjects)} subjects found in dset, only {len(preproc_subjects)} have preprocessed BOLD data. Pipeline is contniuing anyway, please double check preprocessed data if this doesn\'t seem right.')
+        print(
+            f"{len(subjects)} subjects found in dset, only {len(preproc_subjects)} have preprocessed BOLD data. Pipeline is contniuing anyway, please double check preprocessed data if this doesn't seem right."
+        )
 
-    example_events = layout.get(return_type='filename', suffix='events', task=task, subject=preproc_subjects[0])
-    events_df = pd.read_csv(example_events[0], header=0, index_col=0, sep='\t')
-    conditions = events_df['trial_type'].unique()
+    example_events = layout.get(
+        return_type="filename", suffix="events", task=task, subject=preproc_subjects[0]
+    )
+    events_df = pd.read_csv(example_events[0], header=0, index_col=0, sep="\t")
+    conditions = events_df["trial_type"].unique()
 
     print(f"Computing connectivity matrices using {atlas}")
     for subject in preproc_subjects:
         print(f"Subject {subject}")
-        sessions = layout.get(return_type='id', target='session', task=task, subject=subject, suffix='bold')
+        sessions = layout.get(
+            return_type="id", target="session", task=task, subject=subject, suffix="bold"
+        )
         print(f"Sessions with task-{task} found for {subject}: {sessions}")
         for session in sessions:
             print(f"Session {session}")
-            print(f"here are the inputs: {layout, subject, session, task, atlas, conn, space, confounds}")
-            if 'rest' in task:
+            print(
+                f"here are the inputs: {layout, subject, session, task, atlas, conn, space, confounds}"
+            )
+            if "rest" in task:
                 try:
-                    adj_matrix = rest_connectivity(layout, subject, session, task, atlas, conn, space, confounds)
+                    adj_matrix = rest_connectivity(
+                        layout, subject, session, task, atlas, conn, space, confounds
+                    )
                 except Exception as e:
-                    print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}')
+                    print(
+                        f"Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}"
+                    )
             if len(conditions) < 1:
                 try:
-                    adj_matrix = rest_connectivity(layout, subject, session, task, atlas, conn, space, confounds)
+                    adj_matrix = rest_connectivity(
+                        layout, subject, session, task, atlas, conn, space, confounds
+                    )
                 except Exception as e:
-                    print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}')
+                    print(
+                        f"Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}"
+                    )
             else:
                 try:
-                    adj_matrix = task_connectivity(layout=layout, subject=subject, session=session, task=task, atlas=atlas, confounds=confounds, connectivity_metric=conn)
+                    adj_matrix = task_connectivity(
+                        layout=layout,
+                        subject=subject,
+                        session=session,
+                        task=task,
+                        atlas=atlas,
+                        confounds=confounds,
+                        connectivity_metric=conn,
+                    )
                 except Exception as e:
-                    print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}')
+                    print(
+                        f"Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}"
+                    )
 
 
 def _main(argv=None):
@@ -138,7 +207,7 @@ def _main(argv=None):
     idconn_workflow(**vars(options))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     _main()
 
 """
diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index ed6b664..7a1563b 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -23,60 +23,77 @@
 today = datetime.today()
 today_str = strftime("%m_%d_%Y")
 
-TRAIN_DSET = '/Users/katherine.b/Dropbox/Data/ds002674'
-TEST_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset'
-DERIV_NAME = 'IDConn'
-OUTCOME = 'bc'
-CONFOUNDS = 'framewise_displacement'
-TASK = 'rest'
-ATLAS = 'craddock2012'
+TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674"
+TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset"
+DERIV_NAME = "IDConn"
+OUTCOME = "bc"
+CONFOUNDS = "framewise_displacement"
+TASK = "rest"
+ATLAS = "craddock2012"
 alpha = 0.05
-atlas_fname = '/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz'
+atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
 
 
 layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
 
-dat = io.read_corrmats(layout, task=TASK, deriv_name='IDConn', atlas=ATLAS, z_score=True)
+dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=True)
 
-keep = dat['adj'].dropna().index
+keep = dat["adj"].dropna().index
 dat = dat.loc[keep]
-#print(dat['adj'].values.shape)
-num_node = dat.iloc[0]['adj'].shape[0]
+# print(dat['adj'].values.shape)
+num_node = dat.iloc[0]["adj"].shape[0]
 
-matrices = np.vstack(dat['adj'].values).reshape((len(keep), num_node, num_node))
+matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node))
 upper_tri = np.triu_indices(num_node, k=1)
 
-outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]),1))
+outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
 
 if CONFOUNDS is not None:
     confounds = dat[CONFOUNDS]
-    base_name = f'nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}'
+    base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}"
 else:
     confounds = None
-    base_name = f'nbs-predict_outcome-{OUTCOME}'
-#print(dat['bc'])
-
-weighted_average, cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, groups=dat['bc'], n_splits=10, n_iterations=100)
-
-fig,fig2, nimg = io.plot_edges(weighted_average, 
-                         atlas_fname, 
-                         threshold='computed', 
-                         title=f'{OUTCOME} Precition-Weighted Average', 
-                         strength=True, 
-                         cmap='seismic', 
-                         node_size='strength')
-
-fig.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-{today_str}.png'), dpi=400)
-fig2.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-strength-{today_str}.png'), dpi=400)
-nib.save(nimg, join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-strength-{today_str}'))
-
-
-avg_df = pd.DataFrame(weighted_average, 
-                      index=range(0,weighted_average.shape[0]),
-                      columns=range(0,weighted_average.shape[1]))
-
-cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_models-{today_str}.tsv'),sep='\t')
-avg_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-{today_str}.tsv'),sep='\t')
+    base_name = f"nbs-predict_outcome-{OUTCOME}"
+# print(dat['bc'])
+
+weighted_average, cv_results = nbs.kfold_nbs(
+    matrices, outcome, confounds, alpha, groups=dat["bc"], n_splits=10, n_iterations=100
+)
+
+fig, fig2, nimg = io.plot_edges(
+    weighted_average,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Precition-Weighted Average",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}")
+)
+
+
+avg_df = pd.DataFrame(
+    weighted_average,
+    index=range(0, weighted_average.shape[0]),
+    columns=range(0, weighted_average.shape[1]),
+)
+
+cv_results.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t"
+)
+avg_df.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
+)
 
 
 # this uses the most predictive subnetwork as features in the model
@@ -91,26 +108,28 @@
 nbs_vector = weighted_average[upper_tri]
 p50 = np.percentile(nbs_vector, 50)
 filter = np.where(nbs_vector >= p50, True, False)
-#print(nbs_vector.shape, filter.shape)
+# print(nbs_vector.shape, filter.shape)
 
-#mask = io.vectorize_corrmats(filter)
-edges_train = np.vstack(dat['edge_vector'].dropna().values)
+# mask = io.vectorize_corrmats(filter)
+edges_train = np.vstack(dat["edge_vector"].dropna().values)
 
 # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
 if CONFOUNDS is not None:
     confounds_train = dat[CONFOUNDS].values
     outcome_train = np.reshape(outcome, (outcome.shape[0],))
-    #regress out the confounds from each edge and the outcome variable, 
+    # regress out the confounds from each edge and the outcome variable,
     # use the residuals for the rest of the algorithm
-    #print(confounds.shape, outcome.shape)
+    # print(confounds.shape, outcome.shape)
     if len(np.unique(outcome_train)) <= 2:
         resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
         train_outcome = outcome
     elif len(np.unique(outcome_train)) > 3:
-        train_outcome, resid_edges = nbs.residualize(X=edges_train, y=outcome_train, confounds=confounds_train)
-    train_features = resid_edges[:,filter]
+        train_outcome, resid_edges = nbs.residualize(
+            X=edges_train, y=outcome_train, confounds=confounds_train
+        )
+    train_features = resid_edges[:, filter]
 else:
-    train_features = edges_train[:,filter]
+    train_features = edges_train[:, filter]
     train_outcome = outcome
 
 scaler = StandardScaler()
@@ -126,7 +145,7 @@
 # could be extended to the multiclass case?
 
 if len(np.unique(outcome)) == 2:
-    model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.25)
+    model = LogisticRegression(penalty="elasticnet", solver="saga", l1_ratio=0.25)
 else:
     model = ElasticNet(l1_ratio=0.25)
 
@@ -135,16 +154,18 @@
 fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
 in_sample_score = fitted.score(X=train_features, y=np.ravel(train_outcome))
 if len(np.unique(outcome)) == 2:
-    train_metrics['accuracy'] = in_sample_score
+    train_metrics["accuracy"] = in_sample_score
 else:
-    train_metrics['coefficient of determination'] = in_sample_score
+    train_metrics["coefficient of determination"] = in_sample_score
 y_pred = fitted.predict(X=train_features)
 mse = mean_squared_error(train_outcome, y_pred)
-train_metrics['mean squared error'] = mse
-print('In-sample prediction score: ', in_sample_score)
-print('In-sample mean squared error: ', mse)
-#print(np.mean(train_features))
-with open(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_fit-{today_str}.json'), 'w') as fp:
+train_metrics["mean squared error"] = mse
+print("In-sample prediction score: ", in_sample_score)
+print("In-sample mean squared error: ", mse)
+# print(np.mean(train_features))
+with open(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
     json.dump(train_metrics, fp)
 
 # yoink the coefficients? for a more parsimonious figure?
@@ -153,61 +174,74 @@
 for i in range(0, filter.shape[0]):
     if filter[i] == True:
         if len(np.unique(outcome)) == 2:
-            coeff_vec[i] = fitted.coef_[0,j]
+            coeff_vec[i] = fitted.coef_[0, j]
         else:
             coeff_vec[i] = fitted.coef_[j]
         j += 1
     else:
         pass
 
-#print(coeff_vec)
+# print(coeff_vec)
 
 coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
-#print(coef_mat == coef_mat.T)
-
-fig,fig2, nimg = io.plot_edges(coef_mat, 
-                         atlas_fname, 
-                         threshold='computed',
-                         title=f'{OUTCOME} Coefficients', 
-                         strength=True, 
-                         cmap='seismic', 
-                         node_size='strength')
-
-fig.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_betas-{today_str}.png'), dpi=400)
-fig2.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_betas-strength-{today_str}.png'), dpi=400)
-nib.save(nimg, join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_betas-strength-{today_str}'))
+# print(coef_mat == coef_mat.T)
+
+fig, fig2, nimg = io.plot_edges(
+    coef_mat,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Coefficients",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}")
+)
 
 
 layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
 
-test_df = io.read_corrmats(layout, task=TASK, deriv_name='IDConn', atlas=ATLAS, z_score=True)
+test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=True)
 
-keep = test_df[[OUTCOME, 'adj']].dropna().index
-#print(keep)
+keep = test_df[[OUTCOME, "adj"]].dropna().index
+# print(keep)
 
 test_df = test_df.loc[keep]
 outcome_test = test_df[OUTCOME].values
-#print(test_df)
+# print(test_df)
 
-#print(outcome_test)
-matrices_test = np.vstack(test_df['adj'].dropna().values).reshape((len(test_df['adj'].dropna().index),num_node,num_node))
-edges_test = np.vstack(test_df['edge_vector'].dropna().values)
+# print(outcome_test)
+matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
+    (len(test_df["adj"].dropna().index), num_node, num_node)
+)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)
 
 # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
 if CONFOUNDS is not None:
     confounds_test = test_df[CONFOUNDS].values
-    
-    #regress out the confounds from each edge and the outcome variable, 
+
+    # regress out the confounds from each edge and the outcome variable,
     # use the residuals for the rest of the algorithm
-    #print(confounds.shape, outcome.shape)
+    # print(confounds.shape, outcome.shape)
     if len(np.unique(outcome_test)) <= 2:
         resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
         test_outcome = outcome_test
     elif len(np.unique(outcome_test)) > 3:
-        test_outcome, resid_edges = nbs.residualize(X=edges_test, y=outcome_test, confounds=confounds_test)
-    test_features = resid_edges[:,filter]
+        test_outcome, resid_edges = nbs.residualize(
+            X=edges_test, y=outcome_test, confounds=confounds_test
+        )
+    test_features = resid_edges[:, filter]
 else:
-    test_features = edges_test[:,filter]
+    test_features = edges_test[:, filter]
     test_outcome = outcome_test
 
 # scale after residualizing omg
@@ -216,7 +250,7 @@
     pass
 else:
     test_outcome = scaler.fit_transform(test_outcome.reshape(-1, 1))
-#print(test_features.shape)
+# print(test_features.shape)
 # if the model is a logistic regression, i.e. with a binary outcome
 # then score is prediction accuracy
 # if the model is a linear regression, i.e., with a continuous outcome
@@ -224,29 +258,31 @@
 
 # fit trained ElasticNet, initialized via warm_start
 # prob in CV?
-#fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
-#score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
+# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
+# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
 test_metrics = {}
 y_pred = fitted.predict(X=test_features)
 score = fitted.score(X=test_features, y=np.ravel(test_outcome))
 if len(np.unique(test_outcome)) == 2:
-    test_metrics['accuracy'] = score
+    test_metrics["accuracy"] = score
 else:
-    test_metrics['coefficient of determination'] = score
+    test_metrics["coefficient of determination"] = score
 mse = mean_squared_error(test_outcome, y_pred)
-test_metrics['mean squared error'] = mse
-print('Out-of-sample prediction score:\t', score)
-print('Out-of-sample mean squared error:\t', mse)
-#print(np.mean(test_features))
-#pred_outcome = fitted.predict(test_features)
+test_metrics["mean squared error"] = mse
+print("Out-of-sample prediction score:\t", score)
+print("Out-of-sample mean squared error:\t", mse)
+# print(np.mean(test_features))
+# pred_outcome = fitted.predict(test_features)
 
 
-print(test_outcome, '\n',y_pred)
-#print(pred_outcome)
+print(test_outcome, "\n", y_pred)
+# print(pred_outcome)
 if len(np.unique(test_outcome)) > 2:
     corr = spearmanr(test_outcome, y_pred)
-    print(f'\nSpearman correlation between predicted and actual {OUTCOME}:\t', corr)
-    test_metrics['spearman correlation'] = corr
-with open(join(TEST_DSET, 'derivatives', DERIV_NAME, f'{base_name}_fit-{today_str}.json'), 'w') as fp:
+    print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
+    test_metrics["spearman correlation"] = corr
+with open(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
     json.dump(test_metrics, fp)
-np.savetxt(join(TEST_DSET, f'{base_name}_predicted-values_fit-{today_str}.txt'), y_pred)
\ No newline at end of file
+np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred)
diff --git a/setup.py b/setup.py
index abab8f5..4d7ed83 100644
--- a/setup.py
+++ b/setup.py
@@ -27,14 +27,14 @@
         "numpy",
         "scipy",
         "nilearn",
-        "sklearn",
+        "scikit-learn",
         "pandas",
         "nibabel",
         "bctpy",
         "pybids",
         "networkx",
-        "matplotlib", # necessary until nilearn includes mpl as a dependency
-        "enlighten",  
+        "matplotlib",  # necessary until nilearn includes mpl as a dependency
+        "enlighten",
     ],
     extras_require={
         "doc": [
@@ -46,7 +46,7 @@
             "sphinx-copybutton",
             "sphinx_gallery==0.10.1",
             "sphinxcontrib-bibtex",
-            ],
+        ],
         "tests": [
             "codecov",
             "coverage",
diff --git a/versioneer.py b/versioneer.py
index 2b54540..b9421e4 100644
--- a/versioneer.py
+++ b/versioneer.py
@@ -1136,9 +1136,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         pieces["distance"] = int(count_out)  # total number of commits
 
     # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
-        0
-    ].strip()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
     pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
 
     return pieces
@@ -1238,13 +1236,9 @@ def versions_from_file(filename):
             contents = f.read()
     except EnvironmentError:
         raise NotThisMethod("unable to read _version.py")
-    mo = re.search(
-        r"version_json = '''\n(.*)'''  # END VERSION_JSON", contents, re.M | re.S
-    )
+    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON", contents, re.M | re.S)
     if not mo:
-        mo = re.search(
-            r"version_json = '''\r\n(.*)'''  # END VERSION_JSON", contents, re.M | re.S
-        )
+        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON", contents, re.M | re.S)
     if not mo:
         raise NotThisMethod("no version_json in _version.py")
     return json.loads(mo.group(1))
@@ -1454,9 +1448,7 @@ def get_versions(verbose=False):
     handlers = HANDLERS.get(cfg.VCS)
     assert handlers, "unrecognized VCS '%s'" % cfg.VCS
     verbose = verbose or cfg.verbose
-    assert (
-        cfg.versionfile_source is not None
-    ), "please set versioneer.versionfile_source"
+    assert cfg.versionfile_source is not None, "please set versioneer.versionfile_source"
     assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
 
     versionfile_abs = os.path.join(root, cfg.versionfile_source)
@@ -1697,9 +1689,7 @@ def make_release_tree(self, base_dir, files):
             # updated value
             target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
             print("UPDATING %s" % target_versionfile)
-            write_to_version_file(
-                target_versionfile, self._versioneer_generated_versions
-            )
+            write_to_version_file(target_versionfile, self._versioneer_generated_versions)
 
     cmds["sdist"] = cmd_sdist
 
@@ -1823,10 +1813,7 @@ def do_setup():
     else:
         print(" 'versioneer.py' already in MANIFEST.in")
     if cfg.versionfile_source not in simple_includes:
-        print(
-            " appending versionfile_source ('%s') to MANIFEST.in"
-            % cfg.versionfile_source
-        )
+        print(" appending versionfile_source ('%s') to MANIFEST.in" % cfg.versionfile_source)
         with open(manifest_in, "a") as f:
             f.write("include %s\n" % cfg.versionfile_source)
     else:

From a2cdbd4fed72b3fc52cceba416e8e92bee1db153 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Tue, 18 Apr 2023 11:19:49 -0700
Subject: [PATCH 38/48] add param tuning to nbs, plot true vs pred

---
 idconn/nbs.py                   |  65 +++++++++++++-----
 idconn/workflows/nbs_predict.py | 115 ++++++++++++++++++++++++++++----
 2 files changed, 150 insertions(+), 30 deletions(-)

diff --git a/idconn/nbs.py b/idconn/nbs.py
index facf96f..c7bfceb 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -7,17 +7,15 @@
 import enlighten
 
 # import bct
-
+from sklearn.experimental import enable_halving_search_cv
 from sklearn.model_selection import (
     RepeatedStratifiedKFold,
     RepeatedKFold,
-    GridSearchCV,
-    StratifiedKFold,
-    KFold,
+    HalvingGridSearchCV
 )
 
 from sklearn.feature_selection import f_regression, f_classif
-from sklearn.linear_model import LogisticRegression, ElasticNet
+from sklearn.linear_model import LogisticRegression, ElasticNet, LogisticRegressionCV, ElasticNetCV
 from sklearn.preprocessing import StandardScaler
 
 from sklearn.metrics import mean_squared_error
@@ -319,12 +317,22 @@ def kfold_nbs(
         cv_results.at[i, "split"] = (train_idx, test_idx)
 
         # assert len(train_a_idx) == len(train_b_idx)
+        l1_ratio_grid = [0.2, 0.4, 0.6, 0.8]
         if np.unique(outcome).shape[0] == 2:
-            regressor = LogisticRegression(
-                l1_ratio=0.25, max_iter=1000, penalty="elasticnet", solver="saga"
+            regressor = LogisticRegressionCV(
+                l1_ratio=l1_ratio_grid, 
+                max_iter=100000, 
+                penalty="elasticnet", 
+                solver="saga", 
+                n_jobs=4
             )
+            
         else:
-            regressor = ElasticNet(l1_ratio=0.25, max_iter=1000)
+            regressor = ElasticNetCV(
+                l1_ratio=l1_ratio_grid, 
+                cv=4, 
+                n_jobs=4
+                )
 
         train_y = outcome[train_idx]
         test_y = outcome[test_idx]
@@ -374,6 +382,7 @@ def kfold_nbs(
             # so you don't have repeated edges
             # returns (n_edges, )
             nbs_vector = adj.values[upper_tri]
+            #print(nbs_vector.shape)
             # print(nbs_vector.shape)
             # use those to make a "significant edges" mask
             mask = nbs_vector == 1.0
@@ -385,12 +394,31 @@ def kfold_nbs(
             # returns (n_edges, samples)
             train_features = train_edges.T[mask]
             test_features = test_edges.T[mask]
+            #print(mask.shape, np.sum(mask), train_edges.shape, train_features.shape)
+
+            train_features = train_features.T
+            test_features = test_features.T
+            
+            #train_features = scaler.fit_transform(train_features.T)
+            #test_features = scaler.fit_transform(test_features.T)
+            #print(train_features.shape, train_y.shape)
 
-            train_features = scaler.fit_transform(train_features.T)
-            test_features = scaler.fit_transform(test_features.T)
+            #print(f"train_edges:\t{train_edges[:10, 0]}\ntrain_features:\t{train_features[:10, 0]}")
             # print(np.ravel(train_y))
             # train model predicting outcome from brain (note: no mas covariates)
+            # use grid search bc I want to know how to tune alpha and l1_ratio
+            
+            #grid = HalvingGridSearchCV(estimator=regressor, 
+            #                           param_grid=param_grid, 
+            #                           n_jobs=8, 
+            #                           cv=4, 
+            #                           factor=2,
+            #                           verbose=0,
+            #                           min_resources=20, 
+            #                           refit=True, 
+            #                           aggressive_elimination=False)
             model = regressor.fit(X=train_features, y=np.ravel(train_y))
+            
             cv_results.at[i, "model"] = model
 
             # score that model on the testing data
@@ -399,7 +427,11 @@ def kfold_nbs(
             # both from 0 (low) to 1 (high)
             score = model.score(X=test_features, y=np.ravel(test_y))
             cv_results.at[i, "score"] = score
-            # print(model.coef_.shape)
+            if i % (n_splits * n_iterations / 10) == 0:
+                mean = cv_results['score'].mean()
+                sdev = cv_results['score'].std()
+                print(f'Iteration {i} out of {n_splits * n_iterations}, average score:\t{mean:.2f} +/- {sdev:.2f}')
+            #print(score)
 
             m = 0
             param_vector = np.zeros_like(nbs_vector)
@@ -427,11 +459,10 @@ def kfold_nbs(
     # print(weighted_stack.shape)
     for j in index[1:]:
         # print(cv_results.at[j, 'score'])
-        if cv_results.at[j, "score"] > 0:
-            weighted = cv_results.at[j, "component"] * cv_results.at[j, "score"]
-            weighted_stack = np.dstack([weighted_stack, weighted])
-        else:
-            pass
+        weighted = cv_results.at[j, "component"] * cv_results.at[j, "score"]
+        weighted_stack = np.dstack([weighted_stack, weighted])
+
         # print(weighted_stack.shape, weighted.shape)
     weighted_average = np.mean(weighted_stack, axis=-1)
-    return weighted_average, cv_results
+    #model = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
+    return weighted_average, cv_results, #model
diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index 7a1563b..233c284 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
 import pandas as pd
 import numpy as np
-import pingouin as pg
 import nibabel as nib
+import seaborn as sns
 import bids
+import matplotlib.pyplot as plt
 from os.path import join
 from datetime import datetime
 from time import strftime
@@ -14,6 +15,9 @@
 from sklearn.linear_model import LogisticRegression, ElasticNet
 from sklearn.preprocessing import StandardScaler
 from sklearn.metrics import mean_squared_error
+from matplotlib.colors import ListedColormap
+import matplotlib as mpl
+
 
 import warnings
 import json
@@ -64,7 +68,7 @@
     weighted_average,
     atlas_fname,
     threshold="computed",
-    title=f"{OUTCOME} Precition-Weighted Average",
+    title=f"{OUTCOME} Precision-Weighted Average",
     strength=True,
     cmap="seismic",
     node_size="strength",
@@ -95,6 +99,7 @@
     join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
 )
 
+best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model']
 
 # this uses the most predictive subnetwork as features in the model
 # might replace with thresholded weighted_average
@@ -105,9 +110,10 @@
 # either way, I don't think cv_results is necessary
 
 # here is where we'd threshold the weighted average to use for elastic-net
+weighted_average = np.where(weighted_average > 0, weighted_average, 0)
 nbs_vector = weighted_average[upper_tri]
-p50 = np.percentile(nbs_vector, 50)
-filter = np.where(nbs_vector >= p50, True, False)
+p75 = np.percentile(nbs_vector, 75)
+filter = np.where(nbs_vector >= p75, True, False)
 # print(nbs_vector.shape, filter.shape)
 
 # mask = io.vectorize_corrmats(filter)
@@ -127,9 +133,9 @@
         train_outcome, resid_edges = nbs.residualize(
             X=edges_train, y=outcome_train, confounds=confounds_train
         )
-    train_features = resid_edges[:, filter]
+    train_features = resid_edges[:,filter]
 else:
-    train_features = edges_train[:, filter]
+    train_features = edges_train[:,filter]
     train_outcome = outcome
 
 scaler = StandardScaler()
@@ -145,10 +151,18 @@
 # could be extended to the multiclass case?
 
 if len(np.unique(outcome)) == 2:
-    model = LogisticRegression(penalty="elasticnet", solver="saga", l1_ratio=0.25)
+    model = LogisticRegression(
+        penalty="elasticnet", 
+        solver="saga", 
+        l1_ratio=best.l1_ratio_
+        )
 else:
-    model = ElasticNet(l1_ratio=0.25)
-
+    model = ElasticNet(
+        l1_ratio=best.l1_ratio_, 
+        alpha=best.alpha_
+        )
+#print(params)
+#model.set_params(**params)
 # train ElasticNet on full train dataset, using feature extraction from NBS-Predict
 train_metrics = {}
 fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
@@ -158,6 +172,35 @@
 else:
     train_metrics["coefficient of determination"] = in_sample_score
 y_pred = fitted.predict(X=train_features)
+dat[f'{OUTCOME}_pred'] = y_pred
+dat[f'{OUTCOME}_scaled'] = train_outcome
+
+Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']]
+Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
+
+train_colors = ['#a08ad1', #light
+                '#685690', #medium
+                '#3f2d69' #dark
+                ]
+light_cmap = sns.color_palette('dark:#a08ad1')
+dark_cmap = sns.color_palette('dark:#685690')
+
+fig,ax = plt.subplots()
+g = sns.scatterplot(x='cycle_day', 
+                    y=f'{OUTCOME}_pred', 
+                    style='bc', 
+                    data=Ys,  
+                    ax=ax, 
+                    palette=dark_cmap)
+h = sns.scatterplot(x='cycle_day',
+                    y=f'{OUTCOME}_scaled', 
+                    style='bc', 
+                    data=Ys, 
+                    ax=ax, 
+                    palette=light_cmap)
+ax.legend(bbox_to_anchor=(1.0, 0.5))
+fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+
 mse = mean_squared_error(train_outcome, y_pred)
 train_metrics["mean squared error"] = mse
 print("In-sample prediction score: ", in_sample_score)
@@ -184,7 +227,8 @@
 # print(coeff_vec)
 
 coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
-# print(coef_mat == coef_mat.T)
+coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index)
+coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv"))
 
 fig, fig2, nimg = io.plot_edges(
     coef_mat,
@@ -216,6 +260,7 @@
 # print(keep)
 
 test_df = test_df.loc[keep]
+
 outcome_test = test_df[OUTCOME].values
 # print(test_df)
 
@@ -273,9 +318,53 @@
 print("Out-of-sample mean squared error:\t", mse)
 # print(np.mean(test_features))
 # pred_outcome = fitted.predict(test_features)
-
-
-print(test_outcome, "\n", y_pred)
+test_df[f'{OUTCOME}_scaled'] = test_outcome
+test_df[f'{OUTCOME}_pred'] = y_pred
+Ys = test_df[[f'{OUTCOME}_scaled', 
+              f'{OUTCOME}_pred',
+              'cycle_day', 
+              'bc']]
+Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
+
+Ys['ppts'] = Ys.index.get_level_values(0)
+
+
+light_colors = ['#33ACE3', #Bubbles
+                '#EA6964', #Blossom
+                '#4AB62C' #Buttercup
+                ]
+dark_colors = ['#1278a6', 
+               '#a11510', 
+               '#228208']
+light = ListedColormap(light_colors, name='light_powderpuff')
+dark = ListedColormap(dark_colors, name='dark_powderpuff')
+mpl.colormaps.register(cmap=light)
+mpl.colormaps.register(cmap=dark)
+
+fig,ax = plt.subplots()
+g = sns.scatterplot(x='cycle_day', 
+                    y=f'{OUTCOME}_pred', 
+                    style='bc', 
+                    data=Ys, 
+                    hue='ppts',  
+                    hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'],
+                    ax=ax, 
+                    palette='light_powderpuff'
+                    )
+h = sns.scatterplot(x='cycle_day',
+                     y=f'{OUTCOME}_scaled', 
+                     style='bc', 
+                     data=Ys, 
+                     hue='ppts',
+                     hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], 
+                     ax=ax, 
+                     palette='dark_powderpuff')
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')
+fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+
+
+
+#print(test_outcome, "\n", y_pred)
 # print(pred_outcome)
 if len(np.unique(test_outcome)) > 2:
     corr = spearmanr(test_outcome, y_pred)

From 6f1dd8b5110eca872252bc17a1214d9df8d6493d Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Tue, 18 Apr 2023 11:24:31 -0700
Subject: [PATCH 39/48] add contributor guidelines

---
 CONTRIBUTING.md | 125 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e69de29..11ce204 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,125 @@
+# Contributing to IDConn
+
+Welcome to the ``IDConn`` repository!
+We're excited you're here and want to contribute.
+
+These guidelines are designed to make it as easy as possible to get involved.
+If you have any questions that aren't discussed below, please let us know by opening an [issue][link_issues]!
+
+Before you start you'll need to set up a free [GitHub][link_github] account and sign in.
+Here are some [instructions][link_signupinstructions].
+
+## Governance
+
+Governance is a hugely important part of any project.
+It is especially important to have clear process and communication channels for open source projects that rely on a distributed network of volunteers, such as ``IDConn``.
+
+``IDConn`` is currently supported by a small group of core developers.
+Even with only a couple of individuals involved in decision making processes, we've found that setting expectations and communicating a shared vision has great value.
+
+By starting the governance structure early in our development, we hope to welcome more people into the contributing team.
+We are committed to continuing to update the governance structures as necessary.
+Every member of the ``IDConn`` community is encouraged to comment on these processes and suggest improvements.
+
+As the first project leader, Katie Bottenhorn is ultimately responsible for any major decisions pertaining to ``IDConn`` development.
+However, all potential changes are explicitly and openly discussed in the described channels of communication, and we strive for consensus amongst all community members.
+
+## Code of conduct
+
+All ``IDConn`` community members are expected to follow our [code of conduct](https://github.com/62442katieb/IDConn/blob/main/CODE_OF_CONDUCT.md) during any interaction with the project.
+That includes- but is not limited to- online conversations, in-person workshops or development sprints, and when giving talks about the software.
+
+As stated in the code, severe or repeated violations by community members may result in exclusion from collective decision-making and rejection of future contributions to the ``IDConn`` project.
+
+## Asking questions about using IDConn
+
+Please direct usage-related questions to [NeuroStars][link_neurostars], with [the "Software Support" category and the "IDConn" tag][link_neurostars_IDConn].
+The ``IDConn`` developers follow NeuroStars, and will be able to answer your question there.
+
+## Labels
+
+The current list of labels are [here][link_labels] and include:
+
+* [![Good First Issue](https://img.shields.io/badge/-good%20first%20issue-7057ff.svg)](https://github.com/62442katieb/IDConn/labels/good%20first%20issue)
+*These issues contain a task that a member of the team has determined should require minimal knowledge of the existing codebase, and should be good for people new to the project.*
+If you are interested in contributing to IDConn, but aren't sure where to start, we encourage you to take a look at these issues in particular.
+
+* [![Help Wanted](https://img.shields.io/badge/-help%20wanted-33aa3f.svg)](https://github.com/62442katieb/IDConn/labels/help%20wanted)
+*These issues contain a task that a member of the team has determined we need additional help with.*
+If you feel that you can contribute to one of these issues, we especially encourage you to do so!
+
+* [![Bug](https://img.shields.io/badge/-bug-ee0701.svg)](https://github.com/62442katieb/IDConn/labels/bug)
+*These issues point to problems in the project.*
+If you find new a bug, please give as much detail as possible in your issue, including steps to recreate the error.
+If you experience the same bug as one already listed, please add any additional information that you have as a comment.
+
+* [![Enhancement](https://img.shields.io/badge/-enhancement-84b6eb.svg)](https://github.com/62442katieb/IDConn/labels/enhancement)
+*These issues are asking for new features to be added to the project.*
+Please try to make sure that your requested feature is distinct from any others that have already been requested or implemented.
+If you find one that's similar but there are subtle differences please reference the other request in your issue.
+
+## Making a change
+
+We appreciate all contributions to IDConn, but those accepted fastest will follow a workflow similar to the following:
+
+**1. Comment on an existing issue or open a new issue referencing your addition.**
+
+This allows other members of the IDConn development team to confirm that you aren't overlapping with work that's currently underway and that everyone is on the same page with the goal of the work you're going to carry out.
+
+[This blog][link_pushpullblog] is a nice explanation of why putting this work in up front is so useful to everyone involved.
+
+**2. Fork IDConn.**
+
+[Fork][link_fork] the [IDConn repository][link_idconn] to your profile.
+
+This is now your own unique copy of IDConn.
+Changes here won't effect anyone else's work, so it's a safe space to explore edits to the code!
+
+Make sure to [keep your fork up to date][link_updateupstreamwiki] with the main repository.
+
+**3. Make the changes you've discussed.**
+
+Try to keep the changes focused. We've found that working on a [new branch][link_branches] makes it easier to keep your changes targeted.
+
+When you're creating your pull request, please do your best to follow IDConn's preferred style conventions.
+Namely, documentation should follow the [numpydoc](https://numpydoc.readthedocs.io/en/latest/) convention and code should adhere to [PEP8](https://www.python.org/dev/peps/pep-0008/) as much as possible.
+
+**4. Submit a pull request.**
+
+Submit a [pull request][link_pullrequest].
+
+A member of the development team will review your changes to confirm that they can be merged into the main codebase.
+
+Please use a sentence-case title for the pull request, and do not include any prefixes (e.g., ``[ENH]``), as we now use labels to distinguish pull request types.
+The title should summarize the changes proposed in the pull request, with an emphasis on readability, as pull request titles are used directly in our release notes.
+
+## Recognizing contributions
+
+We welcome and recognize all contributions from documentation to testing to code development.
+You can see a list of current contributors in our [zenodo][link_zenodo] file.
+If you are new to the project, don't forget to add your name and affiliation there!
+
+## Thank you!
+
+You're awesome.
+
+.. note::
+    These guidelines are based on contributing guidelines from the [STEMMRoleModels][link_stemmrolemodels] project.
+
+[link_github]: https://github.com/
+[link_idconn]: https://github.com/62442katieb/IDConn
+[link_signupinstructions]: https://help.github.com/articles/signing-up-for-a-new-github-account
+[link_react]: https://github.com/blog/2119-add-reactions-to-pull-requests-issues-and-comments
+[link_issues]: https://github.com/62442katieb/IDConn/issues
+[link_labels]: https://github.com/62442katieb/IDConn/labels
+[link_discussingissues]: https://help.github.com/articles/discussing-projects-in-issues-and-pull-requests
+[link_neurostars]: https://neurostars.org
+
+
+[link_pullrequest]: https://help.github.com/articles/creating-a-pull-request/
+[link_fork]: https://help.github.com/articles/fork-a-repo/
+[link_pushpullblog]: https://www.igvita.com/2011/12/19/dont-push-your-pull-requests/
+[link_branches]: https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/
+[link_updateupstreamwiki]: https://help.github.com/articles/syncing-a-fork/
+[link_stemmrolemodels]: https://github.com/KirstieJane/STEMMRoleModels
+[link_zenodo]: https://github.com/62442katieb/IDConn/blob/main/.zenodo.json

From 55d4989e66ef6c43fc25d7081da12487a9a09ffb Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Sun, 3 Sep 2023 20:29:17 -0700
Subject: [PATCH 40/48] used for Flux aim2, corr score

---
 idconn/nbs.py | 116 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 79 insertions(+), 37 deletions(-)

diff --git a/idconn/nbs.py b/idconn/nbs.py
index c7bfceb..3e2b48f 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -15,10 +15,11 @@
 )
 
 from sklearn.feature_selection import f_regression, f_classif
-from sklearn.linear_model import LogisticRegression, ElasticNet, LogisticRegressionCV, ElasticNetCV
-from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LogisticRegression, ElasticNet, LogisticRegressionCV, RidgeCV
+from sklearn.preprocessing import Normalizer
 
-from sklearn.metrics import mean_squared_error
+from sklearn.metrics import mean_squared_log_error, adjusted_mutual_info_score
+from scipy.stats import spearmanr
 
 
 def calc_number_of_nodes(matrices):
@@ -38,6 +39,9 @@ def calc_number_of_nodes(matrices):
 
 
 def residualize(X, y=None, confounds=None):
+    '''
+    all inputs need to be arrays, not dataframes
+    '''
     # residualize the outcome
     if confounds is not None:
         if y is not None:
@@ -70,7 +74,7 @@ def residualize(X, y=None, confounds=None):
         print("Confound matrix wasn't provided, so no confounding was done")
 
 
-def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000):
+def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=False, permutations=10000):
     """
     Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided
     of shape ((subject x session)x node x node)
@@ -120,11 +124,12 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000):
 
     # turn matrices into vectorized upper triangles
     if ndims > 2:
-        edges = vectorize_corrmats(matrices)
+        edges = vectorize_corrmats(matrices, diagonal=diagonal)
     else:
         edges = matrices.copy()
     # print(edges.shape)
 
+
     # edges = edges.T
 
     # run an ols per edge
@@ -140,12 +145,14 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000):
 
     # find largest connected component of sig_edges
     # turn sig_edges into an nxn matrix first
-    sig_matrix = undo_vectorize(sig_edges)  # need to write this function
+    sig_matrix = undo_vectorize(sig_edges, num_node=num_node, diagonal=diagonal)  # need to write this function
     matrix = nx.from_numpy_array(sig_matrix)
 
     # use networkX to find connected components
-    largest_cc = max(nx.connected_components(matrix), key=len)
-    G0 = matrix.subgraph(largest_cc)
+    S = [matrix.subgraph(c).copy() for c in nx.connected_components(matrix)]
+    S.sort(key=len, reverse=True)
+    #largest_cc = max(nx.connected_components(matrix), key=len)
+    G0 = S[0]
     # print(G0)
 
     # retain size of largest connected component
@@ -195,7 +202,7 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000):
             # print(np.sum(perm_edges))
             # find largest connected component of sig_edges
             # turn sig_edges into an nxn matrix first
-            perm_matrix = undo_vectorize(perm_edges)  # need to write this function
+            perm_matrix = undo_vectorize(perm_edges, num_node=num_node, diagonal=diagonal)  # need to write this function
             perm_nx = nx.from_numpy_array(perm_matrix)
 
             largest_cc = max(nx.connected_components(perm_nx), key=len)
@@ -226,7 +233,7 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000):
 
 
 def kfold_nbs(
-    matrices, outcome, confounds=None, alpha=0.05, groups=None, n_splits=10, n_iterations=10
+    matrices, outcome, confounds=None, alpha=0.05, groups=None, num_node=None, diagonal=False, n_splits=10, n_iterations=10
 ):
     """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided
     of shape ((subject x session)x node x node)
@@ -240,9 +247,9 @@ def kfold_nbs(
     array of vectorized upper triangles of those correlation mat
     Parameters
     ----------
-    matrices : numpy array of shape (p, n, n)
+    matrices : numpy array of shape (p, n, n) or (p, (n^2 / 2)- n)
         Represents the link strengths of the graphs. Assumed to be
-        an array of symmetric matrices.
+        an array of symmetric matrices or a vectorized triangle thereof.
     outcome : list-like of shape (p,)
         Y-value to be predicted with connectivity
     confounds : list-like
@@ -270,7 +277,15 @@ def kfold_nbs(
         Includes the results of each cross-validation loop
         (e.g., predictive performance, data split, largest connected component per fold per iteration).
     """
-    edges = vectorize_corrmats(matrices)
+    ndims = len(matrices.shape)
+
+    # vectorize_corrmats returns p x n^2
+
+    # turn matrices into vectorized upper triangles
+    if ndims > 2:
+        edges = vectorize_corrmats(matrices)
+    else:
+        edges = matrices.copy()
     # print(edges.shape)
     # print(edges.shape)
     index = list(range(0, n_splits * n_iterations))
@@ -282,8 +297,6 @@ def kfold_nbs(
             #'pval',
             "score",
             "component",
-            "coefficient_matrix",
-            "coefficient_vector",
             "model",
         ],
     )
@@ -295,7 +308,10 @@ def kfold_nbs(
         cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_iterations)
         split_y = outcome
 
-    num_node = calc_number_of_nodes(matrices)
+    if num_node is None:
+        num_node = calc_number_of_nodes(matrices)
+    else:
+        pass
     # print(num_node)
     # if matrices.shape[0] != matrices.shape[1]:
     #    if matrices.shape[1] == matrices.shape[2]:
@@ -307,31 +323,41 @@ def kfold_nbs(
     #'or node x node x (subject x session).')
     # else:
     #    num_node = matrices.shape[0]
-    upper_tri = np.triu_indices(num_node, k=1)
+    if diagonal == True:
+        k = 0
+    if diagonal == False:
+        k=1
+    upper_tri = np.triu_indices(num_node, k=k)
 
     i = 0
     manager = enlighten.get_manager()
     ticks = manager.counter(total=n_splits * n_iterations, desc="Progress", unit="folds")
     for train_idx, test_idx in cv.split(edges, split_y):
-        scaler = StandardScaler()
+        x_scaler = Normalizer()
+        y_scaler = Normalizer()
         cv_results.at[i, "split"] = (train_idx, test_idx)
 
         # assert len(train_a_idx) == len(train_b_idx)
-        l1_ratio_grid = [0.2, 0.4, 0.6, 0.8]
+        Cs = np.logspace(-4, 4, 10)
+        #print(len(np.unique(outcome)))
         if np.unique(outcome).shape[0] == 2:
+            #print('binary')
             regressor = LogisticRegressionCV(
-                l1_ratio=l1_ratio_grid, 
+                Cs=Cs, 
+                cv=4,
+                #verbose=2,
                 max_iter=100000, 
-                penalty="elasticnet", 
+                penalty="l2", 
                 solver="saga", 
                 n_jobs=4
             )
             
         else:
-            regressor = ElasticNetCV(
-                l1_ratio=l1_ratio_grid, 
+            #print('continuous')
+            regressor = RidgeCV(
+                alphas=Cs, 
                 cv=4, 
-                n_jobs=4
+                #n_jobs=4
                 )
 
         train_y = outcome[train_idx]
@@ -357,20 +383,20 @@ def kfold_nbs(
         else:
             pass
 
-        train_edges = scaler.fit_transform(train_edges)
-        test_edges = scaler.fit_transform(test_edges)
+        train_edges = x_scaler.fit_transform(train_edges)
+        test_edges = x_scaler.transform(test_edges)
 
         if np.unique(outcome).shape[0] == 2:
             pass
         else:
-            train_y = scaler.fit_transform(train_y.reshape(-1, 1))
-            test_y = scaler.fit_transform(test_y.reshape(-1, 1))
+            train_y = y_scaler.fit_transform(train_y.reshape(-1, 1))
+            test_y = y_scaler.transform(test_y.reshape(-1, 1))
 
         # perform NBS wooooooooo
         # note: output is a dataframe :)
         # PYNBS SHOULD NOT DO CONFOUND REGRESSION?
-        adj = pynbs(train_edges, train_y, alpha, predict=True)
-        # print(adj.shape, adj.ndim, adj[0].shape, upper_tri)
+        adj = pynbs(train_edges, train_y, num_node=num_node, diagonal=diagonal, alpha=alpha, predict=True)
+        #print(adj.shape, adj.ndim, adj[0].shape, upper_tri)
 
         # cv_results.at[i, 'pval'] = pval
         cv_results.at[i, "component"] = adj.values
@@ -425,7 +451,18 @@ def kfold_nbs(
             # if logistic regression: score = mean accuracy
             # if linear regression: score = coefficient of determination (R^2)
             # both from 0 (low) to 1 (high)
-            score = model.score(X=test_features, y=np.ravel(test_y))
+
+            # can't use MSE, which is the default score for ridge
+            # because larger values = worse performance
+            # I go die now
+            if np.unique(outcome).shape[0] == 2:
+                score = model.score(X=test_features, y=np.ravel(test_y))
+                
+            else:
+                predicted_y = model.predict(X=test_features)
+                score,p = spearmanr(predicted_y, np.ravel(test_y))
+                #spearman = spearmanr(predicted_y, np.ravel(test_y))
+            
             cv_results.at[i, "score"] = score
             if i % (n_splits * n_iterations / 10) == 0:
                 mean = cv_results['score'].mean()
@@ -446,21 +483,26 @@ def kfold_nbs(
                     m += 1
                 else:
                     pass
-            X = undo_vectorize(param_vector, num_node=num_node)
-            cv_results.at[i, "coefficient_matrix"] = X
-            cv_results.at[i, "coefficient_vector"] = param_vector
+            X = undo_vectorize(param_vector, num_node=num_node, diagonal=diagonal)
+            #cv_results.at[i, "coefficient_matrix"] = X
+            #cv_results.at[i, "coefficient_vector"] = param_vector
             i += 1
         else:
             pass
         ticks.update()
     # calculate weighted average
     # print(cv_results['score'])
-    weighted_stack = cv_results.at[0, "component"] * cv_results.at[0, "score"]
+    weighted_stack = np.zeros((num_node,num_node))
+    fake = np.zeros((num_node,num_node))
     # print(weighted_stack.shape)
-    for j in index[1:]:
+    for j in index:
         # print(cv_results.at[j, 'score'])
         weighted = cv_results.at[j, "component"] * cv_results.at[j, "score"]
-        weighted_stack = np.dstack([weighted_stack, weighted])
+        
+        if np.sum(weighted) == 0 or np.isnan(np.sum(weighted)) == True:
+            weighted_stack = np.dstack([weighted_stack, fake])
+        else:
+            weighted_stack = np.dstack([weighted_stack, weighted])
 
         # print(weighted_stack.shape, weighted.shape)
     weighted_average = np.mean(weighted_stack, axis=-1)

From 7df266a8f095c5035d07fcbcfeeb063b0e460295 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 2 Nov 2023 10:12:29 -0700
Subject: [PATCH 41/48] checkpoint before integrating neurocombat

---
 idconn/workflows/nbs_predict.py | 126 +++++++++++++++++---------------
 1 file changed, 68 insertions(+), 58 deletions(-)

diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index 233c284..508aa74 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -10,10 +10,12 @@
 from time import strftime
 from scipy.stats import spearmanr
 from idconn import nbs, io
+from bct import threshold_proportional
 
 
-from sklearn.linear_model import LogisticRegression, ElasticNet
-from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate
+from sklearn.preprocessing import Normalizer, StandardScaler
 from sklearn.metrics import mean_squared_error
 from matplotlib.colors import ListedColormap
 import matplotlib as mpl
@@ -34,16 +36,18 @@
 CONFOUNDS = "framewise_displacement"
 TASK = "rest"
 ATLAS = "craddock2012"
+THRESH = 0.5
 alpha = 0.05
 atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
 
 
 layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
 
-dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=True)
+dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
 
 keep = dat["adj"].dropna().index
 dat = dat.loc[keep]
+
 # print(dat['adj'].values.shape)
 num_node = dat.iloc[0]["adj"].shape[0]
 
@@ -51,6 +55,7 @@
 upper_tri = np.triu_indices(num_node, k=1)
 
 outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
+groups = dat['bc']
 
 if CONFOUNDS is not None:
     confounds = dat[CONFOUNDS]
@@ -61,7 +66,7 @@
 # print(dat['bc'])
 
 weighted_average, cv_results = nbs.kfold_nbs(
-    matrices, outcome, confounds, alpha, groups=dat["bc"], n_splits=10, n_iterations=100
+    matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000
 )
 
 fig, fig2, nimg = io.plot_edges(
@@ -111,13 +116,17 @@
 
 # here is where we'd threshold the weighted average to use for elastic-net
 weighted_average = np.where(weighted_average > 0, weighted_average, 0)
-nbs_vector = weighted_average[upper_tri]
-p75 = np.percentile(nbs_vector, 75)
-filter = np.where(nbs_vector >= p75, True, False)
+#nbs_vector = weighted_average[upper_tri]
+#p75 = np.percentile(nbs_vector, 75)
+#filter = np.where(nbs_vector >= p75, True, False)
 # print(nbs_vector.shape, filter.shape)
+thresh_average = threshold_proportional(weighted_average, THRESH)
+nbs_vector2 = thresh_average[upper_tri]
+#p75 = np.percentile(nbs_vector, 75)
+filter = np.where(nbs_vector2 > 0, True, False)
 
 # mask = io.vectorize_corrmats(filter)
-edges_train = np.vstack(dat["edge_vector"].dropna().values)
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter]
 
 # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
 if CONFOUNDS is not None:
@@ -133,49 +142,65 @@
         train_outcome, resid_edges = nbs.residualize(
             X=edges_train, y=outcome_train, confounds=confounds_train
         )
-    train_features = resid_edges[:,filter]
+    train_features = resid_edges
 else:
-    train_features = edges_train[:,filter]
+    train_features = edges_train
     train_outcome = outcome
 
-scaler = StandardScaler()
-train_features = scaler.fit_transform(train_features)
+x_scaler = StandardScaler()
+y_scaler = StandardScaler()
+train_features = x_scaler.fit_transform(train_features)
 if len(np.unique(train_outcome)) <= 2:
     pass
 else:
-    train_outcome = scaler.fit_transform(train_outcome.reshape(-1, 1))
+    train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
 
 # run the model on the whole test dataset to get params
 
 # classification if the outcome is binary (for now)
 # could be extended to the multiclass case?
 
+cv = RepeatedKFold(n_splits=5, n_repeats=10)
+
+train_metrics = {}
 if len(np.unique(outcome)) == 2:
     model = LogisticRegression(
-        penalty="elasticnet", 
+        penalty="l2", 
         solver="saga", 
-        l1_ratio=best.l1_ratio_
+        C=best.C_[0]
         )
+    train_metrics["alpha"] = best.C_[0]
+    #train_metrics["l1_ratio"] = best.l1_ratio_
 else:
-    model = ElasticNet(
-        l1_ratio=best.l1_ratio_, 
+    model = Ridge(
+        solver="saga",  
         alpha=best.alpha_
         )
+    train_metrics["alpha"] = best.alpha_
+    #train_metrics["l1_ratio"] = best.l1_ratio_
 #print(params)
 #model.set_params(**params)
 # train ElasticNet on full train dataset, using feature extraction from NBS-Predict
-train_metrics = {}
-fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
-in_sample_score = fitted.score(X=train_features, y=np.ravel(train_outcome))
-if len(np.unique(outcome)) == 2:
-    train_metrics["accuracy"] = in_sample_score
-else:
-    train_metrics["coefficient of determination"] = in_sample_score
+
+scores = cross_validate(
+    model, 
+    train_features, 
+    train_outcome, 
+    groups=groups, 
+    cv=cv,
+    return_estimator=True, 
+    return_train_score=True
+    )
+train_metrics["in_sample_test"] = np.mean(scores['test_score'])
+train_metrics["in_sample_train"] = np.mean(scores['train_score'])
+
+fitted = scores['estimator'][0]
 y_pred = fitted.predict(X=train_features)
+train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
 dat[f'{OUTCOME}_pred'] = y_pred
 dat[f'{OUTCOME}_scaled'] = train_outcome
 
-Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']]
+Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled']]
 Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
 
 train_colors = ['#a08ad1', #light
@@ -186,25 +211,20 @@
 dark_cmap = sns.color_palette('dark:#685690')
 
 fig,ax = plt.subplots()
-g = sns.scatterplot(x='cycle_day', 
+g = sns.scatterplot(x=f'{OUTCOME}_scaled',
                     y=f'{OUTCOME}_pred', 
-                    style='bc', 
+                    #style='bc', 
                     data=Ys,  
                     ax=ax, 
                     palette=dark_cmap)
-h = sns.scatterplot(x='cycle_day',
-                    y=f'{OUTCOME}_scaled', 
-                    style='bc', 
-                    data=Ys, 
-                    ax=ax, 
-                    palette=light_cmap)
-ax.legend(bbox_to_anchor=(1.0, 0.5))
+#ax.legend(bbox_to_anchor=(1.0, 0.5))
 fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
 
 mse = mean_squared_error(train_outcome, y_pred)
 train_metrics["mean squared error"] = mse
-print("In-sample prediction score: ", in_sample_score)
+print("In-sample prediction score: ", train_metrics["in_sample_test"])
 print("In-sample mean squared error: ", mse)
+train_metrics["in_sample_mse"] = mse
 # print(np.mean(train_features))
 with open(
     join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
@@ -216,10 +236,8 @@
 j = 0
 for i in range(0, filter.shape[0]):
     if filter[i] == True:
-        if len(np.unique(outcome)) == 2:
-            coeff_vec[i] = fitted.coef_[0, j]
-        else:
-            coeff_vec[i] = fitted.coef_[j]
+        #print(j)
+        coeff_vec[i] = fitted.coef_[0, j]
         j += 1
     else:
         pass
@@ -254,7 +272,7 @@
 
 layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
 
-test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=True)
+test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
 
 keep = test_df[[OUTCOME, "adj"]].dropna().index
 # print(keep)
@@ -268,7 +286,7 @@
 matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
     (len(test_df["adj"].dropna().index), num_node, num_node)
 )
-edges_test = np.vstack(test_df["edge_vector"].dropna().values)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
 
 # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
 if CONFOUNDS is not None:
@@ -284,17 +302,17 @@
         test_outcome, resid_edges = nbs.residualize(
             X=edges_test, y=outcome_test, confounds=confounds_test
         )
-    test_features = resid_edges[:, filter]
+    test_features = resid_edges
 else:
-    test_features = edges_test[:, filter]
+    test_features = edges_test
     test_outcome = outcome_test
 
 # scale after residualizing omg
-test_features = scaler.fit_transform(test_features)
+test_features = x_scaler.transform(test_features)
 if len(np.unique(test_outcome)) <= 2:
     pass
 else:
-    test_outcome = scaler.fit_transform(test_outcome.reshape(-1, 1))
+    test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1))
 # print(test_features.shape)
 # if the model is a logistic regression, i.e. with a binary outcome
 # then score is prediction accuracy
@@ -312,6 +330,8 @@
     test_metrics["accuracy"] = score
 else:
     test_metrics["coefficient of determination"] = score
+corr = spearmanr(test_outcome, y_pred)
+test_metrics["pred_v_actual_corr"] = corr
 mse = mean_squared_error(test_outcome, y_pred)
 test_metrics["mean squared error"] = mse
 print("Out-of-sample prediction score:\t", score)
@@ -321,9 +341,7 @@
 test_df[f'{OUTCOME}_scaled'] = test_outcome
 test_df[f'{OUTCOME}_pred'] = y_pred
 Ys = test_df[[f'{OUTCOME}_scaled', 
-              f'{OUTCOME}_pred',
-              'cycle_day', 
-              'bc']]
+              f'{OUTCOME}_pred']]
 Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
 
 Ys['ppts'] = Ys.index.get_level_values(0)
@@ -342,23 +360,15 @@
 mpl.colormaps.register(cmap=dark)
 
 fig,ax = plt.subplots()
-g = sns.scatterplot(x='cycle_day', 
+g = sns.scatterplot(x=f'{OUTCOME}_scaled', 
                     y=f'{OUTCOME}_pred', 
-                    style='bc', 
+                    #style='bc', 
                     data=Ys, 
                     hue='ppts',  
                     hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'],
                     ax=ax, 
                     palette='light_powderpuff'
                     )
-h = sns.scatterplot(x='cycle_day',
-                     y=f'{OUTCOME}_scaled', 
-                     style='bc', 
-                     data=Ys, 
-                     hue='ppts',
-                     hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], 
-                     ax=ax, 
-                     palette='dark_powderpuff')
 ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')
 fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
 

From 8036f5e498538a466aec0ca8b1c131ebc5b5cdca Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Thu, 2 Nov 2023 10:12:41 -0700
Subject: [PATCH 42/48] checkpoint before integrating neurocombat

---
 idconn/workflows/nbs_predict.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index 508aa74..50563e7 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -242,7 +242,6 @@
     else:
         pass
 
-# print(coeff_vec)
 
 coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
 coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index)

From f274decfd10bc8b13eaaf32aa0cd70148fcd7a7f Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Mon, 27 Nov 2023 15:43:31 -0800
Subject: [PATCH 43/48] add scaling as an option

---
 idconn/nbs.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/idconn/nbs.py b/idconn/nbs.py
index 3e2b48f..26ed551 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -233,7 +233,7 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=
 
 
 def kfold_nbs(
-    matrices, outcome, confounds=None, alpha=0.05, groups=None, num_node=None, diagonal=False, n_splits=10, n_iterations=10
+    matrices, outcome, confounds=None, alpha=0.05, groups=None, num_node=None, diagonal=False, scale_x=False, scale_y=False, n_splits=10, n_iterations=10
 ):
     """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided
     of shape ((subject x session)x node x node)
@@ -333,8 +333,7 @@ def kfold_nbs(
     manager = enlighten.get_manager()
     ticks = manager.counter(total=n_splits * n_iterations, desc="Progress", unit="folds")
     for train_idx, test_idx in cv.split(edges, split_y):
-        x_scaler = Normalizer()
-        y_scaler = Normalizer()
+        
         cv_results.at[i, "split"] = (train_idx, test_idx)
 
         # assert len(train_a_idx) == len(train_b_idx)
@@ -382,15 +381,21 @@ def kfold_nbs(
                 test_y, test_edges = residualize(X=test_edges, y=test_y, confounds=test_confounds)
         else:
             pass
+        if scale_x:
+            x_scaler = Normalizer()
+            train_edges = x_scaler.fit_transform(train_edges)
+            test_edges = x_scaler.transform(test_edges)
+        if scale_y:
+            if np.unique(outcome).shape[0] == 2:
+                pass
+            else:
+                y_scaler = Normalizer()
+                train_y = y_scaler.fit_transform(train_y.reshape(-1, 1))
+                test_y = y_scaler.transform(test_y.reshape(-1, 1))
+        
+        
 
-        train_edges = x_scaler.fit_transform(train_edges)
-        test_edges = x_scaler.transform(test_edges)
-
-        if np.unique(outcome).shape[0] == 2:
-            pass
-        else:
-            train_y = y_scaler.fit_transform(train_y.reshape(-1, 1))
-            test_y = y_scaler.transform(test_y.reshape(-1, 1))
+       
 
         # perform NBS wooooooooo
         # note: output is a dataframe :)

From 68631fbc7aabde35a2f8b798a7e5d54661cd1a5d Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Fri, 12 Jan 2024 15:19:41 -0800
Subject: [PATCH 44/48] nbs workflow works yay

---
 idconn/io.py                             |  43 ++-
 idconn/workflows/nbs_predict-e2.py       | 419 ++++++++++++++++++++++
 idconn/workflows/nbs_predict-e2xp4-bc.py | 422 +++++++++++++++++++++++
 idconn/workflows/nbs_predict-e2xp4.py    | 422 +++++++++++++++++++++++
 idconn/workflows/nbs_predict-p4.py       | 416 ++++++++++++++++++++++
 idconn/workflows/nbs_predict.py          |  90 ++---
 6 files changed, 1754 insertions(+), 58 deletions(-)
 create mode 100644 idconn/workflows/nbs_predict-e2.py
 create mode 100644 idconn/workflows/nbs_predict-e2xp4-bc.py
 create mode 100644 idconn/workflows/nbs_predict-e2xp4.py
 create mode 100644 idconn/workflows/nbs_predict-p4.py

diff --git a/idconn/io.py b/idconn/io.py
index b5f43e1..23b563c 100644
--- a/idconn/io.py
+++ b/idconn/io.py
@@ -35,7 +35,6 @@ def calc_fd(confounds):
     fd = np.sum([delta_x, delta_y, delta_z, delta_alpha, delta_beta, delta_gamma], axis=0)
     return fd
 
-
 def build_statsmodel_json(
     name,
     task,
@@ -132,7 +131,6 @@ def build_statsmodel_json(
         json.dump(statsmodel, outfile)
     return statsmodel_json
 
-
 def atlas_picker(atlas, path, key=None):
     """Takes in atlas name and path to file, if local, returns
     nifti-like object (usually file path to downloaded atlas),
@@ -192,8 +190,7 @@ def atlas_picker(atlas, path, key=None):
 
     return atlas, path
 
-
-def vectorize_corrmats(matrices):
+def vectorize_corrmats(matrices, diagonal=False):
     """Returns the vectorized upper triangles of a 3-dimensional array
     (i.e., node x node x matrix) of matrices. Output will be a 2-dimensional
     array (i.e., matrix x node^2)
@@ -210,11 +207,15 @@ def vectorize_corrmats(matrices):
         the input matrices.
     """
     # print(f'\n\n\n{matrices.shape}, {matrices.ndim}\n\n\n')
+    if diagonal == True:
+        k = 0
+    else:
+        k = 1
     num_node = matrices.shape[1]
-    upper_tri = np.triu_indices(num_node, k=1)
+    upper_tri = np.triu_indices(num_node, k=k)
     if matrices.ndim == 3:
         num_node = matrices.shape[1]
-        upper_tri = np.triu_indices(num_node, k=1)
+        upper_tri = np.triu_indices(num_node, k=k)
         num_matrices = matrices.shape[0]
         edge_vector = []
         for matrix in range(0, num_matrices):
@@ -234,7 +235,7 @@ def vectorize_corrmats(matrices):
     elif matrices.ndim == 1:
         if matrices[0].ndim == 2:
             num_node = matrices[0].shape[0]
-            upper_tri = np.triu_indices(num_node, k=1)
+            upper_tri = np.triu_indices(num_node, k=k)
             edge_vector = []
             for matrix in matrices:
                 vectorized = matrix[upper_tri]
@@ -248,7 +249,6 @@ def vectorize_corrmats(matrices):
     edge_vector = np.asarray(edge_vector)
     return edge_vector
 
-
 def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True, verbose=False):
     """Returns a node x node x (subject x session) matrix of correlation matrices
     from a BIDS derivative folder. Optionally returns a node^2 x (subject x session)
@@ -419,8 +419,7 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True
     ppt_df.replace({"": np.nan}, inplace=True)
     return ppt_df
 
-
-def undo_vectorize(edges, num_node=None):
+def undo_vectorize(edges, num_node=None, diagonal=False):
     """
     Puts an edge vector back into an adjacency matrix.
     Parameters
@@ -439,15 +438,25 @@ def undo_vectorize(edges, num_node=None):
     # num_node = (np.sqrt((8 * j) + 1) + 1) / 2
     if num_node == None:
         j = len(edges)
-        num_node = int((np.sqrt((8 * j) + 1) + 1) / 2)
+        if diagonal == False:
+            num_node = int((np.sqrt((8 * j) + 1) + 1) / 2)
+        else:
+            num_node = int((np.sqrt((8 * j) + 1) - 1) / 2)
     else:
         num_node = int(num_node)
     X = np.zeros((num_node, num_node))
-    X[np.triu_indices(X.shape[0], k=1)] = edges
+    if diagonal == False:
+        k=1
+    if diagonal == True:
+        k=0
+    X[np.triu_indices(num_node, k=k)] = edges
+    diag_X = X[np.diag_indices(num_node,2)]
     X = X + X.T
+    if diagonal == True:
+        X[np.diag_indices(num_node,2)] = diag_X
+    #print('did undo_vectorize work?', np.allclose(X, X.T))
     return X
 
-
 def plot_edges(
     adj,
     atlas_nii,
@@ -499,7 +508,7 @@ def plot_edges(
     print("edge plotting threshold: ", threshold)
 
     if node_size == "strength":
-        node_strength = np.sum(adj, axis=0)
+        node_strength = np.abs(np.sum(adj, axis=0))
         # node_strength /= np.max(node_strength)
         # node_strength **= 4
         node_strength = node_strength / np.max(node_strength) * 60
@@ -535,7 +544,7 @@ def plot_edges(
         nimg = nib.load(atlas_nii)
         regn_sch_arr = nimg.get_fdata()
         for i in np.arange(0, num_node):
-            regn_sch_arr[np.where(regn_sch_arr == i + 1)] = np.sum(adj[i])
+            regn_sch_arr[np.where(regn_sch_arr == i + 1)] = np.sum((adj[i]))
         strength_nimg = nib.Nifti1Image(regn_sch_arr, nimg.affine)
         # replace this filename with BIDSy output
         # nib.save(strength_nimg, f'/Users/katherine.b/Dropbox/{title}predictive-strength.nii')
@@ -558,6 +567,7 @@ def plot_edges(
         i = plotting.plot_surf_stat_map(
             fsaverage.pial_left,
             texture_l,
+            bg_map=fsaverage.sulc_left,
             symmetric_cbar=False,
             threshold=0.5,
             cmap=cmap,
@@ -568,6 +578,7 @@ def plot_edges(
         j = plotting.plot_surf_stat_map(
             fsaverage.pial_left,
             texture_l,
+            bg_map=fsaverage.sulc_left,
             symmetric_cbar=False,
             threshold=0.5,
             cmap=cmap,
@@ -578,6 +589,7 @@ def plot_edges(
         k = plotting.plot_surf_stat_map(
             fsaverage.pial_right,
             texture_r,
+            bg_map=fsaverage.sulc_right,
             symmetric_cbar=False,
             threshold=0.5,
             cmap=cmap,
@@ -588,6 +600,7 @@ def plot_edges(
         l = plotting.plot_surf_stat_map(
             fsaverage.pial_right,
             texture_r,
+            bg_map=fsaverage.sulc_right,
             symmetric_cbar=False,
             threshold=0.5,
             cmap=cmap,
diff --git a/idconn/workflows/nbs_predict-e2.py b/idconn/workflows/nbs_predict-e2.py
new file mode 100644
index 0000000..c92d274
--- /dev/null
+++ b/idconn/workflows/nbs_predict-e2.py
@@ -0,0 +1,419 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import nibabel as nib
+import seaborn as sns
+import bids
+import matplotlib.pyplot as plt
+from os.path import join
+from datetime import datetime
+from time import strftime
+from scipy.stats import spearmanr
+from idconn import nbs, io
+
+from bct import threshold_proportional
+
+
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate
+from sklearn.preprocessing import Normalizer, StandardScaler
+from sklearn.metrics import mean_squared_error
+from matplotlib.colors import ListedColormap
+import matplotlib as mpl
+
+
+import warnings
+import json
+
+warnings.simplefilter("ignore")
+
+today = datetime.today()
+today_str = strftime("%m_%d_%Y")
+
+TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674"
+TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset"
+DERIV_NAME = "IDConn"
+OUTCOME = "estradiol"
+CONFOUNDS = ["framewise_displacement"]
+TASK = "rest"
+ATLAS = "craddock2012"
+THRESH = 0.5
+alpha = 0.01
+atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
+
+
+layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
+
+dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+keep = dat["adj"].dropna().index
+dat = dat.loc[keep]
+
+groups = dat["bc"]
+# print(dat['adj'].values.shape)
+num_node = dat.iloc[0]["adj"].shape[0]
+
+matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node))
+upper_tri = np.triu_indices(num_node, k=1)
+
+outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
+
+#print(len(np.unique(outcome)))
+
+if CONFOUNDS is not None:
+    confounds = dat[CONFOUNDS]
+    base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}"
+else:
+    confounds = None
+    base_name = f"nbs-predict_outcome-{OUTCOME}"
+# print(dat['bc'])
+
+weighted_average, cv_results = nbs.kfold_nbs(
+    matrices, outcome, confounds, alpha, groups=groups, n_splits=3, n_iterations=3
+)
+
+fig, fig2, nimg = io.plot_edges(
+    weighted_average,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Precision-Weighted Average",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}")
+)
+
+
+avg_df = pd.DataFrame(
+    weighted_average,
+    index=range(0, weighted_average.shape[0]),
+    columns=range(0, weighted_average.shape[1]),
+)
+
+cv_results.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t"
+)
+avg_df.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
+)
+
+best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model']
+
+# this uses the most predictive subnetwork as features in the model
+# might replace with thresholded weighted_average
+# or use _all_ the edges in weighted_average with KRR or ElasticNet...
+# ORRR use thresholded weighted average edges with ElasticNet...
+# - stays true to NBS-Predict
+# - increases parsimony while handling multicollinearity...
+# either way, I don't think cv_results is necessary
+
+# here is where we'd threshold the weighted average to use for elastic-net
+weighted_average = np.where(weighted_average > 0, weighted_average, 0)
+#print(np.sum(weighted_average))
+#nbs_vector = weighted_average[upper_tri]
+#p75 = np.percentile(nbs_vector, 75)
+#filter = np.where(nbs_vector >= p75, True, False)
+#print(np.sum(filter))
+# print(nbs_vector.shape, filter.shape)
+
+thresh_average = threshold_proportional(weighted_average, THRESH)
+nbs_vector2 = thresh_average[upper_tri]
+#p75 = np.percentile(nbs_vector, 75)
+filter = np.where(nbs_vector2 > 0, True, False)
+
+# mask = io.vectorize_corrmats(filter)
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_train = dat[CONFOUNDS].values
+    outcome_train = np.reshape(outcome, (outcome.shape[0],))
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_train)) <= 2:
+        resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
+        train_outcome = outcome
+    elif len(np.unique(outcome_train)) > 3:
+        train_outcome, resid_edges = nbs.residualize(
+            X=edges_train, y=outcome_train, confounds=confounds_train
+        )
+    train_features = resid_edges
+else:
+    train_features = edges_train
+    train_outcome = outcome
+
+x_scaler = StandardScaler()
+y_scaler = StandardScaler()
+train_features = x_scaler.fit_transform(train_features)
+if len(np.unique(train_outcome)) <= 2:
+    pass
+else:
+    train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
+
+
+
+# run the model on the whole test dataset to get params
+
+# classification if the outcome is binary (for now)
+# could be extended to the multiclass case?
+train_metrics = {}
+if len(np.unique(outcome)) == 2:
+    model = LogisticRegression(
+        penalty="l2", 
+        solver="saga", 
+        C=best.C_[0]
+        )
+    train_metrics["alpha"] = best.C_[0]
+    #train_metrics["l1_ratio"] = best.l1_ratio_
+else:
+    model = Ridge(
+        solver="auto",  
+        alpha=best.alpha_,
+        fit_intercept=False,
+        )
+    train_metrics["alpha"] = best.alpha_
+
+cv = RepeatedKFold(n_splits=5, n_repeats=10)
+
+    #train_metrics["l1_ratio"] = best.l1_ratio_
+#print(params)
+#model.set_params(**params)
+# train ElasticNet on full train dataset, using feature extraction from NBS-Predict
+#fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+scores = cross_validate(
+    model, 
+    train_features, 
+    train_outcome, 
+    groups=groups, 
+    cv=cv,
+    return_estimator=True, 
+    return_train_score=True
+    )
+train_metrics["in_sample_test"] = np.mean(scores['test_score'])
+train_metrics["in_sample_train"] = np.mean(scores['train_score'])
+
+fitted = scores['estimator'][0]
+y_pred = fitted.predict(X=train_features)
+train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
+
+dat[f'{OUTCOME}_pred'] = y_pred
+dat[f'{OUTCOME}_scaled'] = train_outcome
+
+Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']]
+Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
+
+train_colors = ['#a08ad1', #light
+                '#685690', #medium
+                '#3f2d69' #dark
+                ]
+light_cmap = sns.color_palette('dark:#a08ad1')
+dark_cmap = sns.color_palette('dark:#685690')
+
+fig,ax = plt.subplots()
+g = sns.scatterplot(x='cycle_day', 
+                    y=f'{OUTCOME}_pred', 
+                    style='bc', 
+                    data=Ys,  
+                    ax=ax, 
+                    palette=dark_cmap)
+h = sns.scatterplot(x='cycle_day',
+                    y=f'{OUTCOME}_scaled', 
+                    style='bc', 
+                    data=Ys, 
+                    ax=ax, 
+                    palette=light_cmap)
+ax.legend(bbox_to_anchor=(1.0, 0.5))
+fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+
+mse = mean_squared_error(train_outcome, y_pred)
+train_metrics["mean squared error"] = mse
+print("In-sample train score: ", train_metrics["in_sample_train"])
+print("In-sample test score: ", train_metrics["in_sample_test"])
+print("In-sample mean squared error: ", mse)
+# print(np.mean(train_features))
+with open(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(train_metrics, fp)
+
+# yoink the coefficients? for a more parsimonious figure?
+#print(fitted.coef_.shape)
+#print(fitted.coef_)
+coeff_vec = np.zeros_like(filter)
+j = 0
+for i in range(0, filter.shape[0]):
+    if filter[i] == True:
+        #print(j)
+        #print(fitted.coef_[0, j])
+        coeff_vec[i] = fitted.coef_[0, j]
+        j += 1
+    else:
+        pass
+
+# print(coeff_vec)
+print(coeff_vec)
+coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
+
+coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index)
+coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv"))
+
+fig, fig2, nimg = io.plot_edges(
+    coef_mat,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Coefficients",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}")
+)
+
+
+layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
+
+test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+keep = test_df[[OUTCOME, "adj"]].dropna().index
+# print(keep)
+
+test_df = test_df.loc[keep]
+
+outcome_test = test_df[OUTCOME].values
+# print(test_df)
+
+# print(outcome_test)
+matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
+    (len(test_df["adj"].dropna().index), num_node, num_node)
+)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_test = test_df[CONFOUNDS].values
+
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_test)) <= 2:
+        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
+        test_outcome = outcome_test
+    elif len(np.unique(outcome_test)) > 3:
+        test_outcome, resid_edges = nbs.residualize(
+            X=edges_test, y=outcome_test, confounds=confounds_test
+        )
+    test_features = resid_edges
+else:
+    test_features = edges_test
+    test_outcome = outcome_test
+
+# scale after residualizing omg
+test_features = x_scaler.transform(test_features)
+if len(np.unique(test_outcome)) <= 2:
+    pass
+else:
+    test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1))
+# print(test_features.shape)
+# if the model is a logistic regression, i.e. with a binary outcome
+# then score is prediction accuracy
+# if the model is a linear regression, i.e., with a continuous outcome
+# then the score is R^2 (coefficient of determination)
+
+# fit trained ElasticNet, initialized via warm_start
+# prob in CV?
+# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
+# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
+test_metrics = {}
+
+#cross_validate(model, )
+y_pred = fitted.predict(X=test_features)
+score = fitted.score(X=test_features, y=np.ravel(test_outcome))
+if len(np.unique(test_outcome)) == 2:
+    test_metrics["accuracy"] = score
+else:
+    test_metrics["coefficient of determination"] = score
+corr = spearmanr(test_outcome, y_pred)
+test_metrics["pred_v_actual_corr"] = corr
+mse = mean_squared_error(test_outcome, y_pred)
+test_metrics["mean squared error"] = mse
+print("Out-of-sample prediction score:\t", score)
+print("Out-of-sample mean squared error:\t", mse)
+# print(np.mean(test_features))
+# pred_outcome = fitted.predict(test_features)
+test_df[f'{OUTCOME}_scaled'] = test_outcome
+test_df[f'{OUTCOME}_pred'] = y_pred
+Ys = test_df[[f'{OUTCOME}_scaled', 
+              f'{OUTCOME}_pred',
+              'cycle_day', 
+              'bc']]
+Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
+
+Ys['ppts'] = Ys.index.get_level_values(0)
+
+
+light_colors = ['#33ACE3', #Bubbles
+                '#EA6964', #Blossom
+                '#4AB62C' #Buttercup
+                ]
+dark_colors = ['#1278a6', 
+               '#a11510', 
+               '#228208']
+light = ListedColormap(light_colors, name='light_powderpuff')
+dark = ListedColormap(dark_colors, name='dark_powderpuff')
+mpl.colormaps.register(cmap=light)
+mpl.colormaps.register(cmap=dark)
+
+fig,ax = plt.subplots()
+g = sns.scatterplot(x='cycle_day', 
+                    y=f'{OUTCOME}_pred', 
+                    style='bc', 
+                    data=Ys, 
+                    hue='ppts',  
+                    hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'],
+                    ax=ax, 
+                    palette='light_powderpuff'
+                    )
+h = sns.scatterplot(x='cycle_day',
+                     y=f'{OUTCOME}_scaled', 
+                     style='bc', 
+                     data=Ys, 
+                     hue='ppts',
+                     hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], 
+                     ax=ax, 
+                     palette='dark_powderpuff')
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')
+fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+
+
+
+#print(test_outcome, "\n", y_pred)
+# print(pred_outcome)
+if len(np.unique(test_outcome)) > 2:
+    
+    print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
+    test_metrics["spearman correlation"] = corr
+with open(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(test_metrics, fp)
+np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred)
diff --git a/idconn/workflows/nbs_predict-e2xp4-bc.py b/idconn/workflows/nbs_predict-e2xp4-bc.py
new file mode 100644
index 0000000..ad6a6d8
--- /dev/null
+++ b/idconn/workflows/nbs_predict-e2xp4-bc.py
@@ -0,0 +1,422 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import nibabel as nib
+import seaborn as sns
+import bids
+import matplotlib.pyplot as plt
+from os.path import join
+from datetime import datetime
+from time import strftime
+from scipy.stats import spearmanr
+from idconn import nbs, io
+
+from bct import threshold_proportional
+
+
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate
+from sklearn.preprocessing import Normalizer, StandardScaler
+from sklearn.metrics import mean_squared_error
+from matplotlib.colors import ListedColormap
+import matplotlib as mpl
+
+
+import warnings
+import json
+
+warnings.simplefilter("ignore")
+
+today = datetime.today()
+today_str = strftime("%m_%d_%Y")
+
+TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674"
+TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset"
+DERIV_NAME = "IDConn"
+OUTCOME = "estradiol÷progesterone"
+CONFOUNDS = ["framewise_displacement", "bc"]
+TASK = "rest"
+ATLAS = "craddock2012"
+THRESH = 0.5
+alpha = 0.01
+atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
+
+
+layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
+
+dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+dat['estradiol÷progesterone'] = dat['estradiol'] / dat['progesterone']
+
+keep = dat["adj"].dropna().index
+dat = dat.loc[keep]
+
+groups = dat["bc"]
+# print(dat['adj'].values.shape)
+num_node = dat.iloc[0]["adj"].shape[0]
+
+matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node))
+upper_tri = np.triu_indices(num_node, k=1)
+
+outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
+
+#print(len(np.unique(outcome)))
+
+if CONFOUNDS is not None:
+    confounds = dat[CONFOUNDS]
+    base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}"
+else:
+    confounds = None
+    base_name = f"nbs-predict_outcome-{OUTCOME}"
+# print(dat['bc'])
+
+weighted_average, cv_results = nbs.kfold_nbs(
+    matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000
+)
+
+fig, fig2, nimg = io.plot_edges(
+    weighted_average,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Precision-Weighted Average",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}")
+)
+
+
+avg_df = pd.DataFrame(
+    weighted_average,
+    index=range(0, weighted_average.shape[0]),
+    columns=range(0, weighted_average.shape[1]),
+)
+
+cv_results.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t"
+)
+avg_df.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
+)
+
+best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model']
+
+# this uses the most predictive subnetwork as features in the model
+# might replace with thresholded weighted_average
+# or use _all_ the edges in weighted_average with KRR or ElasticNet...
+# ORRR use thresholded weighted average edges with ElasticNet...
+# - stays true to NBS-Predict
+# - increases parsimony while handling multicollinearity...
+# either way, I don't think cv_results is necessary
+
+# here is where we'd threshold the weighted average to use for elastic-net
+weighted_average = np.where(weighted_average > 0, weighted_average, 0)
+#print(np.sum(weighted_average))
+#nbs_vector = weighted_average[upper_tri]
+#p75 = np.percentile(nbs_vector, 75)
+#filter = np.where(nbs_vector >= p75, True, False)
+#print(np.sum(filter))
+# print(nbs_vector.shape, filter.shape)
+
+thresh_average = threshold_proportional(weighted_average, THRESH)
+nbs_vector2 = thresh_average[upper_tri]
+#p75 = np.percentile(nbs_vector, 75)
+filter = np.where(nbs_vector2 > 0, True, False)
+
+# mask = io.vectorize_corrmats(filter)
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_train = dat[CONFOUNDS].values
+    outcome_train = np.reshape(outcome, (outcome.shape[0],))
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_train)) <= 2:
+        resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
+        train_outcome = outcome
+    elif len(np.unique(outcome_train)) > 3:
+        train_outcome, resid_edges = nbs.residualize(
+            X=edges_train, y=outcome_train, confounds=confounds_train
+        )
+    train_features = resid_edges
+else:
+    train_features = edges_train
+    train_outcome = outcome
+
+x_scaler = StandardScaler()
+y_scaler = StandardScaler()
+train_features = x_scaler.fit_transform(train_features)
+if len(np.unique(train_outcome)) <= 2:
+    pass
+else:
+    train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
+
+
+
+# run the model on the whole test dataset to get params
+
+# classification if the outcome is binary (for now)
+# could be extended to the multiclass case?
+train_metrics = {}
+if len(np.unique(outcome)) == 2:
+    model = LogisticRegression(
+        penalty="l2", 
+        solver="saga", 
+        C=best.C_[0]
+        )
+    train_metrics["alpha"] = best.C_[0]
+    #train_metrics["l1_ratio"] = best.l1_ratio_
+else:
+    model = Ridge(
+        solver="auto",  
+        alpha=best.alpha_,
+        fit_intercept=False,
+        )
+    train_metrics["alpha"] = best.alpha_
+
+cv = RepeatedKFold(n_splits=5, n_repeats=10)
+
+    #train_metrics["l1_ratio"] = best.l1_ratio_
+#print(params)
+#model.set_params(**params)
+# train ElasticNet on full train dataset, using feature extraction from NBS-Predict
+#fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+scores = cross_validate(
+    model, 
+    train_features, 
+    train_outcome, 
+    groups=groups, 
+    cv=cv,
+    return_estimator=True, 
+    return_train_score=True
+    )
+train_metrics["in_sample_test"] = np.mean(scores['test_score'])
+train_metrics["in_sample_train"] = np.mean(scores['train_score'])
+
+fitted = scores['estimator'][0]
+y_pred = fitted.predict(X=train_features)
+train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
+
+dat[f'{OUTCOME}_pred'] = y_pred
+dat[f'{OUTCOME}_scaled'] = train_outcome
+
+Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']]
+Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
+
+train_colors = ['#a08ad1', #light
+                '#685690', #medium
+                '#3f2d69' #dark
+                ]
+light_cmap = sns.color_palette('dark:#a08ad1')
+dark_cmap = sns.color_palette('dark:#685690')
+
+fig,ax = plt.subplots()
+g = sns.scatterplot(x='cycle_day', 
+                    y=f'{OUTCOME}_pred', 
+                    style='bc', 
+                    data=Ys,  
+                    ax=ax, 
+                    palette=dark_cmap)
+h = sns.scatterplot(x='cycle_day',
+                    y=f'{OUTCOME}_scaled', 
+                    style='bc', 
+                    data=Ys, 
+                    ax=ax, 
+                    palette=light_cmap)
+ax.legend(bbox_to_anchor=(1.0, 0.5))
+fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+
+mse = mean_squared_error(train_outcome, y_pred)
+train_metrics["mean squared error"] = mse
+print("In-sample train score: ", train_metrics["in_sample_train"])
+print("In-sample test score: ", train_metrics["in_sample_test"])
+print("In-sample mean squared error: ", mse)
+# print(np.mean(train_features))
+with open(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(train_metrics, fp)
+
+# yoink the coefficients? for a more parsimonious figure?
+#print(fitted.coef_.shape)
+#print(fitted.coef_)
+coeff_vec = np.zeros_like(filter)
+j = 0
+for i in range(0, filter.shape[0]):
+    if filter[i] == True:
+        #print(j)
+        #print(fitted.coef_[0, j])
+        coeff_vec[i] = fitted.coef_[0, j]
+        j += 1
+    else:
+        pass
+
+# print(coeff_vec)
+print(coeff_vec)
+coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
+
+coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index)
+coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv"))
+
+fig, fig2, nimg = io.plot_edges(
+    coef_mat,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Coefficients",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}")
+)
+
+
+layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
+
+test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+test_df['estradiol÷progesterone'] = test_df['estradiol'] / test_df['progesterone']
+
+keep = test_df[[OUTCOME, "adj"]].dropna().index
+# print(keep)
+
+test_df = test_df.loc[keep]
+
+outcome_test = test_df[OUTCOME].values
+# print(test_df)
+
+# print(outcome_test)
+matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
+    (len(test_df["adj"].dropna().index), num_node, num_node)
+)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_test = test_df[CONFOUNDS].values
+
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_test)) <= 2:
+        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
+        test_outcome = outcome_test
+    elif len(np.unique(outcome_test)) > 3:
+        test_outcome, resid_edges = nbs.residualize(
+            X=edges_test, y=outcome_test, confounds=confounds_test
+        )
+    test_features = resid_edges
+else:
+    test_features = edges_test
+    test_outcome = outcome_test
+
+# scale after residualizing omg
+test_features = x_scaler.transform(test_features)
+if len(np.unique(test_outcome)) <= 2:
+    pass
+else:
+    test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1))
+# print(test_features.shape)
+# if the model is a logistic regression, i.e. with a binary outcome
+# then score is prediction accuracy
+# if the model is a linear regression, i.e., with a continuous outcome
+# then the score is R^2 (coefficient of determination)
+
+# fit trained ElasticNet, initialized via warm_start
+# prob in CV?
+# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
+# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
+test_metrics = {}
+
+#cross_validate(model, )
+y_pred = fitted.predict(X=test_features)
+score = fitted.score(X=test_features, y=np.ravel(test_outcome))
+if len(np.unique(test_outcome)) == 2:
+    test_metrics["accuracy"] = score
+else:
+    test_metrics["coefficient of determination"] = score
+corr = spearmanr(test_outcome, y_pred)
+test_metrics["pred_v_actual_corr"] = corr
+mse = mean_squared_error(test_outcome, y_pred)
+test_metrics["mean squared error"] = mse
+print("Out-of-sample prediction score:\t", score)
+print("Out-of-sample mean squared error:\t", mse)
+# print(np.mean(test_features))
+# pred_outcome = fitted.predict(test_features)
+test_df[f'{OUTCOME}_scaled'] = test_outcome
+test_df[f'{OUTCOME}_pred'] = y_pred
+Ys = test_df[[f'{OUTCOME}_scaled', 
+              f'{OUTCOME}_pred',
+              'cycle_day', 
+              'bc']]
+Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
+
+Ys['ppts'] = Ys.index.get_level_values(0)
+
+
+light_colors = ['#33ACE3', #Bubbles
+                '#EA6964', #Blossom
+                '#4AB62C' #Buttercup
+                ]
+dark_colors = ['#1278a6', 
+               '#a11510', 
+               '#228208']
+light = ListedColormap(light_colors, name='light_powderpuff')
+dark = ListedColormap(dark_colors, name='dark_powderpuff')
+mpl.colormaps.register(cmap=light)
+mpl.colormaps.register(cmap=dark)
+
+fig,ax = plt.subplots()
+g = sns.scatterplot(x='cycle_day', 
+                    y=f'{OUTCOME}_pred', 
+                    style='bc', 
+                    data=Ys, 
+                    hue='ppts',  
+                    hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'],
+                    ax=ax, 
+                    palette='light_powderpuff'
+                    )
+h = sns.scatterplot(x='cycle_day',
+                     y=f'{OUTCOME}_scaled', 
+                     style='bc', 
+                     data=Ys, 
+                     hue='ppts',
+                     hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], 
+                     ax=ax, 
+                     palette='dark_powderpuff')
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')
+fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+
+
+
+#print(test_outcome, "\n", y_pred)
+# print(pred_outcome)
+if len(np.unique(test_outcome)) > 2:
+    
+    print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
+    test_metrics["spearman correlation"] = corr
+with open(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(test_metrics, fp)
+np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred)
diff --git a/idconn/workflows/nbs_predict-e2xp4.py b/idconn/workflows/nbs_predict-e2xp4.py
new file mode 100644
index 0000000..022d8b9
--- /dev/null
+++ b/idconn/workflows/nbs_predict-e2xp4.py
@@ -0,0 +1,422 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import nibabel as nib
+import seaborn as sns
+import bids
+import matplotlib.pyplot as plt
+from os.path import join
+from datetime import datetime
+from time import strftime
+from scipy.stats import spearmanr
+from idconn import nbs, io
+
+from bct import threshold_proportional
+
+
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate
+from sklearn.preprocessing import Normalizer, StandardScaler
+from sklearn.metrics import mean_squared_error
+from matplotlib.colors import ListedColormap
+import matplotlib as mpl
+
+
+import warnings
+import json
+
+warnings.simplefilter("ignore")
+
+today = datetime.today()
+today_str = strftime("%m_%d_%Y")
+
+TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674"
+TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset"
+DERIV_NAME = "IDConn"
+OUTCOME = "estradiol÷progesterone"
+CONFOUNDS = ["framewise_displacement"]
+TASK = "rest"
+ATLAS = "craddock2012"
+THRESH = 0.5
+alpha = 0.01
+atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
+
+
+layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
+
+dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+dat['estradiol÷progesterone'] = dat['estradiol'] / dat['progesterone']
+
+keep = dat["adj"].dropna().index
+dat = dat.loc[keep]
+
+groups = dat["bc"]
+# print(dat['adj'].values.shape)
+num_node = dat.iloc[0]["adj"].shape[0]
+
+matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node))
+upper_tri = np.triu_indices(num_node, k=1)
+
+outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
+
+#print(len(np.unique(outcome)))
+
+if CONFOUNDS is not None:
+    confounds = dat[CONFOUNDS]
+    base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}"
+else:
+    confounds = None
+    base_name = f"nbs-predict_outcome-{OUTCOME}"
+# print(dat['bc'])
+
+weighted_average, cv_results = nbs.kfold_nbs(
+    matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000
+)
+
+fig, fig2, nimg = io.plot_edges(
+    weighted_average,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Precision-Weighted Average",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}")
+)
+
+
+avg_df = pd.DataFrame(
+    weighted_average,
+    index=range(0, weighted_average.shape[0]),
+    columns=range(0, weighted_average.shape[1]),
+)
+
+cv_results.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t"
+)
+avg_df.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
+)
+
+best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model']
+
+# this uses the most predictive subnetwork as features in the model
+# might replace with thresholded weighted_average
+# or use _all_ the edges in weighted_average with KRR or ElasticNet...
+# ORRR use thresholded weighted average edges with ElasticNet...
+# - stays true to NBS-Predict
+# - increases parsimony while handling multicollinearity...
+# either way, I don't think cv_results is necessary
+
+# here is where we'd threshold the weighted average to use for elastic-net
+weighted_average = np.where(weighted_average > 0, weighted_average, 0)
+#print(np.sum(weighted_average))
+#nbs_vector = weighted_average[upper_tri]
+#p75 = np.percentile(nbs_vector, 75)
+#filter = np.where(nbs_vector >= p75, True, False)
+#print(np.sum(filter))
+# print(nbs_vector.shape, filter.shape)
+
+thresh_average = threshold_proportional(weighted_average, THRESH)
+nbs_vector2 = thresh_average[upper_tri]
+#p75 = np.percentile(nbs_vector, 75)
+filter = np.where(nbs_vector2 > 0, True, False)
+
+# mask = io.vectorize_corrmats(filter)
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_train = dat[CONFOUNDS].values
+    outcome_train = np.reshape(outcome, (outcome.shape[0],))
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_train)) <= 2:
+        resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
+        train_outcome = outcome
+    elif len(np.unique(outcome_train)) > 3:
+        train_outcome, resid_edges = nbs.residualize(
+            X=edges_train, y=outcome_train, confounds=confounds_train
+        )
+    train_features = resid_edges
+else:
+    train_features = edges_train
+    train_outcome = outcome
+
+x_scaler = StandardScaler()
+y_scaler = StandardScaler()
+train_features = x_scaler.fit_transform(train_features)
+if len(np.unique(train_outcome)) <= 2:
+    pass
+else:
+    train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
+
+
+
+# run the model on the whole test dataset to get params
+
+# classification if the outcome is binary (for now)
+# could be extended to the multiclass case?
+train_metrics = {}
+if len(np.unique(outcome)) == 2:
+    model = LogisticRegression(
+        penalty="l2", 
+        solver="saga", 
+        C=best.C_[0]
+        )
+    train_metrics["alpha"] = best.C_[0]
+    #train_metrics["l1_ratio"] = best.l1_ratio_
+else:
+    model = Ridge(
+        solver="auto",  
+        alpha=best.alpha_,
+        fit_intercept=False,
+        )
+    train_metrics["alpha"] = best.alpha_
+
+cv = RepeatedKFold(n_splits=5, n_repeats=10)
+
+    #train_metrics["l1_ratio"] = best.l1_ratio_
+#print(params)
+#model.set_params(**params)
+# train ElasticNet on full train dataset, using feature extraction from NBS-Predict
+#fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+scores = cross_validate(
+    model, 
+    train_features, 
+    train_outcome, 
+    groups=groups, 
+    cv=cv,
+    return_estimator=True, 
+    return_train_score=True
+    )
+train_metrics["in_sample_test"] = np.mean(scores['test_score'])
+train_metrics["in_sample_train"] = np.mean(scores['train_score'])
+
+fitted = scores['estimator'][0]
+y_pred = fitted.predict(X=train_features)
+train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
+
+dat[f'{OUTCOME}_pred'] = y_pred
+dat[f'{OUTCOME}_scaled'] = train_outcome
+
+Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']]
+Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
+
+train_colors = ['#a08ad1', #light
+                '#685690', #medium
+                '#3f2d69' #dark
+                ]
+light_cmap = sns.color_palette('dark:#a08ad1')
+dark_cmap = sns.color_palette('dark:#685690')
+
+fig,ax = plt.subplots()
+g = sns.scatterplot(x='cycle_day', 
+                    y=f'{OUTCOME}_pred', 
+                    style='bc', 
+                    data=Ys,  
+                    ax=ax, 
+                    palette=dark_cmap)
+h = sns.scatterplot(x='cycle_day',
+                    y=f'{OUTCOME}_scaled', 
+                    style='bc', 
+                    data=Ys, 
+                    ax=ax, 
+                    palette=light_cmap)
+ax.legend(bbox_to_anchor=(1.0, 0.5))
+fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+
+mse = mean_squared_error(train_outcome, y_pred)
+train_metrics["mean squared error"] = mse
+print("In-sample train score: ", train_metrics["in_sample_train"])
+print("In-sample test score: ", train_metrics["in_sample_test"])
+print("In-sample mean squared error: ", mse)
+# print(np.mean(train_features))
+with open(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(train_metrics, fp)
+
+# yoink the coefficients? for a more parsimonious figure?
+#print(fitted.coef_.shape)
+#print(fitted.coef_)
+coeff_vec = np.zeros_like(filter)
+j = 0
+for i in range(0, filter.shape[0]):
+    if filter[i] == True:
+        #print(j)
+        #print(fitted.coef_[0, j])
+        coeff_vec[i] = fitted.coef_[0, j]
+        j += 1
+    else:
+        pass
+
+# print(coeff_vec)
+print(coeff_vec)
+coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
+
+coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index)
+coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv"))
+
+fig, fig2, nimg = io.plot_edges(
+    coef_mat,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Coefficients",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}")
+)
+
+
+layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
+
+test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+test_df['estradiol÷progesterone'] = test_df['estradiol'] / test_df['progesterone']
+
+keep = test_df[[OUTCOME, "adj"]].dropna().index
+# print(keep)
+
+test_df = test_df.loc[keep]
+
+outcome_test = test_df[OUTCOME].values
+# print(test_df)
+
+# print(outcome_test)
+matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
+    (len(test_df["adj"].dropna().index), num_node, num_node)
+)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_test = test_df[CONFOUNDS].values
+
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_test)) <= 2:
+        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
+        test_outcome = outcome_test
+    elif len(np.unique(outcome_test)) > 3:
+        test_outcome, resid_edges = nbs.residualize(
+            X=edges_test, y=outcome_test, confounds=confounds_test
+        )
+    test_features = resid_edges
+else:
+    test_features = edges_test
+    test_outcome = outcome_test
+
+# scale after residualizing omg
+test_features = x_scaler.transform(test_features)
+if len(np.unique(test_outcome)) <= 2:
+    pass
+else:
+    test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1))
+# print(test_features.shape)
+# if the model is a logistic regression, i.e. with a binary outcome
+# then score is prediction accuracy
+# if the model is a linear regression, i.e., with a continuous outcome
+# then the score is R^2 (coefficient of determination)
+
+# fit trained ElasticNet, initialized via warm_start
+# prob in CV?
+# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
+# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
+test_metrics = {}
+
+#cross_validate(model, )
+y_pred = fitted.predict(X=test_features)
+score = fitted.score(X=test_features, y=np.ravel(test_outcome))
+if len(np.unique(test_outcome)) == 2:
+    test_metrics["accuracy"] = score
+else:
+    test_metrics["coefficient of determination"] = score
+corr = spearmanr(test_outcome, y_pred)
+test_metrics["pred_v_actual_corr"] = corr
+mse = mean_squared_error(test_outcome, y_pred)
+test_metrics["mean squared error"] = mse
+print("Out-of-sample prediction score:\t", score)
+print("Out-of-sample mean squared error:\t", mse)
+# print(np.mean(test_features))
+# pred_outcome = fitted.predict(test_features)
+test_df[f'{OUTCOME}_scaled'] = test_outcome
+test_df[f'{OUTCOME}_pred'] = y_pred
+Ys = test_df[[f'{OUTCOME}_scaled', 
+              f'{OUTCOME}_pred',
+              'cycle_day', 
+              'bc']]
+Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
+
+Ys['ppts'] = Ys.index.get_level_values(0)
+
+
+light_colors = ['#33ACE3', #Bubbles
+                '#EA6964', #Blossom
+                '#4AB62C' #Buttercup
+                ]
+dark_colors = ['#1278a6', 
+               '#a11510', 
+               '#228208']
+light = ListedColormap(light_colors, name='light_powderpuff')
+dark = ListedColormap(dark_colors, name='dark_powderpuff')
+mpl.colormaps.register(cmap=light)
+mpl.colormaps.register(cmap=dark)
+
+fig,ax = plt.subplots()
+g = sns.scatterplot(x='cycle_day', 
+                    y=f'{OUTCOME}_pred', 
+                    style='bc', 
+                    data=Ys, 
+                    hue='ppts',  
+                    hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'],
+                    ax=ax, 
+                    palette='light_powderpuff'
+                    )
+h = sns.scatterplot(x='cycle_day',
+                     y=f'{OUTCOME}_scaled', 
+                     style='bc', 
+                     data=Ys, 
+                     hue='ppts',
+                     hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], 
+                     ax=ax, 
+                     palette='dark_powderpuff')
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')
+fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+
+
+
+#print(test_outcome, "\n", y_pred)
+# print(pred_outcome)
+if len(np.unique(test_outcome)) > 2:
+    
+    print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
+    test_metrics["spearman correlation"] = corr
+with open(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(test_metrics, fp)
+np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred)
diff --git a/idconn/workflows/nbs_predict-p4.py b/idconn/workflows/nbs_predict-p4.py
new file mode 100644
index 0000000..559b4ff
--- /dev/null
+++ b/idconn/workflows/nbs_predict-p4.py
@@ -0,0 +1,416 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import nibabel as nib
+import seaborn as sns
+import bids
+import matplotlib.pyplot as plt
+from os.path import join
+from datetime import datetime
+from time import strftime
+from scipy.stats import spearmanr
+from idconn import nbs, io
+
+from bct import threshold_proportional
+
+
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate
+from sklearn.preprocessing import Normalizer, StandardScaler
+from sklearn.metrics import mean_squared_error
+from matplotlib.colors import ListedColormap
+import matplotlib as mpl
+
+
+import warnings
+import json
+
+warnings.simplefilter("ignore")
+
+today = datetime.today()
+today_str = strftime("%m_%d_%Y")
+
+TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674"
+TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset"
+DERIV_NAME = "IDConn"
+OUTCOME = "progesterone"
+CONFOUNDS = ["framewise_displacement"]
+TASK = "rest"
+ATLAS = "craddock2012"
+THRESH = 0.5
+alpha = 0.01
+atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
+
+
+layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
+
+dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+keep = dat["adj"].dropna().index
+dat = dat.loc[keep]
+
+groups = dat["bc"]
+# print(dat['adj'].values.shape)
+num_node = dat.iloc[0]["adj"].shape[0]
+
+matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node))
+upper_tri = np.triu_indices(num_node, k=1)
+
+outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
+
+#print(len(np.unique(outcome)))
+
+if CONFOUNDS is not None:
+    confounds = dat[CONFOUNDS]
+    base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}"
+else:
+    confounds = None
+    base_name = f"nbs-predict_outcome-{OUTCOME}"
+# print(dat['bc'])
+
+weighted_average, cv_results = nbs.kfold_nbs(
+    matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000
+)
+
+fig, fig2, nimg = io.plot_edges(
+    weighted_average,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Precision-Weighted Average",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}")
+)
+
+
+avg_df = pd.DataFrame(
+    weighted_average,
+    index=range(0, weighted_average.shape[0]),
+    columns=range(0, weighted_average.shape[1]),
+)
+
+cv_results.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t"
+)
+avg_df.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
+)
+
+best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model']
+
+# this uses the most predictive subnetwork as features in the model
+# might replace with thresholded weighted_average
+# or use _all_ the edges in weighted_average with KRR or ElasticNet...
+# ORRR use thresholded weighted average edges with ElasticNet...
+# - stays true to NBS-Predict
+# - increases parsimony while handling multicollinearity...
+# either way, I don't think cv_results is necessary
+
+# here is where we'd threshold the weighted average to use for elastic-net
+weighted_average = np.where(weighted_average > 0, weighted_average, 0)
+#print(np.sum(weighted_average))
+#nbs_vector = weighted_average[upper_tri]
+#p75 = np.percentile(nbs_vector, 75)
+#filter = np.where(nbs_vector >= p75, True, False)
+#print(np.sum(filter))
+# print(nbs_vector.shape, filter.shape)
+
+thresh_average = threshold_proportional(weighted_average, THRESH)
+nbs_vector2 = thresh_average[upper_tri]
+#p75 = np.percentile(nbs_vector, 75)
+filter = np.where(nbs_vector2 > 0, True, False)
+
+# mask = io.vectorize_corrmats(filter)
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_train = dat[CONFOUNDS].values
+    outcome_train = np.reshape(outcome, (outcome.shape[0],))
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_train)) <= 2:
+        resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
+        train_outcome = outcome
+    elif len(np.unique(outcome_train)) > 3:
+        train_outcome, resid_edges = nbs.residualize(
+            X=edges_train, y=outcome_train, confounds=confounds_train
+        )
+    train_features = resid_edges
+else:
+    train_features = edges_train
+    train_outcome = outcome
+
+x_scaler = StandardScaler()
+y_scaler = StandardScaler()
+train_features = x_scaler.fit_transform(train_features)
+if len(np.unique(train_outcome)) <= 2:
+    pass
+else:
+    train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
+
+
+
+# run the model on the whole test dataset to get params
+
+# classification if the outcome is binary (for now)
+# could be extended to the multiclass case?
+train_metrics = {}
+if len(np.unique(outcome)) == 2:
+    model = LogisticRegression(
+        penalty="l2", 
+        solver="saga", 
+        C=best.C_[0]
+        )
+    train_metrics["alpha"] = best.C_[0]
+    #train_metrics["l1_ratio"] = best.l1_ratio_
+else:
+    model = Ridge(
+        solver="auto",  
+        alpha=best.alpha_,
+        fit_intercept=False,
+        )
+    train_metrics["alpha"] = best.alpha_
+
+cv = RepeatedKFold(n_splits=5, n_repeats=10)
+
+    #train_metrics["l1_ratio"] = best.l1_ratio_
+#print(params)
+#model.set_params(**params)
+# train ElasticNet on full train dataset, using feature extraction from NBS-Predict
+#fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+scores = cross_validate(
+    model, 
+    train_features, 
+    train_outcome, 
+    groups=groups, 
+    cv=cv,
+    return_estimator=True, 
+    return_train_score=True
+    )
+train_metrics["in_sample_test"] = np.mean(scores['test_score'])
+train_metrics["in_sample_train"] = np.mean(scores['train_score'])
+
+fitted = scores['estimator'][0]
+y_pred = fitted.predict(X=train_features)
+train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
+
+dat[f'{OUTCOME}_pred'] = y_pred
+dat[f'{OUTCOME}_scaled'] = train_outcome
+
+Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']]
+Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
+
+train_colors = ['#a08ad1', #light
+                '#685690', #medium
+                '#3f2d69' #dark
+                ]
+light_cmap = sns.color_palette('dark:#a08ad1')
+dark_cmap = sns.color_palette('dark:#685690')
+
+fig,ax = plt.subplots()
+g = sns.scatterplot(x='cycle_day', 
+                    y=f'{OUTCOME}_pred', 
+                    style='bc', 
+                    data=Ys,  
+                    ax=ax, 
+                    palette=dark_cmap)
+h = sns.scatterplot(x='cycle_day',
+                    y=f'{OUTCOME}_scaled', 
+                    style='bc', 
+                    data=Ys, 
+                    ax=ax, 
+                    palette=light_cmap)
+ax.legend(bbox_to_anchor=(1.0, 0.5))
+fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+
+mse = mean_squared_error(train_outcome, y_pred)
+train_metrics["mean squared error"] = mse
+print("In-sample train score: ", train_metrics["in_sample_train"])
+print("In-sample test score: ", train_metrics["in_sample_test"])
+print("In-sample mean squared error: ", mse)
+# print(np.mean(train_features))
+with open(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(train_metrics, fp)
+
+# yoink the coefficients? for a more parsimonious figure?
+#print(fitted.coef_.shape)
+coeff_vec = np.zeros_like(filter)
+j = 0
+for i in range(0, filter.shape[0]):
+    if filter[i] == True:
+        #print(j)
+        coeff_vec[i] = fitted.coef_[0, j]
+        j += 1
+    else:
+        pass
+
+# print(coeff_vec)
+
+coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
+coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index)
+coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv"))
+
+fig, fig2, nimg = io.plot_edges(
+    coef_mat,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Coefficients",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}")
+)
+
+
+layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
+
+test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+keep = test_df[[OUTCOME, "adj"]].dropna().index
+# print(keep)
+
+test_df = test_df.loc[keep]
+
+outcome_test = test_df[OUTCOME].values
+# print(test_df)
+
+# print(outcome_test)
+matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
+    (len(test_df["adj"].dropna().index), num_node, num_node)
+)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_test = test_df[CONFOUNDS].values
+
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_test)) <= 2:
+        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
+        test_outcome = outcome_test
+    elif len(np.unique(outcome_test)) > 3:
+        test_outcome, resid_edges = nbs.residualize(
+            X=edges_test, y=outcome_test, confounds=confounds_test
+        )
+    test_features = resid_edges
+else:
+    test_features = edges_test
+    test_outcome = outcome_test
+
+# scale after residualizing omg
+test_features = x_scaler.transform(test_features)
+if len(np.unique(test_outcome)) <= 2:
+    pass
+else:
+    test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1))
+# print(test_features.shape)
+# if the model is a logistic regression, i.e. with a binary outcome
+# then score is prediction accuracy
+# if the model is a linear regression, i.e., with a continuous outcome
+# then the score is R^2 (coefficient of determination)
+
+# fit trained ElasticNet, initialized via warm_start
+# prob in CV?
+# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
+# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
+test_metrics = {}
+
+#cross_validate(model, )
+y_pred = fitted.predict(X=test_features)
+score = fitted.score(X=test_features, y=np.ravel(test_outcome))
+if len(np.unique(test_outcome)) == 2:
+    test_metrics["accuracy"] = score
+else:
+    test_metrics["coefficient of determination"] = score
+corr = spearmanr(test_outcome, y_pred)
+test_metrics["pred_v_actual_corr"] = corr
+mse = mean_squared_error(test_outcome, y_pred)
+test_metrics["mean squared error"] = mse
+print("Out-of-sample prediction score:\t", score)
+print("Out-of-sample mean squared error:\t", mse)
+# print(np.mean(test_features))
+# pred_outcome = fitted.predict(test_features)
+test_df[f'{OUTCOME}_scaled'] = test_outcome
+test_df[f'{OUTCOME}_pred'] = y_pred
+Ys = test_df[[f'{OUTCOME}_scaled', 
+              f'{OUTCOME}_pred',
+              'cycle_day', 
+              'bc']]
+Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
+
+Ys['ppts'] = Ys.index.get_level_values(0)
+
+
+light_colors = ['#33ACE3', #Bubbles
+                '#EA6964', #Blossom
+                '#4AB62C' #Buttercup
+                ]
+dark_colors = ['#1278a6', 
+               '#a11510', 
+               '#228208']
+light = ListedColormap(light_colors, name='light_powderpuff')
+dark = ListedColormap(dark_colors, name='dark_powderpuff')
+mpl.colormaps.register(cmap=light)
+mpl.colormaps.register(cmap=dark)
+
+fig,ax = plt.subplots()
+g = sns.scatterplot(x='cycle_day', 
+                    y=f'{OUTCOME}_pred', 
+                    style='bc', 
+                    data=Ys, 
+                    hue='ppts',  
+                    hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'],
+                    ax=ax, 
+                    palette='light_powderpuff'
+                    )
+h = sns.scatterplot(x='cycle_day',
+                     y=f'{OUTCOME}_scaled', 
+                     style='bc', 
+                     data=Ys, 
+                     hue='ppts',
+                     hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], 
+                     ax=ax, 
+                     palette='dark_powderpuff')
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')
+fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+
+
+
+#print(test_outcome, "\n", y_pred)
+# print(pred_outcome)
+if len(np.unique(test_outcome)) > 2:
+    
+    print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
+    test_metrics["spearman correlation"] = corr
+with open(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(test_metrics, fp)
+np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred)
diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index 50563e7..46e804c 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -29,21 +29,21 @@
 today = datetime.today()
 today_str = strftime("%m_%d_%Y")
 
-TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674"
-TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset"
+TRAIN_DSET = ""
+TEST_DSET = ""
 DERIV_NAME = "IDConn"
-OUTCOME = "bc"
+OUTCOME = ""
 CONFOUNDS = "framewise_displacement"
 TASK = "rest"
 ATLAS = "craddock2012"
 THRESH = 0.5
 alpha = 0.05
-atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
+atlas_fname = "craddock2012_tcorr05_2level_270_2mm.nii.gz"
 
 
-layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
+train_layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
 
-dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+dat = io.read_corrmats(train_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
 
 keep = dat["adj"].dropna().index
 dat = dat.loc[keep]
@@ -65,6 +65,47 @@
     base_name = f"nbs-predict_outcome-{OUTCOME}"
 # print(dat['bc'])
 
+# load in test data
+test_layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
+
+test_df = io.read_corrmats(test_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+keep = test_df[[OUTCOME, "adj"]].dropna().index
+# print(keep)
+
+test_df = test_df.loc[keep]
+
+outcome_test = test_df[OUTCOME].values
+# print(test_df)
+
+# print(outcome_test)
+matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
+    (len(test_df["adj"].dropna().index), num_node, num_node)
+)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
+
+
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_test = test_df[CONFOUNDS].values
+
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_test)) <= 2:
+        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
+        test_outcome = outcome_test
+    elif len(np.unique(outcome_test)) > 3:
+        test_outcome, resid_edges = nbs.residualize(
+            X=edges_test, y=outcome_test, confounds=confounds_test
+        )
+    test_features = resid_edges
+else:
+    test_features = edges_test
+    test_outcome = outcome_test
+
+
 weighted_average, cv_results = nbs.kfold_nbs(
     matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000
 )
@@ -269,43 +310,6 @@
 )
 
 
-layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
-
-test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
-
-keep = test_df[[OUTCOME, "adj"]].dropna().index
-# print(keep)
-
-test_df = test_df.loc[keep]
-
-outcome_test = test_df[OUTCOME].values
-# print(test_df)
-
-# print(outcome_test)
-matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
-    (len(test_df["adj"].dropna().index), num_node, num_node)
-)
-edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
-
-# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
-if CONFOUNDS is not None:
-    confounds_test = test_df[CONFOUNDS].values
-
-    # regress out the confounds from each edge and the outcome variable,
-    # use the residuals for the rest of the algorithm
-    # print(confounds.shape, outcome.shape)
-    if len(np.unique(outcome_test)) <= 2:
-        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
-        test_outcome = outcome_test
-    elif len(np.unique(outcome_test)) > 3:
-        test_outcome, resid_edges = nbs.residualize(
-            X=edges_test, y=outcome_test, confounds=confounds_test
-        )
-    test_features = resid_edges
-else:
-    test_features = edges_test
-    test_outcome = outcome_test
-
 # scale after residualizing omg
 test_features = x_scaler.transform(test_features)
 if len(np.unique(test_outcome)) <= 2:

From b6efbbd8e904bc83f1982380793c6dbfa4ea7db3 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Fri, 14 Jun 2024 15:40:03 -0700
Subject: [PATCH 45/48] added some to-dos need to rerun checks

---
 idconn/connectivity.py | 2 ++
 idconn/io.py           | 9 ++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/idconn/connectivity.py b/idconn/connectivity.py
index 1e79998..5ac1ae2 100644
--- a/idconn/connectivity.py
+++ b/idconn/connectivity.py
@@ -254,6 +254,8 @@ def task_connectivity(
 def rest_connectivity(
     layout, subject, session, task, atlas, confounds=None, connectivity_metric="correlation"
 ):
+    ###################################################################################
+    ################# Needs an option to keep runs separate. ##########################
     """
     Makes connectivity matrices per subject per session per task per condition.
     Parameters
diff --git a/idconn/io.py b/idconn/io.py
index 23b563c..3e1bca9 100644
--- a/idconn/io.py
+++ b/idconn/io.py
@@ -253,6 +253,8 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True
     """Returns a node x node x (subject x session) matrix of correlation matrices
     from a BIDS derivative folder. Optionally returns a node^2 x (subject x session)
     array of vectorized upper triangles of those correlation matrices.
+
+    ME @ ME: NEEDS AN OPTION TO KEEP RUNS SEPARATE. CURRENTLY IT AVERAGES CONFOUNDS AND 
     Parameters
     ----------
     layout : BIDSLayout or str
@@ -356,7 +358,7 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True
                         )
                         # print(confound_means)
             else:
-                path = path = layout.get(
+                path = layout.get(
                     return_type="filename",
                     session=session,
                     desc="confounds",
@@ -397,7 +399,12 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True
                 pass
             if type(path) == list:
                 # print(len(path))
+                ################################################################
+                ############ EEEEEEEEEEEEEEEEEK ################################
+                ############### DOES THIS ONLY GRAB ONE RUN?!?!?! ##############
+                ################################################################
                 path = path[0]
+                
             else:
                 pass
             assert exists(path), f"Corrmat file not found at {path}"

From f39aba7d13015a3f9c40c22eee68cedcba661ab5 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Fri, 14 Jun 2024 15:47:29 -0700
Subject: [PATCH 46/48] update nbsp workflows for hormones x fc paper

---
 idconn/io.py                                  |  20 +-
 idconn/nbs.py                                 | 144 +++---
 idconn/workflows/nbs_predict-bc.py            | 387 ++++++++++++++++
 .../workflows/nbs_predict-bc_sensitivity.py   | 412 ++++++++++++++++++
 idconn/workflows/nbs_predict-e2.py            | 210 +++++----
 .../workflows/nbs_predict-e2_sensitivity.py   | 412 ++++++++++++++++++
 .../workflows/nbs_predict-e2bc_sensitivity.py | 412 ++++++++++++++++++
 idconn/workflows/nbs_predict-e2xp4-bc.py      | 214 +++++----
 idconn/workflows/nbs_predict-e2xp4.py         | 214 +++++----
 idconn/workflows/nbs_predict-p4.py            | 206 +++++----
 .../workflows/nbs_predict-p4_sensitivity.py   | 412 ++++++++++++++++++
 .../workflows/nbs_predict-p4bc_sensitivity.py | 412 ++++++++++++++++++
 idconn/workflows/nbs_predict.py               | 167 ++++---
 13 files changed, 3027 insertions(+), 595 deletions(-)
 create mode 100644 idconn/workflows/nbs_predict-bc.py
 create mode 100644 idconn/workflows/nbs_predict-bc_sensitivity.py
 create mode 100644 idconn/workflows/nbs_predict-e2_sensitivity.py
 create mode 100644 idconn/workflows/nbs_predict-e2bc_sensitivity.py
 create mode 100644 idconn/workflows/nbs_predict-p4_sensitivity.py
 create mode 100644 idconn/workflows/nbs_predict-p4bc_sensitivity.py

diff --git a/idconn/io.py b/idconn/io.py
index 3e1bca9..55ddc81 100644
--- a/idconn/io.py
+++ b/idconn/io.py
@@ -35,6 +35,7 @@ def calc_fd(confounds):
     fd = np.sum([delta_x, delta_y, delta_z, delta_alpha, delta_beta, delta_gamma], axis=0)
     return fd
 
+
 def build_statsmodel_json(
     name,
     task,
@@ -131,6 +132,7 @@ def build_statsmodel_json(
         json.dump(statsmodel, outfile)
     return statsmodel_json
 
+
 def atlas_picker(atlas, path, key=None):
     """Takes in atlas name and path to file, if local, returns
     nifti-like object (usually file path to downloaded atlas),
@@ -190,6 +192,7 @@ def atlas_picker(atlas, path, key=None):
 
     return atlas, path
 
+
 def vectorize_corrmats(matrices, diagonal=False):
     """Returns the vectorized upper triangles of a 3-dimensional array
     (i.e., node x node x matrix) of matrices. Output will be a 2-dimensional
@@ -249,12 +252,13 @@ def vectorize_corrmats(matrices, diagonal=False):
     edge_vector = np.asarray(edge_vector)
     return edge_vector
 
+
 def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True, verbose=False):
     """Returns a node x node x (subject x session) matrix of correlation matrices
     from a BIDS derivative folder. Optionally returns a node^2 x (subject x session)
     array of vectorized upper triangles of those correlation matrices.
 
-    ME @ ME: NEEDS AN OPTION TO KEEP RUNS SEPARATE. CURRENTLY IT AVERAGES CONFOUNDS AND 
+    ME @ ME: NEEDS AN OPTION TO KEEP RUNS SEPARATE. CURRENTLY IT AVERAGES CONFOUNDS AND
     Parameters
     ----------
     layout : BIDSLayout or str
@@ -404,7 +408,7 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True
                 ############### DOES THIS ONLY GRAB ONE RUN?!?!?! ##############
                 ################################################################
                 path = path[0]
-                
+
             else:
                 pass
             assert exists(path), f"Corrmat file not found at {path}"
@@ -426,6 +430,7 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True
     ppt_df.replace({"": np.nan}, inplace=True)
     return ppt_df
 
+
 def undo_vectorize(edges, num_node=None, diagonal=False):
     """
     Puts an edge vector back into an adjacency matrix.
@@ -453,17 +458,18 @@ def undo_vectorize(edges, num_node=None, diagonal=False):
         num_node = int(num_node)
     X = np.zeros((num_node, num_node))
     if diagonal == False:
-        k=1
+        k = 1
     if diagonal == True:
-        k=0
+        k = 0
     X[np.triu_indices(num_node, k=k)] = edges
-    diag_X = X[np.diag_indices(num_node,2)]
+    diag_X = X[np.diag_indices(num_node, 2)]
     X = X + X.T
     if diagonal == True:
-        X[np.diag_indices(num_node,2)] = diag_X
-    #print('did undo_vectorize work?', np.allclose(X, X.T))
+        X[np.diag_indices(num_node, 2)] = diag_X
+    # print('did undo_vectorize work?', np.allclose(X, X.T))
     return X
 
+
 def plot_edges(
     adj,
     atlas_nii,
diff --git a/idconn/nbs.py b/idconn/nbs.py
index 26ed551..52e9b37 100644
--- a/idconn/nbs.py
+++ b/idconn/nbs.py
@@ -8,11 +8,7 @@
 
 # import bct
 from sklearn.experimental import enable_halving_search_cv
-from sklearn.model_selection import (
-    RepeatedStratifiedKFold,
-    RepeatedKFold,
-    HalvingGridSearchCV
-)
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, HalvingGridSearchCV
 
 from sklearn.feature_selection import f_regression, f_classif
 from sklearn.linear_model import LogisticRegression, ElasticNet, LogisticRegressionCV, RidgeCV
@@ -39,9 +35,9 @@ def calc_number_of_nodes(matrices):
 
 
 def residualize(X, y=None, confounds=None):
-    '''
+    """
     all inputs need to be arrays, not dataframes
-    '''
+    """
     # residualize the outcome
     if confounds is not None:
         if y is not None:
@@ -74,7 +70,9 @@ def residualize(X, y=None, confounds=None):
         print("Confound matrix wasn't provided, so no confounding was done")
 
 
-def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=False, permutations=10000):
+def pynbs(
+    matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=False, permutations=10000
+):
     """
     Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided
     of shape ((subject x session)x node x node)
@@ -129,7 +127,6 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=
         edges = matrices.copy()
     # print(edges.shape)
 
-
     # edges = edges.T
 
     # run an ols per edge
@@ -145,13 +142,15 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=
 
     # find largest connected component of sig_edges
     # turn sig_edges into an nxn matrix first
-    sig_matrix = undo_vectorize(sig_edges, num_node=num_node, diagonal=diagonal)  # need to write this function
+    sig_matrix = undo_vectorize(
+        sig_edges, num_node=num_node, diagonal=diagonal
+    )  # need to write this function
     matrix = nx.from_numpy_array(sig_matrix)
 
     # use networkX to find connected components
     S = [matrix.subgraph(c).copy() for c in nx.connected_components(matrix)]
     S.sort(key=len, reverse=True)
-    #largest_cc = max(nx.connected_components(matrix), key=len)
+    # largest_cc = max(nx.connected_components(matrix), key=len)
     G0 = S[0]
     # print(G0)
 
@@ -202,7 +201,9 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=
             # print(np.sum(perm_edges))
             # find largest connected component of sig_edges
             # turn sig_edges into an nxn matrix first
-            perm_matrix = undo_vectorize(perm_edges, num_node=num_node, diagonal=diagonal)  # need to write this function
+            perm_matrix = undo_vectorize(
+                perm_edges, num_node=num_node, diagonal=diagonal
+            )  # need to write this function
             perm_nx = nx.from_numpy_array(perm_matrix)
 
             largest_cc = max(nx.connected_components(perm_nx), key=len)
@@ -233,7 +234,17 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=
 
 
 def kfold_nbs(
-    matrices, outcome, confounds=None, alpha=0.05, groups=None, num_node=None, diagonal=False, scale_x=False, scale_y=False, n_splits=10, n_iterations=10
+    matrices,
+    outcome,
+    confounds=None,
+    alpha=0.05,
+    groups=None,
+    num_node=None,
+    diagonal=False,
+    scale_x=False,
+    scale_y=False,
+    n_splits=10,
+    n_iterations=10,
 ):
     """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided
     of shape ((subject x session)x node x node)
@@ -326,38 +337,38 @@ def kfold_nbs(
     if diagonal == True:
         k = 0
     if diagonal == False:
-        k=1
+        k = 1
     upper_tri = np.triu_indices(num_node, k=k)
 
     i = 0
     manager = enlighten.get_manager()
     ticks = manager.counter(total=n_splits * n_iterations, desc="Progress", unit="folds")
     for train_idx, test_idx in cv.split(edges, split_y):
-        
+
         cv_results.at[i, "split"] = (train_idx, test_idx)
 
         # assert len(train_a_idx) == len(train_b_idx)
         Cs = np.logspace(-4, 4, 10)
-        #print(len(np.unique(outcome)))
+        # print(len(np.unique(outcome)))
         if np.unique(outcome).shape[0] == 2:
-            #print('binary')
+            # print('binary')
             regressor = LogisticRegressionCV(
-                Cs=Cs, 
+                Cs=Cs,
                 cv=4,
-                #verbose=2,
-                max_iter=100000, 
-                penalty="l2", 
-                solver="saga", 
-                n_jobs=4
+                # verbose=2,
+                max_iter=100000,
+                penalty="l2",
+                solver="saga",
+                n_jobs=4,
             )
-            
+
         else:
-            #print('continuous')
+            # print('continuous')
             regressor = RidgeCV(
-                alphas=Cs, 
-                cv=4, 
-                #n_jobs=4
-                )
+                alphas=Cs,
+                cv=4,
+                # n_jobs=4
+            )
 
         train_y = outcome[train_idx]
         test_y = outcome[test_idx]
@@ -392,16 +403,14 @@ def kfold_nbs(
                 y_scaler = Normalizer()
                 train_y = y_scaler.fit_transform(train_y.reshape(-1, 1))
                 test_y = y_scaler.transform(test_y.reshape(-1, 1))
-        
-        
-
-       
 
         # perform NBS wooooooooo
         # note: output is a dataframe :)
         # PYNBS SHOULD NOT DO CONFOUND REGRESSION?
-        adj = pynbs(train_edges, train_y, num_node=num_node, diagonal=diagonal, alpha=alpha, predict=True)
-        #print(adj.shape, adj.ndim, adj[0].shape, upper_tri)
+        adj = pynbs(
+            train_edges, train_y, num_node=num_node, diagonal=diagonal, alpha=alpha, predict=True
+        )
+        # print(adj.shape, adj.ndim, adj[0].shape, upper_tri)
 
         # cv_results.at[i, 'pval'] = pval
         cv_results.at[i, "component"] = adj.values
@@ -413,7 +422,7 @@ def kfold_nbs(
             # so you don't have repeated edges
             # returns (n_edges, )
             nbs_vector = adj.values[upper_tri]
-            #print(nbs_vector.shape)
+            # print(nbs_vector.shape)
             # print(nbs_vector.shape)
             # use those to make a "significant edges" mask
             mask = nbs_vector == 1.0
@@ -425,31 +434,31 @@ def kfold_nbs(
             # returns (n_edges, samples)
             train_features = train_edges.T[mask]
             test_features = test_edges.T[mask]
-            #print(mask.shape, np.sum(mask), train_edges.shape, train_features.shape)
+            # print(mask.shape, np.sum(mask), train_edges.shape, train_features.shape)
 
             train_features = train_features.T
             test_features = test_features.T
-            
-            #train_features = scaler.fit_transform(train_features.T)
-            #test_features = scaler.fit_transform(test_features.T)
-            #print(train_features.shape, train_y.shape)
 
-            #print(f"train_edges:\t{train_edges[:10, 0]}\ntrain_features:\t{train_features[:10, 0]}")
+            # train_features = scaler.fit_transform(train_features.T)
+            # test_features = scaler.fit_transform(test_features.T)
+            # print(train_features.shape, train_y.shape)
+
+            # print(f"train_edges:\t{train_edges[:10, 0]}\ntrain_features:\t{train_features[:10, 0]}")
             # print(np.ravel(train_y))
             # train model predicting outcome from brain (note: no mas covariates)
             # use grid search bc I want to know how to tune alpha and l1_ratio
-            
-            #grid = HalvingGridSearchCV(estimator=regressor, 
-            #                           param_grid=param_grid, 
-            #                           n_jobs=8, 
-            #                           cv=4, 
+
+            # grid = HalvingGridSearchCV(estimator=regressor,
+            #                           param_grid=param_grid,
+            #                           n_jobs=8,
+            #                           cv=4,
             #                           factor=2,
             #                           verbose=0,
-            #                           min_resources=20, 
-            #                           refit=True, 
+            #                           min_resources=20,
+            #                           refit=True,
             #                           aggressive_elimination=False)
             model = regressor.fit(X=train_features, y=np.ravel(train_y))
-            
+
             cv_results.at[i, "model"] = model
 
             # score that model on the testing data
@@ -462,18 +471,20 @@ def kfold_nbs(
             # I go die now
             if np.unique(outcome).shape[0] == 2:
                 score = model.score(X=test_features, y=np.ravel(test_y))
-                
+
             else:
                 predicted_y = model.predict(X=test_features)
-                score,p = spearmanr(predicted_y, np.ravel(test_y))
-                #spearman = spearmanr(predicted_y, np.ravel(test_y))
-            
+                score, p = spearmanr(predicted_y, np.ravel(test_y))
+                # spearman = spearmanr(predicted_y, np.ravel(test_y))
+
             cv_results.at[i, "score"] = score
             if i % (n_splits * n_iterations / 10) == 0:
-                mean = cv_results['score'].mean()
-                sdev = cv_results['score'].std()
-                print(f'Iteration {i} out of {n_splits * n_iterations}, average score:\t{mean:.2f} +/- {sdev:.2f}')
-            #print(score)
+                mean = cv_results["score"].mean()
+                sdev = cv_results["score"].std()
+                print(
+                    f"Iteration {i} out of {n_splits * n_iterations}, average score:\t{mean:.2f} +/- {sdev:.2f}"
+                )
+            # print(score)
 
             m = 0
             param_vector = np.zeros_like(nbs_vector)
@@ -489,21 +500,21 @@ def kfold_nbs(
                 else:
                     pass
             X = undo_vectorize(param_vector, num_node=num_node, diagonal=diagonal)
-            #cv_results.at[i, "coefficient_matrix"] = X
-            #cv_results.at[i, "coefficient_vector"] = param_vector
+            # cv_results.at[i, "coefficient_matrix"] = X
+            # cv_results.at[i, "coefficient_vector"] = param_vector
             i += 1
         else:
             pass
         ticks.update()
     # calculate weighted average
     # print(cv_results['score'])
-    weighted_stack = np.zeros((num_node,num_node))
-    fake = np.zeros((num_node,num_node))
+    weighted_stack = np.zeros((num_node, num_node))
+    fake = np.zeros((num_node, num_node))
     # print(weighted_stack.shape)
     for j in index:
         # print(cv_results.at[j, 'score'])
         weighted = cv_results.at[j, "component"] * cv_results.at[j, "score"]
-        
+
         if np.sum(weighted) == 0 or np.isnan(np.sum(weighted)) == True:
             weighted_stack = np.dstack([weighted_stack, fake])
         else:
@@ -511,5 +522,8 @@ def kfold_nbs(
 
         # print(weighted_stack.shape, weighted.shape)
     weighted_average = np.mean(weighted_stack, axis=-1)
-    #model = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
-    return weighted_average, cv_results, #model
+    # model = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
+    return (
+        weighted_average,
+        cv_results,
+    )  # model
diff --git a/idconn/workflows/nbs_predict-bc.py b/idconn/workflows/nbs_predict-bc.py
new file mode 100644
index 0000000..ec3c559
--- /dev/null
+++ b/idconn/workflows/nbs_predict-bc.py
@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import nibabel as nib
+import seaborn as sns
+import bids
+import matplotlib.pyplot as plt
+from os.path import join
+from datetime import datetime
+from time import strftime
+from scipy.stats import spearmanr
+from idconn import nbs, io
+from bct import threshold_proportional
+
+
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate
+from sklearn.preprocessing import Normalizer, StandardScaler
+from sklearn.metrics import mean_squared_error
+from matplotlib.colors import ListedColormap
+import matplotlib as mpl
+
+
+import warnings
+import json
+
+warnings.simplefilter("ignore")
+
+today = datetime.today()
+today_str = strftime("%m_%d_%Y")
+
+TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674"
+TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset"
+DERIV_NAME = "IDConn"
+OUTCOME = "bc"
+CONFOUNDS = ["framewise_displacement"]
+TASK = "rest"
+ATLAS = "craddock2012"
+THRESH = 0.5
+alpha = 0.01
+atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
+
+
+train_layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
+
+dat = io.read_corrmats(train_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+keep = dat["adj"].dropna().index
+dat = dat.loc[keep]
+
+# print(dat['adj'].values.shape)
+num_node = dat.iloc[0]["adj"].shape[0]
+
+matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node))
+upper_tri = np.triu_indices(num_node, k=1)
+
+outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
+groups = dat["bc"]
+
+if CONFOUNDS is not None:
+    confounds = dat[CONFOUNDS]
+    base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}"
+else:
+    confounds = None
+    base_name = f"nbs-predict_outcome-{OUTCOME}"
+# print(dat['bc'])
+
+# load in test data
+test_layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
+
+test_df = io.read_corrmats(test_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+keep = test_df[[OUTCOME, "adj"]].dropna().index
+# print(keep)
+
+test_df = test_df.loc[keep]
+
+outcome_test = test_df[OUTCOME].values
+# print(test_df)
+
+# print(outcome_test)
+matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
+    (len(test_df["adj"].dropna().index), num_node, num_node)
+)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
+
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_test = test_df[CONFOUNDS].values
+
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_test)) <= 2:
+        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
+        test_outcome = outcome_test
+    elif len(np.unique(outcome_test)) > 3:
+        test_outcome, resid_edges = nbs.residualize(
+            X=edges_test, y=outcome_test, confounds=confounds_test
+        )
+    test_features = resid_edges
+else:
+    test_features = edges_test
+    test_outcome = outcome_test
+
+
+weighted_average, cv_results = nbs.kfold_nbs(
+    matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000
+)
+
+fig, fig2, nimg = io.plot_edges(
+    weighted_average,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Precision-Weighted Average",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}")
+)
+
+
+avg_df = pd.DataFrame(
+    weighted_average,
+    index=range(0, weighted_average.shape[0]),
+    columns=range(0, weighted_average.shape[1]),
+)
+
+cv_results.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t"
+)
+avg_df.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
+)
+
+best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
+
+# this uses the most predictive subnetwork as features in the model
+# might replace with thresholded weighted_average
+# or use _all_ the edges in weighted_average with KRR or ElasticNet...
+# ORRR use thresholded weighted average edges with ElasticNet...
+# - stays true to NBS-Predict
+# - increases parsimony while handling multicollinearity...
+# either way, I don't think cv_results is necessary
+
+# here is where we'd threshold the weighted average to use for elastic-net
+weighted_average = np.where(weighted_average > 0, weighted_average, 0)
+# nbs_vector = weighted_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+# filter = np.where(nbs_vector >= p75, True, False)
+# print(nbs_vector.shape, filter.shape)
+thresh_average = threshold_proportional(weighted_average, THRESH)
+nbs_vector2 = thresh_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+filter = np.where(nbs_vector2 > 0, True, False)
+
+# mask = io.vectorize_corrmats(filter)
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_train = dat[CONFOUNDS].values
+    outcome_train = np.reshape(outcome, (outcome.shape[0],))
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_train)) <= 2:
+        resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
+        train_outcome = outcome
+    elif len(np.unique(outcome_train)) > 3:
+        train_outcome, resid_edges = nbs.residualize(
+            X=edges_train, y=outcome_train, confounds=confounds_train
+        )
+    train_features = resid_edges
+else:
+    train_features = edges_train
+    train_outcome = outcome
+
+x_scaler = StandardScaler()
+y_scaler = StandardScaler()
+train_features = x_scaler.fit_transform(train_features)
+if len(np.unique(train_outcome)) <= 2:
+    pass
+else:
+    train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
+
+# run the model on the whole test dataset to get params
+
+# classification if the outcome is binary (for now)
+# could be extended to the multiclass case?
+
+cv = RepeatedKFold(n_splits=5, n_repeats=10)
+
+train_metrics = {}
+if len(np.unique(outcome)) == 2:
+    model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0])
+    train_metrics["alpha"] = best.C_[0]
+    # train_metrics["l1_ratio"] = best.l1_ratio_
+else:
+    model = Ridge(solver="saga", alpha=best.alpha_)
+    train_metrics["alpha"] = best.alpha_
+    # train_metrics["l1_ratio"] = best.l1_ratio_
+# print(params)
+# model.set_params(**params)
+# train ElasticNet on full train dataset, using feature extraction from NBS-Predict
+
+scores = cross_validate(
+    model,
+    train_features,
+    train_outcome,
+    groups=groups,
+    cv=cv,
+    return_estimator=True,
+    return_train_score=True,
+)
+train_metrics["in_sample_test"] = np.mean(scores["test_score"])
+train_metrics["in_sample_train"] = np.mean(scores["train_score"])
+
+fitted = scores["estimator"][0]
+y_pred = fitted.predict(X=train_features)
+train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
+dat[f"{OUTCOME}_pred"] = y_pred
+dat[f"{OUTCOME}_scaled"] = train_outcome
+
+Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled"]]
+Ys.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+train_colors = ["#a08ad1", "#685690", "#3f2d69"]  # light  # medium  # dark
+light_cmap = sns.color_palette("dark:#a08ad1")
+dark_cmap = sns.color_palette("dark:#685690")
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x=f"{OUTCOME}_scaled",
+    y=f"{OUTCOME}_pred",
+    # style='bc',
+    data=Ys,
+    ax=ax,
+    palette=dark_cmap,
+)
+# ax.legend(bbox_to_anchor=(1.0, 0.5))
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+mse = mean_squared_error(train_outcome, y_pred)
+train_metrics["mean squared error"] = mse
+print("In-sample prediction score: ", train_metrics["in_sample_test"])
+print("In-sample mean squared error: ", mse)
+train_metrics["in_sample_mse"] = mse
+# print(np.mean(train_features))
+with open(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(train_metrics, fp)
+
+# yoink the coefficients? for a more parsimonious figure?
+coeff_vec = np.zeros_like(filter)
+j = 0
+for i in range(0, filter.shape[0]):
+    if filter[i] == True:
+        # print(j)
+        coeff_vec[i] = fitted.coef_[0, j]
+        j += 1
+    else:
+        pass
+
+
+coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
+coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index)
+coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv"))
+
+fig, fig2, nimg = io.plot_edges(
+    coef_mat,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Coefficients",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}")
+)
+
+
+# scale after residualizing omg
+test_features = x_scaler.transform(test_features)
+if len(np.unique(test_outcome)) <= 2:
+    pass
+else:
+    test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1))
+# print(test_features.shape)
+# if the model is a logistic regression, i.e. with a binary outcome
+# then score is prediction accuracy
+# if the model is a linear regression, i.e., with a continuous outcome
+# then the score is R^2 (coefficient of determination)
+
+# fit trained ElasticNet, initialized via warm_start
+# prob in CV?
+# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
+# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
+test_metrics = {}
+y_pred = fitted.predict(X=test_features)
+score = fitted.score(X=test_features, y=np.ravel(test_outcome))
+if len(np.unique(test_outcome)) == 2:
+    test_metrics["accuracy"] = score
+else:
+    test_metrics["coefficient of determination"] = score
+corr = spearmanr(test_outcome, y_pred)
+test_metrics["pred_v_actual_corr"] = corr
+mse = mean_squared_error(test_outcome, y_pred)
+test_metrics["mean squared error"] = mse
+print("Out-of-sample prediction score:\t", score)
+print("Out-of-sample mean squared error:\t", mse)
+# print(np.mean(test_features))
+# pred_outcome = fitted.predict(test_features)
+test_df[f"{OUTCOME}_scaled"] = test_outcome
+test_df[f"{OUTCOME}_pred"] = y_pred
+Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred"]]
+Ys.to_csv(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+Ys["ppts"] = Ys.index.get_level_values(0)
+
+
+light_colors = ["#33ACE3", "#EA6964", "#4AB62C"]  # Bubbles  # Blossom  # Buttercup
+dark_colors = ["#1278a6", "#a11510", "#228208"]
+light = ListedColormap(light_colors, name="light_powderpuff")
+dark = ListedColormap(dark_colors, name="dark_powderpuff")
+mpl.colormaps.register(cmap=light)
+mpl.colormaps.register(cmap=dark)
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x=f"{OUTCOME}_scaled",
+    y=f"{OUTCOME}_pred",
+    # style='bc',
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="light_powderpuff",
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+fig.savefig(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+
+# print(test_outcome, "\n", y_pred)
+# print(pred_outcome)
+if len(np.unique(test_outcome)) > 2:
+    corr = spearmanr(test_outcome, y_pred)
+    print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
+    test_metrics["spearman correlation"] = corr
+with open(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(test_metrics, fp)
+np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred)
diff --git a/idconn/workflows/nbs_predict-bc_sensitivity.py b/idconn/workflows/nbs_predict-bc_sensitivity.py
new file mode 100644
index 0000000..813cf66
--- /dev/null
+++ b/idconn/workflows/nbs_predict-bc_sensitivity.py
@@ -0,0 +1,412 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import nibabel as nib
+import seaborn as sns
+import bids
+import matplotlib.pyplot as plt
+from os.path import join
+from datetime import datetime
+from time import strftime
+from scipy.stats import spearmanr
+from idconn import nbs, io
+
+from bct import threshold_proportional
+
+
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate
+from sklearn.preprocessing import Normalizer, StandardScaler
+from sklearn.metrics import mean_squared_error
+from matplotlib.colors import ListedColormap
+import matplotlib as mpl
+
+
+import warnings
+import json
+
+warnings.simplefilter("ignore")
+
+today = datetime.today()
+today_str = strftime("%m_%d_%Y")
+
+TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674"
+TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset"
+DERIV_NAME = "IDConn"
+OUTCOME = "bc"
+CONFOUNDS = ["framewise_displacement"]
+TASK = "rest"
+ATLAS = "craddock2012"
+THRESH = 0.5
+alpha = 0.01
+atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
+
+
+layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
+
+dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index
+keep = dat["adj"].dropna().index
+dat = dat.loc[keep]
+
+groups = dat["bc"]
+# print(dat['adj'].values.shape)
+num_node = dat.iloc[0]["adj"].shape[0]
+
+matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node))
+upper_tri = np.triu_indices(num_node, k=1)
+
+outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
+
+# print(len(np.unique(outcome)))
+
+if CONFOUNDS is not None:
+    confounds = dat[CONFOUNDS]
+    base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}"
+else:
+    confounds = None
+    base_name = f"nbs-predict_outcome-{OUTCOME}"
+# print(dat['bc'])
+
+weighted_average, cv_results = nbs.kfold_nbs(
+    matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500
+)
+
+fig, fig2, nimg = io.plot_edges(
+    weighted_average,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Precision-Weighted Average",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}")
+)
+
+
+avg_df = pd.DataFrame(
+    weighted_average,
+    index=range(0, weighted_average.shape[0]),
+    columns=range(0, weighted_average.shape[1]),
+)
+
+cv_results.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t"
+)
+avg_df.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
+)
+
+best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
+
+# this uses the most predictive subnetwork as features in the model
+# might replace with thresholded weighted_average
+# or use _all_ the edges in weighted_average with KRR or ElasticNet...
+# ORRR use thresholded weighted average edges with ElasticNet...
+# - stays true to NBS-Predict
+# - increases parsimony while handling multicollinearity...
+# either way, I don't think cv_results is necessary
+
+# here is where we'd threshold the weighted average to use for elastic-net
+weighted_average = np.where(weighted_average > 0, weighted_average, 0)
+# print(np.sum(weighted_average))
+# nbs_vector = weighted_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+# filter = np.where(nbs_vector >= p75, True, False)
+# print(np.sum(filter))
+# print(nbs_vector.shape, filter.shape)
+
+thresh_average = threshold_proportional(weighted_average, THRESH)
+nbs_vector2 = thresh_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+filter = np.where(nbs_vector2 > 0, True, False)
+
+# mask = io.vectorize_corrmats(filter)
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_train = dat[CONFOUNDS].values
+    outcome_train = np.reshape(outcome, (outcome.shape[0],))
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_train)) <= 2:
+        resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
+        train_outcome = outcome
+    elif len(np.unique(outcome_train)) > 3:
+        train_outcome, resid_edges = nbs.residualize(
+            X=edges_train, y=outcome_train, confounds=confounds_train
+        )
+    train_features = resid_edges
+else:
+    train_features = edges_train
+    train_outcome = outcome
+
+x_scaler = StandardScaler()
+y_scaler = StandardScaler()
+train_features = x_scaler.fit_transform(train_features)
+if len(np.unique(train_outcome)) <= 2:
+    pass
+else:
+    train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
+
+
+# run the model on the whole test dataset to get params
+
+# classification if the outcome is binary (for now)
+# could be extended to the multiclass case?
+train_metrics = {}
+if len(np.unique(outcome)) == 2:
+    model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0])
+    train_metrics["alpha"] = best.C_[0]
+    # train_metrics["l1_ratio"] = best.l1_ratio_
+else:
+    model = Ridge(
+        solver="auto",
+        alpha=best.alpha_,
+        fit_intercept=False,
+    )
+    train_metrics["alpha"] = best.alpha_
+
+cv = RepeatedKFold(n_splits=5, n_repeats=10)
+
+# train_metrics["l1_ratio"] = best.l1_ratio_
+# print(params)
+# model.set_params(**params)
+# train ElasticNet on full train dataset, using feature extraction from NBS-Predict
+# fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+scores = cross_validate(
+    model,
+    train_features,
+    train_outcome,
+    groups=groups,
+    cv=cv,
+    return_estimator=True,
+    return_train_score=True,
+)
+train_metrics["in_sample_test"] = np.mean(scores["test_score"])
+train_metrics["in_sample_train"] = np.mean(scores["train_score"])
+
+fitted = scores["estimator"][0]
+y_pred = fitted.predict(X=train_features)
+train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
+
+dat[f"{OUTCOME}_pred"] = y_pred
+dat[f"{OUTCOME}_scaled"] = train_outcome
+
+Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]]
+Ys.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+train_colors = ["#a08ad1", "#685690", "#3f2d69"]  # light  # medium  # dark
+light_cmap = sns.color_palette("dark:#a08ad1")
+dark_cmap = sns.color_palette("dark:#685690")
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap
+)
+h = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5))
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+mse = mean_squared_error(train_outcome, y_pred)
+train_metrics["mean squared error"] = mse
+print("In-sample train score: ", train_metrics["in_sample_train"])
+print("In-sample test score: ", train_metrics["in_sample_test"])
+print("In-sample mean squared error: ", mse)
+# print(np.mean(train_features))
+with open(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(train_metrics, fp)
+
+# yoink the coefficients? for a more parsimonious figure?
+# print(fitted.coef_.shape)
+# print(fitted.coef_)
+coeff_vec = np.zeros_like(filter)
+j = 0
+for i in range(0, filter.shape[0]):
+    if filter[i] == True:
+        # print(j)
+        # print(fitted.coef_[0, j])
+        coeff_vec[i] = fitted.coef_[0, j]
+        j += 1
+    else:
+        pass
+
+# print(coeff_vec)
+print(coeff_vec)
+coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
+
+coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index)
+coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv"))
+
+fig, fig2, nimg = io.plot_edges(
+    coef_mat,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Coefficients",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}")
+)
+
+
+layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
+
+test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+keep = test_df[[OUTCOME, "adj"]].dropna().index
+# print(keep)
+
+test_df = test_df.loc[keep]
+
+outcome_test = test_df[OUTCOME].values
+# print(test_df)
+
+# print(outcome_test)
+matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
+    (len(test_df["adj"].dropna().index), num_node, num_node)
+)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_test = test_df[CONFOUNDS].values
+
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_test)) <= 2:
+        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
+        test_outcome = outcome_test
+    elif len(np.unique(outcome_test)) > 3:
+        test_outcome, resid_edges = nbs.residualize(
+            X=edges_test, y=outcome_test, confounds=confounds_test
+        )
+    test_features = resid_edges
+else:
+    test_features = edges_test
+    test_outcome = outcome_test
+
+# scale after residualizing omg
+test_features = x_scaler.transform(test_features)
+if len(np.unique(test_outcome)) <= 2:
+    pass
+else:
+    test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1))
+# print(test_features.shape)
+# if the model is a logistic regression, i.e. with a binary outcome
+# then score is prediction accuracy
+# if the model is a linear regression, i.e., with a continuous outcome
+# then the score is R^2 (coefficient of determination)
+
+# fit trained ElasticNet, initialized via warm_start
+# prob in CV?
+# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
+# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
+test_metrics = {}
+
+# cross_validate(model, )
+y_pred = fitted.predict(X=test_features)
+score = fitted.score(X=test_features, y=np.ravel(test_outcome))
+if len(np.unique(test_outcome)) == 2:
+    test_metrics["accuracy"] = score
+else:
+    test_metrics["coefficient of determination"] = score
+corr = spearmanr(test_outcome, y_pred)
+test_metrics["pred_v_actual_corr"] = corr
+mse = mean_squared_error(test_outcome, y_pred)
+test_metrics["mean squared error"] = mse
+print("Out-of-sample prediction score:\t", score)
+print("Out-of-sample mean squared error:\t", mse)
+# print(np.mean(test_features))
+# pred_outcome = fitted.predict(test_features)
+test_df[f"{OUTCOME}_scaled"] = test_outcome
+test_df[f"{OUTCOME}_pred"] = y_pred
+Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]]
+Ys.to_csv(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+Ys["ppts"] = Ys.index.get_level_values(0)
+
+
+light_colors = ["#33ACE3", "#EA6964", "#4AB62C"]  # Bubbles  # Blossom  # Buttercup
+dark_colors = ["#1278a6", "#a11510", "#228208"]
+light = ListedColormap(light_colors, name="light_powderpuff")
+dark = ListedColormap(dark_colors, name="dark_powderpuff")
+mpl.colormaps.register(cmap=light)
+mpl.colormaps.register(cmap=dark)
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_pred",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="light_powderpuff",
+)
+h = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_scaled",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="dark_powderpuff",
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+fig.savefig(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+
+# print(test_outcome, "\n", y_pred)
+# print(pred_outcome)
+if len(np.unique(test_outcome)) > 2:
+
+    print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
+    test_metrics["spearman correlation"] = corr
+with open(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(test_metrics, fp)
+np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred)
diff --git a/idconn/workflows/nbs_predict-e2.py b/idconn/workflows/nbs_predict-e2.py
index c92d274..a846b5a 100644
--- a/idconn/workflows/nbs_predict-e2.py
+++ b/idconn/workflows/nbs_predict-e2.py
@@ -58,7 +58,7 @@
 
 outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
 
-#print(len(np.unique(outcome)))
+# print(len(np.unique(outcome)))
 
 if CONFOUNDS is not None:
     confounds = dat[CONFOUNDS]
@@ -107,7 +107,7 @@
     join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
 )
 
-best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model']
+best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
 
 # this uses the most predictive subnetwork as features in the model
 # might replace with thresholded weighted_average
@@ -119,20 +119,20 @@
 
 # here is where we'd threshold the weighted average to use for elastic-net
 weighted_average = np.where(weighted_average > 0, weighted_average, 0)
-#print(np.sum(weighted_average))
-#nbs_vector = weighted_average[upper_tri]
-#p75 = np.percentile(nbs_vector, 75)
-#filter = np.where(nbs_vector >= p75, True, False)
-#print(np.sum(filter))
+# print(np.sum(weighted_average))
+# nbs_vector = weighted_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+# filter = np.where(nbs_vector >= p75, True, False)
+# print(np.sum(filter))
 # print(nbs_vector.shape, filter.shape)
 
 thresh_average = threshold_proportional(weighted_average, THRESH)
 nbs_vector2 = thresh_average[upper_tri]
-#p75 = np.percentile(nbs_vector, 75)
+# p75 = np.percentile(nbs_vector, 75)
 filter = np.where(nbs_vector2 > 0, True, False)
 
 # mask = io.vectorize_corrmats(filter)
-edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter]
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter]
 
 # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
 if CONFOUNDS is not None:
@@ -162,79 +162,71 @@
     train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
 
 
-
 # run the model on the whole test dataset to get params
 
 # classification if the outcome is binary (for now)
 # could be extended to the multiclass case?
 train_metrics = {}
 if len(np.unique(outcome)) == 2:
-    model = LogisticRegression(
-        penalty="l2", 
-        solver="saga", 
-        C=best.C_[0]
-        )
+    model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0])
     train_metrics["alpha"] = best.C_[0]
-    #train_metrics["l1_ratio"] = best.l1_ratio_
+    # train_metrics["l1_ratio"] = best.l1_ratio_
 else:
     model = Ridge(
-        solver="auto",  
+        solver="auto",
         alpha=best.alpha_,
         fit_intercept=False,
-        )
+    )
     train_metrics["alpha"] = best.alpha_
 
 cv = RepeatedKFold(n_splits=5, n_repeats=10)
 
-    #train_metrics["l1_ratio"] = best.l1_ratio_
-#print(params)
-#model.set_params(**params)
+# train_metrics["l1_ratio"] = best.l1_ratio_
+# print(params)
+# model.set_params(**params)
 # train ElasticNet on full train dataset, using feature extraction from NBS-Predict
-#fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+# fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
 scores = cross_validate(
-    model, 
-    train_features, 
-    train_outcome, 
-    groups=groups, 
+    model,
+    train_features,
+    train_outcome,
+    groups=groups,
     cv=cv,
-    return_estimator=True, 
-    return_train_score=True
-    )
-train_metrics["in_sample_test"] = np.mean(scores['test_score'])
-train_metrics["in_sample_train"] = np.mean(scores['train_score'])
+    return_estimator=True,
+    return_train_score=True,
+)
+train_metrics["in_sample_test"] = np.mean(scores["test_score"])
+train_metrics["in_sample_train"] = np.mean(scores["train_score"])
 
-fitted = scores['estimator'][0]
+fitted = scores["estimator"][0]
 y_pred = fitted.predict(X=train_features)
 train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
 
-dat[f'{OUTCOME}_pred'] = y_pred
-dat[f'{OUTCOME}_scaled'] = train_outcome
-
-Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']]
-Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
-
-train_colors = ['#a08ad1', #light
-                '#685690', #medium
-                '#3f2d69' #dark
-                ]
-light_cmap = sns.color_palette('dark:#a08ad1')
-dark_cmap = sns.color_palette('dark:#685690')
-
-fig,ax = plt.subplots()
-g = sns.scatterplot(x='cycle_day', 
-                    y=f'{OUTCOME}_pred', 
-                    style='bc', 
-                    data=Ys,  
-                    ax=ax, 
-                    palette=dark_cmap)
-h = sns.scatterplot(x='cycle_day',
-                    y=f'{OUTCOME}_scaled', 
-                    style='bc', 
-                    data=Ys, 
-                    ax=ax, 
-                    palette=light_cmap)
+dat[f"{OUTCOME}_pred"] = y_pred
+dat[f"{OUTCOME}_scaled"] = train_outcome
+
+Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]]
+Ys.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+train_colors = ["#a08ad1", "#685690", "#3f2d69"]  # light  # medium  # dark
+light_cmap = sns.color_palette("dark:#a08ad1")
+dark_cmap = sns.color_palette("dark:#685690")
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap
+)
+h = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap
+)
 ax.legend(bbox_to_anchor=(1.0, 0.5))
-fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
 
 mse = mean_squared_error(train_outcome, y_pred)
 train_metrics["mean squared error"] = mse
@@ -248,14 +240,14 @@
     json.dump(train_metrics, fp)
 
 # yoink the coefficients? for a more parsimonious figure?
-#print(fitted.coef_.shape)
-#print(fitted.coef_)
+# print(fitted.coef_.shape)
+# print(fitted.coef_)
 coeff_vec = np.zeros_like(filter)
 j = 0
 for i in range(0, filter.shape[0]):
     if filter[i] == True:
-        #print(j)
-        #print(fitted.coef_[0, j])
+        # print(j)
+        # print(fitted.coef_[0, j])
         coeff_vec[i] = fitted.coef_[0, j]
         j += 1
     else:
@@ -345,7 +337,7 @@
 # score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
 test_metrics = {}
 
-#cross_validate(model, )
+# cross_validate(model, )
 y_pred = fitted.predict(X=test_features)
 score = fitted.score(X=test_features, y=np.ravel(test_outcome))
 if len(np.unique(test_outcome)) == 2:
@@ -360,56 +352,56 @@
 print("Out-of-sample mean squared error:\t", mse)
 # print(np.mean(test_features))
 # pred_outcome = fitted.predict(test_features)
-test_df[f'{OUTCOME}_scaled'] = test_outcome
-test_df[f'{OUTCOME}_pred'] = y_pred
-Ys = test_df[[f'{OUTCOME}_scaled', 
-              f'{OUTCOME}_pred',
-              'cycle_day', 
-              'bc']]
-Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
-
-Ys['ppts'] = Ys.index.get_level_values(0)
-
-
-light_colors = ['#33ACE3', #Bubbles
-                '#EA6964', #Blossom
-                '#4AB62C' #Buttercup
-                ]
-dark_colors = ['#1278a6', 
-               '#a11510', 
-               '#228208']
-light = ListedColormap(light_colors, name='light_powderpuff')
-dark = ListedColormap(dark_colors, name='dark_powderpuff')
+test_df[f"{OUTCOME}_scaled"] = test_outcome
+test_df[f"{OUTCOME}_pred"] = y_pred
+Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]]
+Ys.to_csv(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+Ys["ppts"] = Ys.index.get_level_values(0)
+
+
+light_colors = ["#33ACE3", "#EA6964", "#4AB62C"]  # Bubbles  # Blossom  # Buttercup
+dark_colors = ["#1278a6", "#a11510", "#228208"]
+light = ListedColormap(light_colors, name="light_powderpuff")
+dark = ListedColormap(dark_colors, name="dark_powderpuff")
 mpl.colormaps.register(cmap=light)
 mpl.colormaps.register(cmap=dark)
 
-fig,ax = plt.subplots()
-g = sns.scatterplot(x='cycle_day', 
-                    y=f'{OUTCOME}_pred', 
-                    style='bc', 
-                    data=Ys, 
-                    hue='ppts',  
-                    hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'],
-                    ax=ax, 
-                    palette='light_powderpuff'
-                    )
-h = sns.scatterplot(x='cycle_day',
-                     y=f'{OUTCOME}_scaled', 
-                     style='bc', 
-                     data=Ys, 
-                     hue='ppts',
-                     hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], 
-                     ax=ax, 
-                     palette='dark_powderpuff')
-ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')
-fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
-
-
-
-#print(test_outcome, "\n", y_pred)
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_pred",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="light_powderpuff",
+)
+h = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_scaled",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="dark_powderpuff",
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+fig.savefig(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+
+# print(test_outcome, "\n", y_pred)
 # print(pred_outcome)
 if len(np.unique(test_outcome)) > 2:
-    
+
     print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
     test_metrics["spearman correlation"] = corr
 with open(
diff --git a/idconn/workflows/nbs_predict-e2_sensitivity.py b/idconn/workflows/nbs_predict-e2_sensitivity.py
new file mode 100644
index 0000000..13177c7
--- /dev/null
+++ b/idconn/workflows/nbs_predict-e2_sensitivity.py
@@ -0,0 +1,412 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import nibabel as nib
+import seaborn as sns
+import bids
+import matplotlib.pyplot as plt
+from os.path import join
+from datetime import datetime
+from time import strftime
+from scipy.stats import spearmanr
+from idconn import nbs, io
+
+from bct import threshold_proportional
+
+
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate
+from sklearn.preprocessing import Normalizer, StandardScaler
+from sklearn.metrics import mean_squared_error
+from matplotlib.colors import ListedColormap
+import matplotlib as mpl
+
+
+import warnings
+import json
+
+warnings.simplefilter("ignore")
+
+today = datetime.today()
+today_str = strftime("%m_%d_%Y")
+
+TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674"
+TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset"
+DERIV_NAME = "IDConn"
+OUTCOME = "estradiol"
+CONFOUNDS = ["framewise_displacement"]
+TASK = "rest"
+ATLAS = "craddock2012"
+THRESH = 0.5
+alpha = 0.01
+atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
+
+
+layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
+
+dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index
+keep = dat["adj"].dropna().index
+dat = dat.loc[keep]
+
+groups = dat["bc"]
+# print(dat['adj'].values.shape)
+num_node = dat.iloc[0]["adj"].shape[0]
+
+matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node))
+upper_tri = np.triu_indices(num_node, k=1)
+
+outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
+
+# print(len(np.unique(outcome)))
+
+if CONFOUNDS is not None:
+    confounds = dat[CONFOUNDS]
+    base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}"
+else:
+    confounds = None
+    base_name = f"nbs-predict_outcome-{OUTCOME}"
+# print(dat['bc'])
+
+weighted_average, cv_results = nbs.kfold_nbs(
+    matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500
+)
+
+fig, fig2, nimg = io.plot_edges(
+    weighted_average,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Precision-Weighted Average",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}")
+)
+
+
+avg_df = pd.DataFrame(
+    weighted_average,
+    index=range(0, weighted_average.shape[0]),
+    columns=range(0, weighted_average.shape[1]),
+)
+
+cv_results.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t"
+)
+avg_df.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
+)
+
+best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
+
+# this uses the most predictive subnetwork as features in the model
+# might replace with thresholded weighted_average
+# or use _all_ the edges in weighted_average with KRR or ElasticNet...
+# ORRR use thresholded weighted average edges with ElasticNet...
+# - stays true to NBS-Predict
+# - increases parsimony while handling multicollinearity...
+# either way, I don't think cv_results is necessary
+
+# here is where we'd threshold the weighted average to use for elastic-net
+weighted_average = np.where(weighted_average > 0, weighted_average, 0)
+# print(np.sum(weighted_average))
+# nbs_vector = weighted_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+# filter = np.where(nbs_vector >= p75, True, False)
+# print(np.sum(filter))
+# print(nbs_vector.shape, filter.shape)
+
+thresh_average = threshold_proportional(weighted_average, THRESH)
+nbs_vector2 = thresh_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+filter = np.where(nbs_vector2 > 0, True, False)
+
+# mask = io.vectorize_corrmats(filter)
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_train = dat[CONFOUNDS].values
+    outcome_train = np.reshape(outcome, (outcome.shape[0],))
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_train)) <= 2:
+        resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
+        train_outcome = outcome
+    elif len(np.unique(outcome_train)) > 3:
+        train_outcome, resid_edges = nbs.residualize(
+            X=edges_train, y=outcome_train, confounds=confounds_train
+        )
+    train_features = resid_edges
+else:
+    train_features = edges_train
+    train_outcome = outcome
+
+x_scaler = StandardScaler()
+y_scaler = StandardScaler()
+train_features = x_scaler.fit_transform(train_features)
+if len(np.unique(train_outcome)) <= 2:
+    pass
+else:
+    train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
+
+
+# run the model on the whole test dataset to get params
+
+# classification if the outcome is binary (for now)
+# could be extended to the multiclass case?
+train_metrics = {}
+if len(np.unique(outcome)) == 2:
+    model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0])
+    train_metrics["alpha"] = best.C_[0]
+    # train_metrics["l1_ratio"] = best.l1_ratio_
+else:
+    model = Ridge(
+        solver="auto",
+        alpha=best.alpha_,
+        fit_intercept=False,
+    )
+    train_metrics["alpha"] = best.alpha_
+
+cv = RepeatedKFold(n_splits=5, n_repeats=10)
+
+# train_metrics["l1_ratio"] = best.l1_ratio_
+# print(params)
+# model.set_params(**params)
+# train ElasticNet on full train dataset, using feature extraction from NBS-Predict
+# fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+scores = cross_validate(
+    model,
+    train_features,
+    train_outcome,
+    groups=groups,
+    cv=cv,
+    return_estimator=True,
+    return_train_score=True,
+)
+train_metrics["in_sample_test"] = np.mean(scores["test_score"])
+train_metrics["in_sample_train"] = np.mean(scores["train_score"])
+
+fitted = scores["estimator"][0]
+y_pred = fitted.predict(X=train_features)
+train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
+
+dat[f"{OUTCOME}_pred"] = y_pred
+dat[f"{OUTCOME}_scaled"] = train_outcome
+
+Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]]
+Ys.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+train_colors = ["#a08ad1", "#685690", "#3f2d69"]  # light  # medium  # dark
+light_cmap = sns.color_palette("dark:#a08ad1")
+dark_cmap = sns.color_palette("dark:#685690")
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap
+)
+h = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5))
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+mse = mean_squared_error(train_outcome, y_pred)
+train_metrics["mean squared error"] = mse
+print("In-sample train score: ", train_metrics["in_sample_train"])
+print("In-sample test score: ", train_metrics["in_sample_test"])
+print("In-sample mean squared error: ", mse)
+# print(np.mean(train_features))
+with open(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(train_metrics, fp)
+
+# yoink the coefficients? for a more parsimonious figure?
+# print(fitted.coef_.shape)
+# print(fitted.coef_)
+coeff_vec = np.zeros_like(filter)
+j = 0
+for i in range(0, filter.shape[0]):
+    if filter[i] == True:
+        # print(j)
+        # print(fitted.coef_[0, j])
+        coeff_vec[i] = fitted.coef_[0, j]
+        j += 1
+    else:
+        pass
+
+# print(coeff_vec)
+print(coeff_vec)
+coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
+
+coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index)
+coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv"))
+
+fig, fig2, nimg = io.plot_edges(
+    coef_mat,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Coefficients",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}")
+)
+
+
+layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
+
+test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+keep = test_df[[OUTCOME, "adj"]].dropna().index
+# print(keep)
+
+test_df = test_df.loc[keep]
+
+outcome_test = test_df[OUTCOME].values
+# print(test_df)
+
+# print(outcome_test)
+matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
+    (len(test_df["adj"].dropna().index), num_node, num_node)
+)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_test = test_df[CONFOUNDS].values
+
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_test)) <= 2:
+        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
+        test_outcome = outcome_test
+    elif len(np.unique(outcome_test)) > 3:
+        test_outcome, resid_edges = nbs.residualize(
+            X=edges_test, y=outcome_test, confounds=confounds_test
+        )
+    test_features = resid_edges
+else:
+    test_features = edges_test
+    test_outcome = outcome_test
+
+# scale after residualizing omg
+test_features = x_scaler.transform(test_features)
+if len(np.unique(test_outcome)) <= 2:
+    pass
+else:
+    test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1))
+# print(test_features.shape)
+# if the model is a logistic regression, i.e. with a binary outcome
+# then score is prediction accuracy
+# if the model is a linear regression, i.e., with a continuous outcome
+# then the score is R^2 (coefficient of determination)
+
+# fit trained ElasticNet, initialized via warm_start
+# prob in CV?
+# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
+# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
+test_metrics = {}
+
+# cross_validate(model, )
+y_pred = fitted.predict(X=test_features)
+score = fitted.score(X=test_features, y=np.ravel(test_outcome))
+if len(np.unique(test_outcome)) == 2:
+    test_metrics["accuracy"] = score
+else:
+    test_metrics["coefficient of determination"] = score
+corr = spearmanr(test_outcome, y_pred)
+test_metrics["pred_v_actual_corr"] = corr
+mse = mean_squared_error(test_outcome, y_pred)
+test_metrics["mean squared error"] = mse
+print("Out-of-sample prediction score:\t", score)
+print("Out-of-sample mean squared error:\t", mse)
+# print(np.mean(test_features))
+# pred_outcome = fitted.predict(test_features)
+test_df[f"{OUTCOME}_scaled"] = test_outcome
+test_df[f"{OUTCOME}_pred"] = y_pred
+Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]]
+Ys.to_csv(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+Ys["ppts"] = Ys.index.get_level_values(0)
+
+
+light_colors = ["#33ACE3", "#EA6964", "#4AB62C"]  # Bubbles  # Blossom  # Buttercup
+dark_colors = ["#1278a6", "#a11510", "#228208"]
+light = ListedColormap(light_colors, name="light_powderpuff")
+dark = ListedColormap(dark_colors, name="dark_powderpuff")
+mpl.colormaps.register(cmap=light)
+mpl.colormaps.register(cmap=dark)
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_pred",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="light_powderpuff",
+)
+h = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_scaled",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="dark_powderpuff",
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+fig.savefig(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+
+# print(test_outcome, "\n", y_pred)
+# print(pred_outcome)
+if len(np.unique(test_outcome)) > 2:
+
+    print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
+    test_metrics["spearman correlation"] = corr
+with open(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(test_metrics, fp)
+np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred)
diff --git a/idconn/workflows/nbs_predict-e2bc_sensitivity.py b/idconn/workflows/nbs_predict-e2bc_sensitivity.py
new file mode 100644
index 0000000..8052164
--- /dev/null
+++ b/idconn/workflows/nbs_predict-e2bc_sensitivity.py
@@ -0,0 +1,412 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import nibabel as nib
+import seaborn as sns
+import bids
+import matplotlib.pyplot as plt
+from os.path import join
+from datetime import datetime
+from time import strftime
+from scipy.stats import spearmanr
+from idconn import nbs, io
+
+from bct import threshold_proportional
+
+
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate
+from sklearn.preprocessing import Normalizer, StandardScaler
+from sklearn.metrics import mean_squared_error
+from matplotlib.colors import ListedColormap
+import matplotlib as mpl
+
+
+import warnings
+import json
+
+warnings.simplefilter("ignore")
+
+today = datetime.today()
+today_str = strftime("%m_%d_%Y")
+
+TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674"
+TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset"
+DERIV_NAME = "IDConn"
+OUTCOME = "estradiol"
+CONFOUNDS = ["framewise_displacement", "bc"]
+TASK = "rest"
+ATLAS = "craddock2012"
+THRESH = 0.5
+alpha = 0.01
+atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
+
+
+layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
+
+dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index
+keep = dat["adj"].dropna().index
+dat = dat.loc[keep]
+
+groups = dat["bc"]
+# print(dat['adj'].values.shape)
+num_node = dat.iloc[0]["adj"].shape[0]
+
+matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node))
+upper_tri = np.triu_indices(num_node, k=1)
+
+outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
+
+# print(len(np.unique(outcome)))
+
+if CONFOUNDS is not None:
+    confounds = dat[CONFOUNDS]
+    base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}"
+else:
+    confounds = None
+    base_name = f"nbs-predict_outcome-{OUTCOME}"
+# print(dat['bc'])
+
+weighted_average, cv_results = nbs.kfold_nbs(
+    matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500
+)
+
+fig, fig2, nimg = io.plot_edges(
+    weighted_average,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Precision-Weighted Average",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}")
+)
+
+
+avg_df = pd.DataFrame(
+    weighted_average,
+    index=range(0, weighted_average.shape[0]),
+    columns=range(0, weighted_average.shape[1]),
+)
+
+cv_results.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t"
+)
+avg_df.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
+)
+
+best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
+
+# this uses the most predictive subnetwork as features in the model
+# might replace with thresholded weighted_average
+# or use _all_ the edges in weighted_average with KRR or ElasticNet...
+# ORRR use thresholded weighted average edges with ElasticNet...
+# - stays true to NBS-Predict
+# - increases parsimony while handling multicollinearity...
+# either way, I don't think cv_results is necessary
+
+# here is where we'd threshold the weighted average to use for elastic-net
+weighted_average = np.where(weighted_average > 0, weighted_average, 0)
+# print(np.sum(weighted_average))
+# nbs_vector = weighted_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+# filter = np.where(nbs_vector >= p75, True, False)
+# print(np.sum(filter))
+# print(nbs_vector.shape, filter.shape)
+
+thresh_average = threshold_proportional(weighted_average, THRESH)
+nbs_vector2 = thresh_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+filter = np.where(nbs_vector2 > 0, True, False)
+
+# mask = io.vectorize_corrmats(filter)
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_train = dat[CONFOUNDS].values
+    outcome_train = np.reshape(outcome, (outcome.shape[0],))
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_train)) <= 2:
+        resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
+        train_outcome = outcome
+    elif len(np.unique(outcome_train)) > 3:
+        train_outcome, resid_edges = nbs.residualize(
+            X=edges_train, y=outcome_train, confounds=confounds_train
+        )
+    train_features = resid_edges
+else:
+    train_features = edges_train
+    train_outcome = outcome
+
+x_scaler = StandardScaler()
+y_scaler = StandardScaler()
+train_features = x_scaler.fit_transform(train_features)
+if len(np.unique(train_outcome)) <= 2:
+    pass
+else:
+    train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
+
+
+# run the model on the whole test dataset to get params
+
+# classification if the outcome is binary (for now)
+# could be extended to the multiclass case?
+train_metrics = {}
+if len(np.unique(outcome)) == 2:
+    model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0])
+    train_metrics["alpha"] = best.C_[0]
+    # train_metrics["l1_ratio"] = best.l1_ratio_
+else:
+    model = Ridge(
+        solver="auto",
+        alpha=best.alpha_,
+        fit_intercept=False,
+    )
+    train_metrics["alpha"] = best.alpha_
+
+cv = RepeatedKFold(n_splits=5, n_repeats=10)
+
+# train_metrics["l1_ratio"] = best.l1_ratio_
+# print(params)
+# model.set_params(**params)
+# train ElasticNet on full train dataset, using feature extraction from NBS-Predict
+# fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+scores = cross_validate(
+    model,
+    train_features,
+    train_outcome,
+    groups=groups,
+    cv=cv,
+    return_estimator=True,
+    return_train_score=True,
+)
+train_metrics["in_sample_test"] = np.mean(scores["test_score"])
+train_metrics["in_sample_train"] = np.mean(scores["train_score"])
+
+fitted = scores["estimator"][0]
+y_pred = fitted.predict(X=train_features)
+train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
+
+dat[f"{OUTCOME}_pred"] = y_pred
+dat[f"{OUTCOME}_scaled"] = train_outcome
+
+Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]]
+Ys.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+train_colors = ["#a08ad1", "#685690", "#3f2d69"]  # light  # medium  # dark
+light_cmap = sns.color_palette("dark:#a08ad1")
+dark_cmap = sns.color_palette("dark:#685690")
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap
+)
+h = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5))
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+mse = mean_squared_error(train_outcome, y_pred)
+train_metrics["mean squared error"] = mse
+print("In-sample train score: ", train_metrics["in_sample_train"])
+print("In-sample test score: ", train_metrics["in_sample_test"])
+print("In-sample mean squared error: ", mse)
+# print(np.mean(train_features))
+with open(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(train_metrics, fp)
+
+# yoink the coefficients? for a more parsimonious figure?
+# print(fitted.coef_.shape)
+# print(fitted.coef_)
+coeff_vec = np.zeros_like(filter)
+j = 0
+for i in range(0, filter.shape[0]):
+    if filter[i] == True:
+        # print(j)
+        # print(fitted.coef_[0, j])
+        coeff_vec[i] = fitted.coef_[0, j]
+        j += 1
+    else:
+        pass
+
+# print(coeff_vec)
+print(coeff_vec)
+coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
+
+coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index)
+coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv"))
+
+fig, fig2, nimg = io.plot_edges(
+    coef_mat,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Coefficients",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}")
+)
+
+
+layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
+
+test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+keep = test_df[[OUTCOME, "adj"]].dropna().index
+# print(keep)
+
+test_df = test_df.loc[keep]
+
+outcome_test = test_df[OUTCOME].values
+# print(test_df)
+
+# print(outcome_test)
+matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
+    (len(test_df["adj"].dropna().index), num_node, num_node)
+)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_test = test_df[CONFOUNDS].values
+
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_test)) <= 2:
+        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
+        test_outcome = outcome_test
+    elif len(np.unique(outcome_test)) > 3:
+        test_outcome, resid_edges = nbs.residualize(
+            X=edges_test, y=outcome_test, confounds=confounds_test
+        )
+    test_features = resid_edges
+else:
+    test_features = edges_test
+    test_outcome = outcome_test
+
+# scale after residualizing omg
+test_features = x_scaler.transform(test_features)
+if len(np.unique(test_outcome)) <= 2:
+    pass
+else:
+    test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1))
+# print(test_features.shape)
+# if the model is a logistic regression, i.e. with a binary outcome
+# then score is prediction accuracy
+# if the model is a linear regression, i.e., with a continuous outcome
+# then the score is R^2 (coefficient of determination)
+
+# fit trained ElasticNet, initialized via warm_start
+# prob in CV?
+# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
+# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
+test_metrics = {}
+
+# cross_validate(model, )
+y_pred = fitted.predict(X=test_features)
+score = fitted.score(X=test_features, y=np.ravel(test_outcome))
+if len(np.unique(test_outcome)) == 2:
+    test_metrics["accuracy"] = score
+else:
+    test_metrics["coefficient of determination"] = score
+corr = spearmanr(test_outcome, y_pred)
+test_metrics["pred_v_actual_corr"] = corr
+mse = mean_squared_error(test_outcome, y_pred)
+test_metrics["mean squared error"] = mse
+print("Out-of-sample prediction score:\t", score)
+print("Out-of-sample mean squared error:\t", mse)
+# print(np.mean(test_features))
+# pred_outcome = fitted.predict(test_features)
+test_df[f"{OUTCOME}_scaled"] = test_outcome
+test_df[f"{OUTCOME}_pred"] = y_pred
+Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]]
+Ys.to_csv(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+Ys["ppts"] = Ys.index.get_level_values(0)
+
+
+light_colors = ["#33ACE3", "#EA6964", "#4AB62C"]  # Bubbles  # Blossom  # Buttercup
+dark_colors = ["#1278a6", "#a11510", "#228208"]
+light = ListedColormap(light_colors, name="light_powderpuff")
+dark = ListedColormap(dark_colors, name="dark_powderpuff")
+mpl.colormaps.register(cmap=light)
+mpl.colormaps.register(cmap=dark)
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_pred",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="light_powderpuff",
+)
+h = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_scaled",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="dark_powderpuff",
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+fig.savefig(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+
+# print(test_outcome, "\n", y_pred)
+# print(pred_outcome)
+if len(np.unique(test_outcome)) > 2:
+
+    print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
+    test_metrics["spearman correlation"] = corr
+with open(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(test_metrics, fp)
+np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred)
diff --git a/idconn/workflows/nbs_predict-e2xp4-bc.py b/idconn/workflows/nbs_predict-e2xp4-bc.py
index ad6a6d8..4b32a85 100644
--- a/idconn/workflows/nbs_predict-e2xp4-bc.py
+++ b/idconn/workflows/nbs_predict-e2xp4-bc.py
@@ -46,7 +46,7 @@
 
 dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
 
-dat['estradiol÷progesterone'] = dat['estradiol'] / dat['progesterone']
+dat["estradiol÷progesterone"] = dat["estradiol"] / dat["progesterone"]
 
 keep = dat["adj"].dropna().index
 dat = dat.loc[keep]
@@ -60,7 +60,7 @@
 
 outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
 
-#print(len(np.unique(outcome)))
+# print(len(np.unique(outcome)))
 
 if CONFOUNDS is not None:
     confounds = dat[CONFOUNDS]
@@ -109,7 +109,7 @@
     join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
 )
 
-best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model']
+best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
 
 # this uses the most predictive subnetwork as features in the model
 # might replace with thresholded weighted_average
@@ -121,20 +121,20 @@
 
 # here is where we'd threshold the weighted average to use for elastic-net
 weighted_average = np.where(weighted_average > 0, weighted_average, 0)
-#print(np.sum(weighted_average))
-#nbs_vector = weighted_average[upper_tri]
-#p75 = np.percentile(nbs_vector, 75)
-#filter = np.where(nbs_vector >= p75, True, False)
-#print(np.sum(filter))
+# print(np.sum(weighted_average))
+# nbs_vector = weighted_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+# filter = np.where(nbs_vector >= p75, True, False)
+# print(np.sum(filter))
 # print(nbs_vector.shape, filter.shape)
 
 thresh_average = threshold_proportional(weighted_average, THRESH)
 nbs_vector2 = thresh_average[upper_tri]
-#p75 = np.percentile(nbs_vector, 75)
+# p75 = np.percentile(nbs_vector, 75)
 filter = np.where(nbs_vector2 > 0, True, False)
 
 # mask = io.vectorize_corrmats(filter)
-edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter]
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter]
 
 # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
 if CONFOUNDS is not None:
@@ -164,79 +164,71 @@
     train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
 
 
-
 # run the model on the whole test dataset to get params
 
 # classification if the outcome is binary (for now)
 # could be extended to the multiclass case?
 train_metrics = {}
 if len(np.unique(outcome)) == 2:
-    model = LogisticRegression(
-        penalty="l2", 
-        solver="saga", 
-        C=best.C_[0]
-        )
+    model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0])
     train_metrics["alpha"] = best.C_[0]
-    #train_metrics["l1_ratio"] = best.l1_ratio_
+    # train_metrics["l1_ratio"] = best.l1_ratio_
 else:
     model = Ridge(
-        solver="auto",  
+        solver="auto",
         alpha=best.alpha_,
         fit_intercept=False,
-        )
+    )
     train_metrics["alpha"] = best.alpha_
 
 cv = RepeatedKFold(n_splits=5, n_repeats=10)
 
-    #train_metrics["l1_ratio"] = best.l1_ratio_
-#print(params)
-#model.set_params(**params)
+# train_metrics["l1_ratio"] = best.l1_ratio_
+# print(params)
+# model.set_params(**params)
 # train ElasticNet on full train dataset, using feature extraction from NBS-Predict
-#fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+# fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
 scores = cross_validate(
-    model, 
-    train_features, 
-    train_outcome, 
-    groups=groups, 
+    model,
+    train_features,
+    train_outcome,
+    groups=groups,
     cv=cv,
-    return_estimator=True, 
-    return_train_score=True
-    )
-train_metrics["in_sample_test"] = np.mean(scores['test_score'])
-train_metrics["in_sample_train"] = np.mean(scores['train_score'])
+    return_estimator=True,
+    return_train_score=True,
+)
+train_metrics["in_sample_test"] = np.mean(scores["test_score"])
+train_metrics["in_sample_train"] = np.mean(scores["train_score"])
 
-fitted = scores['estimator'][0]
+fitted = scores["estimator"][0]
 y_pred = fitted.predict(X=train_features)
 train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
 
-dat[f'{OUTCOME}_pred'] = y_pred
-dat[f'{OUTCOME}_scaled'] = train_outcome
-
-Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']]
-Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
-
-train_colors = ['#a08ad1', #light
-                '#685690', #medium
-                '#3f2d69' #dark
-                ]
-light_cmap = sns.color_palette('dark:#a08ad1')
-dark_cmap = sns.color_palette('dark:#685690')
-
-fig,ax = plt.subplots()
-g = sns.scatterplot(x='cycle_day', 
-                    y=f'{OUTCOME}_pred', 
-                    style='bc', 
-                    data=Ys,  
-                    ax=ax, 
-                    palette=dark_cmap)
-h = sns.scatterplot(x='cycle_day',
-                    y=f'{OUTCOME}_scaled', 
-                    style='bc', 
-                    data=Ys, 
-                    ax=ax, 
-                    palette=light_cmap)
+dat[f"{OUTCOME}_pred"] = y_pred
+dat[f"{OUTCOME}_scaled"] = train_outcome
+
+Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]]
+Ys.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+train_colors = ["#a08ad1", "#685690", "#3f2d69"]  # light  # medium  # dark
+light_cmap = sns.color_palette("dark:#a08ad1")
+dark_cmap = sns.color_palette("dark:#685690")
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap
+)
+h = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap
+)
 ax.legend(bbox_to_anchor=(1.0, 0.5))
-fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
 
 mse = mean_squared_error(train_outcome, y_pred)
 train_metrics["mean squared error"] = mse
@@ -250,14 +242,14 @@
     json.dump(train_metrics, fp)
 
 # yoink the coefficients? for a more parsimonious figure?
-#print(fitted.coef_.shape)
-#print(fitted.coef_)
+# print(fitted.coef_.shape)
+# print(fitted.coef_)
 coeff_vec = np.zeros_like(filter)
 j = 0
 for i in range(0, filter.shape[0]):
     if filter[i] == True:
-        #print(j)
-        #print(fitted.coef_[0, j])
+        # print(j)
+        # print(fitted.coef_[0, j])
         coeff_vec[i] = fitted.coef_[0, j]
         j += 1
     else:
@@ -295,7 +287,7 @@
 layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
 
 test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
-test_df['estradiol÷progesterone'] = test_df['estradiol'] / test_df['progesterone']
+test_df["estradiol÷progesterone"] = test_df["estradiol"] / test_df["progesterone"]
 
 keep = test_df[[OUTCOME, "adj"]].dropna().index
 # print(keep)
@@ -348,7 +340,7 @@
 # score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
 test_metrics = {}
 
-#cross_validate(model, )
+# cross_validate(model, )
 y_pred = fitted.predict(X=test_features)
 score = fitted.score(X=test_features, y=np.ravel(test_outcome))
 if len(np.unique(test_outcome)) == 2:
@@ -363,56 +355,56 @@
 print("Out-of-sample mean squared error:\t", mse)
 # print(np.mean(test_features))
 # pred_outcome = fitted.predict(test_features)
-test_df[f'{OUTCOME}_scaled'] = test_outcome
-test_df[f'{OUTCOME}_pred'] = y_pred
-Ys = test_df[[f'{OUTCOME}_scaled', 
-              f'{OUTCOME}_pred',
-              'cycle_day', 
-              'bc']]
-Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
-
-Ys['ppts'] = Ys.index.get_level_values(0)
-
-
-light_colors = ['#33ACE3', #Bubbles
-                '#EA6964', #Blossom
-                '#4AB62C' #Buttercup
-                ]
-dark_colors = ['#1278a6', 
-               '#a11510', 
-               '#228208']
-light = ListedColormap(light_colors, name='light_powderpuff')
-dark = ListedColormap(dark_colors, name='dark_powderpuff')
+test_df[f"{OUTCOME}_scaled"] = test_outcome
+test_df[f"{OUTCOME}_pred"] = y_pred
+Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]]
+Ys.to_csv(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+Ys["ppts"] = Ys.index.get_level_values(0)
+
+
+light_colors = ["#33ACE3", "#EA6964", "#4AB62C"]  # Bubbles  # Blossom  # Buttercup
+dark_colors = ["#1278a6", "#a11510", "#228208"]
+light = ListedColormap(light_colors, name="light_powderpuff")
+dark = ListedColormap(dark_colors, name="dark_powderpuff")
 mpl.colormaps.register(cmap=light)
 mpl.colormaps.register(cmap=dark)
 
-fig,ax = plt.subplots()
-g = sns.scatterplot(x='cycle_day', 
-                    y=f'{OUTCOME}_pred', 
-                    style='bc', 
-                    data=Ys, 
-                    hue='ppts',  
-                    hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'],
-                    ax=ax, 
-                    palette='light_powderpuff'
-                    )
-h = sns.scatterplot(x='cycle_day',
-                     y=f'{OUTCOME}_scaled', 
-                     style='bc', 
-                     data=Ys, 
-                     hue='ppts',
-                     hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], 
-                     ax=ax, 
-                     palette='dark_powderpuff')
-ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')
-fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
-
-
-
-#print(test_outcome, "\n", y_pred)
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_pred",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="light_powderpuff",
+)
+h = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_scaled",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="dark_powderpuff",
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+fig.savefig(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+
+# print(test_outcome, "\n", y_pred)
 # print(pred_outcome)
 if len(np.unique(test_outcome)) > 2:
-    
+
     print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
     test_metrics["spearman correlation"] = corr
 with open(
diff --git a/idconn/workflows/nbs_predict-e2xp4.py b/idconn/workflows/nbs_predict-e2xp4.py
index 022d8b9..fcd6f40 100644
--- a/idconn/workflows/nbs_predict-e2xp4.py
+++ b/idconn/workflows/nbs_predict-e2xp4.py
@@ -46,7 +46,7 @@
 
 dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
 
-dat['estradiol÷progesterone'] = dat['estradiol'] / dat['progesterone']
+dat["estradiol÷progesterone"] = dat["estradiol"] / dat["progesterone"]
 
 keep = dat["adj"].dropna().index
 dat = dat.loc[keep]
@@ -60,7 +60,7 @@
 
 outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
 
-#print(len(np.unique(outcome)))
+# print(len(np.unique(outcome)))
 
 if CONFOUNDS is not None:
     confounds = dat[CONFOUNDS]
@@ -109,7 +109,7 @@
     join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
 )
 
-best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model']
+best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
 
 # this uses the most predictive subnetwork as features in the model
 # might replace with thresholded weighted_average
@@ -121,20 +121,20 @@
 
 # here is where we'd threshold the weighted average to use for elastic-net
 weighted_average = np.where(weighted_average > 0, weighted_average, 0)
-#print(np.sum(weighted_average))
-#nbs_vector = weighted_average[upper_tri]
-#p75 = np.percentile(nbs_vector, 75)
-#filter = np.where(nbs_vector >= p75, True, False)
-#print(np.sum(filter))
+# print(np.sum(weighted_average))
+# nbs_vector = weighted_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+# filter = np.where(nbs_vector >= p75, True, False)
+# print(np.sum(filter))
 # print(nbs_vector.shape, filter.shape)
 
 thresh_average = threshold_proportional(weighted_average, THRESH)
 nbs_vector2 = thresh_average[upper_tri]
-#p75 = np.percentile(nbs_vector, 75)
+# p75 = np.percentile(nbs_vector, 75)
 filter = np.where(nbs_vector2 > 0, True, False)
 
 # mask = io.vectorize_corrmats(filter)
-edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter]
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter]
 
 # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
 if CONFOUNDS is not None:
@@ -164,79 +164,71 @@
     train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
 
 
-
 # run the model on the whole test dataset to get params
 
 # classification if the outcome is binary (for now)
 # could be extended to the multiclass case?
 train_metrics = {}
 if len(np.unique(outcome)) == 2:
-    model = LogisticRegression(
-        penalty="l2", 
-        solver="saga", 
-        C=best.C_[0]
-        )
+    model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0])
     train_metrics["alpha"] = best.C_[0]
-    #train_metrics["l1_ratio"] = best.l1_ratio_
+    # train_metrics["l1_ratio"] = best.l1_ratio_
 else:
     model = Ridge(
-        solver="auto",  
+        solver="auto",
         alpha=best.alpha_,
         fit_intercept=False,
-        )
+    )
     train_metrics["alpha"] = best.alpha_
 
 cv = RepeatedKFold(n_splits=5, n_repeats=10)
 
-    #train_metrics["l1_ratio"] = best.l1_ratio_
-#print(params)
-#model.set_params(**params)
+# train_metrics["l1_ratio"] = best.l1_ratio_
+# print(params)
+# model.set_params(**params)
 # train ElasticNet on full train dataset, using feature extraction from NBS-Predict
-#fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+# fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
 scores = cross_validate(
-    model, 
-    train_features, 
-    train_outcome, 
-    groups=groups, 
+    model,
+    train_features,
+    train_outcome,
+    groups=groups,
     cv=cv,
-    return_estimator=True, 
-    return_train_score=True
-    )
-train_metrics["in_sample_test"] = np.mean(scores['test_score'])
-train_metrics["in_sample_train"] = np.mean(scores['train_score'])
+    return_estimator=True,
+    return_train_score=True,
+)
+train_metrics["in_sample_test"] = np.mean(scores["test_score"])
+train_metrics["in_sample_train"] = np.mean(scores["train_score"])
 
-fitted = scores['estimator'][0]
+fitted = scores["estimator"][0]
 y_pred = fitted.predict(X=train_features)
 train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
 
-dat[f'{OUTCOME}_pred'] = y_pred
-dat[f'{OUTCOME}_scaled'] = train_outcome
-
-Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']]
-Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
-
-train_colors = ['#a08ad1', #light
-                '#685690', #medium
-                '#3f2d69' #dark
-                ]
-light_cmap = sns.color_palette('dark:#a08ad1')
-dark_cmap = sns.color_palette('dark:#685690')
-
-fig,ax = plt.subplots()
-g = sns.scatterplot(x='cycle_day', 
-                    y=f'{OUTCOME}_pred', 
-                    style='bc', 
-                    data=Ys,  
-                    ax=ax, 
-                    palette=dark_cmap)
-h = sns.scatterplot(x='cycle_day',
-                    y=f'{OUTCOME}_scaled', 
-                    style='bc', 
-                    data=Ys, 
-                    ax=ax, 
-                    palette=light_cmap)
+dat[f"{OUTCOME}_pred"] = y_pred
+dat[f"{OUTCOME}_scaled"] = train_outcome
+
+Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]]
+Ys.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+train_colors = ["#a08ad1", "#685690", "#3f2d69"]  # light  # medium  # dark
+light_cmap = sns.color_palette("dark:#a08ad1")
+dark_cmap = sns.color_palette("dark:#685690")
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap
+)
+h = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap
+)
 ax.legend(bbox_to_anchor=(1.0, 0.5))
-fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
 
 mse = mean_squared_error(train_outcome, y_pred)
 train_metrics["mean squared error"] = mse
@@ -250,14 +242,14 @@
     json.dump(train_metrics, fp)
 
 # yoink the coefficients? for a more parsimonious figure?
-#print(fitted.coef_.shape)
-#print(fitted.coef_)
+# print(fitted.coef_.shape)
+# print(fitted.coef_)
 coeff_vec = np.zeros_like(filter)
 j = 0
 for i in range(0, filter.shape[0]):
     if filter[i] == True:
-        #print(j)
-        #print(fitted.coef_[0, j])
+        # print(j)
+        # print(fitted.coef_[0, j])
         coeff_vec[i] = fitted.coef_[0, j]
         j += 1
     else:
@@ -295,7 +287,7 @@
 layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
 
 test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
-test_df['estradiol÷progesterone'] = test_df['estradiol'] / test_df['progesterone']
+test_df["estradiol÷progesterone"] = test_df["estradiol"] / test_df["progesterone"]
 
 keep = test_df[[OUTCOME, "adj"]].dropna().index
 # print(keep)
@@ -348,7 +340,7 @@
 # score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
 test_metrics = {}
 
-#cross_validate(model, )
+# cross_validate(model, )
 y_pred = fitted.predict(X=test_features)
 score = fitted.score(X=test_features, y=np.ravel(test_outcome))
 if len(np.unique(test_outcome)) == 2:
@@ -363,56 +355,56 @@
 print("Out-of-sample mean squared error:\t", mse)
 # print(np.mean(test_features))
 # pred_outcome = fitted.predict(test_features)
-test_df[f'{OUTCOME}_scaled'] = test_outcome
-test_df[f'{OUTCOME}_pred'] = y_pred
-Ys = test_df[[f'{OUTCOME}_scaled', 
-              f'{OUTCOME}_pred',
-              'cycle_day', 
-              'bc']]
-Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
-
-Ys['ppts'] = Ys.index.get_level_values(0)
-
-
-light_colors = ['#33ACE3', #Bubbles
-                '#EA6964', #Blossom
-                '#4AB62C' #Buttercup
-                ]
-dark_colors = ['#1278a6', 
-               '#a11510', 
-               '#228208']
-light = ListedColormap(light_colors, name='light_powderpuff')
-dark = ListedColormap(dark_colors, name='dark_powderpuff')
+test_df[f"{OUTCOME}_scaled"] = test_outcome
+test_df[f"{OUTCOME}_pred"] = y_pred
+Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]]
+Ys.to_csv(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+Ys["ppts"] = Ys.index.get_level_values(0)
+
+
+light_colors = ["#33ACE3", "#EA6964", "#4AB62C"]  # Bubbles  # Blossom  # Buttercup
+dark_colors = ["#1278a6", "#a11510", "#228208"]
+light = ListedColormap(light_colors, name="light_powderpuff")
+dark = ListedColormap(dark_colors, name="dark_powderpuff")
 mpl.colormaps.register(cmap=light)
 mpl.colormaps.register(cmap=dark)
 
-fig,ax = plt.subplots()
-g = sns.scatterplot(x='cycle_day', 
-                    y=f'{OUTCOME}_pred', 
-                    style='bc', 
-                    data=Ys, 
-                    hue='ppts',  
-                    hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'],
-                    ax=ax, 
-                    palette='light_powderpuff'
-                    )
-h = sns.scatterplot(x='cycle_day',
-                     y=f'{OUTCOME}_scaled', 
-                     style='bc', 
-                     data=Ys, 
-                     hue='ppts',
-                     hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], 
-                     ax=ax, 
-                     palette='dark_powderpuff')
-ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')
-fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
-
-
-
-#print(test_outcome, "\n", y_pred)
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_pred",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="light_powderpuff",
+)
+h = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_scaled",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="dark_powderpuff",
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+fig.savefig(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+
+# print(test_outcome, "\n", y_pred)
 # print(pred_outcome)
 if len(np.unique(test_outcome)) > 2:
-    
+
     print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
     test_metrics["spearman correlation"] = corr
 with open(
diff --git a/idconn/workflows/nbs_predict-p4.py b/idconn/workflows/nbs_predict-p4.py
index 559b4ff..2251179 100644
--- a/idconn/workflows/nbs_predict-p4.py
+++ b/idconn/workflows/nbs_predict-p4.py
@@ -58,7 +58,7 @@
 
 outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
 
-#print(len(np.unique(outcome)))
+# print(len(np.unique(outcome)))
 
 if CONFOUNDS is not None:
     confounds = dat[CONFOUNDS]
@@ -107,7 +107,7 @@
     join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
 )
 
-best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model']
+best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
 
 # this uses the most predictive subnetwork as features in the model
 # might replace with thresholded weighted_average
@@ -119,20 +119,20 @@
 
 # here is where we'd threshold the weighted average to use for elastic-net
 weighted_average = np.where(weighted_average > 0, weighted_average, 0)
-#print(np.sum(weighted_average))
-#nbs_vector = weighted_average[upper_tri]
-#p75 = np.percentile(nbs_vector, 75)
-#filter = np.where(nbs_vector >= p75, True, False)
-#print(np.sum(filter))
+# print(np.sum(weighted_average))
+# nbs_vector = weighted_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+# filter = np.where(nbs_vector >= p75, True, False)
+# print(np.sum(filter))
 # print(nbs_vector.shape, filter.shape)
 
 thresh_average = threshold_proportional(weighted_average, THRESH)
 nbs_vector2 = thresh_average[upper_tri]
-#p75 = np.percentile(nbs_vector, 75)
+# p75 = np.percentile(nbs_vector, 75)
 filter = np.where(nbs_vector2 > 0, True, False)
 
 # mask = io.vectorize_corrmats(filter)
-edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter]
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter]
 
 # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
 if CONFOUNDS is not None:
@@ -162,79 +162,71 @@
     train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
 
 
-
 # run the model on the whole test dataset to get params
 
 # classification if the outcome is binary (for now)
 # could be extended to the multiclass case?
 train_metrics = {}
 if len(np.unique(outcome)) == 2:
-    model = LogisticRegression(
-        penalty="l2", 
-        solver="saga", 
-        C=best.C_[0]
-        )
+    model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0])
     train_metrics["alpha"] = best.C_[0]
-    #train_metrics["l1_ratio"] = best.l1_ratio_
+    # train_metrics["l1_ratio"] = best.l1_ratio_
 else:
     model = Ridge(
-        solver="auto",  
+        solver="auto",
         alpha=best.alpha_,
         fit_intercept=False,
-        )
+    )
     train_metrics["alpha"] = best.alpha_
 
 cv = RepeatedKFold(n_splits=5, n_repeats=10)
 
-    #train_metrics["l1_ratio"] = best.l1_ratio_
-#print(params)
-#model.set_params(**params)
+# train_metrics["l1_ratio"] = best.l1_ratio_
+# print(params)
+# model.set_params(**params)
 # train ElasticNet on full train dataset, using feature extraction from NBS-Predict
-#fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+# fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
 scores = cross_validate(
-    model, 
-    train_features, 
-    train_outcome, 
-    groups=groups, 
+    model,
+    train_features,
+    train_outcome,
+    groups=groups,
     cv=cv,
-    return_estimator=True, 
-    return_train_score=True
-    )
-train_metrics["in_sample_test"] = np.mean(scores['test_score'])
-train_metrics["in_sample_train"] = np.mean(scores['train_score'])
+    return_estimator=True,
+    return_train_score=True,
+)
+train_metrics["in_sample_test"] = np.mean(scores["test_score"])
+train_metrics["in_sample_train"] = np.mean(scores["train_score"])
 
-fitted = scores['estimator'][0]
+fitted = scores["estimator"][0]
 y_pred = fitted.predict(X=train_features)
 train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
 
-dat[f'{OUTCOME}_pred'] = y_pred
-dat[f'{OUTCOME}_scaled'] = train_outcome
-
-Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']]
-Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
-
-train_colors = ['#a08ad1', #light
-                '#685690', #medium
-                '#3f2d69' #dark
-                ]
-light_cmap = sns.color_palette('dark:#a08ad1')
-dark_cmap = sns.color_palette('dark:#685690')
-
-fig,ax = plt.subplots()
-g = sns.scatterplot(x='cycle_day', 
-                    y=f'{OUTCOME}_pred', 
-                    style='bc', 
-                    data=Ys,  
-                    ax=ax, 
-                    palette=dark_cmap)
-h = sns.scatterplot(x='cycle_day',
-                    y=f'{OUTCOME}_scaled', 
-                    style='bc', 
-                    data=Ys, 
-                    ax=ax, 
-                    palette=light_cmap)
+dat[f"{OUTCOME}_pred"] = y_pred
+dat[f"{OUTCOME}_scaled"] = train_outcome
+
+Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]]
+Ys.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+train_colors = ["#a08ad1", "#685690", "#3f2d69"]  # light  # medium  # dark
+light_cmap = sns.color_palette("dark:#a08ad1")
+dark_cmap = sns.color_palette("dark:#685690")
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap
+)
+h = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap
+)
 ax.legend(bbox_to_anchor=(1.0, 0.5))
-fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
 
 mse = mean_squared_error(train_outcome, y_pred)
 train_metrics["mean squared error"] = mse
@@ -248,12 +240,12 @@
     json.dump(train_metrics, fp)
 
 # yoink the coefficients? for a more parsimonious figure?
-#print(fitted.coef_.shape)
+# print(fitted.coef_.shape)
 coeff_vec = np.zeros_like(filter)
 j = 0
 for i in range(0, filter.shape[0]):
     if filter[i] == True:
-        #print(j)
+        # print(j)
         coeff_vec[i] = fitted.coef_[0, j]
         j += 1
     else:
@@ -342,7 +334,7 @@
 # score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
 test_metrics = {}
 
-#cross_validate(model, )
+# cross_validate(model, )
 y_pred = fitted.predict(X=test_features)
 score = fitted.score(X=test_features, y=np.ravel(test_outcome))
 if len(np.unique(test_outcome)) == 2:
@@ -357,56 +349,56 @@
 print("Out-of-sample mean squared error:\t", mse)
 # print(np.mean(test_features))
 # pred_outcome = fitted.predict(test_features)
-test_df[f'{OUTCOME}_scaled'] = test_outcome
-test_df[f'{OUTCOME}_pred'] = y_pred
-Ys = test_df[[f'{OUTCOME}_scaled', 
-              f'{OUTCOME}_pred',
-              'cycle_day', 
-              'bc']]
-Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
-
-Ys['ppts'] = Ys.index.get_level_values(0)
-
-
-light_colors = ['#33ACE3', #Bubbles
-                '#EA6964', #Blossom
-                '#4AB62C' #Buttercup
-                ]
-dark_colors = ['#1278a6', 
-               '#a11510', 
-               '#228208']
-light = ListedColormap(light_colors, name='light_powderpuff')
-dark = ListedColormap(dark_colors, name='dark_powderpuff')
+test_df[f"{OUTCOME}_scaled"] = test_outcome
+test_df[f"{OUTCOME}_pred"] = y_pred
+Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]]
+Ys.to_csv(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+Ys["ppts"] = Ys.index.get_level_values(0)
+
+
+light_colors = ["#33ACE3", "#EA6964", "#4AB62C"]  # Bubbles  # Blossom  # Buttercup
+dark_colors = ["#1278a6", "#a11510", "#228208"]
+light = ListedColormap(light_colors, name="light_powderpuff")
+dark = ListedColormap(dark_colors, name="dark_powderpuff")
 mpl.colormaps.register(cmap=light)
 mpl.colormaps.register(cmap=dark)
 
-fig,ax = plt.subplots()
-g = sns.scatterplot(x='cycle_day', 
-                    y=f'{OUTCOME}_pred', 
-                    style='bc', 
-                    data=Ys, 
-                    hue='ppts',  
-                    hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'],
-                    ax=ax, 
-                    palette='light_powderpuff'
-                    )
-h = sns.scatterplot(x='cycle_day',
-                     y=f'{OUTCOME}_scaled', 
-                     style='bc', 
-                     data=Ys, 
-                     hue='ppts',
-                     hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], 
-                     ax=ax, 
-                     palette='dark_powderpuff')
-ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')
-fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
-
-
-
-#print(test_outcome, "\n", y_pred)
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_pred",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="light_powderpuff",
+)
+h = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_scaled",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="dark_powderpuff",
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+fig.savefig(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+
+# print(test_outcome, "\n", y_pred)
 # print(pred_outcome)
 if len(np.unique(test_outcome)) > 2:
-    
+
     print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
     test_metrics["spearman correlation"] = corr
 with open(
diff --git a/idconn/workflows/nbs_predict-p4_sensitivity.py b/idconn/workflows/nbs_predict-p4_sensitivity.py
new file mode 100644
index 0000000..449db27
--- /dev/null
+++ b/idconn/workflows/nbs_predict-p4_sensitivity.py
@@ -0,0 +1,412 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import nibabel as nib
+import seaborn as sns
+import bids
+import matplotlib.pyplot as plt
+from os.path import join
+from datetime import datetime
+from time import strftime
+from scipy.stats import spearmanr
+from idconn import nbs, io
+
+from bct import threshold_proportional
+
+
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate
+from sklearn.preprocessing import Normalizer, StandardScaler
+from sklearn.metrics import mean_squared_error
+from matplotlib.colors import ListedColormap
+import matplotlib as mpl
+
+
+import warnings
+import json
+
+warnings.simplefilter("ignore")
+
+today = datetime.today()
+today_str = strftime("%m_%d_%Y")
+
+TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674"
+TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset"
+DERIV_NAME = "IDConn"
+OUTCOME = "progesterone"
+CONFOUNDS = ["framewise_displacement"]
+TASK = "rest"
+ATLAS = "craddock2012"
+THRESH = 0.5
+alpha = 0.01
+atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
+
+
+layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
+
+dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index
+keep = dat["adj"].dropna().index
+dat = dat.loc[keep]
+
+groups = dat["bc"]
+# print(dat['adj'].values.shape)
+num_node = dat.iloc[0]["adj"].shape[0]
+
+matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node))
+upper_tri = np.triu_indices(num_node, k=1)
+
+outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
+
+# print(len(np.unique(outcome)))
+
+if CONFOUNDS is not None:
+    confounds = dat[CONFOUNDS]
+    base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}"
+else:
+    confounds = None
+    base_name = f"nbs-predict_outcome-{OUTCOME}"
+# print(dat['bc'])
+
+weighted_average, cv_results = nbs.kfold_nbs(
+    matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500
+)
+
+fig, fig2, nimg = io.plot_edges(
+    weighted_average,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Precision-Weighted Average",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}")
+)
+
+
+avg_df = pd.DataFrame(
+    weighted_average,
+    index=range(0, weighted_average.shape[0]),
+    columns=range(0, weighted_average.shape[1]),
+)
+
+cv_results.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t"
+)
+avg_df.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
+)
+
+best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
+
+# this uses the most predictive subnetwork as features in the model
+# might replace with thresholded weighted_average
+# or use _all_ the edges in weighted_average with KRR or ElasticNet...
+# ORRR use thresholded weighted average edges with ElasticNet...
+# - stays true to NBS-Predict
+# - increases parsimony while handling multicollinearity...
+# either way, I don't think cv_results is necessary
+
+# here is where we'd threshold the weighted average to use for elastic-net
+weighted_average = np.where(weighted_average > 0, weighted_average, 0)
+# print(np.sum(weighted_average))
+# nbs_vector = weighted_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+# filter = np.where(nbs_vector >= p75, True, False)
+# print(np.sum(filter))
+# print(nbs_vector.shape, filter.shape)
+
+thresh_average = threshold_proportional(weighted_average, THRESH)
+nbs_vector2 = thresh_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+filter = np.where(nbs_vector2 > 0, True, False)
+
+# mask = io.vectorize_corrmats(filter)
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_train = dat[CONFOUNDS].values
+    outcome_train = np.reshape(outcome, (outcome.shape[0],))
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_train)) <= 2:
+        resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
+        train_outcome = outcome
+    elif len(np.unique(outcome_train)) > 3:
+        train_outcome, resid_edges = nbs.residualize(
+            X=edges_train, y=outcome_train, confounds=confounds_train
+        )
+    train_features = resid_edges
+else:
+    train_features = edges_train
+    train_outcome = outcome
+
+x_scaler = StandardScaler()
+y_scaler = StandardScaler()
+train_features = x_scaler.fit_transform(train_features)
+if len(np.unique(train_outcome)) <= 2:
+    pass
+else:
+    train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
+
+
+# run the model on the whole test dataset to get params
+
+# classification if the outcome is binary (for now)
+# could be extended to the multiclass case?
+train_metrics = {}
+if len(np.unique(outcome)) == 2:
+    model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0])
+    train_metrics["alpha"] = best.C_[0]
+    # train_metrics["l1_ratio"] = best.l1_ratio_
+else:
+    model = Ridge(
+        solver="auto",
+        alpha=best.alpha_,
+        fit_intercept=False,
+    )
+    train_metrics["alpha"] = best.alpha_
+
+cv = RepeatedKFold(n_splits=5, n_repeats=10)
+
+# train_metrics["l1_ratio"] = best.l1_ratio_
+# print(params)
+# model.set_params(**params)
+# train ElasticNet on full train dataset, using feature extraction from NBS-Predict
+# fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+scores = cross_validate(
+    model,
+    train_features,
+    train_outcome,
+    groups=groups,
+    cv=cv,
+    return_estimator=True,
+    return_train_score=True,
+)
+train_metrics["in_sample_test"] = np.mean(scores["test_score"])
+train_metrics["in_sample_train"] = np.mean(scores["train_score"])
+
+fitted = scores["estimator"][0]
+y_pred = fitted.predict(X=train_features)
+train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
+
+dat[f"{OUTCOME}_pred"] = y_pred
+dat[f"{OUTCOME}_scaled"] = train_outcome
+
+Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]]
+Ys.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+train_colors = ["#a08ad1", "#685690", "#3f2d69"]  # light  # medium  # dark
+light_cmap = sns.color_palette("dark:#a08ad1")
+dark_cmap = sns.color_palette("dark:#685690")
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap
+)
+h = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5))
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+mse = mean_squared_error(train_outcome, y_pred)
+train_metrics["mean squared error"] = mse
+print("In-sample train score: ", train_metrics["in_sample_train"])
+print("In-sample test score: ", train_metrics["in_sample_test"])
+print("In-sample mean squared error: ", mse)
+# print(np.mean(train_features))
+with open(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(train_metrics, fp)
+
+# yoink the coefficients? for a more parsimonious figure?
+# print(fitted.coef_.shape)
+# print(fitted.coef_)
+coeff_vec = np.zeros_like(filter)
+j = 0
+for i in range(0, filter.shape[0]):
+    if filter[i] == True:
+        # print(j)
+        # print(fitted.coef_[0, j])
+        coeff_vec[i] = fitted.coef_[0, j]
+        j += 1
+    else:
+        pass
+
+# print(coeff_vec)
+print(coeff_vec)
+coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
+
+coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index)
+coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv"))
+
+fig, fig2, nimg = io.plot_edges(
+    coef_mat,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Coefficients",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}")
+)
+
+
+layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
+
+test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+keep = test_df[[OUTCOME, "adj"]].dropna().index
+# print(keep)
+
+test_df = test_df.loc[keep]
+
+outcome_test = test_df[OUTCOME].values
+# print(test_df)
+
+# print(outcome_test)
+matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
+    (len(test_df["adj"].dropna().index), num_node, num_node)
+)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_test = test_df[CONFOUNDS].values
+
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_test)) <= 2:
+        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
+        test_outcome = outcome_test
+    elif len(np.unique(outcome_test)) > 3:
+        test_outcome, resid_edges = nbs.residualize(
+            X=edges_test, y=outcome_test, confounds=confounds_test
+        )
+    test_features = resid_edges
+else:
+    test_features = edges_test
+    test_outcome = outcome_test
+
+# scale after residualizing omg
+test_features = x_scaler.transform(test_features)
+if len(np.unique(test_outcome)) <= 2:
+    pass
+else:
+    test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1))
+# print(test_features.shape)
+# if the model is a logistic regression, i.e. with a binary outcome
+# then score is prediction accuracy
+# if the model is a linear regression, i.e., with a continuous outcome
+# then the score is R^2 (coefficient of determination)
+
+# fit trained ElasticNet, initialized via warm_start
+# prob in CV?
+# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
+# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
+test_metrics = {}
+
+# cross_validate(model, )
+y_pred = fitted.predict(X=test_features)
+score = fitted.score(X=test_features, y=np.ravel(test_outcome))
+if len(np.unique(test_outcome)) == 2:
+    test_metrics["accuracy"] = score
+else:
+    test_metrics["coefficient of determination"] = score
+corr = spearmanr(test_outcome, y_pred)
+test_metrics["pred_v_actual_corr"] = corr
+mse = mean_squared_error(test_outcome, y_pred)
+test_metrics["mean squared error"] = mse
+print("Out-of-sample prediction score:\t", score)
+print("Out-of-sample mean squared error:\t", mse)
+# print(np.mean(test_features))
+# pred_outcome = fitted.predict(test_features)
+test_df[f"{OUTCOME}_scaled"] = test_outcome
+test_df[f"{OUTCOME}_pred"] = y_pred
+Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]]
+Ys.to_csv(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+Ys["ppts"] = Ys.index.get_level_values(0)
+
+
+light_colors = ["#33ACE3", "#EA6964", "#4AB62C"]  # Bubbles  # Blossom  # Buttercup
+dark_colors = ["#1278a6", "#a11510", "#228208"]
+light = ListedColormap(light_colors, name="light_powderpuff")
+dark = ListedColormap(dark_colors, name="dark_powderpuff")
+mpl.colormaps.register(cmap=light)
+mpl.colormaps.register(cmap=dark)
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_pred",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="light_powderpuff",
+)
+h = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_scaled",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="dark_powderpuff",
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+fig.savefig(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+
+# print(test_outcome, "\n", y_pred)
+# print(pred_outcome)
+if len(np.unique(test_outcome)) > 2:
+
+    print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
+    test_metrics["spearman correlation"] = corr
+with open(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(test_metrics, fp)
+np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred)
diff --git a/idconn/workflows/nbs_predict-p4bc_sensitivity.py b/idconn/workflows/nbs_predict-p4bc_sensitivity.py
new file mode 100644
index 0000000..8cf6026
--- /dev/null
+++ b/idconn/workflows/nbs_predict-p4bc_sensitivity.py
@@ -0,0 +1,412 @@
+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+import nibabel as nib
+import seaborn as sns
+import bids
+import matplotlib.pyplot as plt
+from os.path import join
+from datetime import datetime
+from time import strftime
+from scipy.stats import spearmanr
+from idconn import nbs, io
+
+from bct import threshold_proportional
+
+
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate
+from sklearn.preprocessing import Normalizer, StandardScaler
+from sklearn.metrics import mean_squared_error
+from matplotlib.colors import ListedColormap
+import matplotlib as mpl
+
+
+import warnings
+import json
+
+warnings.simplefilter("ignore")
+
+today = datetime.today()
+today_str = strftime("%m_%d_%Y")
+
+TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674"
+TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset"
+DERIV_NAME = "IDConn"
+OUTCOME = "progesterone"
+CONFOUNDS = ["framewise_displacement", "bc"]
+TASK = "rest"
+ATLAS = "craddock2012"
+THRESH = 0.5
+alpha = 0.01
+atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz"
+
+
+layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True)
+
+dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index
+keep = dat["adj"].dropna().index
+dat = dat.loc[keep]
+
+groups = dat["bc"]
+# print(dat['adj'].values.shape)
+num_node = dat.iloc[0]["adj"].shape[0]
+
+matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node))
+upper_tri = np.triu_indices(num_node, k=1)
+
+outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
+
+# print(len(np.unique(outcome)))
+
+if CONFOUNDS is not None:
+    confounds = dat[CONFOUNDS]
+    base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}"
+else:
+    confounds = None
+    base_name = f"nbs-predict_outcome-{OUTCOME}"
+# print(dat['bc'])
+
+weighted_average, cv_results = nbs.kfold_nbs(
+    matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500
+)
+
+fig, fig2, nimg = io.plot_edges(
+    weighted_average,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Precision-Weighted Average",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}")
+)
+
+
+avg_df = pd.DataFrame(
+    weighted_average,
+    index=range(0, weighted_average.shape[0]),
+    columns=range(0, weighted_average.shape[1]),
+)
+
+cv_results.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t"
+)
+avg_df.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
+)
+
+best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
+
+# this uses the most predictive subnetwork as features in the model
+# might replace with thresholded weighted_average
+# or use _all_ the edges in weighted_average with KRR or ElasticNet...
+# ORRR use thresholded weighted average edges with ElasticNet...
+# - stays true to NBS-Predict
+# - increases parsimony while handling multicollinearity...
+# either way, I don't think cv_results is necessary
+
+# here is where we'd threshold the weighted average to use for elastic-net
+weighted_average = np.where(weighted_average > 0, weighted_average, 0)
+# print(np.sum(weighted_average))
+# nbs_vector = weighted_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+# filter = np.where(nbs_vector >= p75, True, False)
+# print(np.sum(filter))
+# print(nbs_vector.shape, filter.shape)
+
+thresh_average = threshold_proportional(weighted_average, THRESH)
+nbs_vector2 = thresh_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+filter = np.where(nbs_vector2 > 0, True, False)
+
+# mask = io.vectorize_corrmats(filter)
+edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_train = dat[CONFOUNDS].values
+    outcome_train = np.reshape(outcome, (outcome.shape[0],))
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_train)) <= 2:
+        resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train)
+        train_outcome = outcome
+    elif len(np.unique(outcome_train)) > 3:
+        train_outcome, resid_edges = nbs.residualize(
+            X=edges_train, y=outcome_train, confounds=confounds_train
+        )
+    train_features = resid_edges
+else:
+    train_features = edges_train
+    train_outcome = outcome
+
+x_scaler = StandardScaler()
+y_scaler = StandardScaler()
+train_features = x_scaler.fit_transform(train_features)
+if len(np.unique(train_outcome)) <= 2:
+    pass
+else:
+    train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1))
+
+
+# run the model on the whole test dataset to get params
+
+# classification if the outcome is binary (for now)
+# could be extended to the multiclass case?
+train_metrics = {}
+if len(np.unique(outcome)) == 2:
+    model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0])
+    train_metrics["alpha"] = best.C_[0]
+    # train_metrics["l1_ratio"] = best.l1_ratio_
+else:
+    model = Ridge(
+        solver="auto",
+        alpha=best.alpha_,
+        fit_intercept=False,
+    )
+    train_metrics["alpha"] = best.alpha_
+
+cv = RepeatedKFold(n_splits=5, n_repeats=10)
+
+# train_metrics["l1_ratio"] = best.l1_ratio_
+# print(params)
+# model.set_params(**params)
+# train ElasticNet on full train dataset, using feature extraction from NBS-Predict
+# fitted = model.fit(X=train_features, y=np.ravel(train_outcome))
+scores = cross_validate(
+    model,
+    train_features,
+    train_outcome,
+    groups=groups,
+    cv=cv,
+    return_estimator=True,
+    return_train_score=True,
+)
+train_metrics["in_sample_test"] = np.mean(scores["test_score"])
+train_metrics["in_sample_train"] = np.mean(scores["train_score"])
+
+fitted = scores["estimator"][0]
+y_pred = fitted.predict(X=train_features)
+train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
+
+dat[f"{OUTCOME}_pred"] = y_pred
+dat[f"{OUTCOME}_scaled"] = train_outcome
+
+Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]]
+Ys.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+train_colors = ["#a08ad1", "#685690", "#3f2d69"]  # light  # medium  # dark
+light_cmap = sns.color_palette("dark:#a08ad1")
+dark_cmap = sns.color_palette("dark:#685690")
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap
+)
+h = sns.scatterplot(
+    x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5))
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+mse = mean_squared_error(train_outcome, y_pred)
+train_metrics["mean squared error"] = mse
+print("In-sample train score: ", train_metrics["in_sample_train"])
+print("In-sample test score: ", train_metrics["in_sample_test"])
+print("In-sample mean squared error: ", mse)
+# print(np.mean(train_features))
+with open(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(train_metrics, fp)
+
+# yoink the coefficients? for a more parsimonious figure?
+# print(fitted.coef_.shape)
+# print(fitted.coef_)
+coeff_vec = np.zeros_like(filter)
+j = 0
+for i in range(0, filter.shape[0]):
+    if filter[i] == True:
+        # print(j)
+        # print(fitted.coef_[0, j])
+        coeff_vec[i] = fitted.coef_[0, j]
+        j += 1
+    else:
+        pass
+
+# print(coeff_vec)
+print(coeff_vec)
+coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node)
+
+coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index)
+coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv"))
+
+fig, fig2, nimg = io.plot_edges(
+    coef_mat,
+    atlas_fname,
+    threshold="computed",
+    title=f"{OUTCOME} Coefficients",
+    strength=True,
+    cmap="seismic",
+    node_size="strength",
+)
+
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400
+)
+fig2.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"),
+    dpi=400,
+)
+nib.save(
+    nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}")
+)
+
+
+layout = bids.BIDSLayout(TEST_DSET, derivatives=True)
+
+test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False)
+
+keep = test_df[[OUTCOME, "adj"]].dropna().index
+# print(keep)
+
+test_df = test_df.loc[keep]
+
+outcome_test = test_df[OUTCOME].values
+# print(test_df)
+
+# print(outcome_test)
+matrices_test = np.vstack(test_df["adj"].dropna().values).reshape(
+    (len(test_df["adj"].dropna().index), num_node, num_node)
+)
+edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
+
+# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
+if CONFOUNDS is not None:
+    confounds_test = test_df[CONFOUNDS].values
+
+    # regress out the confounds from each edge and the outcome variable,
+    # use the residuals for the rest of the algorithm
+    # print(confounds.shape, outcome.shape)
+    if len(np.unique(outcome_test)) <= 2:
+        resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test)
+        test_outcome = outcome_test
+    elif len(np.unique(outcome_test)) > 3:
+        test_outcome, resid_edges = nbs.residualize(
+            X=edges_test, y=outcome_test, confounds=confounds_test
+        )
+    test_features = resid_edges
+else:
+    test_features = edges_test
+    test_outcome = outcome_test
+
+# scale after residualizing omg
+test_features = x_scaler.transform(test_features)
+if len(np.unique(test_outcome)) <= 2:
+    pass
+else:
+    test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1))
+# print(test_features.shape)
+# if the model is a logistic regression, i.e. with a binary outcome
+# then score is prediction accuracy
+# if the model is a linear regression, i.e., with a continuous outcome
+# then the score is R^2 (coefficient of determination)
+
+# fit trained ElasticNet, initialized via warm_start
+# prob in CV?
+# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome))
+# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome))
+test_metrics = {}
+
+# cross_validate(model, )
+y_pred = fitted.predict(X=test_features)
+score = fitted.score(X=test_features, y=np.ravel(test_outcome))
+if len(np.unique(test_outcome)) == 2:
+    test_metrics["accuracy"] = score
+else:
+    test_metrics["coefficient of determination"] = score
+corr = spearmanr(test_outcome, y_pred)
+test_metrics["pred_v_actual_corr"] = corr
+mse = mean_squared_error(test_outcome, y_pred)
+test_metrics["mean squared error"] = mse
+print("Out-of-sample prediction score:\t", score)
+print("Out-of-sample mean squared error:\t", mse)
+# print(np.mean(test_features))
+# pred_outcome = fitted.predict(test_features)
+test_df[f"{OUTCOME}_scaled"] = test_outcome
+test_df[f"{OUTCOME}_pred"] = y_pred
+Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]]
+Ys.to_csv(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+Ys["ppts"] = Ys.index.get_level_values(0)
+
+
+light_colors = ["#33ACE3", "#EA6964", "#4AB62C"]  # Bubbles  # Blossom  # Buttercup
+dark_colors = ["#1278a6", "#a11510", "#228208"]
+light = ListedColormap(light_colors, name="light_powderpuff")
+dark = ListedColormap(dark_colors, name="dark_powderpuff")
+mpl.colormaps.register(cmap=light)
+mpl.colormaps.register(cmap=dark)
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_pred",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="light_powderpuff",
+)
+h = sns.scatterplot(
+    x="cycle_day",
+    y=f"{OUTCOME}_scaled",
+    style="bc",
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="dark_powderpuff",
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+fig.savefig(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
+
+
+# print(test_outcome, "\n", y_pred)
+# print(pred_outcome)
+if len(np.unique(test_outcome)) > 2:
+
+    print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr)
+    test_metrics["spearman correlation"] = corr
+with open(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w"
+) as fp:
+    json.dump(test_metrics, fp)
+np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred)
diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py
index 46e804c..169a5aa 100644
--- a/idconn/workflows/nbs_predict.py
+++ b/idconn/workflows/nbs_predict.py
@@ -55,7 +55,7 @@
 upper_tri = np.triu_indices(num_node, k=1)
 
 outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1))
-groups = dat['bc']
+groups = dat["bc"]
 
 if CONFOUNDS is not None:
     confounds = dat[CONFOUNDS]
@@ -85,7 +85,6 @@
 edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter]
 
 
-
 # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE
 if CONFOUNDS is not None:
     confounds_test = test_df[CONFOUNDS].values
@@ -145,7 +144,7 @@
     join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t"
 )
 
-best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model']
+best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"]
 
 # this uses the most predictive subnetwork as features in the model
 # might replace with thresholded weighted_average
@@ -157,13 +156,13 @@
 
 # here is where we'd threshold the weighted average to use for elastic-net
 weighted_average = np.where(weighted_average > 0, weighted_average, 0)
-#nbs_vector = weighted_average[upper_tri]
-#p75 = np.percentile(nbs_vector, 75)
-#filter = np.where(nbs_vector >= p75, True, False)
+# nbs_vector = weighted_average[upper_tri]
+# p75 = np.percentile(nbs_vector, 75)
+# filter = np.where(nbs_vector >= p75, True, False)
 # print(nbs_vector.shape, filter.shape)
 thresh_average = threshold_proportional(weighted_average, THRESH)
 nbs_vector2 = thresh_average[upper_tri]
-#p75 = np.percentile(nbs_vector, 75)
+# p75 = np.percentile(nbs_vector, 75)
 filter = np.where(nbs_vector2 > 0, True, False)
 
 # mask = io.vectorize_corrmats(filter)
@@ -205,61 +204,59 @@
 
 train_metrics = {}
 if len(np.unique(outcome)) == 2:
-    model = LogisticRegression(
-        penalty="l2", 
-        solver="saga", 
-        C=best.C_[0]
-        )
+    model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0])
     train_metrics["alpha"] = best.C_[0]
-    #train_metrics["l1_ratio"] = best.l1_ratio_
+    # train_metrics["l1_ratio"] = best.l1_ratio_
 else:
-    model = Ridge(
-        solver="saga",  
-        alpha=best.alpha_
-        )
+    model = Ridge(solver="saga", alpha=best.alpha_)
     train_metrics["alpha"] = best.alpha_
-    #train_metrics["l1_ratio"] = best.l1_ratio_
-#print(params)
-#model.set_params(**params)
+    # train_metrics["l1_ratio"] = best.l1_ratio_
+# print(params)
+# model.set_params(**params)
 # train ElasticNet on full train dataset, using feature extraction from NBS-Predict
 
 scores = cross_validate(
-    model, 
-    train_features, 
-    train_outcome, 
-    groups=groups, 
+    model,
+    train_features,
+    train_outcome,
+    groups=groups,
     cv=cv,
-    return_estimator=True, 
-    return_train_score=True
-    )
-train_metrics["in_sample_test"] = np.mean(scores['test_score'])
-train_metrics["in_sample_train"] = np.mean(scores['train_score'])
+    return_estimator=True,
+    return_train_score=True,
+)
+train_metrics["in_sample_test"] = np.mean(scores["test_score"])
+train_metrics["in_sample_train"] = np.mean(scores["train_score"])
 
-fitted = scores['estimator'][0]
+fitted = scores["estimator"][0]
 y_pred = fitted.predict(X=train_features)
 train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome)
-dat[f'{OUTCOME}_pred'] = y_pred
-dat[f'{OUTCOME}_scaled'] = train_outcome
-
-Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled']]
-Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
-
-train_colors = ['#a08ad1', #light
-                '#685690', #medium
-                '#3f2d69' #dark
-                ]
-light_cmap = sns.color_palette('dark:#a08ad1')
-dark_cmap = sns.color_palette('dark:#685690')
-
-fig,ax = plt.subplots()
-g = sns.scatterplot(x=f'{OUTCOME}_scaled',
-                    y=f'{OUTCOME}_pred', 
-                    #style='bc', 
-                    data=Ys,  
-                    ax=ax, 
-                    palette=dark_cmap)
-#ax.legend(bbox_to_anchor=(1.0, 0.5))
-fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
+dat[f"{OUTCOME}_pred"] = y_pred
+dat[f"{OUTCOME}_scaled"] = train_outcome
+
+Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled"]]
+Ys.to_csv(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+train_colors = ["#a08ad1", "#685690", "#3f2d69"]  # light  # medium  # dark
+light_cmap = sns.color_palette("dark:#a08ad1")
+dark_cmap = sns.color_palette("dark:#685690")
+
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x=f"{OUTCOME}_scaled",
+    y=f"{OUTCOME}_pred",
+    # style='bc',
+    data=Ys,
+    ax=ax,
+    palette=dark_cmap,
+)
+# ax.legend(bbox_to_anchor=(1.0, 0.5))
+fig.savefig(
+    join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
 
 mse = mean_squared_error(train_outcome, y_pred)
 train_metrics["mean squared error"] = mse
@@ -277,7 +274,7 @@
 j = 0
 for i in range(0, filter.shape[0]):
     if filter[i] == True:
-        #print(j)
+        # print(j)
         coeff_vec[i] = fitted.coef_[0, j]
         j += 1
     else:
@@ -341,43 +338,43 @@
 print("Out-of-sample mean squared error:\t", mse)
 # print(np.mean(test_features))
 # pred_outcome = fitted.predict(test_features)
-test_df[f'{OUTCOME}_scaled'] = test_outcome
-test_df[f'{OUTCOME}_pred'] = y_pred
-Ys = test_df[[f'{OUTCOME}_scaled', 
-              f'{OUTCOME}_pred']]
-Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t')
-
-Ys['ppts'] = Ys.index.get_level_values(0)
-
-
-light_colors = ['#33ACE3', #Bubbles
-                '#EA6964', #Blossom
-                '#4AB62C' #Buttercup
-                ]
-dark_colors = ['#1278a6', 
-               '#a11510', 
-               '#228208']
-light = ListedColormap(light_colors, name='light_powderpuff')
-dark = ListedColormap(dark_colors, name='dark_powderpuff')
+test_df[f"{OUTCOME}_scaled"] = test_outcome
+test_df[f"{OUTCOME}_pred"] = y_pred
+Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred"]]
+Ys.to_csv(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t"
+)
+
+Ys["ppts"] = Ys.index.get_level_values(0)
+
+
+light_colors = ["#33ACE3", "#EA6964", "#4AB62C"]  # Bubbles  # Blossom  # Buttercup
+dark_colors = ["#1278a6", "#a11510", "#228208"]
+light = ListedColormap(light_colors, name="light_powderpuff")
+dark = ListedColormap(dark_colors, name="dark_powderpuff")
 mpl.colormaps.register(cmap=light)
 mpl.colormaps.register(cmap=dark)
 
-fig,ax = plt.subplots()
-g = sns.scatterplot(x=f'{OUTCOME}_scaled', 
-                    y=f'{OUTCOME}_pred', 
-                    #style='bc', 
-                    data=Ys, 
-                    hue='ppts',  
-                    hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'],
-                    ax=ax, 
-                    palette='light_powderpuff'
-                    )
-ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left')
-fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight')
-
+fig, ax = plt.subplots()
+g = sns.scatterplot(
+    x=f"{OUTCOME}_scaled",
+    y=f"{OUTCOME}_pred",
+    # style='bc',
+    data=Ys,
+    hue="ppts",
+    hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"],
+    ax=ax,
+    palette="light_powderpuff",
+)
+ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+fig.savefig(
+    join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"),
+    dpi=400,
+    bbox_inches="tight",
+)
 
 
-#print(test_outcome, "\n", y_pred)
+# print(test_outcome, "\n", y_pred)
 # print(pred_outcome)
 if len(np.unique(test_outcome)) > 2:
     corr = spearmanr(test_outcome, y_pred)

From bdf7527eb0ac6e7d5b1fc1961f08b927bba32667 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Fri, 14 Jun 2024 15:53:03 -0700
Subject: [PATCH 47/48] misspelled pingouin oops

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 4d7ed83..af040ab 100644
--- a/setup.py
+++ b/setup.py
@@ -35,6 +35,7 @@
         "networkx",
         "matplotlib",  # necessary until nilearn includes mpl as a dependency
         "enlighten",
+        'pingouin'
     ],
     extras_require={
         "doc": [

From 602fd72167e538f7820560c162f8cc6e761bd290 Mon Sep 17 00:00:00 2001
From: "Katherine L. Bottenhorn" <kbott006@fiu.edu>
Date: Fri, 14 Jun 2024 16:01:20 -0700
Subject: [PATCH 48/48] add docstring to test function

---
 idconn/tests/test_pipeline.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/idconn/tests/test_pipeline.py b/idconn/tests/test_pipeline.py
index 6d78fae..8322e7f 100644
--- a/idconn/tests/test_pipeline.py
+++ b/idconn/tests/test_pipeline.py
@@ -2,6 +2,9 @@
 
 
 def test_idconn_workflow_smoke():
+    '''
+    this is a docstring bc my tests kept failing and it was annoying
+    '''
     from idconn.pipeline import idconn_workflow
 
     # Check that it's a function ¯\_(ツ)_/¯