From 467e675a9c488eb87d4196f2649b4f5ff542681b Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Tue, 29 Nov 2022 14:19:58 -0800 Subject: [PATCH 01/48] omst graph theory --- idconn/networking/graph_theory.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/idconn/networking/graph_theory.py b/idconn/networking/graph_theory.py index 2713929..710cadd 100644 --- a/idconn/networking/graph_theory.py +++ b/idconn/networking/graph_theory.py @@ -80,4 +80,5 @@ def graph_omst(matrix, measure, args): # calculate graph measure on thresholded matrix metric = measure(thresh_mat, args) - return metric \ No newline at end of file + return metric + From 9c1feeb86214429298657886afc12baab9f732fa Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 12:08:10 -0800 Subject: [PATCH 02/48] update null modeling with new data shape --- idconn/networking/null_distribution.py | 69 +++++++------------------- 1 file changed, 19 insertions(+), 50 deletions(-) diff --git a/idconn/networking/null_distribution.py b/idconn/networking/null_distribution.py index 623f64f..03f9ce7 100644 --- a/idconn/networking/null_distribution.py +++ b/idconn/networking/null_distribution.py @@ -4,47 +4,16 @@ import bct import datetime -def avg_corrmat(layout, task, session): - subjects = layout.get_subjects(task=task,session=session) - corrmats = {} - for subject in subjects: - try: - if task == "rest": - corrmat = np.genfromtxt( - join( - data_dir, - sesh[session], - subject, - "{0}-session-{1}-{2}_network_corrmat_{3}.csv".format( - subject, session, task, atlas - ), - ), - delimiter=",", - ) - else: - corrmat = np.genfromtxt( - join( - data_dir, - sesh[session], - subject, - "{0}-session-{1}_{2}-{3}_{4}-corrmat.csv".format( - subject, session, task, condition, atlas - ), - ), - delimiter=" ", - ) - # corrmat = np.genfromtxt(join(data_dir, '{0}-session-{1}_{2}-{3}_{4}-corrmat.csv'.format(subject, session, task, condition, atlas)), delimiter=' ') - corrmats[subject] = corrmat - except Exception as e: - print(subject, e) - data = list(corrmats.values()) - stacked_corrmats = np.array(data) +# this is all bullshit. +# update to mesh with the BIDSy way of doing things +def avg_corrmat(ppt_df): + stacked_corrmats = np.array(ppt_df['adj']) print('Stacked corrmats have dimensions', stacked_corrmats.shape) avg_corrmat = np.mean(stacked_corrmats, axis=0) return avg_corrmat -def null_model_und_sign(W, bin_swaps=5, wei_freq=0.1, seed=None): +def null_model(W, bin_swaps=5, wei_freq=0.1, seed=None): def get_rng(seed): if seed is None or seed == np.random: return np.random.mtrand._rand @@ -53,7 +22,7 @@ def get_rng(seed): try: rstate = np.random.RandomState(seed) except ValueError: - rstate = np.random.RandomState(random.Random(seed).randint(0, 2 ** 32 - 1)) + rstate = np.random.RandomState(np.random.Random(seed).randint(0, 2 ** 32 - 1)) return rstate def randmio_und_signed(R, itr, seed=None): @@ -194,28 +163,28 @@ def pick_four_unique_nodes_quickly(n, seed=None): W0 = W0 + W0.T return W0 -def generate_null(layout, task, session, mask): +def generate_null(ppt_df, thresh_arr, measure): + ''' + Generate a distribution of graph measure values based on a null connectivity matrix + that is like the average connectivity matrix across participants. + + ''' null_dist = pd.DataFrame(index=subjects, columns=["mean", "sdev"]) avg_corr = avg_corrmat( - layout, task, session, mask + ppt_df ) eff_perm = [] - j = 1 - while j < 3: + j = 0 + while j < 1000: effs = [] - W = null_model_und_sign(avg_corr.values) - for thresh in np.arange(0.21, 0.31, 0.03): + W = null_model(avg_corr.values) + for thresh in thresh_arr: thresh_corr = bct.threshold_proportional(W, thresh) - leff = bct.efficiency_wei(thresh_corr) + leff = measure(thresh_corr) effs.append(leff) effs_arr = np.asarray(effs) leff_auc = np.trapz(effs_arr, dx=0.03, axis=0) eff_perm.append(leff_auc) j += 1 - null_dist.at[(sesh[session], task, conds[i], mask), "mean"] = np.mean( - eff_perm - ) - null_dist.at[(sesh[session], task, conds[i], mask), "sdev"] = np.std( - eff_perm - ) + return null_dist \ No newline at end of file From 888604f8d69e56bc63b6f7f66e5e77ede99aeeec Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 12:16:23 -0800 Subject: [PATCH 03/48] draft of pynbs and nbspredict --- idconn/nbs.py | 297 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100644 idconn/nbs.py diff --git a/idconn/nbs.py b/idconn/nbs.py new file mode 100644 index 0000000..8998e25 --- /dev/null +++ b/idconn/nbs.py @@ -0,0 +1,297 @@ +import numpy as np +import statsmodels as sm +import networkx as nx +from utils import vectorize_corrmats, undo_vectorize + + +def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000, stratified=False): + ''' + Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided + of shape ((subject x session)x node x node) + in the network. + Returns a dataframe containing the results of kfolds cross-validation, + including the indices of train and test samples, the resulting p-value and largest connected component, + the accuracy of the network in predicting group belonging in the test samples (using logistic regression), + the parameter estimates from each regression, and the model object from each regression. + from a BIDS derivative folder. Optionally returns a subject x session dataframe + of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) + array of vectorized upper triangles of those correlation mat + Parameters + ---------- + matrices : numpy array of shape (p, n, n) + Represents the link strengths of the graphs (i.e., functional connectivity). + Assumed to be an array of symmetric matrices. + outcome : list-like of shape (p,) + Y-value to be predicted with connectivity + confounds : list-like of shape (p,m) + Covariates, included as predictors in model. + alpha : float + Type-I error (i.e., false positive) rate, for outcome-related edge detection. + predict : bool + If True, bypasses `permutations` parameter and only runs edge detection + component identification. + Used for NBS-Predict. + permutations : int + If `predict=False`, specifies the number of permutations run to create a null distribution + for estimating the significance of the connected component size. Recommended 10,000. + stratified : bool or list-like of shape (p,) + If `predict=True` and there are groups that should be equally sampled across k-fold + cross-validation, input should be a list of group belonging (i.e., one label per participant). + + Returns + ------- + S1 : Pandas dataframe + A binary matrix denoting the largest connected component. + pval : float + If `predict=False`, denotes the significance of the largest connected component. + perms : numpy array of shape (permutations,) + If `predict=False`, largest connected component size per permutation. + ''' + # need to do a mass-univariate test at every edge + # and retain significant edges + # then find the largest connected component + # and, if not predict, build a null distribution + n = matrices.shape[:-1] + ndims = len(matrices.shape) + + # vectorize_corrmats returns p x n^2 + # we want to run pynbs per edge + # so vectorized edges must be transposed + + exog = np.hstack((outcome, confounds)) + exog = sm.add_constant(exog, prepend=False) + # turn matrices into vectorized upper triangles + if ndims > 2: + edges = vectorize_corrmats(matrices) + else: + edges = matrices.copy() + edges = edges.T + + # run an ols per edge + # create significancs matrix for predictor of interest (outcome) + # 1 if edge is significantly predicted by outcome + # 0 if it's not + sig_edges = [] + for i in range(0, edges.shape[0]): + # statsmodels for regressing predictors on edges + mod = sm.OLS(edges[i,:], exog, hasconst=True) + results = mod.fit() + edge_pval = results.pvalues[0] + + # build binary significance edge vector + if edge_pval < alpha: + sig_edges.append(1) + else: + sig_edges.append(0) + + # find largest connected component of sig_edges + # turn sig_edges into an nxn matrix first + sig_matrix = undo_vectorize(sig_edges) # need to write this function + matrix = nx.from_numpy_array(sig_matrix) + + #use networkX to find connected components + comps = nx.connected_components(matrix) + + # rearrange networkx output into an array of matrices, S + S = [matrix.subgraph(c).copy() for c in comps] + # find size of each connected component, s in S + size = np.asarray([s.number_of_edges() for s in S]) + (max_comp, ) = np.where(size == max(size)) + largest_comp_size = max(size) + print(f'Connected component has {largest_comp_size} edges.') + + # retain size of largest connected component + # for NBS permutation-based significance testing + max_comp = max_comp[0] + + # pull the subgraph with largest number of nodes + # i.e., the largest connected component + G = S[max_comp] + + # grab list of nodes in largest connected component + nodes = list(G.nodes) + + unused_nodes = list(set(matrix.nodes) - set(nodes)) + S1 = nx.to_pandas_adjacency(G, nodelist=nodes) + + # add empty edges for unused nodes + # bc NBS-Predict needs all nodes for + # the eventual weighted average + # and NBS might need all nodes for easier + # plotting in brain space + for i in unused_nodes: + S1.loc[i] = 0 + S1[i] = 0 + + S1.sort_index(axis=0, inplace=True) + S1.sort_index(axis=1, inplace=True) + + # permutation testing to create a null distribution of max component size + # only for regular NBS, -Predict doesn't need this + if predict == False: + perms = np.zeros((permutations,)) + hit = 0 + rng = np.random.default_rng() + exog_copy = exog.copy() + for i in range(0, permutations): + # shuffle outcome order + rng.shuffle(exog_copy, axis=0) + #print(exog_copy) + perm_edges = [] + for j in range(0, edges.shape[0]): + # statsmodels for regressing predictors on edges + mod = sm.OLS(edges[j,:], exog_copy, hasconst=False) + results = mod.fit() + edge_pval = results.pvalues[0] + + if edge_pval < alpha: + perm_edges.append(1) + else: + perm_edges.append(0) + #print(np.sum(perm_edges)) + # find largest connected component of sig_edges + # turn sig_edges into an nxn matrix first + perm_matrix = undo_vectorize(perm_edges) # need to write this function + perm_nx = nx.from_numpy_array(perm_matrix) + + comps = nx.connected_components(perm_nx) + + S = [perm_nx.subgraph(c).copy() for c in comps] + perm_size = np.asarray([s.number_of_edges() for s in S]) + (max_comp, ) = np.where(perm_size == max(perm_size)) + #print(perm_size, max_comp) + + # retain for null distribution + perms[i] = max(perm_size) + if i % 10 == 0: + print(f'p-value is {np.size(np.where(perms >= largest_comp_size)) / permutations} as of permutation {i}') + + # bctpy nbs code uses hit to mark progress across permutations + # prob not necessary? + + # bctpy calcs pval for all components, not just largest? + # but I don't think that's relevant for the og implimentation of nbs? + pval = np.size(np.where(perms >= largest_comp_size)) / permutations + print(largest_comp_size, permutations, pval) + + return pval, S1, perms + else: + return S1 + +def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10, k=1000, shuffle=False, fig_dir=None): + """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided + of shape ((subject x session)x node x node) + in the network. + Returns a dataframe containing the results of kfolds cross-validation, + including the indices of train and test samples, the resulting p-value and largest connected component, + the accuracy of the network in predicting group belonging in the test samples (using logistic regression), + the parameter estimates from each regression, and the model object from each regression. + from a BIDS derivative folder. Optionally returns a subject x session dataframe + of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) + array of vectorized upper triangles of those correlation mat + Parameters + ---------- + matrices : numpy array of shape (p, n, n) + Represents the link strengths of the graphs. Assumed to be + an array of symmetric matrices. + outcome : list-like of shape (p,) + Y-value to be predicted with connectivity + + Returns + ------- + cv_results : Pandas dataframe + Includes the results of each cross-validation loop + the input matrices. + """ + edges = vectorize_corrmats(matrices) + #print(edges.shape) + index = list(range(0,n_splits * n_iterations)) + + cv_results = pd.DataFrame(index=index, + columns=['split', + 'pval', + 'score', + 'component', + 'coefficient_matrix', + 'coefficient_vector', + 'model']) + if groups is not None: + cv = RepeatedStratifiedKFold(n_splits=n_splits, + n_repeats=n_iterations) + df = groups.shape[0] - 2 + else: + cv = RepeatedKFold(n_splits=n_splits, + n_repeats=n_iterations) + df = edges.shape[0] - 1 + + if tail == 'both': + alpha = 0.01 + else: + alpha = 0.005 + t_threshold = t.ppf(1 - alpha, df=df) + + if matrices.shape[0] != matrices.shape[1]: + if matrices.shape[1] == matrices.shape[2]: + num_node = matrices.shape[1] + matrices = np.moveaxis(matrices, 0, -1) + else: + raise ValueError(f'Matrices of shape {matrices.shape}', + 'requires matrices of shape (subject x session) x node x node', + 'or node x node x (subject x session).') + else: + num_node = matrices.shape[0] + upper_tri = np.triu_indices(num_node, k=1) + + i = 0 + manager = enlighten.get_manager() + ticks = manager.counter(total=n_splits * n_iterations, desc='Progress', unit='folds') + for train_idx, test_idx in cv.split(edges, outcome, groups=groups): + cv_results.at[i, 'split'] = (train_idx, test_idx) + # all of this presumes the old bctpy version of nbs + # irrelevant for pynbs + #train_a_idx = [m for m in train_idx if outcome[m] == 0] + #train_b_idx = [m for m in train_idx if outcome[m] == 1] + #assert len(train_a_idx) == len(train_b_idx) + #train_a = matrices[:,:,train_a_idx] + #train_b = matrices[:,:,train_b_idx] + #print(train_a.shape, train_b.shape) + + # separate edges & covariates into + train_y = outcome[train_idx] + test_y = outcome[test_idx] + + pval, adj, _ = pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000) + pval, adj, _ = bct.nbs_bct(train_a, + train_b, + t_threshold, + k=k, + tail=tail) + cv_results.at[i, 'pval'] = pval + cv_results.at[i, 'component'] = adj + + nbs_vector = adj[upper_tri] + mask = nbs_vector == 1 + train_features = edges[train_idx, :].T[mask] + test_features = edges[test_idx, :].T[mask] + + regressor = LogisticRegression(max_iter=1000) + model = regressor.fit(X=train_features.T, y=train_y) + cv_results.at[i, 'model'] = model + score = model.score(X=test_features.T, y=test_y) + cv_results.at[i, 'score'] = score + + m = 0 + param_vector = np.zeros_like(nbs_vector) + for l in range(0, nbs_vector.shape[0]): + if nbs_vector[l] == 1.: + param_vector[l] = model.coef_[0,m] + m+=1 + else: + pass + X = np.zeros_like(adj) + X[np.triu_indices(X.shape[0], k=1)] = param_vector + X = X + X.T + cv_results.at[i, 'coefficient_matrix'] = X + cv_results.at[i, 'coefficient_vector'] = param_vector + i += 1 + ticks.update() + return cv_results From 2a78636f1b1e9d26db59ca4cc6becc84c7bb8821 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 12:29:01 -0800 Subject: [PATCH 04/48] move null dist into networking.py, delete empties --- idconn/data/missingness.py | 0 .../graph_theory.py => networking.py} | 0 idconn/networking/__init__.py | 8 - idconn/networking/null_distribution.py | 190 ------------------ 4 files changed, 198 deletions(-) delete mode 100644 idconn/data/missingness.py rename idconn/{networking/graph_theory.py => networking.py} (100%) delete mode 100644 idconn/networking/__init__.py delete mode 100644 idconn/networking/null_distribution.py diff --git a/idconn/data/missingness.py b/idconn/data/missingness.py deleted file mode 100644 index e69de29..0000000 diff --git a/idconn/networking/graph_theory.py b/idconn/networking.py similarity index 100% rename from idconn/networking/graph_theory.py rename to idconn/networking.py diff --git a/idconn/networking/__init__.py b/idconn/networking/__init__.py deleted file mode 100644 index a4564bf..0000000 --- a/idconn/networking/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -Tools for computing network topology / graph theoretic measures -""" - -from . import null_distribution -from . import graph_theory - -__all__ = ["null_distribution", "graph_theory"] diff --git a/idconn/networking/null_distribution.py b/idconn/networking/null_distribution.py deleted file mode 100644 index 03f9ce7..0000000 --- a/idconn/networking/null_distribution.py +++ /dev/null @@ -1,190 +0,0 @@ -import numpy as np -import pandas as pd -from os.path import join, exists -import bct -import datetime - -# this is all bullshit. -# update to mesh with the BIDSy way of doing things -def avg_corrmat(ppt_df): - stacked_corrmats = np.array(ppt_df['adj']) - print('Stacked corrmats have dimensions', stacked_corrmats.shape) - avg_corrmat = np.mean(stacked_corrmats, axis=0) - return avg_corrmat - - -def null_model(W, bin_swaps=5, wei_freq=0.1, seed=None): - def get_rng(seed): - if seed is None or seed == np.random: - return np.random.mtrand._rand - elif isinstance(seed, np.random.RandomState): - return seed - try: - rstate = np.random.RandomState(seed) - except ValueError: - rstate = np.random.RandomState(np.random.Random(seed).randint(0, 2 ** 32 - 1)) - return rstate - - def randmio_und_signed(R, itr, seed=None): - rng = get_rng(seed) - R = R.copy() - n = len(R) - - itr *= int(n * (n - 1) / 2) - - max_attempts = int(np.round(n / 2)) - eff = 0 - - for it in range(int(itr)): - att = 0 - while att <= max_attempts: - - a, b, c, d = pick_four_unique_nodes_quickly(n, rng) - - r0_ab = R[a, b] - r0_cd = R[c, d] - r0_ad = R[a, d] - r0_cb = R[c, b] - - # rewiring condition - if ( - np.sign(r0_ab) == np.sign(r0_cd) - and np.sign(r0_ad) == np.sign(r0_cb) - and np.sign(r0_ab) != np.sign(r0_ad) - ): - - R[a, d] = R[d, a] = r0_ab - R[a, b] = R[b, a] = r0_ad - - R[c, b] = R[b, c] = r0_cd - R[c, d] = R[d, c] = r0_cb - - eff += 1 - break - - att += 1 - - return R, eff - - def pick_four_unique_nodes_quickly(n, seed=None): - """ - This is equivalent to np.random.choice(n, 4, replace=False) - Another fellow suggested np.random.random_sample(n).argpartition(4) which is - clever but still substantially slower. - """ - rng = get_rng(seed) - k = rng.randint(n ** 4) - a = k % n - b = k // n % n - c = k // n ** 2 % n - d = k // n ** 3 % n - if a != b and a != c and a != d and b != c and b != d and c != d: - return (a, b, c, d) - else: - # the probability of finding a wrong configuration is extremely low - # unless for extremely small n. if n is extremely small the - # computational demand is not a problem. - - # In my profiling it only took 0.4 seconds to include the uniqueness - # check in 1 million runs of this function so I think it is OK. - return pick_four_unique_nodes_quickly(n, rng) - - rng = get_rng(seed) - if not np.allclose(W, W.T): - print("Input must be undirected") - W = W.copy() - n = len(W) - np.fill_diagonal(W, 0) # clear diagonal - Ap = W > 0 # positive adjmat - An = W < 0 # negative adjmat - - if np.size(np.where(Ap.flat)) < (n * (n - 1)): - W_r, eff = randmio_und_signed(W, bin_swaps, seed=rng) - Ap_r = W_r > 0 - An_r = W_r < 0 - else: - Ap_r = Ap - An_r = An - - W0 = np.zeros((n, n)) - for s in (1, -1): - if s == 1: - Acur = Ap - A_rcur = Ap_r - else: - Acur = An - A_rcur = An_r - - S = np.sum(W * Acur, axis=0) # strengths - Wv = np.sort(W[np.where(np.triu(Acur))]) # sorted weights vector - i, j = np.where(np.triu(A_rcur)) - (Lij,) = np.where(np.triu(A_rcur).flat) # weights indices - - P = np.outer(S, S) - - if wei_freq == 0: # get indices of Lij that sort P - Oind = np.argsort(P.flat[Lij]) # assign corresponding sorted - W0.flat[Lij[Oind]] = s * Wv # weight at this index - else: - wsize = np.size(Wv) - wei_period = np.round(1 / wei_freq).astype( - int - ) # convert frequency to period - lq = np.arange(wsize, 0, -wei_period, dtype=int) - for m in lq: # iteratively explore at this period - # get indices of Lij that sort P - Oind = np.argsort(P.flat[Lij]) - R = rng.permutation(m)[: np.min((m, wei_period))] - for q, r in enumerate(R): - # choose random index of sorted expected weight - o = Oind[r] - W0.flat[Lij[o]] = s * Wv[r] # assign corresponding weight - - # readjust expected weighted probability for i[o],j[o] - f = 1 - Wv[r] / S[i[o]] - P[i[o], :] *= f - P[:, i[o]] *= f - f = 1 - Wv[r] / S[j[o]] - P[j[o], :] *= f - P[:, j[o]] *= f - - # readjust strength of i[o] - S[i[o]] -= Wv[r] - # readjust strength of j[o] - S[j[o]] -= Wv[r] - - O = Oind[R] - # remove current indices from further consideration - Lij = np.delete(Lij, O) - i = np.delete(i, O) - j = np.delete(j, O) - Wv = np.delete(Wv, R) - - W0 = W0 + W0.T - return W0 - -def generate_null(ppt_df, thresh_arr, measure): - ''' - Generate a distribution of graph measure values based on a null connectivity matrix - that is like the average connectivity matrix across participants. - - ''' - null_dist = pd.DataFrame(index=subjects, columns=["mean", "sdev"]) - avg_corr = avg_corrmat( - ppt_df - ) - eff_perm = [] - j = 0 - while j < 1000: - effs = [] - W = null_model(avg_corr.values) - for thresh in thresh_arr: - thresh_corr = bct.threshold_proportional(W, thresh) - leff = measure(thresh_corr) - effs.append(leff) - effs_arr = np.asarray(effs) - leff_auc = np.trapz(effs_arr, dx=0.03, axis=0) - eff_perm.append(leff_auc) - j += 1 - - return null_dist \ No newline at end of file From 4deb84fafaa1c2a570dd8d5da96198f96e6d65d8 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 12:29:29 -0800 Subject: [PATCH 05/48] clean up imputation --- idconn/data/iterative_imputation.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/idconn/data/iterative_imputation.py b/idconn/data/iterative_imputation.py index 73505fe..575e7bc 100644 --- a/idconn/data/iterative_imputation.py +++ b/idconn/data/iterative_imputation.py @@ -5,23 +5,17 @@ from sklearn.impute import IterativeImputer -#sink_dir = "/Users/kbottenh/Dropbox/Projects/physics-retrieval/data/rescored" -# sink_dir = '/home/kbott006/physics-retrieval' -# fig_dir = '/Users/kbottenh/Dropbox/Projects/physics-retrieval/figures/' -#data_dir = "/Users/kbottenh/Dropbox/Projects/physics-retrieval/data/rescored" -# roi_dir = '/Users/kbottenh/Dropbox/Data/templates/shen2015/' -# data_dir = '/home/kbott006/physics-retrieval' - -# big_df = pd.read_csv(join(data_dir, 'physics_learning-nonbrain_OLS-missing+fd+local_efficiency.csv'), -# index_col=0, header=0) - -# impute first? -def impute(data, max_iter): +def impute(data, max_iter=10000): + ''' + Fill in missing data with an iterative imputation algorithm from scikit learn. + NOTE: Will not imput connectivity data. + ''' + non_numeric = data.select_dtypes(exclude=['number']).columns dumb = pd.get_dummies(data[non_numeric], prefix='dummy') df = pd.concat([data.drop(non_numeric, axis=1), dumb]) impute_pls = IterativeImputer( - max_iter=10000, skip_complete=True, verbose=1, tol=5e-3, n_nearest_features=1000 + max_iter=max_iter, skip_complete=True, verbose=1, tol=5e-3, n_nearest_features=1000 ) imputed = impute_pls.fit_transform(df) imp_df = pd.DataFrame(imputed,columns=data.drop(non_numeric, axis=1).columns, index=data.index, From 26db135a96a669c56206908f89dd3376c52cba6c Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 12:29:49 -0800 Subject: [PATCH 06/48] all graph functions in networking.py --- idconn/networking.py | 187 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) diff --git a/idconn/networking.py b/idconn/networking.py index 710cadd..3932e0a 100644 --- a/idconn/networking.py +++ b/idconn/networking.py @@ -8,6 +8,193 @@ import bct #import datetime + +def avg_corrmat(ppt_df): + ''' + Reads in adjacency matrices from the pandas df with ppt info and adj, then computes an average. + ''' + stacked_corrmats = np.array(ppt_df['adj']) + print('Stacked corrmats have dimensions', stacked_corrmats.shape) + avg_corrmat = np.mean(stacked_corrmats, axis=0) + return avg_corrmat + + +def null_model(W, bin_swaps=5, wei_freq=0.1, seed=None): + def get_rng(seed): + if seed is None or seed == np.random: + return np.random.mtrand._rand + elif isinstance(seed, np.random.RandomState): + return seed + try: + rstate = np.random.RandomState(seed) + except ValueError: + rstate = np.random.RandomState(np.random.Random(seed).randint(0, 2 ** 32 - 1)) + return rstate + + def randmio_und_signed(R, itr, seed=None): + rng = get_rng(seed) + R = R.copy() + n = len(R) + + itr *= int(n * (n - 1) / 2) + + max_attempts = int(np.round(n / 2)) + eff = 0 + + for it in range(int(itr)): + att = 0 + while att <= max_attempts: + + a, b, c, d = pick_four_unique_nodes_quickly(n, rng) + + r0_ab = R[a, b] + r0_cd = R[c, d] + r0_ad = R[a, d] + r0_cb = R[c, b] + + # rewiring condition + if ( + np.sign(r0_ab) == np.sign(r0_cd) + and np.sign(r0_ad) == np.sign(r0_cb) + and np.sign(r0_ab) != np.sign(r0_ad) + ): + + R[a, d] = R[d, a] = r0_ab + R[a, b] = R[b, a] = r0_ad + + R[c, b] = R[b, c] = r0_cd + R[c, d] = R[d, c] = r0_cb + + eff += 1 + break + + att += 1 + + return R, eff + + def pick_four_unique_nodes_quickly(n, seed=None): + """ + This is equivalent to np.random.choice(n, 4, replace=False) + Another fellow suggested np.random.random_sample(n).argpartition(4) which is + clever but still substantially slower. + """ + rng = get_rng(seed) + k = rng.randint(n ** 4) + a = k % n + b = k // n % n + c = k // n ** 2 % n + d = k // n ** 3 % n + if a != b and a != c and a != d and b != c and b != d and c != d: + return (a, b, c, d) + else: + # the probability of finding a wrong configuration is extremely low + # unless for extremely small n. if n is extremely small the + # computational demand is not a problem. + + # In my profiling it only took 0.4 seconds to include the uniqueness + # check in 1 million runs of this function so I think it is OK. + return pick_four_unique_nodes_quickly(n, rng) + + rng = get_rng(seed) + if not np.allclose(W, W.T): + print("Input must be undirected") + W = W.copy() + n = len(W) + np.fill_diagonal(W, 0) # clear diagonal + Ap = W > 0 # positive adjmat + An = W < 0 # negative adjmat + + if np.size(np.where(Ap.flat)) < (n * (n - 1)): + W_r, eff = randmio_und_signed(W, bin_swaps, seed=rng) + Ap_r = W_r > 0 + An_r = W_r < 0 + else: + Ap_r = Ap + An_r = An + + W0 = np.zeros((n, n)) + for s in (1, -1): + if s == 1: + Acur = Ap + A_rcur = Ap_r + else: + Acur = An + A_rcur = An_r + + S = np.sum(W * Acur, axis=0) # strengths + Wv = np.sort(W[np.where(np.triu(Acur))]) # sorted weights vector + i, j = np.where(np.triu(A_rcur)) + (Lij,) = np.where(np.triu(A_rcur).flat) # weights indices + + P = np.outer(S, S) + + if wei_freq == 0: # get indices of Lij that sort P + Oind = np.argsort(P.flat[Lij]) # assign corresponding sorted + W0.flat[Lij[Oind]] = s * Wv # weight at this index + else: + wsize = np.size(Wv) + wei_period = np.round(1 / wei_freq).astype( + int + ) # convert frequency to period + lq = np.arange(wsize, 0, -wei_period, dtype=int) + for m in lq: # iteratively explore at this period + # get indices of Lij that sort P + Oind = np.argsort(P.flat[Lij]) + R = rng.permutation(m)[: np.min((m, wei_period))] + for q, r in enumerate(R): + # choose random index of sorted expected weight + o = Oind[r] + W0.flat[Lij[o]] = s * Wv[r] # assign corresponding weight + + # readjust expected weighted probability for i[o],j[o] + f = 1 - Wv[r] / S[i[o]] + P[i[o], :] *= f + P[:, i[o]] *= f + f = 1 - Wv[r] / S[j[o]] + P[j[o], :] *= f + P[:, j[o]] *= f + + # readjust strength of i[o] + S[i[o]] -= Wv[r] + # readjust strength of j[o] + S[j[o]] -= Wv[r] + + O = Oind[R] + # remove current indices from further consideration + Lij = np.delete(Lij, O) + i = np.delete(i, O) + j = np.delete(j, O) + Wv = np.delete(Wv, R) + + W0 = W0 + W0.T + return W0 + +def generate_null(ppt_df, thresh_arr, measure): + ''' + Generate a distribution of graph measure values based on a null connectivity matrix + that is like the average connectivity matrix across participants. + + ''' + null_dist = pd.DataFrame(index=subjects, columns=["mean", "sdev"]) + avg_corr = avg_corrmat( + ppt_df + ) + eff_perm = [] + j = 0 + while j < 1000: + effs = [] + W = null_model(avg_corr.values) + for thresh in thresh_arr: + thresh_corr = bct.threshold_proportional(W, thresh) + leff = measure(thresh_corr) + effs.append(leff) + effs_arr = np.asarray(effs) + leff_auc = np.trapz(effs_arr, dx=0.03, axis=0) + eff_perm.append(leff_auc) + j += 1 + + return null_dist + def omst(matrix, density=True, plot=False): ''' WARNING: THIS IS SLOW AF, REPLACING WITH NETWORKX VERSION IN NEAR FUTURE From 25992dbfa5340fc66f753c880278f1e5743492f7 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 12:32:01 -0800 Subject: [PATCH 07/48] convert data folder to script --- idconn/{data/iterative_imputation.py => data.py} | 0 idconn/data/__init__.py | 8 -------- 2 files changed, 8 deletions(-) rename idconn/{data/iterative_imputation.py => data.py} (100%) delete mode 100644 idconn/data/__init__.py diff --git a/idconn/data/iterative_imputation.py b/idconn/data.py similarity index 100% rename from idconn/data/iterative_imputation.py rename to idconn/data.py diff --git a/idconn/data/__init__.py b/idconn/data/__init__.py deleted file mode 100644 index e0be4c5..0000000 --- a/idconn/data/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -Tools for arranging data and addressing missing data -""" - -from . import iterative_imputation -from . import missingness - -__all__ = ["iterative_imputation", "missingness"] From 65eb863bbeccc92c1997c8864cb7e6505342dd64 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 12:32:48 -0800 Subject: [PATCH 08/48] untested versions of pynbs and nbspredict --- idconn/nbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idconn/nbs.py b/idconn/nbs.py index 8998e25..cc5b59a 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -294,4 +294,4 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s cv_results.at[i, 'coefficient_vector'] = param_vector i += 1 ticks.update() - return cv_results + return cv_results \ No newline at end of file From 1b70102c74ade0f600ac78f65c9893b31ced81b5 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 12:36:02 -0800 Subject: [PATCH 09/48] add utils to io --- .gitignore | 1 - idconn/io.py | 372 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 372 insertions(+), 1 deletion(-) create mode 100644 idconn/io.py diff --git a/.gitignore b/.gitignore index 33bdf27..ce30a02 100644 --- a/.gitignore +++ b/.gitignore @@ -32,7 +32,6 @@ idconn/networking/task-graph-theory-fci.py idconn/networking/task-graph-theory-local-nodal.py idconn/networking/task-graph-theory-local.py idconn/networking/task-graph-theory-nodal.py -idconn/io.py docs/_build/ docs/generated/ diff --git a/idconn/io.py b/idconn/io.py new file mode 100644 index 0000000..6ba41f1 --- /dev/null +++ b/idconn/io.py @@ -0,0 +1,372 @@ +import bids +import json +from nilearn import datasets +import nibabel as nib +from os.path import exists, join, basename + + +import nibabel as nib +import numpy as np +import pandas as pd +import seaborn as sns +#from matplotlib import projections +from matplotlib import pyplot as plt +from matplotlib.gridspec import GridSpec +from nilearn import datasets, plotting, surface + + +def build_statsmodel_json(name, task, contrast, confounds, highpass, + mask, conn_meas, graph_meas=None, exclude=None, outfile=None): + ''' + Creates a BIDS Stats Models json with analysis details for further use. + + Parameters + ---------- + root_dir : str + Location of BIDS dataset root + validate : bool + If true, pybids will check if this is a valid BIDS-format + dataset before continuing. + absolute_paths : bool + If true, will assume paths are absolute, instead of relative. + derivatives : str + Location of preprocessed data (i.e., name of fmriprep dir). + verbose : bool + If true, will narrate finding of dataset and describe it. + Returns + ------- + atlas : str + Name of the atlas chosen. + path : str + File path of atlas. If user-provided, will be copied into + `derivatives/idconn`. If using an atlas from Nilearn, will + be path to downloaded nifti. + shape : str + Indicates shape of map (3d, 4d, coords) for choosing appropriate + Nilearn masker for extracting BOLD signals from nifti files. + + ''' + mask_builtins = ['shen270', 'craddock270', 'schaefer400', 'yeo7', 'yeo17'] + if '.nii' in mask: + assert exists(mask), 'Mask file does not exist at {mask}'.format(mask=mask) + if '.gz' in mask: + mask_name = basename(mask).rsplit('.', 2)[0] + else: + mask_name = basename(mask).rsplit('.', 1)[0] + else: + assert mask in mask_builtins, 'Mask {mask} not in built-in mask options. Please provide file path or one of {mask_builtins}'.format(mask=mask, mask_builtins=mask_builtins) + variables = confounds + ["{mask_name}*".format(mask_name=mask_name)] + statsmodel = { + "name": name, + "description": "A functional connectivity analysis of {task}, comparing {contrast}".format(task=task, + contrast=contrast), + "input":{ + "task": task + }, + "blocks":[{ + "level": "run", + "transformations":{ + "name": "load_image_data", + "input": ["bold"], + "aggregate": ["mean"], + "mask": [mask_name], + "output": ["{mask_name}*".format(mask_name=mask_name)] + }, + }, + { + "level": "session", + "model": { + "variables": variables, + "options": { + "confounds": confounds, + "high_pass_filter_cutoff_secs": highpass + }, + "variances": { + "name": "session_level", + "groupBy": "session" + }, + "software": { + "IDConn": { + "ConnectivityMeasure": [conn_meas], + "GraphMetrics": [graph_meas] + } + } + } + + } + ] + } + statsmodel_json = json.dumps(statsmodel, indent = 2) + + outfile = '{name}-statsmodel.json'.format(name=name) + with open(outfile, 'w') as outfile: + json.dump(statsmodel, outfile) + return statsmodel_json + +def atlas_picker(atlas, path, key=None): + """Takes in atlas name and path to file, if local, returns + nifti-like object (usually file path to downloaded atlas), + and atlas name (for tagging output files). If atlas is from + Nilearn, will download atlas, **and space must be == 'MNI'. + If atlas is provided by user (path must be specified), then + space of atlas must match space of fMRI data, but that is up + to the user to determine. + Parameters + ---------- + atlas : str + Name of the atlas/parcellation used to define nodes from + voxels. If using an atlas fetchable by Nilearn, atlas name + must match the function `fetch_atlas_[name]`. + path : str + Path to the atlas specified, if not using a dataset from Nilearn. + If using `nilearn.datasets` to fetch an atlas, will revert to + `derivatives/idconn` path. + key : str + Atlas-specific key for denoting which of multiple versions + will be used. Default behavior is described in the "atlases" + section of the docs. NOT IMPLEMENTED + Returns + ------- + atlas : str + Name of the atlas chosen. + path : str + File path of atlas. If user-provided, will be copied into + `derivatives/idconn`. If using an atlas from Nilearn, will + be path to downloaded nifti. + shape : str + Indicates shape of map (3d, 4d, coords) for choosing appropriate + Nilearn masker for extracting BOLD signals from nifti files. + """ + nilearn_3d = ['craddock_2012', 'destrieux_2009', 'harvard_oxford', 'smith_2009', 'yeo_2011', 'aal', 'pauli_2017', 'msdl'] + #nilearn_coord = ['power_2011', 'dosenbach_2010', 'seitzman_2018'] + #nilearn_4d = ['allen_2011', ''] + if atlas in nilearn_3d: + if atlas == 'craddock_2012': + atlas_dict = datasets.fetch_atlas_craddock_2012(data_dir=path) + atlas_path = atlas_dict['tcorr_2level'] + nifti = nib.load(atlas_path) + nifti_arr = nifti.get_fdata() + #selecting one volume of the nifti, each represent different granularity of parcellation + #selecting N = 270, the 27th volume per http://ccraddock.github.io/cluster_roi/atlases.html + nifti = nib.Nifti1Image(nifti_arr[:,:,:,26], nifti.affine) + nifti.to_filename() + + return atlas, path + +def vectorize_corrmats(matrices): + """Returns the vectorized upper triangles of a 3-dimensional array + (i.e., node x node x matrix) of matrices. Output will be a 2-dimensional + array (i.e., matrix x node^2) + Parameters + ---------- + matrices : numpy array of shape (p, n, n) + Represents the link strengths of the graphs. Assumed to be + an array of symmetric matrices. + + Returns + ------- + edge_vector : numpy array of shape (p, n^2) + Represents an array of vectorized upper triangles of + the input matrices. + """ + #print(matrices.shape, matrices.ndim) + num_node = matrices.shape[1] + upper_tri = np.triu_indices(num_node, k=1) + if matrices.ndim == 3: + num_node = matrices.shape[1] + upper_tri = np.triu_indices(num_node, k=1) + num_matrices = matrices.shape[0] + edge_vector = [] + for matrix in range(0,num_matrices): + vectorized = matrices[matrix,:,:][upper_tri] + edge_vector.append(vectorized) + + elif matrices.ndim == 2: + true = matrices[0].T == matrices[0] + if true.all(): + edge_vector = matrices[upper_tri] + else: + print('Matrices of incompatible shape:', matrices.shape, + '\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).') + elif matrices.ndim == 1: + if matrices[0].ndim == 2: + num_node = matrices[0].shape[0] + upper_tri = np.triu_indices(num_node, k=1) + edge_vector = [] + for matrix in matrices: + vectorized = matrix[upper_tri] + edge_vector.append(vectorized) + else: + print('Matrices of incompatible shape:', matrices.shape, + '\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).') + edge_vector = np.asarray(edge_vector) + return edge_vector + +def read_corrmats(layout, task, deriv_name, conf_measures=None, z_score=True, vectorized=True, verbose=False): + """Returns a node x node x (subject x session) matrix of correlation matrices + from a BIDS derivative folder. Optionally returns a subject x session dataframe + of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) + array of vectorized upper triangles of those correlation matrices. + Parameters + ---------- + matrices : numpy array of shape (n, n, p) + Represents the link strengths of the graphs. Assumed to be + an array of symmetric matrices. + + Returns + ------- + edge_vector : numpy array of shape (p, n^2) + Represents an array of vectorized upper triangles of + the input matrices. + """ + subjects = layout.get(return_type='id', + target='subject', + suffix='bold', + scope=deriv_name + ) + all_sesh = layout.get(return_type='id', + target='session', + task=task, + suffix='bold', + scope=deriv_name + ) + ppts_fname = layout.get_file('participants.tsv').path + ppt_df = pd.read_csv(ppts_fname, sep='\t', index_col=[0,1]) + ppt_df['adj'] = '' + if vectorized: + ppt_df['edge_vector'] = '' + + for subject in subjects: + if verbose: + print(subject) + else: + pass + sessions = layout.get(return_type='id', + target='session', + task=task, + suffix='bold', + subject=subject, + scope=deriv_name) + + for session in sessions: + if verbose: + print(session) + else: + pass + path = layout.get(return_type='filename', + task=task, + subject=subject, + session=session, + suffix='bold', + scope='IDConn' + ) + if verbose: + print(f'Corrmat path for sub-{subject}, ses-{session}: \t{path}') + else: + pass + if type(path) == list: + #print(len(path)) + path = path[0] + else: + pass + assert exists(path), f'Corrmat file not found at {path}' + adj_matrix = pd.read_csv(path, sep='\t', header=0, index_col=0) + if z_score == True: + z_adj = np.arctanh(adj_matrix.values) + z_adj = np.where(z_adj == np.inf, 0, z_adj) + #print(z_adj.shape) + ppt_df.at[(f'sub-{subject}', + f'ses-{session}'), + 'adj'] = z_adj + else: + #print(adj_matrix.values.shape) + ppt_df.at[(f'sub-{subject}', + f'ses-{session}'), + 'adj'] = adj_matrix.values + + + if vectorized == True: + edge_vector = vectorize_corrmats(adj_matrix.values) + #print(edge_vector.shape) + ppt_df.at[(f'sub-{subject}', + f'ses-{session}'), + 'edge_vector'] = edge_vector + ppt_df.replace({'': np.nan}, inplace=True) + return ppt_df + +def undo_vectorize(edges): + j = len(edges) + num_node = (np.sqrt((8 * j) + 1) + 1) / 2 + X = np.zeros((num_node,num_node)) + X[np.triu_indices(X.shape[0], k = 1)] = edges + X = X + X.T + return X + +def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='icefire', node_size='strength'): + coords = plotting.find_parcellation_cut_coords(atlas_nii) + num_node = adj.shape[0] + # only plot the top t% of edges + if threshold == 'computed': + threshold = f'{(1 - (100 / num_node ** 2)) * 100}%' + elif type(threshold) == float or type(threshold) == int: + threshold = f'{threshold}%' + else: + threshold = '99%' + print('edge plotting threshold: ', threshold) + + if node_size == 'strength': + node_strength = np.sum((np.abs(adj)), axis=0) + node_strength /= np.max(node_strength) + node_strength **= 4 + node_size = node_strength + fig = plt.figure(figsize=(12,4)) + if title is not None: + fig.suptitle(title) + gs = GridSpec(1, 2, width_ratios=[4,2]) + ax0 = fig.add_subplot(gs[0]) + ax1 = fig.add_subplot(gs[1]) + + plt.tight_layout(w_pad=5) + g = plotting.plot_connectome(adj, coords, + node_size=node_size, + edge_threshold=threshold, + edge_cmap=cmap, + figure=fig, + axes=ax0, + colorbar=False, + annotate=False) + h = sns.heatmap(adj, square=True, cmap=cmap, ax=ax1) + if strength: + fig2 = plt.figure(figsize=(12,4)) + if title is not None: + fig2.suptitle(title) + fsaverage = datasets.fetch_surf_fsaverage() + nimg = nib.load(atlas_nii) + regn_sch_arr = nimg.get_fdata() + for i in np.arange(0,num_node): + regn_sch_arr[np.where(regn_sch_arr == i+1)] = np.sum(adj[i]) + strength_nimg = nib.Nifti1Image(regn_sch_arr, nimg.affine) + nib.save(strength_nimg, '/Users/katherine.b/Dropbox/HC_Use_predictive-strength.nii') + + gs = GridSpec(1, 4) + # plot edge weights on surfaces + ax2 = fig2.add_subplot(gs[0], projection='3d') + ax3 = fig2.add_subplot(gs[1], projection='3d') + ax4 = fig2.add_subplot(gs[2], projection='3d') + ax5 = fig2.add_subplot(gs[3], projection='3d') + + texture_l = surface.vol_to_surf(strength_nimg, fsaverage.pial_left, interpolation='nearest') + texture_r = surface.vol_to_surf(strength_nimg, fsaverage.pial_right, interpolation='nearest') + + plt.tight_layout(w_pad=-1) + i = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5, + cmap=cmap, view='lateral', colorbar=False, axes=ax2) + j = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5, + cmap=cmap, view='medial', colorbar=False, axes=ax3) + k = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5, + cmap=cmap, view='lateral', colorbar=False, axes=ax4) + l = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5, + cmap=cmap, view='medial', colorbar=False, axes=ax5) + return fig, fig2 + else: + return fig \ No newline at end of file From 141140e25441180c7188eb180ee88f2b20e05722 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 12:42:18 -0800 Subject: [PATCH 10/48] add imports to nbs --- idconn/nbs.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/idconn/nbs.py b/idconn/nbs.py index cc5b59a..28420db 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -1,7 +1,14 @@ import numpy as np import statsmodels as sm import networkx as nx -from utils import vectorize_corrmats, undo_vectorize +import pandas as pd +from io import vectorize_corrmats, undo_vectorize +from scipy.stats import t +import enlighten +import bct + +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold +from sklearn.linear_model import LogisticRegression def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000, stratified=False): @@ -260,11 +267,7 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s test_y = outcome[test_idx] pval, adj, _ = pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000) - pval, adj, _ = bct.nbs_bct(train_a, - train_b, - t_threshold, - k=k, - tail=tail) + cv_results.at[i, 'pval'] = pval cv_results.at[i, 'component'] = adj @@ -273,6 +276,8 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s train_features = edges[train_idx, :].T[mask] test_features = edges[test_idx, :].T[mask] + # need an IF GROUPS statement + # ELSE statsmodels OLS regressor = LogisticRegression(max_iter=1000) model = regressor.fit(X=train_features.T, y=train_y) cv_results.at[i, 'model'] = model From f2a33e5bc0c163799e09b054d6fc5167b76358a6 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 12:45:37 -0800 Subject: [PATCH 11/48] add thresholding functions to networking --- idconn/networking.py | 66 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/idconn/networking.py b/idconn/networking.py index 3932e0a..273af7c 100644 --- a/idconn/networking.py +++ b/idconn/networking.py @@ -1,10 +1,12 @@ import numpy as np -#import pandas as pd +import pandas as pd import seaborn as sns +import networkx as nx import matplotlib.pyplot as plt from os.path import join #from nilearn.connectome import ConnectivityMeasure from scipy.sparse.csgraph import minimum_spanning_tree +from scipy.stats import skew import bct #import datetime @@ -18,7 +20,6 @@ def avg_corrmat(ppt_df): avg_corrmat = np.mean(stacked_corrmats, axis=0) return avg_corrmat - def null_model(W, bin_swaps=5, wei_freq=0.1, seed=None): def get_rng(seed): if seed is None or seed == np.random: @@ -169,19 +170,19 @@ def pick_four_unique_nodes_quickly(n, seed=None): W0 = W0 + W0.T return W0 -def generate_null(ppt_df, thresh_arr, measure): +def generate_null(ppt_df, thresh_arr, measure, permutations=1000): ''' Generate a distribution of graph measure values based on a null connectivity matrix that is like the average connectivity matrix across participants. ''' - null_dist = pd.DataFrame(index=subjects, columns=["mean", "sdev"]) + null_dist = pd.DataFrame(index=range(0,permutations), columns=["mean", "sdev"]) avg_corr = avg_corrmat( ppt_df ) eff_perm = [] j = 0 - while j < 1000: + while j < permutations: effs = [] W = null_model(avg_corr.values) for thresh in thresh_arr: @@ -269,3 +270,58 @@ def graph_omst(matrix, measure, args): metric = measure(thresh_mat, args) return metric + +def scale_free_tau(corrmat, skew_thresh, proportional=True): + '''' + Calculates threshold at which network becomes scale-free, estimated from the skewness of the networks degree distribution. + Parameters + ---------- + corrmat : numpy.array + Correlation or other connectivity matrix from which tau_connected will be estimated. + Should be values between 0 and 1. + proportional : bool + Determines whether connectivity matrix is thresholded proportionally or absolutely. + Default is proportional as maintaining network density across participants is a priority + Returns + ------- + tau : float + Lowest vaue of tau (threshold) at which network is scale-free. + ''' + tau = 0.01 + skewness = 1 + while abs(skewness) > 0.3: + if proportional: + w = bct.threshold_proportional(corrmat, tau) + else: + w = bct.threshold_absolute(corrmat, tau) + skewness = skew(bct.degrees_und(w)) + tau += 0.01 + return tau + +def connected_tau(corrmat, proportional=True): + ''' + Calculates threshold at network becomes node connected, using NetworkX's `is_connected` function. + Parameters + ---------- + corrmat : numpy.array + Correlation or other connectivity matrix from which tau_connected will be estimated. + Should be values between 0 and 1. + proportional : bool + Determines whether connectivity matrix is thresholded proportionally or absolutely. + Default is proportional as maintaining network density across participants is a priority + Returns + ------- + tau : float + Highest vaue of tau (threshold) at which network becomes node-connected. + ''' + tau = 0.01 + connected = False + while connected == False: + if proportional: + w = bct.threshold_proportional(corrmat, tau) + else: + w = bct.threshold_absolute(corrmat, tau) + w_nx = nx.convert_matrix.from_numpy_array(w) + connected = nx.algorithms.components.is_connected(w_nx) + tau += 0.01 + return tau \ No newline at end of file From 745c206e21037399b4d924b6507d31b04ced8fe5 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 12:47:18 -0800 Subject: [PATCH 12/48] move connectivity estimting to connectivity.py --- .../build_networks.py => connectivity.py} | 0 idconn/connectivity/__init__.py | 8 --- .../__pycache__/__init__.cpython-37.pyc | Bin 348 -> 0 bytes idconn/connectivity/estimate_thresh.py | 60 ------------------ 4 files changed, 68 deletions(-) rename idconn/{connectivity/build_networks.py => connectivity.py} (100%) delete mode 100644 idconn/connectivity/__init__.py delete mode 100644 idconn/connectivity/__pycache__/__init__.cpython-37.pyc delete mode 100644 idconn/connectivity/estimate_thresh.py diff --git a/idconn/connectivity/build_networks.py b/idconn/connectivity.py similarity index 100% rename from idconn/connectivity/build_networks.py rename to idconn/connectivity.py diff --git a/idconn/connectivity/__init__.py b/idconn/connectivity/__init__.py deleted file mode 100644 index afe46b9..0000000 --- a/idconn/connectivity/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -Tools for computing connectivity matrices/graphs -""" - -from . import build_networks -from . import estimate_thresh - -__all__ = ["build_networks", "estimate_thresh"] diff --git a/idconn/connectivity/__pycache__/__init__.cpython-37.pyc b/idconn/connectivity/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 587b05d21a7b91244269ae5024ce1fb989e55580..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 348 zcmYk0!Ait16h+gtGnF!6GK zEfYSyCPn&mlr4=>F26H2uT9qvUg`!(Yf*c-k$#(ZmAA4Mjy1OGTbF%C_(c(oyQ@Je z4qEsJW7iIwJ>k5>bpgJ$!nHP7*%;%pn2ad^H#NXW{`yc>N&&nPn}+2OU$^YW3G3K; zWxN-0S^1^xO3bYJ}7Y# Nq}&uEp|fa4zXASXVy6HA diff --git a/idconn/connectivity/estimate_thresh.py b/idconn/connectivity/estimate_thresh.py deleted file mode 100644 index 7dd99a4..0000000 --- a/idconn/connectivity/estimate_thresh.py +++ /dev/null @@ -1,60 +0,0 @@ -import numpy as np -import networkx as nx -import pandas as pd -import bct - - -def scale_free_tau(corrmat, skew_thresh, proportional=True): - '''' - Calculates threshold at which network becomes scale-free, estimated from the skewness of the networks degree distribution. - Parameters - ---------- - corrmat : numpy.array - Correlation or other connectivity matrix from which tau_connected will be estimated. - Should be values between 0 and 1. - proportional : bool - Determines whether connectivity matrix is thresholded proportionally or absolutely. - Default is proportional as maintaining network density across participants is a priority - Returns - ------- - tau : float - Lowest vaue of tau (threshold) at which network is scale-free. - ''' - tau = 0.01 - skewness = 1 - while abs(skewness) > 0.3: - if proportional: - w = bct.threshold_proportional(corrmat, tau) - else: - w = bct.threshold_absolute(corrmat, tau) - skewness = skew(bct.degrees_und(w)) - tau += 0.01 - return tau - -def connected_tau(corrmat, proportional=True): - ''' - Calculates threshold at network becomes node connected, using NetworkX's `is_connected` function. - Parameters - ---------- - corrmat : numpy.array - Correlation or other connectivity matrix from which tau_connected will be estimated. - Should be values between 0 and 1. - proportional : bool - Determines whether connectivity matrix is thresholded proportionally or absolutely. - Default is proportional as maintaining network density across participants is a priority - Returns - ------- - tau : float - Highest vaue of tau (threshold) at which network becomes node-connected. - ''' - tau = 0.01 - connected = False - while connected == False: - if proportional: - w = bct.threshold_proportional(corrmat, tau) - else: - w = bct.threshold_absolute(corrmat, tau) - w_nx = nx.convert_matrix.from_numpy_array(w) - connected = nx.algorithms.components.is_connected(w_nx) - tau += 0.01 - return tau \ No newline at end of file From 6c57950f14fcc8eec019c48cc2d3c6886737c521 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 12:49:51 -0800 Subject: [PATCH 13/48] add nbs to init --- idconn/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/idconn/__init__.py b/idconn/__init__.py index 83ad6d8..75cdf18 100644 --- a/idconn/__init__.py +++ b/idconn/__init__.py @@ -31,7 +31,8 @@ # "preprocessing", #"statistics", # "utils", - # "io", + "io", + "nbs", "__version__", ] From bc5da00874eb3da698131171b6dc0e271d3f6ec0 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 1 Dec 2022 13:09:03 -0800 Subject: [PATCH 14/48] update imports in connectivity and pipeline --- idconn/connectivity.py | 9 ++++----- idconn/pipeline.py | 8 ++++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/idconn/connectivity.py b/idconn/connectivity.py index 15e3017..6f15e51 100644 --- a/idconn/connectivity.py +++ b/idconn/connectivity.py @@ -1,11 +1,11 @@ from posixpath import sep import numpy as np import pandas as pd -import idconn.connectivity.build_networks +#import idconn.connectivity.build_networks from os import makedirs from os.path import join, exists, basename from nilearn import input_data, datasets, connectome, image, plotting - +from . import __version__ #from .utils import contrast def _check_dims(matrix): @@ -18,7 +18,6 @@ def _check_dims(matrix): if matrix.ndim != 2: raise ValueError('Expected a square matrix, got array of shape' ' {0}.'.format(matrix.shape)) - def task_connectivity(layout, subject, session, task, atlas, confounds, connectivity_metric='correlation', out_dir=None): """ @@ -52,7 +51,7 @@ def task_connectivity(layout, subject, session, task, atlas, confounds, connecti """ #version = '0.1.1' try: - version = idconn.__version__ + version = __version__ except: version = 'test' if '.nii' in atlas: @@ -191,7 +190,7 @@ def connectivity(layout, subject, session, task, atlas, connectivity_metric='cor adjacency_matrix """ try: - version = idconn.__version__ + version = __version__ except: version = 'test' if '.nii' in atlas: diff --git a/idconn/pipeline.py b/idconn/pipeline.py index 667870f..38c0ccd 100644 --- a/idconn/pipeline.py +++ b/idconn/pipeline.py @@ -23,7 +23,7 @@ from os.path import exists #from glob import glob #from nilearn import input_data, connectome, plotting, image -from idconn.connectivity import build_networks +from idconn.connectivity import connectivity, task_connectivity from idconn.parser_utils import is_valid_file, is_valid_path #from idconn.networking import graph_theory, null_distribution @@ -116,17 +116,17 @@ def idconn_workflow(dset_dir, atlas, task, out_dir, space="MNI152NLin2009cAsym", print(f"here are the inputs: {layout, subject, session, task, atlas, conn, space, confounds}") if 'rest' in task: try: - adj_matrix = build_networks.connectivity(layout, subject, session, task, atlas, conn, space, confounds) + adj_matrix = connectivity(layout, subject, session, task, atlas, conn, space, confounds) except Exception as e: print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}') if len(conditions) < 1: try: - adj_matrix = build_networks.connectivity(layout, subject, session, task, atlas, conn, space, confounds) + adj_matrix = connectivity(layout, subject, session, task, atlas, conn, space, confounds) except Exception as e: print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}') else: try: - adj_matrix = build_networks.task_connectivity(layout=layout, subject=subject, session=session, task=task, atlas=atlas, confounds=confounds, connectivity_metric=conn) + adj_matrix = task_connectivity(layout=layout, subject=subject, session=session, task=task, atlas=atlas, confounds=confounds, connectivity_metric=conn) except Exception as e: print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}') From 6fe67219329eaa589f1e05ea5e7dd3d77e11fcd0 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Tue, 6 Dec 2022 13:16:57 -0800 Subject: [PATCH 15/48] fixed version import --- idconn/connectivity.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/idconn/connectivity.py b/idconn/connectivity.py index 6f15e51..3746433 100644 --- a/idconn/connectivity.py +++ b/idconn/connectivity.py @@ -5,7 +5,7 @@ from os import makedirs from os.path import join, exists, basename from nilearn import input_data, datasets, connectome, image, plotting -from . import __version__ +from ._version import get_versions #from .utils import contrast def _check_dims(matrix): @@ -51,7 +51,7 @@ def task_connectivity(layout, subject, session, task, atlas, confounds, connecti """ #version = '0.1.1' try: - version = __version__ + version = get_versions()["version"] except: version = 'test' if '.nii' in atlas: From 423cb45d3bd1c58c83bac3e3fcb9482112f4ee56 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Tue, 6 Dec 2022 21:34:02 -0800 Subject: [PATCH 16/48] bug fixing, creating nbspredict wf --- idconn/__init__.py | 2 +- idconn/connectivity.py | 4 +- idconn/io.py | 19 +++--- idconn/nbs.py | 117 +++++++++++++++++++++----------- idconn/networking.py | 1 - idconn/workflows/nbs_predict.py | 60 ++++++++++++++++ setup.py | 3 +- 7 files changed, 154 insertions(+), 52 deletions(-) create mode 100644 idconn/workflows/nbs_predict.py diff --git a/idconn/__init__.py b/idconn/__init__.py index 75cdf18..000932b 100644 --- a/idconn/__init__.py +++ b/idconn/__init__.py @@ -12,7 +12,7 @@ warnings.simplefilter("ignore") from . import connectivity from . import data - #from . import figures + from . import nbs from . import networking # from . import preprocessing diff --git a/idconn/connectivity.py b/idconn/connectivity.py index 3746433..e54914b 100644 --- a/idconn/connectivity.py +++ b/idconn/connectivity.py @@ -163,7 +163,7 @@ def task_connectivity(layout, subject, session, task, atlas, confounds, connecti print('saving corrmat...', e) return files, avg_corrmats -def connectivity(layout, subject, session, task, atlas, connectivity_metric='correlation', confounds=None, out_dir=None): +def rest_connectivity(layout, subject, session, task, atlas, connectivity_metric='correlation', confounds=None, out_dir=None): """ Makes connectivity matrices per subject per session per task per condition. @@ -190,7 +190,7 @@ def connectivity(layout, subject, session, task, atlas, connectivity_metric='cor adjacency_matrix """ try: - version = __version__ + version = get_versions()["version"] except: version = 'test' if '.nii' in atlas: diff --git a/idconn/io.py b/idconn/io.py index 6ba41f1..487844b 100644 --- a/idconn/io.py +++ b/idconn/io.py @@ -161,7 +161,7 @@ def vectorize_corrmats(matrices): ---------- matrices : numpy array of shape (p, n, n) Represents the link strengths of the graphs. Assumed to be - an array of symmetric matrices. + an array of symmetric nxn matrices per participant and/or timepoint (p). Returns ------- @@ -169,7 +169,7 @@ def vectorize_corrmats(matrices): Represents an array of vectorized upper triangles of the input matrices. """ - #print(matrices.shape, matrices.ndim) + #print(f'\n\n\n{matrices.shape}, {matrices.ndim}\n\n\n') num_node = matrices.shape[1] upper_tri = np.triu_indices(num_node, k=1) if matrices.ndim == 3: @@ -202,7 +202,7 @@ def vectorize_corrmats(matrices): edge_vector = np.asarray(edge_vector) return edge_vector -def read_corrmats(layout, task, deriv_name, conf_measures=None, z_score=True, vectorized=True, verbose=False): +def read_corrmats(layout, task, deriv_name='IDConn', atlas=None, conf_measures=None, z_score=True, vectorized=True, verbose=False): """Returns a node x node x (subject x session) matrix of correlation matrices from a BIDS derivative folder. Optionally returns a subject x session dataframe of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) @@ -258,14 +258,15 @@ def read_corrmats(layout, task, deriv_name, conf_measures=None, z_score=True, ve subject=subject, session=session, suffix='bold', - scope='IDConn' + scope='IDConn', + atlas=atlas, ) if verbose: print(f'Corrmat path for sub-{subject}, ses-{session}: \t{path}') else: pass if type(path) == list: - #print(len(path)) + #print(path) path = path[0] else: pass @@ -294,9 +295,9 @@ def read_corrmats(layout, task, deriv_name, conf_measures=None, z_score=True, ve ppt_df.replace({'': np.nan}, inplace=True) return ppt_df -def undo_vectorize(edges): - j = len(edges) - num_node = (np.sqrt((8 * j) + 1) + 1) / 2 +def undo_vectorize(edges, num_node): + #j = len(edges) + #num_node = (np.sqrt((8 * j) + 1) + 1) / 2 X = np.zeros((num_node,num_node)) X[np.triu_indices(X.shape[0], k = 1)] = edges X = X + X.T @@ -335,7 +336,7 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap= axes=ax0, colorbar=False, annotate=False) - h = sns.heatmap(adj, square=True, cmap=cmap, ax=ax1) + h = sns.heatmap(adj, square=True, cmap=cmap, ax=ax1, center=0) if strength: fig2 = plt.figure(figsize=(12,4)) if title is not None: diff --git a/idconn/nbs.py b/idconn/nbs.py index 28420db..c5b4caa 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -1,15 +1,27 @@ import numpy as np -import statsmodels as sm +import statsmodels.api as sm import networkx as nx import pandas as pd -from io import vectorize_corrmats, undo_vectorize +from idconn.io import vectorize_corrmats, undo_vectorize from scipy.stats import t import enlighten -import bct +#import bct from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold -from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import LogisticRegression, LinearRegression +def calc_number_of_nodes(matrices): + if matrices.shape[0] != matrices.shape[1]: + if matrices.shape[1] == matrices.shape[2]: + num_node = matrices.shape[1] + matrices = np.moveaxis(matrices, 0, -1) + else: + raise ValueError(f'Matrices of shape {matrices.shape}', + 'requires matrices of shape (subject x session) x node x node', + 'or node x node x (subject x session).') + else: + num_node = matrices.shape[0] + return num_node def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000, stratified=False): ''' @@ -59,6 +71,9 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000 # and, if not predict, build a null distribution n = matrices.shape[:-1] ndims = len(matrices.shape) + #print(ndims) + #if ndims >=2 + num_node = calc_number_of_nodes(matrices) # vectorize_corrmats returns p x n^2 # we want to run pynbs per edge @@ -70,8 +85,10 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000 if ndims > 2: edges = vectorize_corrmats(matrices) else: - edges = matrices.copy() + raise ValueError(f'Input matrices have shape {matrices.shape},', + 'pyNBS requires matrices of shape (subject x session) x node x node.') edges = edges.T + #print(f'\n\n\n{edges.shape}\n\n\n') # run an ols per edge # create significancs matrix for predictor of interest (outcome) @@ -92,7 +109,7 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000 # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - sig_matrix = undo_vectorize(sig_edges) # need to write this function + sig_matrix = undo_vectorize(sig_edges, num_node) # need to write this function matrix = nx.from_numpy_array(sig_matrix) #use networkX to find connected components @@ -104,7 +121,10 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000 size = np.asarray([s.number_of_edges() for s in S]) (max_comp, ) = np.where(size == max(size)) largest_comp_size = max(size) - print(f'Connected component has {largest_comp_size} edges.') + if predict == False: + print(f'Connected component has {largest_comp_size} edges.') + else: + pass # retain size of largest connected component # for NBS permutation-based significance testing @@ -127,8 +147,10 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000 # plotting in brain space for i in unused_nodes: S1.loc[i] = 0 - S1[i] = 0 - + temp = S1.copy() + temp[i] = 0 + S1 = temp.copy() + S1.sort_index(axis=0, inplace=True) S1.sort_index(axis=1, inplace=True) @@ -157,7 +179,7 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000 #print(np.sum(perm_edges)) # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - perm_matrix = undo_vectorize(perm_edges) # need to write this function + perm_matrix = undo_vectorize(perm_edges, num_node) # need to write this function perm_nx = nx.from_numpy_array(perm_matrix) comps = nx.connected_components(perm_nx) @@ -202,6 +224,8 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s an array of symmetric matrices. outcome : list-like of shape (p,) Y-value to be predicted with connectivity + groups : list-like of shape (p,) + Grouping variable - currently only works for 2 groups Returns ------- @@ -215,7 +239,7 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s cv_results = pd.DataFrame(index=index, columns=['split', - 'pval', + #'pval', 'score', 'component', 'coefficient_matrix', @@ -224,28 +248,32 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s if groups is not None: cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_iterations) - df = groups.shape[0] - 2 + dof = groups.shape[0] - 2 else: cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_iterations) - df = edges.shape[0] - 1 + dof = edges.shape[0] - 1 if tail == 'both': alpha = 0.01 else: alpha = 0.005 - t_threshold = t.ppf(1 - alpha, df=df) + t_threshold = t.ppf(1 - alpha, df=dof) - if matrices.shape[0] != matrices.shape[1]: - if matrices.shape[1] == matrices.shape[2]: - num_node = matrices.shape[1] - matrices = np.moveaxis(matrices, 0, -1) - else: - raise ValueError(f'Matrices of shape {matrices.shape}', - 'requires matrices of shape (subject x session) x node x node', - 'or node x node x (subject x session).') - else: - num_node = matrices.shape[0] + # really can't remember why tf I did this? + # maybe it's an artifact of permuted_ols? + num_node = calc_number_of_nodes(matrices) + #print(num_node) + #if matrices.shape[0] != matrices.shape[1]: + # if matrices.shape[1] == matrices.shape[2]: + # num_node = matrices.shape[1] + #matrices = np.moveaxis(matrices, 0, -1) + # else: + # raise ValueError(f'Matrices of shape {matrices.shape}', + #'requires matrices of shape (subject x session) x node x node', + #'or node x node x (subject x session).') + #else: + # num_node = matrices.shape[0] upper_tri = np.triu_indices(num_node, k=1) i = 0 @@ -255,32 +283,47 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s cv_results.at[i, 'split'] = (train_idx, test_idx) # all of this presumes the old bctpy version of nbs # irrelevant for pynbs - #train_a_idx = [m for m in train_idx if outcome[m] == 0] - #train_b_idx = [m for m in train_idx if outcome[m] == 1] + #assert len(train_a_idx) == len(train_b_idx) - #train_a = matrices[:,:,train_a_idx] - #train_b = matrices[:,:,train_b_idx] + if groups is not None: + train_a_idx = [m for m in train_idx if groups[m] == 0] + train_b_idx = [m for m in train_idx if groups[m] == 1] + regressor = LogisticRegression(max_iter=1000) + else: + regressor = LinearRegression() + train_mats = matrices[train_idx,:,:] #print(train_a.shape, train_b.shape) # separate edges & covariates into train_y = outcome[train_idx] test_y = outcome[test_idx] - pval, adj, _ = pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000) + train_confounds = confounds.values[train_idx] + #test_confounds = confounds.values[test_idx] + + # perform NBS wooooooooo + # note: output is a dataframe :) + adj = pynbs(train_mats, train_y, train_confounds, alpha, predict=True) + #print(adj.shape, adj.ndim, adj[0].shape, upper_tri) - cv_results.at[i, 'pval'] = pval - cv_results.at[i, 'component'] = adj + #cv_results.at[i, 'pval'] = pval + cv_results.at[i, 'component'] = adj.values - nbs_vector = adj[upper_tri] + # grab the values of the adjacency matrix that are just in the upper triangle + # so you don't have repeated edges + nbs_vector = adj.values[upper_tri] + # use those to make a "significant edges" mask mask = nbs_vector == 1 + + # grab only the significant edges from testing and training sets of edges + # for use as features in the predictive models train_features = edges[train_idx, :].T[mask] test_features = edges[test_idx, :].T[mask] - # need an IF GROUPS statement - # ELSE statsmodels OLS - regressor = LogisticRegression(max_iter=1000) + # train model predicting outcome from brain (note: no mas covariates) model = regressor.fit(X=train_features.T, y=train_y) cv_results.at[i, 'model'] = model + # score that model on the testing data score = model.score(X=test_features.T, y=test_y) cv_results.at[i, 'score'] = score @@ -292,9 +335,7 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s m+=1 else: pass - X = np.zeros_like(adj) - X[np.triu_indices(X.shape[0], k=1)] = param_vector - X = X + X.T + X = undo_vectorize(param_vector, num_node=num_node) cv_results.at[i, 'coefficient_matrix'] = X cv_results.at[i, 'coefficient_vector'] = param_vector i += 1 diff --git a/idconn/networking.py b/idconn/networking.py index 273af7c..f74ee12 100644 --- a/idconn/networking.py +++ b/idconn/networking.py @@ -270,7 +270,6 @@ def graph_omst(matrix, measure, args): metric = measure(thresh_mat, args) return metric - def scale_free_tau(corrmat, skew_thresh, proportional=True): '''' Calculates threshold at which network becomes scale-free, estimated from the skewness of the networks degree distribution. diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py new file mode 100644 index 0000000..169531a --- /dev/null +++ b/idconn/workflows/nbs_predict.py @@ -0,0 +1,60 @@ +from idconn import nbs, io +import pandas as pd +import numpy as np +import bids +from os.path import join +from datetime import datetime +from time import strftime + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset' +TEST_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset' +DERIV_NAME = 'IDConn' +OUTCOME = 'Mean E2 (pg/mL)' +atlas_fname = '/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz' + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task='rest', atlas='craddock2012', z_score=False) + +keep = dat['adj'].dropna().index +dat = dat.loc[keep] +#print(dat['adj'].values.shape) +num_node = dat.iloc[0]['adj'].shape[0] + +matrices = np.vstack(dat['adj'].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]),1)) +confounds = dat[['bc', 'menst_cycle-day']] +alpha = 0.1 +fig_dir = '/Users/katherine.b/Dropbox/Projects/IDConn' + +cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=4, n_iterations=2, k=1000, shuffle=False, fig_dir=fig_dir) + +cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t') +best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0] +subnetwork = cv_results.loc[best]['component'] +subnetwork_df = pd.DataFrame(subnetwork, + index=range(0,num_node), + columns=range(0,num_node)) + +subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_edge_parameters-{today_str}.tsv'),sep='\t') + +nbs_vector = subnetwork[upper_tri] +mask = nbs_vector == 1 +edges = np.vstack(dat['edge_vector'].values) +features = edges[:,mask] +#plot the parameters +param_mat = cv_results.loc[best]['coefficient_matrix'] +odds = 10 ** param_mat +prob = odds / (1 + odds) + +# run the model on the whole 28andMe dataset to get params +model = cv_results.loc[best]['model'] +model.fit(features, outcome) +fig,fig2 = io.plot_edges(param_mat, atlas_fname, title=None, strength=True, cmap='icefire', node_size='strength') +fig.savefig('/Users/katherine.b/Dropbox/Projects/IDConn/test1.png') +fig2.savefig('/Users/katherine.b/Dropbox/Projects/IDConn/test2.png') \ No newline at end of file diff --git a/setup.py b/setup.py index 6d42c12..abab8f5 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,8 @@ "bctpy", "pybids", "networkx", - "matplotlib", # necessary until nilearn includes mpl as a dependency + "matplotlib", # necessary until nilearn includes mpl as a dependency + "enlighten", ], extras_require={ "doc": [ From 49ae2746abb8d0ce9dcb6b1c7c2da181868e9c2f Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Tue, 6 Dec 2022 21:52:12 -0800 Subject: [PATCH 17/48] changed default color palette --- idconn/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idconn/io.py b/idconn/io.py index 487844b..0930d06 100644 --- a/idconn/io.py +++ b/idconn/io.py @@ -303,7 +303,7 @@ def undo_vectorize(edges, num_node): X = X + X.T return X -def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='icefire', node_size='strength'): +def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='vlag', node_size='strength'): coords = plotting.find_parcellation_cut_coords(atlas_nii) num_node = adj.shape[0] # only plot the top t% of edges From 5dd8ba725b13b571fcf489a344a29228fa994223 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Tue, 6 Dec 2022 22:05:32 -0800 Subject: [PATCH 18/48] nbs-predict runsgit add idconn/workflows/nbs_predict.py --- idconn/workflows/nbs_predict.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 169531a..1436528 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -9,15 +9,18 @@ today = datetime.today() today_str = strftime("%m_%d_%Y") -TRAIN_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset' +TRAIN_DSET = '/Users/katherine.b/Dropbox/Data/ds002674' TEST_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset' DERIV_NAME = 'IDConn' -OUTCOME = 'Mean E2 (pg/mL)' +OUTCOME = 'estradiol' +CONFOUNDS = ['bc'] +TASK = 'rest' +ATLAS = 'craddock2012' atlas_fname = '/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz' layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) -dat = io.read_corrmats(layout, task='rest', atlas='craddock2012', z_score=False) +dat = io.read_corrmats(layout, task=TASK, atlas=ATLAS, z_score=False) keep = dat['adj'].dropna().index dat = dat.loc[keep] @@ -28,7 +31,7 @@ upper_tri = np.triu_indices(num_node, k=1) outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]),1)) -confounds = dat[['bc', 'menst_cycle-day']] +confounds = dat[CONFOUNDS] alpha = 0.1 fig_dir = '/Users/katherine.b/Dropbox/Projects/IDConn' @@ -56,5 +59,22 @@ model = cv_results.loc[best]['model'] model.fit(features, outcome) fig,fig2 = io.plot_edges(param_mat, atlas_fname, title=None, strength=True, cmap='icefire', node_size='strength') -fig.savefig('/Users/katherine.b/Dropbox/Projects/IDConn/test1.png') -fig2.savefig('/Users/katherine.b/Dropbox/Projects/IDConn/test2.png') \ No newline at end of file +fig.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_betas-{today_str}.png'), dpi=400) +fig2.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_betas-strength-{today_str}.png'), dpi=400) + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, atlas=ATLAS, z_score=False) + +test_df.dropna(inplace=True) + +outcome_test = test_df[OUTCOME].values +groups_test = outcome +matrices_test = np.vstack(test_df['adj'].dropna().values).reshape((len(test_df['adj'].dropna().index),num_node,num_node)) +edges_test = np.vstack(test_df['edge_vector'].dropna().values) + +test_features = edges_test.T[mask,:] +test_outcome = test_df[OUTCOME].values +accuracy = model.score(test_features.T, test_outcome) +print('Independent prediction accuracy:\t', accuracy) +np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'accuracy-{today_str}.txt'), [accuracy]) \ No newline at end of file From 2aa185feccff6a9d968cbc13a9796ca164682546 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Tue, 6 Dec 2022 22:11:34 -0800 Subject: [PATCH 19/48] removve unused options' --- idconn/nbs.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/idconn/nbs.py b/idconn/nbs.py index c5b4caa..41b63ca 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -78,8 +78,10 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000 # vectorize_corrmats returns p x n^2 # we want to run pynbs per edge # so vectorized edges must be transposed - - exog = np.hstack((outcome, confounds)) + if confounds: + exog = np.hstack((outcome, confounds)) + else: + exog = outcome exog = sm.add_constant(exog, prepend=False) # turn matrices into vectorized upper triangles if ndims > 2: @@ -206,7 +208,7 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000 else: return S1 -def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10, k=1000, shuffle=False, fig_dir=None): +def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10): """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided of shape ((subject x session)x node x node) in the network. @@ -289,6 +291,8 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s train_a_idx = [m for m in train_idx if groups[m] == 0] train_b_idx = [m for m in train_idx if groups[m] == 1] regressor = LogisticRegression(max_iter=1000) + elif np.unique(outcome).shape[0] >2: + regressor = LogisticRegression(max_iter=1000) else: regressor = LinearRegression() train_mats = matrices[train_idx,:,:] From f4b8531d48deb97de8aad5952d8fed24666dde23 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Tue, 6 Dec 2022 22:12:00 -0800 Subject: [PATCH 20/48] remove unused options --- idconn/workflows/nbs_predict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 1436528..d3bfbb8 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -35,7 +35,7 @@ alpha = 0.1 fig_dir = '/Users/katherine.b/Dropbox/Projects/IDConn' -cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=4, n_iterations=2, k=1000, shuffle=False, fig_dir=fig_dir) +cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10000) cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t') best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0] From 8228235495b169cf937c2a9479f4620a15f082ff Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Tue, 6 Dec 2022 22:15:15 -0800 Subject: [PATCH 21/48] auto-detect binary outcome and select logistic regression --- idconn/nbs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/idconn/nbs.py b/idconn/nbs.py index 41b63ca..164eb68 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -23,7 +23,7 @@ def calc_number_of_nodes(matrices): num_node = matrices.shape[0] return num_node -def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000, stratified=False): +def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutations=10000, stratified=False): ''' Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided of shape ((subject x session)x node x node) @@ -78,7 +78,7 @@ def pynbs(matrices, outcome, confounds, alpha, predict=False, permutations=10000 # vectorize_corrmats returns p x n^2 # we want to run pynbs per edge # so vectorized edges must be transposed - if confounds: + if confounds is not None: exog = np.hstack((outcome, confounds)) else: exog = outcome @@ -291,7 +291,7 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s train_a_idx = [m for m in train_idx if groups[m] == 0] train_b_idx = [m for m in train_idx if groups[m] == 1] regressor = LogisticRegression(max_iter=1000) - elif np.unique(outcome).shape[0] >2: + elif np.unique(outcome).shape[0] == 2: regressor = LogisticRegression(max_iter=1000) else: regressor = LinearRegression() From 37f29b7f7a233b77c213f58f0946ff8f5b82cc89 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Wed, 7 Dec 2022 09:15:44 -0800 Subject: [PATCH 22/48] bypass regression if no significant edges --- idconn/nbs.py | 59 ++++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/idconn/nbs.py b/idconn/nbs.py index 164eb68..9513eef 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -312,36 +312,41 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s #cv_results.at[i, 'pval'] = pval cv_results.at[i, 'component'] = adj.values + + # in the event of no edges significantly related to + if sum(adj) > 0: + # grab the values of the adjacency matrix that are just in the upper triangle + # so you don't have repeated edges + nbs_vector = adj.values[upper_tri] + # use those to make a "significant edges" mask + mask = nbs_vector == 1 - # grab the values of the adjacency matrix that are just in the upper triangle - # so you don't have repeated edges - nbs_vector = adj.values[upper_tri] - # use those to make a "significant edges" mask - mask = nbs_vector == 1 + # grab only the significant edges from testing and training sets of edges + # for use as features in the predictive models + train_features = edges[train_idx, :].T[mask] + test_features = edges[test_idx, :].T[mask] - # grab only the significant edges from testing and training sets of edges - # for use as features in the predictive models - train_features = edges[train_idx, :].T[mask] - test_features = edges[test_idx, :].T[mask] - # train model predicting outcome from brain (note: no mas covariates) - model = regressor.fit(X=train_features.T, y=train_y) - cv_results.at[i, 'model'] = model - # score that model on the testing data - score = model.score(X=test_features.T, y=test_y) - cv_results.at[i, 'score'] = score + # train model predicting outcome from brain (note: no mas covariates) + model = regressor.fit(X=train_features.T, y=train_y) + cv_results.at[i, 'model'] = model + # score that model on the testing data + score = model.score(X=test_features.T, y=test_y) + cv_results.at[i, 'score'] = score - m = 0 - param_vector = np.zeros_like(nbs_vector) - for l in range(0, nbs_vector.shape[0]): - if nbs_vector[l] == 1.: - param_vector[l] = model.coef_[0,m] - m+=1 - else: - pass - X = undo_vectorize(param_vector, num_node=num_node) - cv_results.at[i, 'coefficient_matrix'] = X - cv_results.at[i, 'coefficient_vector'] = param_vector - i += 1 + m = 0 + param_vector = np.zeros_like(nbs_vector) + for l in range(0, nbs_vector.shape[0]): + if nbs_vector[l] == 1.: + param_vector[l] = model.coef_[0,m] + m+=1 + else: + pass + X = undo_vectorize(param_vector, num_node=num_node) + cv_results.at[i, 'coefficient_matrix'] = X + cv_results.at[i, 'coefficient_vector'] = param_vector + i += 1 + else: + pass ticks.update() return cv_results \ No newline at end of file From dfc470d87a182c5689f724b1f09327b901f35801 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Wed, 7 Dec 2022 09:25:41 -0800 Subject: [PATCH 23/48] testing nbs-predict (it runs) --- idconn/workflows/nbs_predict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index d3bfbb8..0ab55c0 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -35,7 +35,7 @@ alpha = 0.1 fig_dir = '/Users/katherine.b/Dropbox/Projects/IDConn' -cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10000) +cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=1000) cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t') best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0] From 26f69187b4c2b920a0d4d53f58c70252ad353570 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Wed, 7 Dec 2022 10:43:48 -0800 Subject: [PATCH 24/48] standardize output file names --- idconn/workflows/nbs_predict.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 0ab55c0..dfdae87 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -37,14 +37,14 @@ cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=1000) -cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t') +cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t') best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0] subnetwork = cv_results.loc[best]['component'] subnetwork_df = pd.DataFrame(subnetwork, index=range(0,num_node), columns=range(0,num_node)) -subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_edge_parameters-{today_str}.tsv'),sep='\t') +subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_edge-parameters-{today_str}.tsv'),sep='\t') nbs_vector = subnetwork[upper_tri] mask = nbs_vector == 1 @@ -59,8 +59,8 @@ model = cv_results.loc[best]['model'] model.fit(features, outcome) fig,fig2 = io.plot_edges(param_mat, atlas_fname, title=None, strength=True, cmap='icefire', node_size='strength') -fig.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_betas-{today_str}.png'), dpi=400) -fig2.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_betas-strength-{today_str}.png'), dpi=400) +fig.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_betas-{today_str}.png'), dpi=400) +fig2.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_betas-strength-{today_str}.png'), dpi=400) layout = bids.BIDSLayout(TEST_DSET, derivatives=True) @@ -77,4 +77,4 @@ test_outcome = test_df[OUTCOME].values accuracy = model.score(test_features.T, test_outcome) print('Independent prediction accuracy:\t', accuracy) -np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'accuracy-{today_str}.txt'), [accuracy]) \ No newline at end of file +np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_accuracy-{today_str}.txt'), [accuracy]) \ No newline at end of file From c8ec46f4bce0dcf749ade1aa7aa418b0d254e113 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Wed, 7 Dec 2022 20:35:58 -0800 Subject: [PATCH 25/48] add correlation, fix logistic conditional --- idconn/workflows/nbs_predict.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index dfdae87..b09fac7 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -5,6 +5,7 @@ from os.path import join from datetime import datetime from time import strftime +from scipy.stats import spearmanr today = datetime.today() today_str = strftime("%m_%d_%Y") @@ -31,7 +32,10 @@ upper_tri = np.triu_indices(num_node, k=1) outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]),1)) -confounds = dat[CONFOUNDS] +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] +else: + confounds = None alpha = 0.1 fig_dir = '/Users/katherine.b/Dropbox/Projects/IDConn' @@ -55,7 +59,7 @@ odds = 10 ** param_mat prob = odds / (1 + odds) -# run the model on the whole 28andMe dataset to get params +# run the model on the whole test dataset to get params model = cv_results.loc[best]['model'] model.fit(features, outcome) fig,fig2 = io.plot_edges(param_mat, atlas_fname, title=None, strength=True, cmap='icefire', node_size='strength') @@ -75,6 +79,17 @@ test_features = edges_test.T[mask,:] test_outcome = test_df[OUTCOME].values -accuracy = model.score(test_features.T, test_outcome) -print('Independent prediction accuracy:\t', accuracy) -np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_accuracy-{today_str}.txt'), [accuracy]) \ No newline at end of file +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) +score = model.score(test_features.T, test_outcome) +print('Independent prediction accuracy:\t', score) +pred_outcome = model.predict(test_features.T) +if len(np.unique(test_outcome)) > 2: + corr = spearmanr(test_outcome, pred_outcome) + print('\nSpearman correlation:\t', corr) + np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_score-{today_str}.txt'), [score, corr[0], corr[1]]) +else: + np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_score-{today_str}.txt'), [score]) + From e83affa503ecb01a654f9f4e989e4d2a5c2a49e7 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 8 Dec 2022 17:02:12 -0800 Subject: [PATCH 26/48] removve out_dir, add docs --- idconn/connectivity.py | 54 ++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/idconn/connectivity.py b/idconn/connectivity.py index e54914b..cf14137 100644 --- a/idconn/connectivity.py +++ b/idconn/connectivity.py @@ -19,35 +19,37 @@ def _check_dims(matrix): raise ValueError('Expected a square matrix, got array of shape' ' {0}.'.format(matrix.shape)) -def task_connectivity(layout, subject, session, task, atlas, confounds, connectivity_metric='correlation', out_dir=None): +def task_connectivity(layout, subject, session, task, atlas, confounds, connectivity_metric='correlation'): """ Makes connectivity matrices per subject per session per task per condition. Parameters ---------- - dset_dir : str - BIDS-formatted dataset path (top-level, in which a 'derivatives/' directory will be made if one does not exist) + layout : BIDSLayout object + BIDSLayout (i.e., pybids layout object) for directory containing data for analysis (with `derivative=True`, as we're using fmriprep output). subject : str Subject ID for which the networks will be calculated. session : str, optional - Session of data collection. If there's only one session, we'll find it. + Session of data collection for which networks will be calculated. If there's only one session, we'll find it. task : str - Name of task fMRI scan from which networks will be calculated. + Name of task fMRI scan (can be "rest") from which networks will be calculated. connectivity_metric : {"correlation", "partial correlation", "tangent",\ "covariance", "precision"}, optional - The matrix kind. Passed to Nilearn's `ConnectivityMeasure`. + The matrix kind. Passed to Nilearn's `ConnectivityMeasure`. Default is product-moment correlation, "correlation". space : str - 'native' if analyses will be performed in subjects' functional native space (atlas(es) should be transformed) - 'mni152-2mm' if analyses will be performed in MNI125 2mm isotropic space (fMRI data should already be transformed) + 'native' if analyses will be performed in subjects' functional native space (atlas(es) should be transformed into this space already). + 'mni152-2mm' if analyses will be performed in MNI125 2mm isotropic space (fMRI data should already be transformed into MNI space). atlas : str If you want to grab an atlas using Nilearn, this is the name of the atlas and must match the corresponding function `fetch_atlas_[name]` in `nilearn.datasets`. - If you have your own atlas, this is the path to that nifti file.` + If you have your own atlas, this is the path to that nifti file. Currently: only works with paths. confounds : list-like - Filenames of confounds files. + Columns from fMRIPrep confounds output to be regressed out of fMRI data before correlation matrices are made. Returns ------- - confounds_file : str - Filename of merged confounds .tsv file + avg_corrmats: numpy array + Average corrmat (per condition, if applicable). + files : list + Filenames of computed correlation matrices. """ #version = '0.1.1' try: @@ -57,10 +59,8 @@ def task_connectivity(layout, subject, session, task, atlas, confounds, connecti if '.nii' in atlas: assert exists(atlas), f'Mask file does not exist at {atlas}' - if not out_dir: - deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}') - else: - deriv_dir = out_dir + deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}') + space = 'MNI152NLin2009cAsym' atlas_name = basename(atlas).rsplit('.', 2)[0] # use pybids here to grab # of runs and preproc bold filenames @@ -163,14 +163,14 @@ def task_connectivity(layout, subject, session, task, atlas, confounds, connecti print('saving corrmat...', e) return files, avg_corrmats -def rest_connectivity(layout, subject, session, task, atlas, connectivity_metric='correlation', confounds=None, out_dir=None): +def rest_connectivity(layout, subject, session, task, atlas, confounds=None,connectivity_metric='correlation'): """ Makes connectivity matrices per subject per session per task per condition. Parameters ---------- layout : str - BIDS layout with derivatives indexed from pyBIDS + BIDS layout with fMRIPrep derivatives indexed from pyBIDS subject : str Subject ID for which the networks will be calculated. session : str, optional @@ -178,16 +178,17 @@ def rest_connectivity(layout, subject, session, task, atlas, connectivity_metric connectivity_metric : {"correlation", "partial correlation", "tangent",\ "covariance", "precision"}, optional The matrix kind. Passed to Nilearn's `ConnectivityMeasure`. - space : str - 'native' if analyses will be performed in subjects' functional native space (atlas(es) should be transformed) - 'mni152-2mm' if analyses will be performed in MNI125 2mm isotropic space (fMRI data should already be transformed) atlas : str - Name of atlas for parcellating voxels into nodes, must be in the same `space` given above. + Name of atlas for parcellating voxels into nodes, must be in the same `space` as preprocessed rsfMRI data from fMRIPrep. confounds : list-like Names of confounds (should be columns in fmriprep output confounds.tsv). Returns ------- - adjacency_matrix + corrmat_df : Pandas dataframe + Functional connectivity matrix with labeled nodes (i.e., rows, columns) and weighted edges (i.e., elements) based on + the connectivity metric selected. If multiple runs, represents average across runs. + corrmat_file : str + Path to saved correlation matrix. """ try: version = get_versions()["version"] @@ -196,16 +197,13 @@ def rest_connectivity(layout, subject, session, task, atlas, connectivity_metric if '.nii' in atlas: assert exists(atlas), f'Mask file does not exist at {atlas}' - if not out_dir: - deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}') - else: - deriv_dir = out_dir + deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}') atlas_name = basename(atlas).rsplit('.', 2)[0] # use pybids here to grab # of runs and preproc bold filenames connectivity_measure = connectome.ConnectivityMeasure(kind=connectivity_metric) bold_files = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz') # should be preprocessed BOLD file from fmriprep, grabbed with pybids print(f'BOLD files found at {bold_files}') - confounds_files = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task) + #confounds_files = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task) runs = [] if len(bold_files) > 1: From 8e119f622cdf5c68dbd1fb8e01717eecb6100d71 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 8 Dec 2022 17:13:53 -0800 Subject: [PATCH 27/48] removed unused params, added docs --- idconn/nbs.py | 75 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/idconn/nbs.py b/idconn/nbs.py index 9513eef..e9d59fe 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -3,7 +3,7 @@ import networkx as nx import pandas as pd from idconn.io import vectorize_corrmats, undo_vectorize -from scipy.stats import t +from scipy.stats import t, pearsonr, pointbiserialr, spearmanr import enlighten #import bct @@ -23,7 +23,7 @@ def calc_number_of_nodes(matrices): num_node = matrices.shape[0] return num_node -def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutations=10000, stratified=False): +def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutations=10000): ''' Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided of shape ((subject x session)x node x node) @@ -52,10 +52,7 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat permutations : int If `predict=False`, specifies the number of permutations run to create a null distribution for estimating the significance of the connected component size. Recommended 10,000. - stratified : bool or list-like of shape (p,) - If `predict=True` and there are groups that should be equally sampled across k-fold - cross-validation, input should be a list of group belonging (i.e., one label per participant). - + Returns ------- S1 : Pandas dataframe @@ -69,7 +66,7 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat # and retain significant edges # then find the largest connected component # and, if not predict, build a null distribution - n = matrices.shape[:-1] + n = matrices.shape[0] ndims = len(matrices.shape) #print(ndims) #if ndims >=2 @@ -98,10 +95,17 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat # 0 if it's not sig_edges = [] for i in range(0, edges.shape[0]): + y = edges[i,:] # statsmodels for regressing predictors on edges - mod = sm.OLS(edges[i,:], exog, hasconst=True) - results = mod.fit() - edge_pval = results.pvalues[0] + #mod = sm.OLS(y, exog, hasconst=True) + #results = mod.fit() + #edge_pval = results.pvalues[0] + + # let's try straight up correlations? + if len(np.unique(outcome)) > 2: + r, edge_pval = pearsonr(outcome.reshape(n,), y.reshape(n,)) + else: + r, edge_pval = pointbiserialr(outcome.reshape(n,), y.reshape(n,)) # build binary significance edge vector if edge_pval < alpha: @@ -148,9 +152,9 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat # and NBS might need all nodes for easier # plotting in brain space for i in unused_nodes: - S1.loc[i] = 0 + S1.loc[i] = 0.0 temp = S1.copy() - temp[i] = 0 + temp[i] = 0.0 S1 = temp.copy() S1.sort_index(axis=0, inplace=True) @@ -160,7 +164,6 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat # only for regular NBS, -Predict doesn't need this if predict == False: perms = np.zeros((permutations,)) - hit = 0 rng = np.random.default_rng() exog_copy = exog.copy() for i in range(0, permutations): @@ -208,7 +211,7 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat else: return S1 -def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10): +def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10, n_iterations=10): """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided of shape ((subject x session)x node x node) in the network. @@ -226,14 +229,28 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s an array of symmetric matrices. outcome : list-like of shape (p,) Y-value to be predicted with connectivity + confounds : list-like + Names of columns in `participants.tsv` to be regressed out of connectivity and outcome + data in each CV fold (per recommendation from Snoek et al., 2019). + alpha : float + Proportion of type II errors (i.e., false positives) we're willing to put up with. + This is the upper limit for pvalues in the edge detection process. groups : list-like of shape (p,) - Grouping variable - currently only works for 2 groups + Grouping variable - currently only works for 2 groups. Will enforce stratified k-fold CV. + n_splits : int + Value of K for K-fold cross-validation. Will split data into K chunks, train on K-1 chunks and test on the Kth. + n_iterations : int + Number of times to run K-fold cross-validation. More times = more stable results. Returns ------- + weighted_average : Pandas dataframe + Includes the average of all largest components across folds and iterations, weighted by + their prediction performance (i.e., accuracy for binary outcome, correlation for continuous). + Could be used for out-of-sample prediction, once thresholded and binarized. cv_results : Pandas dataframe - Includes the results of each cross-validation loop - the input matrices. + Includes the results of each cross-validation loop + (e.g., predictive performance, data split, largest connected component per fold per iteration). """ edges = vectorize_corrmats(matrices) #print(edges.shape) @@ -256,11 +273,7 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s n_repeats=n_iterations) dof = edges.shape[0] - 1 - if tail == 'both': - alpha = 0.01 - else: - alpha = 0.005 - t_threshold = t.ppf(1 - alpha, df=dof) + #t_threshold = t.ppf(1 - alpha, df=dof) # really can't remember why tf I did this? # maybe it's an artifact of permuted_ols? @@ -302,7 +315,10 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s train_y = outcome[train_idx] test_y = outcome[test_idx] - train_confounds = confounds.values[train_idx] + if confounds is not None: + train_confounds = confounds.values[train_idx] + else: + train_confounds = None #test_confounds = confounds.values[test_idx] # perform NBS wooooooooo @@ -314,24 +330,25 @@ def kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_s cv_results.at[i, 'component'] = adj.values # in the event of no edges significantly related to - if sum(adj) > 0: + #print(sum(sum(adj.values)), '\n', adj.values.shape) + if sum(sum(adj.values)) > 0: # grab the values of the adjacency matrix that are just in the upper triangle # so you don't have repeated edges nbs_vector = adj.values[upper_tri] # use those to make a "significant edges" mask - mask = nbs_vector == 1 + mask = nbs_vector == 1.0 # grab only the significant edges from testing and training sets of edges # for use as features in the predictive models train_features = edges[train_idx, :].T[mask] test_features = edges[test_idx, :].T[mask] - # train model predicting outcome from brain (note: no mas covariates) - model = regressor.fit(X=train_features.T, y=train_y) - cv_results.at[i, 'model'] = model + #print(train_features.T.shape, train_y.shape) + model = regressor.fit(X=train_features.T, y=train_y.ravel()) + #cv_results.at[i, 'model'] = model # score that model on the testing data - score = model.score(X=test_features.T, y=test_y) + score = model.score(X=test_features.T, y=test_y.ravel()) cv_results.at[i, 'score'] = score m = 0 From 8336a8b09560bf7ca7328671deab7f8338bb2f45 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 9 Mar 2023 19:50:18 -0800 Subject: [PATCH 28/48] commit before overhauling nbs.py --- idconn/nbs.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/idconn/nbs.py b/idconn/nbs.py index e9d59fe..cbcc395 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -8,6 +8,7 @@ #import bct from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold +from sklearn.feature_selection import f_regression, f_classif from sklearn.linear_model import LogisticRegression, LinearRegression def calc_number_of_nodes(matrices): @@ -115,17 +116,17 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - sig_matrix = undo_vectorize(sig_edges, num_node) # need to write this function + sig_matrix = undo_vectorize(sig_edges, num_node) + + # turn it into a networkx matrix matrix = nx.from_numpy_array(sig_matrix) #use networkX to find connected components - comps = nx.connected_components(matrix) + largest_cc = max(nx.connected_components(matrix), key=len) + G0 = G.subgraph(largest_cc) + + # grab number of edges from G0 - # rearrange networkx output into an array of matrices, S - S = [matrix.subgraph(c).copy() for c in comps] - # find size of each connected component, s in S - size = np.asarray([s.number_of_edges() for s in S]) - (max_comp, ) = np.where(size == max(size)) largest_comp_size = max(size) if predict == False: print(f'Connected component has {largest_comp_size} edges.') @@ -187,7 +188,9 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat perm_matrix = undo_vectorize(perm_edges, num_node) # need to write this function perm_nx = nx.from_numpy_array(perm_matrix) - comps = nx.connected_components(perm_nx) + #comps = nx.connected_components(perm_nx) + + S = [perm_nx.subgraph(c).copy() for c in comps] perm_size = np.asarray([s.number_of_edges() for s in S]) From a3869a07ae48e44ca5031510c8c209fa19171eeb Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 9 Mar 2023 19:51:04 -0800 Subject: [PATCH 29/48] add docstrings to io --- idconn/io.py | 79 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 12 deletions(-) diff --git a/idconn/io.py b/idconn/io.py index 0930d06..7690615 100644 --- a/idconn/io.py +++ b/idconn/io.py @@ -19,6 +19,7 @@ def build_statsmodel_json(name, task, contrast, confounds, highpass, mask, conn_meas, graph_meas=None, exclude=None, outfile=None): ''' Creates a BIDS Stats Models json with analysis details for further use. + DOES NOT WORK YET. Parameters ---------- @@ -202,16 +203,25 @@ def vectorize_corrmats(matrices): edge_vector = np.asarray(edge_vector) return edge_vector -def read_corrmats(layout, task, deriv_name='IDConn', atlas=None, conf_measures=None, z_score=True, vectorized=True, verbose=False): +def read_corrmats(layout, task, deriv_name='IDConn', z_score=True, vectorized=True, verbose=False): """Returns a node x node x (subject x session) matrix of correlation matrices from a BIDS derivative folder. Optionally returns a subject x session dataframe of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) array of vectorized upper triangles of those correlation matrices. Parameters ---------- - matrices : numpy array of shape (n, n, p) - Represents the link strengths of the graphs. Assumed to be - an array of symmetric matrices. + layout : BIDSLayout object + BIDSLayout (i.e., pybids layout object) for directory containing data for analysis (with `derivative=True`, as we're using fmriprep output). + task : str + Name of task fMRI scan (can be "rest") from which networks will be calculated. + deriv_name : str + Name of the package used to generate the correlation matrices to be read. Could be IDConn, could be something else. + z_score : bool + If True, assumes computed connectivity matrices are product-moment correlations, uses Fisher's r-to-Z. + vectorized : bool + Would you also like this function to return the vectorized upper triangles of all your matrices? + verbose : bool + Print statements? Y/N? Returns ------- @@ -296,6 +306,20 @@ def read_corrmats(layout, task, deriv_name='IDConn', atlas=None, conf_measures=N return ppt_df def undo_vectorize(edges, num_node): + ''' + Puts an edge vector back into an adjacency matrix. + Parameters + ---------- + edges : list-like of shape ((n^2-n)/2,) + Vectorized upper triangle of an adjacency matrix. + num_node : int + The number of nodes in the graph. I would calculate this myself, but I'd rather not. + + Returns + ------- + matrix : numpy array of size (n,n) + Symmetric array of connectivity values. + ''' #j = len(edges) #num_node = (np.sqrt((8 * j) + 1) + 1) / 2 X = np.zeros((num_node,num_node)) @@ -303,7 +327,37 @@ def undo_vectorize(edges, num_node): X = X + X.T return X -def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='vlag', node_size='strength'): +def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='coolwarm', node_size='strength'): + ''' + Plots the edges of a connectivity/adjacency matrix both in a heatmap and in brain space, with the option to include + a surface plot of node strength. + Parameters + ---------- + adj : array-like of shape (n, n) + Adjacency matrix to be plotted. Can be numpy array or Pandas dataframe. + atlas_nii : str + Path to the atlas used to define nodes in the adjacency matrix. + Should be one value per node, with the same number of values as rows and columns in adj (i.e., n). + Background should be 0, should be in MNI space. + threshold : int + Percentile of edges to plot, between 0 and 100 such that 0 plots all the edges and 100 plots none. + If not specified, default is 99, which plots the top 1% of edges. + title : str + Title for plots. + strength : bool + If True, plots surface maps of node strength (i.e., the sum of all a node's edge weights) + cmap : str + One of the matplotlib colormaps. + node_size : int or 'strength' + Size to plot nodes in brain space. If 'strength', node size varies according to a node's summed edges (i.e., strength). + + Returns + ------- + fig1 : Matplotlib figure object + Connectivity figure. + fig2 : Matplotlib figure object + If `strength=True`, the surface node strength plot. + ''' coords = plotting.find_parcellation_cut_coords(atlas_nii) num_node = adj.shape[0] # only plot the top t% of edges @@ -331,12 +385,12 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap= g = plotting.plot_connectome(adj, coords, node_size=node_size, edge_threshold=threshold, - edge_cmap=cmap, + edge_cmap='coolwarm', figure=fig, axes=ax0, colorbar=False, annotate=False) - h = sns.heatmap(adj, square=True, cmap=cmap, ax=ax1, center=0) + h = sns.heatmap(adj, square=True, cmap='coolwarm', ax=ax1, center=0) if strength: fig2 = plt.figure(figsize=(12,4)) if title is not None: @@ -347,7 +401,8 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap= for i in np.arange(0,num_node): regn_sch_arr[np.where(regn_sch_arr == i+1)] = np.sum(adj[i]) strength_nimg = nib.Nifti1Image(regn_sch_arr, nimg.affine) - nib.save(strength_nimg, '/Users/katherine.b/Dropbox/HC_Use_predictive-strength.nii') + # replace this filename with BIDSy output + #nib.save(strength_nimg, f'/Users/katherine.b/Dropbox/{title}predictive-strength.nii') gs = GridSpec(1, 4) # plot edge weights on surfaces @@ -361,13 +416,13 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap= plt.tight_layout(w_pad=-1) i = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5, - cmap=cmap, view='lateral', colorbar=False, axes=ax2) + cmap='coolwarm', view='lateral', colorbar=False, axes=ax2) j = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5, - cmap=cmap, view='medial', colorbar=False, axes=ax3) + cmap='coolwarm', view='medial', colorbar=False, axes=ax3) k = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5, - cmap=cmap, view='lateral', colorbar=False, axes=ax4) + cmap='coolwarm', view='lateral', colorbar=False, axes=ax4) l = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5, - cmap=cmap, view='medial', colorbar=False, axes=ax5) + cmap='coolwarm', view='medial', colorbar=False, axes=ax5) return fig, fig2 else: return fig \ No newline at end of file From cadd0d40811500b9f3da296384956e77a4e982c8 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 9 Mar 2023 19:52:50 -0800 Subject: [PATCH 30/48] rename task/rest conn modules --- idconn/pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/idconn/pipeline.py b/idconn/pipeline.py index 38c0ccd..8c82eea 100644 --- a/idconn/pipeline.py +++ b/idconn/pipeline.py @@ -23,7 +23,7 @@ from os.path import exists #from glob import glob #from nilearn import input_data, connectome, plotting, image -from idconn.connectivity import connectivity, task_connectivity +from idconn.connectivity import rest_connectivity, task_connectivity from idconn.parser_utils import is_valid_file, is_valid_path #from idconn.networking import graph_theory, null_distribution @@ -116,12 +116,12 @@ def idconn_workflow(dset_dir, atlas, task, out_dir, space="MNI152NLin2009cAsym", print(f"here are the inputs: {layout, subject, session, task, atlas, conn, space, confounds}") if 'rest' in task: try: - adj_matrix = connectivity(layout, subject, session, task, atlas, conn, space, confounds) + adj_matrix = rest_connectivity(layout, subject, session, task, atlas, conn, space, confounds) except Exception as e: print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}') if len(conditions) < 1: try: - adj_matrix = connectivity(layout, subject, session, task, atlas, conn, space, confounds) + adj_matrix = rest_connectivity(layout, subject, session, task, atlas, conn, space, confounds) except Exception as e: print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}') else: From adc613968c0537ebc6349caf794467d90b5826b3 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 9 Mar 2023 19:53:45 -0800 Subject: [PATCH 31/48] update nbs_predict script --- idconn/workflows/nbs_predict.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index b09fac7..01b5e8f 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -11,12 +11,13 @@ today_str = strftime("%m_%d_%Y") TRAIN_DSET = '/Users/katherine.b/Dropbox/Data/ds002674' -TEST_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset' +TEST_DSET = '/Users/katherine.b/Dropbox/Data/ds002674' DERIV_NAME = 'IDConn' OUTCOME = 'estradiol' -CONFOUNDS = ['bc'] +CONFOUNDS = None TASK = 'rest' ATLAS = 'craddock2012' +alpha = 0.01 atlas_fname = '/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz' layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) @@ -36,10 +37,9 @@ confounds = dat[CONFOUNDS] else: confounds = None -alpha = 0.1 -fig_dir = '/Users/katherine.b/Dropbox/Projects/IDConn' -cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=1000) + +cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10) cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t') best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0] @@ -84,11 +84,11 @@ # if the model is a linear regression, i.e., with a continuous outcome # then the score is R^2 (coefficient of determination) score = model.score(test_features.T, test_outcome) -print('Independent prediction accuracy:\t', score) +print('Out-of-sample prediction score:\t', score) pred_outcome = model.predict(test_features.T) if len(np.unique(test_outcome)) > 2: corr = spearmanr(test_outcome, pred_outcome) - print('\nSpearman correlation:\t', corr) + print(f'\nSpearman correlation between predicted and actual {OUTCOME}:\t', corr) np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_score-{today_str}.txt'), [score, corr[0], corr[1]]) else: np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_score-{today_str}.txt'), [score]) From afb2308cf4453203a7a41baa0b284b6ffc4f8fce Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 9 Mar 2023 19:56:57 -0800 Subject: [PATCH 32/48] update nbspy, use f_classif/f_regression, so fast! --- idconn/nbs.py | 128 ++++++++++++++++++++------------------------------ 1 file changed, 51 insertions(+), 77 deletions(-) diff --git a/idconn/nbs.py b/idconn/nbs.py index cbcc395..abf6885 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -24,7 +24,7 @@ def calc_number_of_nodes(matrices): num_node = matrices.shape[0] return num_node -def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutations=10000): +def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations=10000): ''' Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided of shape ((subject x session)x node x node) @@ -46,7 +46,7 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat confounds : list-like of shape (p,m) Covariates, included as predictors in model. alpha : float - Type-I error (i.e., false positive) rate, for outcome-related edge detection. + Type-I error (i.e., false positive) rate, for outcome-related edge detection. Default = 0.05 predict : bool If True, bypasses `permutations` parameter and only runs edge detection + component identification. Used for NBS-Predict. @@ -67,85 +67,64 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat # and retain significant edges # then find the largest connected component # and, if not predict, build a null distribution - n = matrices.shape[0] + n = matrices.shape[:-1] ndims = len(matrices.shape) - #print(ndims) - #if ndims >=2 - num_node = calc_number_of_nodes(matrices) # vectorize_corrmats returns p x n^2 # we want to run pynbs per edge # so vectorized edges must be transposed + if confounds is not None: - exog = np.hstack((outcome, confounds)) + #regress out the confounds, use the residuals for the rest of the algorithm + pass else: - exog = outcome - exog = sm.add_constant(exog, prepend=False) + pass + exog = outcome + # turn matrices into vectorized upper triangles if ndims > 2: edges = vectorize_corrmats(matrices) else: - raise ValueError(f'Input matrices have shape {matrices.shape},', - 'pyNBS requires matrices of shape (subject x session) x node x node.') - edges = edges.T - #print(f'\n\n\n{edges.shape}\n\n\n') + edges = matrices.copy() + #edges = edges.T # run an ols per edge # create significancs matrix for predictor of interest (outcome) # 1 if edge is significantly predicted by outcome # 0 if it's not - sig_edges = [] - for i in range(0, edges.shape[0]): - y = edges[i,:] - # statsmodels for regressing predictors on edges - #mod = sm.OLS(y, exog, hasconst=True) - #results = mod.fit() - #edge_pval = results.pvalues[0] - - # let's try straight up correlations? - if len(np.unique(outcome)) > 2: - r, edge_pval = pearsonr(outcome.reshape(n,), y.reshape(n,)) - else: - r, edge_pval = pointbiserialr(outcome.reshape(n,), y.reshape(n,)) - - # build binary significance edge vector - if edge_pval < alpha: - sig_edges.append(1) - else: - sig_edges.append(0) + + if len(np.unique(exog)) < 5: + (f, p) = f_classif(edges, exog) + else: + (f, p) = f_regression(edges, exog, center=False) + sig_edges = np.where(p < alpha, 1, 0) # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - sig_matrix = undo_vectorize(sig_edges, num_node) - - # turn it into a networkx matrix + sig_matrix = undo_vectorize(sig_edges) # need to write this function matrix = nx.from_numpy_array(sig_matrix) #use networkX to find connected components largest_cc = max(nx.connected_components(matrix), key=len) - G0 = G.subgraph(largest_cc) + G0 = matrix.subgraph(largest_cc) + print(G0) - # grab number of edges from G0 - - largest_comp_size = max(size) - if predict == False: - print(f'Connected component has {largest_comp_size} edges.') - else: - pass - # retain size of largest connected component # for NBS permutation-based significance testing - max_comp = max_comp[0] + max_comp = G0.number_of_edges() + print(f'Connected component has {max_comp} edges.') + + + # pull the subgraph with largest number of nodes # i.e., the largest connected component - G = S[max_comp] - + # grab list of nodes in largest connected component - nodes = list(G.nodes) + nodes = list(G0.nodes) unused_nodes = list(set(matrix.nodes) - set(nodes)) - S1 = nx.to_pandas_adjacency(G, nodelist=nodes) + S1 = nx.to_pandas_adjacency(G0, nodelist=nodes) # add empty edges for unused nodes # bc NBS-Predict needs all nodes for @@ -153,11 +132,9 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat # and NBS might need all nodes for easier # plotting in brain space for i in unused_nodes: - S1.loc[i] = 0.0 - temp = S1.copy() - temp[i] = 0.0 - S1 = temp.copy() - + S1.loc[i] = 0 + S1[i] = 0 + S1.sort_index(axis=0, inplace=True) S1.sort_index(axis=1, inplace=True) @@ -165,50 +142,47 @@ def pynbs(matrices, outcome, confounds=None, alpha=0.05, predict=False, permutat # only for regular NBS, -Predict doesn't need this if predict == False: perms = np.zeros((permutations,)) + hit = 0 rng = np.random.default_rng() exog_copy = exog.copy() for i in range(0, permutations): # shuffle outcome order rng.shuffle(exog_copy, axis=0) #print(exog_copy) - perm_edges = [] - for j in range(0, edges.shape[0]): - # statsmodels for regressing predictors on edges - mod = sm.OLS(edges[j,:], exog_copy, hasconst=False) - results = mod.fit() - edge_pval = results.pvalues[0] - - if edge_pval < alpha: - perm_edges.append(1) - else: - perm_edges.append(0) + + if len(np.unique(exog)) < 5: + (f1, p1) = f_classif(edges, exog_copy) + else: + (f1, p1) = f_regression(edges, exog_copy, center=False) + + perm_edges = np.where(p1 < alpha, 1, 0) + #print(np.sum(perm_edges)) # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - perm_matrix = undo_vectorize(perm_edges, num_node) # need to write this function + perm_matrix = undo_vectorize(perm_edges) # need to write this function perm_nx = nx.from_numpy_array(perm_matrix) - #comps = nx.connected_components(perm_nx) + largest_cc = max(nx.connected_components(perm_nx), key=len) + S = perm_nx.subgraph(largest_cc) + perm_comp_size = S.number_of_edges() - S = [perm_nx.subgraph(c).copy() for c in comps] - perm_size = np.asarray([s.number_of_edges() for s in S]) - (max_comp, ) = np.where(perm_size == max(perm_size)) - #print(perm_size, max_comp) - # retain for null distribution - perms[i] = max(perm_size) - if i % 10 == 0: - print(f'p-value is {np.size(np.where(perms >= largest_comp_size)) / permutations} as of permutation {i}') + perms[i] = perm_comp_size + if i == 0: + pass + elif i % 100 == 0: + print(f'p-value is {np.round(np.sum(np.where(perms >= max_comp, 1, 0)) / i, 3)} as of permutation {i}') # bctpy nbs code uses hit to mark progress across permutations # prob not necessary? # bctpy calcs pval for all components, not just largest? # but I don't think that's relevant for the og implimentation of nbs? - pval = np.size(np.where(perms >= largest_comp_size)) / permutations - print(largest_comp_size, permutations, pval) + pval = np.size(np.where(perms >= max_comp)) / permutations + print(max_comp, permutations, pval) return pval, S1, perms else: From d49847e49e52fa4a1f0d6e8ff1a871bfbe8d96e2 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Wed, 29 Mar 2023 09:37:55 -0700 Subject: [PATCH 33/48] nbs-predict works now --- CONTRIBUTING.md | 0 idconn/__init__.py | 2 +- idconn/io.py | 95 +++++++----- idconn/nbs.py | 200 +++++++++++++++++-------- idconn/workflows/nbs_predict.py | 257 +++++++++++++++++++++++++++----- 5 files changed, 412 insertions(+), 142 deletions(-) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..e69de29 diff --git a/idconn/__init__.py b/idconn/__init__.py index 000932b..6915dae 100644 --- a/idconn/__init__.py +++ b/idconn/__init__.py @@ -18,7 +18,7 @@ # from . import preprocessing # from . import statistics # from . import utils - # from . import io + from . import io __version__ = get_versions()["version"] diff --git a/idconn/io.py b/idconn/io.py index 7690615..b14abb6 100644 --- a/idconn/io.py +++ b/idconn/io.py @@ -15,6 +15,7 @@ from nilearn import datasets, plotting, surface + def build_statsmodel_json(name, task, contrast, confounds, highpass, mask, conn_meas, graph_meas=None, exclude=None, outfile=None): ''' @@ -203,43 +204,47 @@ def vectorize_corrmats(matrices): edge_vector = np.asarray(edge_vector) return edge_vector -def read_corrmats(layout, task, deriv_name='IDConn', z_score=True, vectorized=True, verbose=False): +def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True, verbose=False): """Returns a node x node x (subject x session) matrix of correlation matrices - from a BIDS derivative folder. Optionally returns a subject x session dataframe - of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) + from a BIDS derivative folder. Optionally returns a node^2 x (subject x session) array of vectorized upper triangles of those correlation matrices. Parameters ---------- - layout : BIDSLayout object - BIDSLayout (i.e., pybids layout object) for directory containing data for analysis (with `derivative=True`, as we're using fmriprep output). + layout : BIDSLayout or str + A valid BIDSLayout or directory. If BIDSLayout, must be generated with derivatives=True, + in order to find the derivatives folder containing the relevant correlation matrices. task : str - Name of task fMRI scan (can be "rest") from which networks will be calculated. + The task used to collect fMRI data from which correlation matrices were computed. deriv_name : str - Name of the package used to generate the correlation matrices to be read. Could be IDConn, could be something else. - z_score : bool - If True, assumes computed connectivity matrices are product-moment correlations, uses Fisher's r-to-Z. - vectorized : bool - Would you also like this function to return the vectorized upper triangles of all your matrices? - verbose : bool - Print statements? Y/N? + The name of the derivatives subdirectory in which correlation matrices can be found + atlas: str + The name of the atlas used to make the correlation matrix. Must match the string in corrmat filename. + z_score : Bool + Would you like the correlation matrices z-scored? (Uses Fishers r-to-z, + thus assumes elements/edges of corrmats are product-moment correlations). + vectorized : Bool + If True, returns the vectorized upper triangles of correlation matrices in a p x (n^2 - n)/2 array. + If false, returns the full correlation matrices in a p x n x n array. + verbose : Bool + If True, prints out subjects/sessions as their correlationmatrices are being read. + If False, prints nothing. Returns ------- - edge_vector : numpy array of shape (p, n^2) + # NOT TRUE CURRENTLY RETURNS DATAFRAME + edge_vector : numpy array of shape (p, (n^2-n)/2) Represents an array of vectorized upper triangles of - the input matrices. + the input nxn matrices if vectorized=True. + edge_cube : numpy array of shape (p, n^2) + Represents an array of the input nxn matrices + if vectorized=False. """ subjects = layout.get(return_type='id', target='subject', suffix='bold', scope=deriv_name ) - all_sesh = layout.get(return_type='id', - target='session', - task=task, - suffix='bold', - scope=deriv_name - ) + ppts_fname = layout.get_file('participants.tsv').path ppt_df = pd.read_csv(ppts_fname, sep='\t', index_col=[0,1]) ppt_df['adj'] = '' @@ -258,7 +263,9 @@ def read_corrmats(layout, task, deriv_name='IDConn', z_score=True, vectorized=Tr subject=subject, scope=deriv_name) + for session in sessions: + if verbose: print(session) else: @@ -267,21 +274,22 @@ def read_corrmats(layout, task, deriv_name='IDConn', z_score=True, vectorized=Tr task=task, subject=subject, session=session, + atlas=atlas, suffix='bold', - scope='IDConn', - atlas=atlas, + scope='IDConn' ) if verbose: print(f'Corrmat path for sub-{subject}, ses-{session}: \t{path}') else: pass if type(path) == list: - #print(path) + #print(len(path)) path = path[0] else: pass assert exists(path), f'Corrmat file not found at {path}' adj_matrix = pd.read_csv(path, sep='\t', header=0, index_col=0) + if z_score == True: z_adj = np.arctanh(adj_matrix.values) z_adj = np.where(z_adj == np.inf, 0, z_adj) @@ -305,7 +313,7 @@ def read_corrmats(layout, task, deriv_name='IDConn', z_score=True, vectorized=Tr ppt_df.replace({'': np.nan}, inplace=True) return ppt_df -def undo_vectorize(edges, num_node): +def undo_vectorize(edges, num_node=None): ''' Puts an edge vector back into an adjacency matrix. Parameters @@ -322,12 +330,17 @@ def undo_vectorize(edges, num_node): ''' #j = len(edges) #num_node = (np.sqrt((8 * j) + 1) + 1) / 2 + if num_node == None: + j = len(edges) + num_node = int((np.sqrt((8 * j) + 1) + 1) / 2) + else: + num_node = int(num_node) X = np.zeros((num_node,num_node)) X[np.triu_indices(X.shape[0], k = 1)] = edges X = X + X.T return X -def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='coolwarm', node_size='strength'): +def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='seismic', node_size='strength'): ''' Plots the edges of a connectivity/adjacency matrix both in a heatmap and in brain space, with the option to include a surface plot of node strength. @@ -366,18 +379,20 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap= elif type(threshold) == float or type(threshold) == int: threshold = f'{threshold}%' else: - threshold = '99%' + threshold = '99.99%' print('edge plotting threshold: ', threshold) if node_size == 'strength': - node_strength = np.sum((np.abs(adj)), axis=0) - node_strength /= np.max(node_strength) - node_strength **= 4 + node_strength = np.sum(adj, axis=0) + #node_strength /= np.max(node_strength) + #node_strength **= 4 + node_strength = node_strength / np.max(node_strength) * 60 node_size = node_strength + fig = plt.figure(figsize=(12,4)) if title is not None: fig.suptitle(title) - gs = GridSpec(1, 2, width_ratios=[4,2]) + gs = GridSpec(1, 2, width_ratios=[3,1]) ax0 = fig.add_subplot(gs[0]) ax1 = fig.add_subplot(gs[1]) @@ -385,12 +400,14 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap= g = plotting.plot_connectome(adj, coords, node_size=node_size, edge_threshold=threshold, - edge_cmap='coolwarm', + edge_cmap=cmap, + edge_kwargs={'alpha': 0.4}, + display_mode='lyrz', figure=fig, axes=ax0, colorbar=False, - annotate=False) - h = sns.heatmap(adj, square=True, cmap='coolwarm', ax=ax1, center=0) + annotate=True) + h = sns.heatmap(adj, square=True, linewidths=0, cmap=cmap, ax=ax1, center=0) if strength: fig2 = plt.figure(figsize=(12,4)) if title is not None: @@ -416,13 +433,13 @@ def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap= plt.tight_layout(w_pad=-1) i = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5, - cmap='coolwarm', view='lateral', colorbar=False, axes=ax2) + cmap=cmap, view='lateral', colorbar=False, axes=ax2) j = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5, - cmap='coolwarm', view='medial', colorbar=False, axes=ax3) + cmap=cmap, view='medial', colorbar=False, axes=ax3) k = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5, - cmap='coolwarm', view='lateral', colorbar=False, axes=ax4) + cmap=cmap, view='lateral', colorbar=False, axes=ax4) l = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5, - cmap='coolwarm', view='medial', colorbar=False, axes=ax5) - return fig, fig2 + cmap=cmap, view='medial', colorbar=False, axes=ax5) + return fig, fig2, strength_nimg else: return fig \ No newline at end of file diff --git a/idconn/nbs.py b/idconn/nbs.py index abf6885..ea7025b 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -1,5 +1,5 @@ import numpy as np -import statsmodels.api as sm +import pingouin as pg import networkx as nx import pandas as pd from idconn.io import vectorize_corrmats, undo_vectorize @@ -7,9 +7,13 @@ import enlighten #import bct -from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, GridSearchCV, StratifiedKFold, KFold + from sklearn.feature_selection import f_regression, f_classif -from sklearn.linear_model import LogisticRegression, LinearRegression +from sklearn.linear_model import LogisticRegression, ElasticNet +from sklearn.preprocessing import StandardScaler + +from sklearn.metrics import mean_squared_error def calc_number_of_nodes(matrices): if matrices.shape[0] != matrices.shape[1]: @@ -24,7 +28,41 @@ def calc_number_of_nodes(matrices): num_node = matrices.shape[0] return num_node -def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations=10000): +def residualize(X, y=None, confounds=None): + # residualize the outcome + if confounds is not None: + if y is not None: + temp_y = np.reshape(y, (y.shape[0],)) + y = pg.linear_regression(confounds, temp_y) + resid_y = y.residuals_ + + # residualize features + resid_X = np.zeros_like(X) + #print(X.shape, resid_X.shape) + for i in range(0, X.shape[1]): + X_temp = X[:,i] + #print(X_temp.shape) + X_ = pg.linear_regression(confounds, X_temp) + #print(X_.residuals_.shape) + resid_X[:,i] = X_.residuals_.flatten() + return resid_y, resid_X + else: + # residualize features + resid_X = np.zeros_like(X) + #print(X.shape, resid_X.shape) + for i in range(0, X.shape[1]): + X_temp = X[:,i] + #print(X_temp.shape) + X_ = pg.linear_regression(confounds, X_temp) + #print(X_.residuals_.shape) + resid_X[:,i] = X_.residuals_.flatten() + return resid_X + else: + print('Confound matrix wasn\'t provided, so no confounding was done') + + + +def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000): ''' Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided of shape ((subject x session)x node x node) @@ -67,25 +105,19 @@ def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations= # and retain significant edges # then find the largest connected component # and, if not predict, build a null distribution - n = matrices.shape[:-1] + #n = matrices.shape[:-1] ndims = len(matrices.shape) # vectorize_corrmats returns p x n^2 - # we want to run pynbs per edge - # so vectorized edges must be transposed - - if confounds is not None: - #regress out the confounds, use the residuals for the rest of the algorithm - pass - else: - pass - exog = outcome - + # turn matrices into vectorized upper triangles if ndims > 2: edges = vectorize_corrmats(matrices) else: edges = matrices.copy() + #print(edges.shape) + + #edges = edges.T # run an ols per edge @@ -93,10 +125,10 @@ def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations= # 1 if edge is significantly predicted by outcome # 0 if it's not - if len(np.unique(exog)) < 5: - (f, p) = f_classif(edges, exog) + if len(np.unique(outcome)) < 5: + (f, p) = f_classif(X=edges, y=outcome) else: - (f, p) = f_regression(edges, exog, center=False) + (f, p) = f_regression(X=edges, y=outcome, center=False) sig_edges = np.where(p < alpha, 1, 0) # find largest connected component of sig_edges @@ -107,15 +139,12 @@ def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations= #use networkX to find connected components largest_cc = max(nx.connected_components(matrix), key=len) G0 = matrix.subgraph(largest_cc) - print(G0) + #print(G0) # retain size of largest connected component # for NBS permutation-based significance testing max_comp = G0.number_of_edges() - print(f'Connected component has {max_comp} edges.') - - - + #print(f'Connected component has {max_comp} edges.') # pull the subgraph with largest number of nodes # i.e., the largest connected component @@ -142,18 +171,17 @@ def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations= # only for regular NBS, -Predict doesn't need this if predict == False: perms = np.zeros((permutations,)) - hit = 0 rng = np.random.default_rng() - exog_copy = exog.copy() + outcome_copy = outcome.copy() for i in range(0, permutations): # shuffle outcome order - rng.shuffle(exog_copy, axis=0) - #print(exog_copy) + rng.shuffle(outcome_copy, axis=0) + #print(outcome_copy) - if len(np.unique(exog)) < 5: - (f1, p1) = f_classif(edges, exog_copy) + if len(np.unique(outcome)) < 5: + (f1, p1) = f_classif(edges, outcome_copy) else: - (f1, p1) = f_regression(edges, exog_copy, center=False) + (f1, p1) = f_regression(edges, outcome_copy, center=False) perm_edges = np.where(p1 < alpha, 1, 0) @@ -188,7 +216,7 @@ def pynbs(matrices, outcome, confounds, alpha=0.05, predict=False, permutations= else: return S1 -def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10, n_iterations=10): +def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_splits=10, n_iterations=10): """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided of shape ((subject x session)x node x node) in the network. @@ -207,13 +235,15 @@ def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10 outcome : list-like of shape (p,) Y-value to be predicted with connectivity confounds : list-like - Names of columns in `participants.tsv` to be regressed out of connectivity and outcome + Columns in `participants.tsv` to be regressed out of connectivity and outcome data in each CV fold (per recommendation from Snoek et al., 2019). alpha : float Proportion of type II errors (i.e., false positives) we're willing to put up with. This is the upper limit for pvalues in the edge detection process. groups : list-like of shape (p,) Grouping variable - currently only works for 2 groups. Will enforce stratified k-fold CV. + Currently intended for use where grouping variable is the outcome of interest, assumed by StratifiedKFold. + NEED TO FIX THIS: ALLOW THE CASE WHERE GROUPING VAR != OUTCOME VAR n_splits : int Value of K for K-fold cross-validation. Will split data into K chunks, train on K-1 chunks and test on the Kth. n_iterations : int @@ -231,6 +261,7 @@ def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10 """ edges = vectorize_corrmats(matrices) #print(edges.shape) + #print(edges.shape) index = list(range(0,n_splits * n_iterations)) cv_results = pd.DataFrame(index=index, @@ -244,16 +275,13 @@ def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10 if groups is not None: cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_iterations) - dof = groups.shape[0] - 2 + split_y = groups + else: cv = RepeatedKFold(n_splits=n_splits, - n_repeats=n_iterations) - dof = edges.shape[0] - 1 - - #t_threshold = t.ppf(1 - alpha, df=dof) + n_repeats=n_iterations) + split_y = outcome - # really can't remember why tf I did this? - # maybe it's an artifact of permuted_ols? num_node = calc_number_of_nodes(matrices) #print(num_node) #if matrices.shape[0] != matrices.shape[1]: @@ -271,36 +299,50 @@ def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10 i = 0 manager = enlighten.get_manager() ticks = manager.counter(total=n_splits * n_iterations, desc='Progress', unit='folds') - for train_idx, test_idx in cv.split(edges, outcome, groups=groups): + for train_idx, test_idx in cv.split(edges, split_y): + scaler = StandardScaler() cv_results.at[i, 'split'] = (train_idx, test_idx) - # all of this presumes the old bctpy version of nbs - # irrelevant for pynbs #assert len(train_a_idx) == len(train_b_idx) - if groups is not None: - train_a_idx = [m for m in train_idx if groups[m] == 0] - train_b_idx = [m for m in train_idx if groups[m] == 1] - regressor = LogisticRegression(max_iter=1000) - elif np.unique(outcome).shape[0] == 2: - regressor = LogisticRegression(max_iter=1000) + if np.unique(outcome).shape[0] == 2: + regressor = LogisticRegression(l1_ratio=0.25, max_iter=1000, penalty='elasticnet', solver='saga') else: - regressor = LinearRegression() - train_mats = matrices[train_idx,:,:] - #print(train_a.shape, train_b.shape) - - # separate edges & covariates into + regressor = ElasticNet(l1_ratio=0.25, max_iter=1000) + train_y = outcome[train_idx] test_y = outcome[test_idx] + train_edges = edges[train_idx, :] + test_edges = edges[test_idx, :] + if confounds is not None: train_confounds = confounds.values[train_idx] + test_confounds = confounds.values[test_idx] + #print(train_edges.shape, train_confounds.shape, train_y.shape) + + # residualize the edges and outcome + if np.unique(outcome).shape[0] == 2: + train_edges = residualize(train_edges,train_confounds) + test_edges = residualize(test_edges, test_confounds) + elif np.unique(outcome).shape[0] > 3: + train_y, train_edges = residualize(train_edges, train_y, train_confounds) + test_y, test_edges = residualize(test_edges, test_y, test_confounds) else: - train_confounds = None - #test_confounds = confounds.values[test_idx] + pass + + train_edges = scaler.fit_transform(train_edges) + test_edges = scaler.fit_transform(test_edges) + + if np.unique(outcome).shape[0] == 2: + pass + else: + train_y = scaler.fit_transform(train_y.reshape(-1, 1)) + test_y = scaler.fit_transform(test_y.reshape(-1, 1)) # perform NBS wooooooooo # note: output is a dataframe :) - adj = pynbs(train_mats, train_y, train_confounds, alpha, predict=True) + # PYNBS SHOULD NOT DO CONFOUND REGRESSION? + adj = pynbs(train_edges, train_y, alpha, predict=True) #print(adj.shape, adj.ndim, adj[0].shape, upper_tri) #cv_results.at[i, 'pval'] = pval @@ -311,28 +353,45 @@ def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10 if sum(sum(adj.values)) > 0: # grab the values of the adjacency matrix that are just in the upper triangle # so you don't have repeated edges + # returns (n_edges, ) nbs_vector = adj.values[upper_tri] + #print(nbs_vector.shape) # use those to make a "significant edges" mask mask = nbs_vector == 1.0 # grab only the significant edges from testing and training sets of edges # for use as features in the predictive models - train_features = edges[train_idx, :].T[mask] - test_features = edges[test_idx, :].T[mask] + # these are already residualized + #print(train_edges.shape) + # returns (n_edges, samples) + train_features = train_edges.T[mask] + test_features = test_edges.T[mask] + train_features = scaler.fit_transform(train_features.T) + test_features = scaler.fit_transform(test_features.T) + #print(np.ravel(train_y)) # train model predicting outcome from brain (note: no mas covariates) - #print(train_features.T.shape, train_y.shape) - model = regressor.fit(X=train_features.T, y=train_y.ravel()) - #cv_results.at[i, 'model'] = model + model = regressor.fit(X=train_features, y=np.ravel(train_y)) + cv_results.at[i, 'model'] = model + # score that model on the testing data - score = model.score(X=test_features.T, y=test_y.ravel()) + # if logistic regression: score = mean accuracy + # if linear regression: score = coefficient of determination (R^2) + # both from 0 (low) to 1 (high) + score = model.score(X=test_features, y=np.ravel(test_y)) cv_results.at[i, 'score'] = score + #print(model.coef_.shape) m = 0 param_vector = np.zeros_like(nbs_vector) for l in range(0, nbs_vector.shape[0]): if nbs_vector[l] == 1.: - param_vector[l] = model.coef_[0,m] + ### + # NEEDS IF STATEMENT BC LOGISTIC AND LINEAR HAVE DIFFERENT COEF_ SHAPES + if np.unique(outcome).shape[0] == 2: + param_vector[l] = model.coef_[0,m] + else: + param_vector[l] = model.coef_[m] m+=1 else: pass @@ -343,4 +402,17 @@ def kfold_nbs(matrices, outcome, confounds, alpha=0.05, groups=None, n_splits=10 else: pass ticks.update() - return cv_results \ No newline at end of file + # calculate weighted average + #print(cv_results['score']) + weighted_stack = cv_results.at[0, 'component'] * cv_results.at[0, 'score'] + #print(weighted_stack.shape) + for j in index[1:]: + #print(cv_results.at[j, 'score']) + if cv_results.at[j, 'score'] > 0: + weighted = cv_results.at[j, 'component'] * cv_results.at[j, 'score'] + weighted_stack = np.dstack([weighted_stack, weighted]) + else: + pass + #print(weighted_stack.shape, weighted.shape) + weighted_average = np.mean(weighted_stack, axis=-1) + return weighted_average, cv_results \ No newline at end of file diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 01b5e8f..9d20d0d 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -1,28 +1,42 @@ -from idconn import nbs, io +#!/usr/bin/env python3 import pandas as pd import numpy as np +import pingouin as pg +import nibabel as nib import bids from os.path import join from datetime import datetime from time import strftime from scipy.stats import spearmanr +from idconn import nbs, io + + +from sklearn.linear_model import LogisticRegression, ElasticNet +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import mean_squared_error + +import warnings +import json + +warnings.simplefilter("ignore") today = datetime.today() today_str = strftime("%m_%d_%Y") TRAIN_DSET = '/Users/katherine.b/Dropbox/Data/ds002674' -TEST_DSET = '/Users/katherine.b/Dropbox/Data/ds002674' +TEST_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset' DERIV_NAME = 'IDConn' -OUTCOME = 'estradiol' -CONFOUNDS = None +OUTCOME = 'bc' +CONFOUNDS = 'fd' TASK = 'rest' ATLAS = 'craddock2012' -alpha = 0.01 +alpha = 0.05 atlas_fname = '/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz' + layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) -dat = io.read_corrmats(layout, task=TASK, atlas=ATLAS, z_score=False) +dat = io.read_corrmats(layout, task=TASK, deriv_name='IDConn', atlas=ATLAS, z_score=True) keep = dat['adj'].dropna().index dat = dat.loc[keep] @@ -33,63 +47,230 @@ upper_tri = np.triu_indices(num_node, k=1) outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]),1)) + if CONFOUNDS is not None: confounds = dat[CONFOUNDS] else: confounds = None +#print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, groups=dat['bc'], n_splits=10, n_iterations=100) + +fig,fig2, nimg = io.plot_edges(weighted_average, + atlas_fname, + threshold='computed', + title=f'{OUTCOME} Precition-Weighted Average', + strength=True, + cmap='seismic', + node_size='strength') + +if CONFOUNDS is not None: + base_name = f'nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}' +else: + base_name = f'nbs-predict_outcome-{OUTCOME}' +fig.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-{today_str}.png'), dpi=400) +fig2.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-strength-{today_str}.png'), dpi=400) +nib.save(nimg, join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-strength-{today_str}')) + + +avg_df = pd.DataFrame(weighted_average, + index=range(0,weighted_average.shape[0]), + columns=range(0,weighted_average.shape[1])) + +cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_models-{today_str}.tsv'),sep='\t') +avg_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-{today_str}.tsv'),sep='\t') + + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +#best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0] +#subnetwork = cv_results.loc[best]['component'] +#subnetwork_df = pd.DataFrame(subnetwork, +# index=range(0,num_node), +# columns=range(0,num_node)) + +#if CONFOUNDS is not None: +# subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}_edge-parameters-{today_str}.tsv'),sep='\t') +#else: +# subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_edge-parameters-{today_str}.tsv'),sep='\t') + +# here is where we'd threshold the weighted average to use for elastic-net + +nbs_vector = weighted_average[upper_tri] +p50 = np.percentile(nbs_vector, 50) +filter = np.where(nbs_vector >= p50, True, False) +#print(nbs_vector.shape, filter.shape) + +#mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat['edge_vector'].dropna().values) + +#print(features.shape) + +scaler = StandardScaler() +edges_train = scaler.fit_transform(edges_train) +if len(np.unique(outcome)) <= 2: + pass +else: + outcome = scaler.fit_transform(outcome) -cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, tail='both', groups=None, n_splits=10, n_iterations=10) -cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_models-{today_str}.tsv'),sep='\t') -best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0] -subnetwork = cv_results.loc[best]['component'] -subnetwork_df = pd.DataFrame(subnetwork, - index=range(0,num_node), - columns=range(0,num_node)) +#edges = np.vstack(dat['edge_vector'].values) +#features = edges[:,mask] -subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_edge-parameters-{today_str}.tsv'),sep='\t') +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + #regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + #print(confounds.shape, outcome.shape) + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + y = pg.linear_regression(confounds, outcome_train) + train_outcome = y.residuals_ -nbs_vector = subnetwork[upper_tri] -mask = nbs_vector == 1 -edges = np.vstack(dat['edge_vector'].values) -features = edges[:,mask] -#plot the parameters -param_mat = cv_results.loc[best]['coefficient_matrix'] -odds = 10 ** param_mat -prob = odds / (1 + odds) + resid_edges = np.zeros_like(edges_train) + for i in range(0, edges_train.shape[1]): + x = pg.linear_regression(confounds, edges_train[:,i]) + resid_edges[:,i] = x.residuals_ + train_features = resid_edges[:,filter] +else: + train_features = edges_train[:,filter] + train_outcome = outcome # run the model on the whole test dataset to get params -model = cv_results.loc[best]['model'] -model.fit(features, outcome) -fig,fig2 = io.plot_edges(param_mat, atlas_fname, title=None, strength=True, cmap='icefire', node_size='strength') -fig.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_betas-{today_str}.png'), dpi=400) -fig2.savefig(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_betas-strength-{today_str}.png'), dpi=400) + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? + +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.25, warm_start=True) +else: + model = ElasticNet(l1_ratio=0.25, warm_start=True) + +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +train_metrics = {} +fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +in_sample_score = fitted.score(X=train_features, y=np.ravel(train_outcome)) +if len(np.unique(outcome)) == 2: + train_metrics['accuracy'] = in_sample_score +else: + train_metrics['coefficient of determination'] = in_sample_score +y_pred = fitted.predict(X=train_features) +mse = mean_squared_error(train_outcome, y_pred) +train_metrics['mean squared error'] = mse +print('In-sample prediction score: ', in_sample_score) +print('In-sample mean squared error: ', mse) +#print(np.mean(train_features)) +with open(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_fit-{today_str}.json'), 'w') as fp: + json.dump(train_metrics, fp) + + + +# yoink the coefficients? for a more parsimonious figure? +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + if len(np.unique(outcome)) == 2: + coeff_vec[i] = fitted.coef_[0,j] + else: + coeff_vec[i] = fitted.coef_[j] + j += 1 + else: + pass + +#print(coeff_vec) + +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) +#print(coef_mat == coef_mat.T) + +fig,fig2, nimg = io.plot_edges(coef_mat, + atlas_fname, + threshold='computed', + title=f'{OUTCOME} Coefficients', + strength=True, + cmap='seismic', + node_size='strength') + +fig.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_betas-{today_str}.png'), dpi=400) +fig2.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_betas-strength-{today_str}.png'), dpi=400) +nib.save(nimg, join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_betas-strength-{today_str}')) + layout = bids.BIDSLayout(TEST_DSET, derivatives=True) -test_df = io.read_corrmats(layout, task=TASK, atlas=ATLAS, z_score=False) +test_df = io.read_corrmats(layout, task=TASK, deriv_name='IDConn', atlas=ATLAS, z_score=True) -test_df.dropna(inplace=True) +keep = test_df[[OUTCOME, 'adj']].dropna().index +#print(keep) + +test_df = test_df.loc[keep] outcome_test = test_df[OUTCOME].values -groups_test = outcome + +if len(np.unique(outcome_test)) <= 2: + pass +else: + outcome_test = scaler.fit_transform(outcome_test.reshape(-1, 1)) + +#print(outcome_test) matrices_test = np.vstack(test_df['adj'].dropna().values).reshape((len(test_df['adj'].dropna().index),num_node,num_node)) edges_test = np.vstack(test_df['edge_vector'].dropna().values) +edges_test = scaler.fit_transform(edges_test) + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if confounds is not None: + confounds_test = test_df[CONFOUNDS].values + #regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + #print(confounds.shape, outcome.shape) + outcome_test = np.reshape(outcome_test, (outcome_test.shape[0],)) + y = pg.linear_regression(confounds_test, outcome_test) + test_outcome = y.residuals_ -test_features = edges_test.T[mask,:] -test_outcome = test_df[OUTCOME].values + resid_edges = np.zeros_like(edges_test) + for i in range(0, edges_test.shape[1]): + x = pg.linear_regression(confounds_test, edges_test[:,i]) + resid_edges[:,i] = x.residuals_ + test_features = resid_edges[:,filter] +else: + test_features = edges_test[:,filter] + test_outcome = outcome_test + +#print(test_features.shape) # if the model is a logistic regression, i.e. with a binary outcome # then score is prediction accuracy # if the model is a linear regression, i.e., with a continuous outcome # then the score is R^2 (coefficient of determination) -score = model.score(test_features.T, test_outcome) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +#fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +#score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics['accuracy'] = score +else: + test_metrics['coefficient of determination'] = score +mse = mean_squared_error(test_outcome, y_pred) +test_metrics['mean squared error'] = mse print('Out-of-sample prediction score:\t', score) -pred_outcome = model.predict(test_features.T) +print('Out-of-sample mean squared error:\t', mse) +#print(np.mean(test_features)) +pred_outcome = fitted.predict(test_features) + +#print(test_outcome, '\n',pred_outcome) +#print(pred_outcome) if len(np.unique(test_outcome)) > 2: corr = spearmanr(test_outcome, pred_outcome) print(f'\nSpearman correlation between predicted and actual {OUTCOME}:\t', corr) - np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_score-{today_str}.txt'), [score, corr[0], corr[1]]) -else: - np.savetxt(join(TEST_DSET, 'derivatives', DERIV_NAME, f'nbs-predict__outcome-{OUTCOME}_score-{today_str}.txt'), [score]) - + test_metrics['spearman correlation'] = corr +with open(join(TEST_DSET, 'derivatives', DERIV_NAME, f'{base_name}_fit-{today_str}.json'), 'w') as fp: + json.dump(test_metrics, fp) From d5f557f4cc9cc8afda345b1add05232ecf9c3ec8 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 30 Mar 2023 12:55:37 -0700 Subject: [PATCH 34/48] add auto-FD comp, residualizing --- idconn/io.py | 79 ++++++++++++++++++++++++++++++++- idconn/nbs.py | 8 ++-- idconn/workflows/nbs_predict.py | 79 ++++++++++++--------------------- 3 files changed, 111 insertions(+), 55 deletions(-) diff --git a/idconn/io.py b/idconn/io.py index b14abb6..61d5d93 100644 --- a/idconn/io.py +++ b/idconn/io.py @@ -14,7 +14,24 @@ from matplotlib.gridspec import GridSpec from nilearn import datasets, plotting, surface +def calc_fd(confounds): + x = confounds['trans_x'].values + y = confounds['trans_y'].values + z = confounds['trans_z'].values + alpha = confounds['rot_x'].values + beta = confounds['rot_y'].values + gamma = confounds['rot_z'].values + + delta_x = [np.abs(t - s) for s, t in zip(x, x[1:])] + delta_y = [np.abs(t - s) for s, t in zip(y, y[1:])] + delta_z = [np.abs(t - s) for s, t in zip(z, z[1:])] + + delta_alpha = [np.abs(t - s) for s, t in zip(alpha, alpha[1:])] + delta_beta = [np.abs(t - s) for s, t in zip(beta, beta[1:])] + delta_gamma = [np.abs(t - s) for s, t in zip(gamma, gamma[1:])] + fd = np.sum([delta_x, delta_y, delta_z, delta_alpha, delta_beta, delta_gamma], axis=0) + return fd def build_statsmodel_json(name, task, contrast, confounds, highpass, mask, conn_meas, graph_meas=None, exclude=None, outfile=None): @@ -265,7 +282,67 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True for session in sessions: - + runs = layout.get(return_type='id', + session=session, + target='run', + task=task, + suffix='timeseries', + subject=subject, + scope=deriv_name) + if len(runs) > 0: + path = layout.get(return_type='filename', + session=session, + run=runs[0], + task=task, + suffix='timeseries', + subject=subject, + scope=deriv_name) + confounds = pd.read_table(path[0], header=0, index_col=0) + if not 'framewise_displacement' in confounds.columns: + fd = calc_fd(confounds) + #fd.append(0) + fd = np.append(fd, [0]) + confounds['framewise_displacement'] = fd + confound_means = confounds.mean(axis=0) + if len(runs) > 1: + for run in runs[1:]: + path = layout.get(return_type='filename', + session=session, + run=run, + task=task, + suffix='timeseries', + subject=subject, + scope=deriv_name) + confounds = pd.read_table(path[0], header=0, index_col=0) + if not 'framewise_displacement' in confounds.columns: + fd = calc_fd(confounds) + #fd.append(0) + fd = np.append(fd, [0]) + confounds['framewise_displacement'] = fd + confound_means_temp = confounds.mean(axis=0) + confound_means = np.mean(pd.concat([confound_means, confound_means_temp], axis=1), axis=1) + #print(confound_means) + else: + path = path = layout.get(return_type='filename', + session=session, + desc='confounds', + task=task, + suffix='timeseries', + subject=subject, + scope=deriv_name) + + confounds = pd.read_table(path[0], header=0, index_col=0) + if not 'framewise_displacement' in confounds.columns: + fd = calc_fd(confounds) + fd = np.append(fd, [0]) + confounds['framewise_displacement'] = fd + confound_means = confounds.mean(axis=0) + #print(confound_means) + for confound in confound_means.index: + ppt_df.at[(f'sub-{subject}', + f'ses-{session}'), + confound] = confound_means[confound] + if verbose: print(session) else: diff --git a/idconn/nbs.py b/idconn/nbs.py index ea7025b..ad236c0 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -322,11 +322,11 @@ def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_spli # residualize the edges and outcome if np.unique(outcome).shape[0] == 2: - train_edges = residualize(train_edges,train_confounds) - test_edges = residualize(test_edges, test_confounds) + train_edges = residualize(X=train_edges, confounds=train_confounds) + test_edges = residualize(X=test_edges, confounds=test_confounds) elif np.unique(outcome).shape[0] > 3: - train_y, train_edges = residualize(train_edges, train_y, train_confounds) - test_y, test_edges = residualize(test_edges, test_y, test_confounds) + train_y, train_edges = residualize(X=train_edges, y=train_y, confounds=train_confounds) + test_y, test_edges = residualize(X=test_edges, y=test_y, confounds=test_confounds) else: pass diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 9d20d0d..1830a82 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -27,7 +27,7 @@ TEST_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset' DERIV_NAME = 'IDConn' OUTCOME = 'bc' -CONFOUNDS = 'fd' +CONFOUNDS = 'framewise_displacement' TASK = 'rest' ATLAS = 'craddock2012' alpha = 0.05 @@ -90,19 +90,7 @@ # - increases parsimony while handling multicollinearity... # either way, I don't think cv_results is necessary -#best = cv_results[cv_results['score'] == cv_results['score'].max()].index[0] -#subnetwork = cv_results.loc[best]['component'] -#subnetwork_df = pd.DataFrame(subnetwork, -# index=range(0,num_node), -# columns=range(0,num_node)) - -#if CONFOUNDS is not None: -# subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}_edge-parameters-{today_str}.tsv'),sep='\t') -#else: -# subnetwork_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'nbs-predict_outcome-{OUTCOME}_edge-parameters-{today_str}.tsv'),sep='\t') - # here is where we'd threshold the weighted average to use for elastic-net - nbs_vector = weighted_average[upper_tri] p50 = np.percentile(nbs_vector, 50) filter = np.where(nbs_vector >= p50, True, False) @@ -111,37 +99,30 @@ #mask = io.vectorize_corrmats(filter) edges_train = np.vstack(dat['edge_vector'].dropna().values) -#print(features.shape) - -scaler = StandardScaler() -edges_train = scaler.fit_transform(edges_train) -if len(np.unique(outcome)) <= 2: - pass -else: - outcome = scaler.fit_transform(outcome) - - -#edges = np.vstack(dat['edge_vector'].values) -#features = edges[:,mask] - # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) #regress out the confounds from each edge and the outcome variable, # use the residuals for the rest of the algorithm #print(confounds.shape, outcome.shape) - outcome_train = np.reshape(outcome, (outcome.shape[0],)) - y = pg.linear_regression(confounds, outcome_train) - train_outcome = y.residuals_ - - resid_edges = np.zeros_like(edges_train) - for i in range(0, edges_train.shape[1]): - x = pg.linear_regression(confounds, edges_train[:,i]) - resid_edges[:,i] = x.residuals_ + if np.unique(outcome).shape[0] == 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif np.unique(outcome).shape[0] > 3: + train_outcome, resid_edges = nbs.residualize(X=edges_train, y=outcome_train, confounds=confounds_train) train_features = resid_edges[:,filter] else: train_features = edges_train[:,filter] train_outcome = outcome +scaler = StandardScaler() +train_features = scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + outcome_test = scaler.fit_transform(train_outcome.reshape(-1, 1)) + # run the model on the whole test dataset to get params # classification if the outcome is binary (for now) @@ -210,38 +191,36 @@ #print(keep) test_df = test_df.loc[keep] - outcome_test = test_df[OUTCOME].values - -if len(np.unique(outcome_test)) <= 2: - pass -else: - outcome_test = scaler.fit_transform(outcome_test.reshape(-1, 1)) +#print(test_df) #print(outcome_test) matrices_test = np.vstack(test_df['adj'].dropna().values).reshape((len(test_df['adj'].dropna().index),num_node,num_node)) edges_test = np.vstack(test_df['edge_vector'].dropna().values) -edges_test = scaler.fit_transform(edges_test) # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE -if confounds is not None: +if CONFOUNDS is not None: confounds_test = test_df[CONFOUNDS].values + #regress out the confounds from each edge and the outcome variable, # use the residuals for the rest of the algorithm #print(confounds.shape, outcome.shape) - outcome_test = np.reshape(outcome_test, (outcome_test.shape[0],)) - y = pg.linear_regression(confounds_test, outcome_test) - test_outcome = y.residuals_ - - resid_edges = np.zeros_like(edges_test) - for i in range(0, edges_test.shape[1]): - x = pg.linear_regression(confounds_test, edges_test[:,i]) - resid_edges[:,i] = x.residuals_ + if np.unique(outcome_test).shape[0] == 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif np.unique(outcome_test).shape[0] > 3: + test_outcome, resid_edges = nbs.residualize(X=edges_test, y=outcome_test, confounds=confounds_test) test_features = resid_edges[:,filter] else: test_features = edges_test[:,filter] test_outcome = outcome_test +# scale after residualizing omg +test_features = scaler.fit_transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = scaler.fit_transform(test_outcome.reshape(-1, 1)) #print(test_features.shape) # if the model is a logistic regression, i.e. with a binary outcome # then score is prediction accuracy From c1a7878c76783b2e7ded1750b5bacf88a846da4c Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Fri, 31 Mar 2023 17:29:38 -0700 Subject: [PATCH 35/48] fix scaling bug in training data --- idconn/workflows/nbs_predict.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 1830a82..7444bfa 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -50,11 +50,13 @@ if CONFOUNDS is not None: confounds = dat[CONFOUNDS] + base_name = f'nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}' else: confounds = None + base_name = f'nbs-predict_outcome-{OUTCOME}' #print(dat['bc']) -weighted_average, cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, groups=dat['bc'], n_splits=10, n_iterations=100) +weighted_average, cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, groups=dat['bc'], n_splits=10, n_iterations=1000) fig,fig2, nimg = io.plot_edges(weighted_average, atlas_fname, @@ -64,11 +66,6 @@ cmap='seismic', node_size='strength') -if CONFOUNDS is not None: - base_name = f'nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}' -else: - base_name = f'nbs-predict_outcome-{OUTCOME}' - fig.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-{today_str}.png'), dpi=400) fig2.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-strength-{today_str}.png'), dpi=400) nib.save(nimg, join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-strength-{today_str}')) @@ -106,10 +103,10 @@ #regress out the confounds from each edge and the outcome variable, # use the residuals for the rest of the algorithm #print(confounds.shape, outcome.shape) - if np.unique(outcome).shape[0] == 2: + if len(np.unique(outcome_train)) <= 2: resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) train_outcome = outcome - elif np.unique(outcome).shape[0] > 3: + elif len(np.unique(outcome_train)) > 3: train_outcome, resid_edges = nbs.residualize(X=edges_train, y=outcome_train, confounds=confounds_train) train_features = resid_edges[:,filter] else: @@ -121,7 +118,7 @@ if len(np.unique(train_outcome)) <= 2: pass else: - outcome_test = scaler.fit_transform(train_outcome.reshape(-1, 1)) + train_outcome = scaler.fit_transform(train_outcome.reshape(-1, 1)) # run the model on the whole test dataset to get params @@ -129,9 +126,9 @@ # could be extended to the multiclass case? if len(np.unique(outcome)) == 2: - model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.25, warm_start=True) + model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.25) else: - model = ElasticNet(l1_ratio=0.25, warm_start=True) + model = ElasticNet(l1_ratio=0.25) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict train_metrics = {} @@ -150,8 +147,6 @@ with open(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_fit-{today_str}.json'), 'w') as fp: json.dump(train_metrics, fp) - - # yoink the coefficients? for a more parsimonious figure? coeff_vec = np.zeros_like(filter) j = 0 @@ -205,10 +200,10 @@ #regress out the confounds from each edge and the outcome variable, # use the residuals for the rest of the algorithm #print(confounds.shape, outcome.shape) - if np.unique(outcome_test).shape[0] == 2: + if len(np.unique(outcome_test)) <= 2: resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) test_outcome = outcome_test - elif np.unique(outcome_test).shape[0] > 3: + elif len(np.unique(outcome_test)) > 3: test_outcome, resid_edges = nbs.residualize(X=edges_test, y=outcome_test, confounds=confounds_test) test_features = resid_edges[:,filter] else: @@ -243,13 +238,15 @@ print('Out-of-sample prediction score:\t', score) print('Out-of-sample mean squared error:\t', mse) #print(np.mean(test_features)) -pred_outcome = fitted.predict(test_features) +#pred_outcome = fitted.predict(test_features) + -#print(test_outcome, '\n',pred_outcome) +print(test_outcome, '\n',y_pred) #print(pred_outcome) if len(np.unique(test_outcome)) > 2: - corr = spearmanr(test_outcome, pred_outcome) + corr = spearmanr(test_outcome, y_pred) print(f'\nSpearman correlation between predicted and actual {OUTCOME}:\t', corr) test_metrics['spearman correlation'] = corr with open(join(TEST_DSET, 'derivatives', DERIV_NAME, f'{base_name}_fit-{today_str}.json'), 'w') as fp: json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f'{base_name}_predicted-values_fit-{today_str}.txt'), y_pred) \ No newline at end of file From fa48a1745eb191bce5b045706f67a78fb75a4adf Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Tue, 4 Apr 2023 10:45:12 -0700 Subject: [PATCH 36/48] just changed number of iterations --- idconn/workflows/nbs_predict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 7444bfa..ed6b664 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -56,7 +56,7 @@ base_name = f'nbs-predict_outcome-{OUTCOME}' #print(dat['bc']) -weighted_average, cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, groups=dat['bc'], n_splits=10, n_iterations=1000) +weighted_average, cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, groups=dat['bc'], n_splits=10, n_iterations=100) fig,fig2, nimg = io.plot_edges(weighted_average, atlas_fname, From b33e24fad88803e009a57828a145d664fcce5b1f Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Tue, 4 Apr 2023 11:03:46 -0700 Subject: [PATCH 37/48] linted --- idconn/__init__.py | 4 +- idconn/connectivity.py | 436 ++++++++++++++++++-------- idconn/data.py | 17 +- idconn/io.py | 538 ++++++++++++++++++-------------- idconn/nbs.py | 269 ++++++++-------- idconn/networking.py | 97 +++--- idconn/parser_utils.py | 4 +- idconn/pipeline.py | 179 +++++++---- idconn/workflows/nbs_predict.py | 234 ++++++++------ setup.py | 8 +- versioneer.py | 25 +- 11 files changed, 1088 insertions(+), 723 deletions(-) diff --git a/idconn/__init__.py b/idconn/__init__.py index 6915dae..79ab307 100644 --- a/idconn/__init__.py +++ b/idconn/__init__.py @@ -26,10 +26,10 @@ "idconn", "connectivity", "data", - #"figures", + # "figures", "networking", # "preprocessing", - #"statistics", + # "statistics", # "utils", "io", "nbs", diff --git a/idconn/connectivity.py b/idconn/connectivity.py index cf14137..1e79998 100644 --- a/idconn/connectivity.py +++ b/idconn/connectivity.py @@ -1,12 +1,15 @@ from posixpath import sep import numpy as np import pandas as pd -#import idconn.connectivity.build_networks + +# import idconn.connectivity.build_networks from os import makedirs from os.path import join, exists, basename from nilearn import input_data, datasets, connectome, image, plotting from ._version import get_versions -#from .utils import contrast + +# from .utils import contrast + def _check_dims(matrix): """Raise a ValueError if the input matrix has more than two square. @@ -16,10 +19,14 @@ def _check_dims(matrix): Input array. """ if matrix.ndim != 2: - raise ValueError('Expected a square matrix, got array of shape' - ' {0}.'.format(matrix.shape)) + raise ValueError( + "Expected a square matrix, got array of shape" " {0}.".format(matrix.shape) + ) + -def task_connectivity(layout, subject, session, task, atlas, confounds, connectivity_metric='correlation'): +def task_connectivity( + layout, subject, session, task, atlas, confounds, connectivity_metric="correlation" +): """ Makes connectivity matrices per subject per session per task per condition. Parameters @@ -51,120 +58,202 @@ def task_connectivity(layout, subject, session, task, atlas, confounds, connecti files : list Filenames of computed correlation matrices. """ - #version = '0.1.1' + # version = '0.1.1' try: version = get_versions()["version"] except: - version = 'test' - if '.nii' in atlas: - assert exists(atlas), f'Mask file does not exist at {atlas}' - - deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}') - - space = 'MNI152NLin2009cAsym' - atlas_name = basename(atlas).rsplit('.', 2)[0] + version = "test" + if ".nii" in atlas: + assert exists(atlas), f"Mask file does not exist at {atlas}" + + deriv_dir = join(layout.root, "derivatives", f"idconn-{version}") + + space = "MNI152NLin2009cAsym" + atlas_name = basename(atlas).rsplit(".", 2)[0] # use pybids here to grab # of runs and preproc bold filenames connectivity_measure = connectome.ConnectivityMeasure(kind=connectivity_metric) - bold_files = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space=space,subject=subject, session=session, extension='nii.gz') # should be preprocessed BOLD file from fmriprep, grabbed with pybids - print(f'BOLD files found at {bold_files}') + bold_files = layout.get( + scope="derivatives", + return_type="file", + suffix="bold", + task=task, + space=space, + subject=subject, + session=session, + extension="nii.gz", + ) # should be preprocessed BOLD file from fmriprep, grabbed with pybids + print(f"BOLD files found at {bold_files}") runs = [] if len(bold_files) > 1: for i in range(0, len(bold_files)): - assert exists(bold_files[i]), "Preprocessed bold file(s) does not exist at {0}".format(bold_files) - runs.append(layout.parse_file_entities(bold_files[i])['run']) + assert exists(bold_files[i]), "Preprocessed bold file(s) does not exist at {0}".format( + bold_files + ) + runs.append(layout.parse_file_entities(bold_files[i])["run"]) else: runs = None - print(f'Found runs: {runs}') + print(f"Found runs: {runs}") - out = join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func') + out = join(deriv_dir, f"sub-{subject}", f"ses-{session}", "func") if not exists(out): - makedirs(out) - - event_files = layout.get(return_type='filename', suffix='events', task=task, subject=subject) - timing = pd.read_csv(event_files[0], header=0, index_col=0, sep='\t') - conditions = timing['trial_type'].unique() + makedirs(out) + + event_files = layout.get(return_type="filename", suffix="events", task=task, subject=subject) + timing = pd.read_csv(event_files[0], header=0, index_col=0, sep="\t") + conditions = timing["trial_type"].unique() run_cond = {} corrmats = {} for run in runs: - bold_file = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz', run=run) - assert len(bold_file) == 1, f'BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}, {run}: {bold_file}' + bold_file = layout.get( + scope="derivatives", + return_type="file", + suffix="bold", + task=task, + space="MNI152NLin2009cAsym", + subject=subject, + session=session, + extension="nii.gz", + run=run, + ) + assert ( + len(bold_file) == 1 + ), f"BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}, {run}: {bold_file}" tr = layout.get_tr(bold_file) - - #load timing file - #update to use pyBIDS + layout - event_file = layout.get(return_type='filename', suffix='events', task=task, subject=subject, run=run, session=session) - print('# of event files =', len(event_file), '\nfilename = ', event_file[0]) + + # load timing file + # update to use pyBIDS + layout + event_file = layout.get( + return_type="filename", + suffix="events", + task=task, + subject=subject, + run=run, + session=session, + ) + print("# of event files =", len(event_file), "\nfilename = ", event_file[0]) the_file = str(event_file[0]) - assert exists(the_file), 'file really does not exist' - timing = pd.read_csv(the_file, header=0, index_col=0, sep='\t') - timing.sort_values('onset') + assert exists(the_file), "file really does not exist" + timing = pd.read_csv(the_file, header=0, index_col=0, sep="\t") + timing.sort_values("onset") - confounds_file = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task, run=run, extension='tsv') - print(f'Confounds file located at: {confounds_file}') - confounds_df = pd.read_csv(confounds_file[0], header=0, sep='\t') + confounds_file = layout.get( + scope="derivatives", + return_type="file", + desc="confounds", + subject=subject, + session=session, + task=task, + run=run, + extension="tsv", + ) + print(f"Confounds file located at: {confounds_file}") + confounds_df = pd.read_csv(confounds_file[0], header=0, sep="\t") confounds_df = confounds_df[confounds].fillna(0) - confounds_fname = join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-confounds_timeseries.tsv') - confounds_df.to_csv(confounds_fname, sep='\t') + confounds_fname = join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-confounds_timeseries.tsv", + ) + confounds_df.to_csv(confounds_fname, sep="\t") masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=2) ex_bold = image.index_img(bold_file[0], 2) display = plotting.plot_epi(ex_bold) display.add_contours(atlas) - display.savefig(join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_run-{run}_space-MNI152NLin2009cAsym_space-{atlas_name}_overlay.png')) - - print(f'BOLD file located at {bold_file}\nTR = {tr}s') - + display.savefig( + join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_run-{run}_space-MNI152NLin2009cAsym_space-{atlas_name}_overlay.png", + ) + ) + + print(f"BOLD file located at {bold_file}\nTR = {tr}s") + masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=1) timeseries = masker.fit_transform(bold_file[0], confounds=confounds_fname) - #load timing file - #update to use pyBIDS + layout + # load timing file + # update to use pyBIDS + layout try: - #and now we slice into conditions + # and now we slice into conditions for condition in conditions: run_cond[condition] = {} corrmats[condition] = {} blocks = [] - cond_timing = timing[timing['trial_type'] == condition] + cond_timing = timing[timing["trial_type"] == condition] for i in cond_timing.index: - blocks.append((cond_timing.loc[i]['onset'] / tr, ((cond_timing.loc[i]['onset'] + cond_timing.loc[i]['duration']) / tr) + 1)) + blocks.append( + ( + cond_timing.loc[i]["onset"] / tr, + ((cond_timing.loc[i]["onset"] + cond_timing.loc[i]["duration"]) / tr) + + 1, + ) + ) if len(blocks) > 1: - run_cond[condition][run] = np.vstack((timeseries[int(blocks[0][0]):int(blocks[0][1]), :], timeseries[int(blocks[1][0]):int(blocks[1][1]), :])) + run_cond[condition][run] = np.vstack( + ( + timeseries[int(blocks[0][0]) : int(blocks[0][1]), :], + timeseries[int(blocks[1][0]) : int(blocks[1][1]), :], + ) + ) if len(blocks) > 2: - for i in np.arange(2,len(blocks)): - run_cond[condition][run] = np.vstack((timeseries[int(blocks[0][0]):int(blocks[0][1]), :], timeseries[int(blocks[1][0]):int(blocks[1][1]), :])) - #print('extracted signals for {0}, {1}, {2}'.format(task, run, condition), run_cond['{0}-{1}'.format(run, condition)].shape) + for i in np.arange(2, len(blocks)): + run_cond[condition][run] = np.vstack( + ( + timeseries[int(blocks[0][0]) : int(blocks[0][1]), :], + timeseries[int(blocks[1][0]) : int(blocks[1][1]), :], + ) + ) + # print('extracted signals for {0}, {1}, {2}'.format(task, run, condition), run_cond['{0}-{1}'.format(run, condition)].shape) else: pass - print(f'Making correlation matrix for {run}, {condition}.') - corrmats[condition][run] = connectivity_measure.fit_transform([run_cond[condition][run]])[0] - print('And that correlation matrix is', corrmats[condition][run].shape) + print(f"Making correlation matrix for {run}, {condition}.") + corrmats[condition][run] = connectivity_measure.fit_transform( + [run_cond[condition][run]] + )[0] + print("And that correlation matrix is", corrmats[condition][run].shape) except Exception as e: - print('trying to slice and dice, but', e) - #and paste together the timeseries from each run together per condition + print("trying to slice and dice, but", e) + # and paste together the timeseries from each run together per condition files = [] avg_corrmats = {} - print('Corrmats per run per condition have been made!') + print("Corrmats per run per condition have been made!") for condition in conditions: - print(f'Merging corrmats for {task}-{condition}...') + print(f"Merging corrmats for {task}-{condition}...") data = list(corrmats[condition].values()) stacked_corrmats = np.array(data) - print('Stacked corrmats have dimensions', stacked_corrmats.shape) + print("Stacked corrmats have dimensions", stacked_corrmats.shape) avg_corrmat = np.mean(stacked_corrmats, axis=0) - corrmat_df = pd.DataFrame(index=np.arange(1, avg_corrmat.shape[0]+1), columns=np.arange(1, avg_corrmat.shape[0]+1),data=avg_corrmat) + corrmat_df = pd.DataFrame( + index=np.arange(1, avg_corrmat.shape[0] + 1), + columns=np.arange(1, avg_corrmat.shape[0] + 1), + data=avg_corrmat, + ) avg_corrmats[condition] = corrmat_df - corrmat_file = join(deriv_dir, - f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_desc-{condition}_space-MNI152NLin2009cAsym_atlas-{atlas_name}_corrmat.tsv') + corrmat_file = join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_desc-{condition}_space-MNI152NLin2009cAsym_atlas-{atlas_name}_corrmat.tsv", + ) try: - corrmat_df.to_csv(corrmat_file, sep='\t') + corrmat_df.to_csv(corrmat_file, sep="\t") files.append(corrmat_file) except Exception as e: - print('saving corrmat...', e) + print("saving corrmat...", e) return files, avg_corrmats -def rest_connectivity(layout, subject, session, task, atlas, confounds=None,connectivity_metric='correlation'): +def rest_connectivity( + layout, subject, session, task, atlas, confounds=None, connectivity_metric="correlation" +): """ Makes connectivity matrices per subject per session per task per condition. Parameters @@ -193,116 +282,201 @@ def rest_connectivity(layout, subject, session, task, atlas, confounds=None,conn try: version = get_versions()["version"] except: - version = 'test' - if '.nii' in atlas: - assert exists(atlas), f'Mask file does not exist at {atlas}' - - deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}') - atlas_name = basename(atlas).rsplit('.', 2)[0] + version = "test" + if ".nii" in atlas: + assert exists(atlas), f"Mask file does not exist at {atlas}" + + deriv_dir = join(layout.root, "derivatives", f"idconn-{version}") + atlas_name = basename(atlas).rsplit(".", 2)[0] # use pybids here to grab # of runs and preproc bold filenames connectivity_measure = connectome.ConnectivityMeasure(kind=connectivity_metric) - bold_files = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz') # should be preprocessed BOLD file from fmriprep, grabbed with pybids - print(f'BOLD files found at {bold_files}') - #confounds_files = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task) + bold_files = layout.get( + scope="derivatives", + return_type="file", + suffix="bold", + task=task, + space="MNI152NLin2009cAsym", + subject=subject, + session=session, + extension="nii.gz", + ) # should be preprocessed BOLD file from fmriprep, grabbed with pybids + print(f"BOLD files found at {bold_files}") + # confounds_files = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task) runs = [] if len(bold_files) > 1: for i in range(0, len(bold_files)): - assert exists(bold_files[i]), "Preprocessed bold file(s) does not exist at {0}".format(bold_files) - runs.append(layout.parse_file_entities(bold_files[i])['run']) + assert exists(bold_files[i]), "Preprocessed bold file(s) does not exist at {0}".format( + bold_files + ) + runs.append(layout.parse_file_entities(bold_files[i])["run"]) else: runs = None - print(f'Found runs: {runs}') + print(f"Found runs: {runs}") - out = join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func') + out = join(deriv_dir, f"sub-{subject}", f"ses-{session}", "func") if not exists(out): - makedirs(out) - - - #event_files = layout.get(return_type='filename', suffix='events', task=task, subject=subject) - #timing = pd.read_csv(event_files[0], header=0, index_col=0, sep='\t') - #conditions = timing['trial_type'].unique() + makedirs(out) + + # event_files = layout.get(return_type='filename', suffix='events', task=task, subject=subject) + # timing = pd.read_csv(event_files[0], header=0, index_col=0, sep='\t') + # conditions = timing['trial_type'].unique() if runs: corrmats = {} for run in runs: - print('run = ', run) + print("run = ", run) # read in events file for this subject, task, and run - - confounds_file = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task, run=run, extension='tsv') - print(f'Confounds file located at: {confounds_file}') - confounds_df = pd.read_csv(confounds_file[0], header=0, sep='\t') + confounds_file = layout.get( + scope="derivatives", + return_type="file", + desc="confounds", + subject=subject, + session=session, + task=task, + run=run, + extension="tsv", + ) + print(f"Confounds file located at: {confounds_file}") + confounds_df = pd.read_csv(confounds_file[0], header=0, sep="\t") confounds_df = confounds_df[confounds].fillna(0) - confounds_fname = join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-confounds_timeseries.tsv') - confounds_df.to_csv(confounds_fname, sep='\t') + confounds_fname = join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-confounds_timeseries.tsv", + ) + confounds_df.to_csv(confounds_fname, sep="\t") - bold_file = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz', run=run) - assert len(bold_file) == 1, f'BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}, {run}: {bold_file}' + bold_file = layout.get( + scope="derivatives", + return_type="file", + suffix="bold", + task=task, + space="MNI152NLin2009cAsym", + subject=subject, + session=session, + extension="nii.gz", + run=run, + ) + assert ( + len(bold_file) == 1 + ), f"BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}, {run}: {bold_file}" tr = layout.get_tr(bold_file) masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=2) ex_bold = image.index_img(bold_file[0], 2) display = plotting.plot_epi(ex_bold) display.add_contours(atlas) - display.savefig(join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-atlas_overlay.png')) - - print(f'BOLD file located at {bold_file}\nTR = {tr}s') + display.savefig( + join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-atlas_overlay.png", + ) + ) + + print(f"BOLD file located at {bold_file}\nTR = {tr}s") try: - #for each parcellation, extract BOLD timeseries - print(f'Extracting bold signal for sub-{subject}, ses-{session}, run-{run}...') - timeseries = masker.fit_transform(bold_file[0], confounds_fname) + # for each parcellation, extract BOLD timeseries + print(f"Extracting bold signal for sub-{subject}, ses-{session}, run-{run}...") + timeseries = masker.fit_transform(bold_file[0], confounds_fname) except Exception as e: - print('ERROR: Trying to extract BOLD signals, but', e) + print("ERROR: Trying to extract BOLD signals, but", e) try: - print(f'Making correlation matrix for for sub-{subject}, ses-{session}, task-{task}, run-{run}...') + print( + f"Making correlation matrix for for sub-{subject}, ses-{session}, task-{task}, run-{run}..." + ) corrmats[run] = connectivity_measure.fit_transform([timeseries])[0] except Exception as e: - print('ERROR: Trying to make corrmat, but', e) + print("ERROR: Trying to make corrmat, but", e) data = list(corrmats.values()) stacked_corrmats = np.array(data) - print('Stacked corrmats have dimensions', stacked_corrmats.shape) + print("Stacked corrmats have dimensions", stacked_corrmats.shape) avg_corrmat = np.mean(stacked_corrmats, axis=0) else: - confounds_file = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task, extension='tsv') - print(f'Confounds file located at: {confounds_file}') - confounds_df = pd.read_csv(confounds_file[0], header=0, sep='\t') + confounds_file = layout.get( + scope="derivatives", + return_type="file", + desc="confounds", + subject=subject, + session=session, + task=task, + extension="tsv", + ) + print(f"Confounds file located at: {confounds_file}") + confounds_df = pd.read_csv(confounds_file[0], header=0, sep="\t") confounds_df = confounds_df[confounds].fillna(0) - confounds_fname = join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_desc-confounds_timeseries.tsv') - confounds_df.to_csv(confounds_fname, sep='\t') + confounds_fname = join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_desc-confounds_timeseries.tsv", + ) + confounds_df.to_csv(confounds_fname, sep="\t") - bold_file = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz') - assert len(bold_file) == 1, f'BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}: {bold_file}' + bold_file = layout.get( + scope="derivatives", + return_type="file", + suffix="bold", + task=task, + space="MNI152NLin2009cAsym", + subject=subject, + session=session, + extension="nii.gz", + ) + assert ( + len(bold_file) == 1 + ), f"BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}: {bold_file}" tr = layout.get_tr(bold_file) masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=2) - + ex_bold = image.index_img(bold_file[0], 2) display = plotting.plot_epi(ex_bold) display.add_contours(atlas) - display.savefig(join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_desc-atlas_overlay.png')) - - print(f'BOLD file located at {bold_file}\nTR = {tr}s') + display.savefig( + join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_desc-atlas_overlay.png", + ) + ) + + print(f"BOLD file located at {bold_file}\nTR = {tr}s") try: - #for each parcellation, extract BOLD timeseries - print(f'Extracting bold signal for sub-{subject}, ses-{session}...') - timeseries = masker.fit_transform(bold_file[0], confounds_fname) + # for each parcellation, extract BOLD timeseries + print(f"Extracting bold signal for sub-{subject}, ses-{session}...") + timeseries = masker.fit_transform(bold_file[0], confounds_fname) except Exception as e: - print('ERROR: Trying to extract BOLD signals, but', e) + print("ERROR: Trying to extract BOLD signals, but", e) try: - print(f'Making correlation matrix for for sub-{subject}, ses-{session}...') + print(f"Making correlation matrix for for sub-{subject}, ses-{session}...") avg_corrmat = connectivity_measure.fit_transform([timeseries])[0] except Exception as e: - print('ERROR: Trying to make corrmat, but', e) + print("ERROR: Trying to make corrmat, but", e) - print('Correlation matrix created, dimensions:', avg_corrmat.shape) + print("Correlation matrix created, dimensions:", avg_corrmat.shape) try: - corrmat_df = pd.DataFrame(index=np.arange(1, avg_corrmat.shape[0]+1), columns=np.arange(1, avg_corrmat.shape[0]+1),data=avg_corrmat) - corrmat_file = join(deriv_dir, - f'sub-{subject}', - f'ses-{session}', - 'func', - f'sub-{subject}_ses-{session}_task-{task}_space-MNI152NLin2009cAsym_atlas-{atlas_name}_desc-corrmat_bold.tsv') - corrmat_df.to_csv(corrmat_file, sep='\t') + corrmat_df = pd.DataFrame( + index=np.arange(1, avg_corrmat.shape[0] + 1), + columns=np.arange(1, avg_corrmat.shape[0] + 1), + data=avg_corrmat, + ) + corrmat_file = join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_space-MNI152NLin2009cAsym_atlas-{atlas_name}_desc-corrmat_bold.tsv", + ) + corrmat_df.to_csv(corrmat_file, sep="\t") except Exception as e: - print('ERROR saving corrmat...', e) + print("ERROR saving corrmat...", e) return corrmat_df, corrmat_file diff --git a/idconn/data.py b/idconn/data.py index 575e7bc..0e18186 100644 --- a/idconn/data.py +++ b/idconn/data.py @@ -6,20 +6,21 @@ def impute(data, max_iter=10000): - ''' + """ Fill in missing data with an iterative imputation algorithm from scikit learn. NOTE: Will not imput connectivity data. - ''' - - non_numeric = data.select_dtypes(exclude=['number']).columns - dumb = pd.get_dummies(data[non_numeric], prefix='dummy') + """ + + non_numeric = data.select_dtypes(exclude=["number"]).columns + dumb = pd.get_dummies(data[non_numeric], prefix="dummy") df = pd.concat([data.drop(non_numeric, axis=1), dumb]) impute_pls = IterativeImputer( max_iter=max_iter, skip_complete=True, verbose=1, tol=5e-3, n_nearest_features=1000 ) imputed = impute_pls.fit_transform(df) - imp_df = pd.DataFrame(imputed,columns=data.drop(non_numeric, axis=1).columns, index=data.index, + imp_df = pd.DataFrame( + imputed, + columns=data.drop(non_numeric, axis=1).columns, + index=data.index, ) return imp_df - - diff --git a/idconn/io.py b/idconn/io.py index 61d5d93..b5f43e1 100644 --- a/idconn/io.py +++ b/idconn/io.py @@ -9,19 +9,21 @@ import numpy as np import pandas as pd import seaborn as sns -#from matplotlib import projections + +# from matplotlib import projections from matplotlib import pyplot as plt from matplotlib.gridspec import GridSpec from nilearn import datasets, plotting, surface + def calc_fd(confounds): - x = confounds['trans_x'].values - y = confounds['trans_y'].values - z = confounds['trans_z'].values - alpha = confounds['rot_x'].values - beta = confounds['rot_y'].values - gamma = confounds['rot_z'].values - + x = confounds["trans_x"].values + y = confounds["trans_y"].values + z = confounds["trans_z"].values + alpha = confounds["rot_x"].values + beta = confounds["rot_y"].values + gamma = confounds["rot_z"].values + delta_x = [np.abs(t - s) for s, t in zip(x, x[1:])] delta_y = [np.abs(t - s) for s, t in zip(y, y[1:])] delta_z = [np.abs(t - s) for s, t in zip(z, z[1:])] @@ -33,9 +35,20 @@ def calc_fd(confounds): fd = np.sum([delta_x, delta_y, delta_z, delta_alpha, delta_beta, delta_gamma], axis=0) return fd -def build_statsmodel_json(name, task, contrast, confounds, highpass, - mask, conn_meas, graph_meas=None, exclude=None, outfile=None): - ''' + +def build_statsmodel_json( + name, + task, + contrast, + confounds, + highpass, + mask, + conn_meas, + graph_meas=None, + exclude=None, + outfile=None, +): + """ Creates a BIDS Stats Models json with analysis details for further use. DOES NOT WORK YET. @@ -63,65 +76,63 @@ def build_statsmodel_json(name, task, contrast, confounds, highpass, shape : str Indicates shape of map (3d, 4d, coords) for choosing appropriate Nilearn masker for extracting BOLD signals from nifti files. - - ''' - mask_builtins = ['shen270', 'craddock270', 'schaefer400', 'yeo7', 'yeo17'] - if '.nii' in mask: - assert exists(mask), 'Mask file does not exist at {mask}'.format(mask=mask) - if '.gz' in mask: - mask_name = basename(mask).rsplit('.', 2)[0] + + """ + mask_builtins = ["shen270", "craddock270", "schaefer400", "yeo7", "yeo17"] + if ".nii" in mask: + assert exists(mask), "Mask file does not exist at {mask}".format(mask=mask) + if ".gz" in mask: + mask_name = basename(mask).rsplit(".", 2)[0] else: - mask_name = basename(mask).rsplit('.', 1)[0] + mask_name = basename(mask).rsplit(".", 1)[0] else: - assert mask in mask_builtins, 'Mask {mask} not in built-in mask options. Please provide file path or one of {mask_builtins}'.format(mask=mask, mask_builtins=mask_builtins) + assert ( + mask in mask_builtins + ), "Mask {mask} not in built-in mask options. Please provide file path or one of {mask_builtins}".format( + mask=mask, mask_builtins=mask_builtins + ) variables = confounds + ["{mask_name}*".format(mask_name=mask_name)] statsmodel = { "name": name, - "description": "A functional connectivity analysis of {task}, comparing {contrast}".format(task=task, - contrast=contrast), - "input":{ - "task": task - }, - "blocks":[{ + "description": "A functional connectivity analysis of {task}, comparing {contrast}".format( + task=task, contrast=contrast + ), + "input": {"task": task}, + "blocks": [ + { "level": "run", - "transformations":{ - "name": "load_image_data", - "input": ["bold"], - "aggregate": ["mean"], - "mask": [mask_name], - "output": ["{mask_name}*".format(mask_name=mask_name)] - }, - }, + "transformations": { + "name": "load_image_data", + "input": ["bold"], + "aggregate": ["mean"], + "mask": [mask_name], + "output": ["{mask_name}*".format(mask_name=mask_name)], + }, + }, { "level": "session", "model": { "variables": variables, - "options": { - "confounds": confounds, - "high_pass_filter_cutoff_secs": highpass - }, - "variances": { - "name": "session_level", - "groupBy": "session" - }, + "options": {"confounds": confounds, "high_pass_filter_cutoff_secs": highpass}, + "variances": {"name": "session_level", "groupBy": "session"}, "software": { "IDConn": { "ConnectivityMeasure": [conn_meas], - "GraphMetrics": [graph_meas] + "GraphMetrics": [graph_meas], } - } - } - - } - ] + }, + }, + }, + ], } - statsmodel_json = json.dumps(statsmodel, indent = 2) - - outfile = '{name}-statsmodel.json'.format(name=name) - with open(outfile, 'w') as outfile: + statsmodel_json = json.dumps(statsmodel, indent=2) + + outfile = "{name}-statsmodel.json".format(name=name) + with open(outfile, "w") as outfile: json.dump(statsmodel, outfile) return statsmodel_json + def atlas_picker(atlas, path, key=None): """Takes in atlas name and path to file, if local, returns nifti-like object (usually file path to downloaded atlas), @@ -133,12 +144,12 @@ def atlas_picker(atlas, path, key=None): Parameters ---------- atlas : str - Name of the atlas/parcellation used to define nodes from - voxels. If using an atlas fetchable by Nilearn, atlas name + Name of the atlas/parcellation used to define nodes from + voxels. If using an atlas fetchable by Nilearn, atlas name must match the function `fetch_atlas_[name]`. path : str - Path to the atlas specified, if not using a dataset from Nilearn. - If using `nilearn.datasets` to fetch an atlas, will revert to + Path to the atlas specified, if not using a dataset from Nilearn. + If using `nilearn.datasets` to fetch an atlas, will revert to `derivatives/idconn` path. key : str Atlas-specific key for denoting which of multiple versions @@ -156,22 +167,32 @@ def atlas_picker(atlas, path, key=None): Indicates shape of map (3d, 4d, coords) for choosing appropriate Nilearn masker for extracting BOLD signals from nifti files. """ - nilearn_3d = ['craddock_2012', 'destrieux_2009', 'harvard_oxford', 'smith_2009', 'yeo_2011', 'aal', 'pauli_2017', 'msdl'] - #nilearn_coord = ['power_2011', 'dosenbach_2010', 'seitzman_2018'] - #nilearn_4d = ['allen_2011', ''] + nilearn_3d = [ + "craddock_2012", + "destrieux_2009", + "harvard_oxford", + "smith_2009", + "yeo_2011", + "aal", + "pauli_2017", + "msdl", + ] + # nilearn_coord = ['power_2011', 'dosenbach_2010', 'seitzman_2018'] + # nilearn_4d = ['allen_2011', ''] if atlas in nilearn_3d: - if atlas == 'craddock_2012': + if atlas == "craddock_2012": atlas_dict = datasets.fetch_atlas_craddock_2012(data_dir=path) - atlas_path = atlas_dict['tcorr_2level'] + atlas_path = atlas_dict["tcorr_2level"] nifti = nib.load(atlas_path) nifti_arr = nifti.get_fdata() - #selecting one volume of the nifti, each represent different granularity of parcellation - #selecting N = 270, the 27th volume per http://ccraddock.github.io/cluster_roi/atlases.html - nifti = nib.Nifti1Image(nifti_arr[:,:,:,26], nifti.affine) + # selecting one volume of the nifti, each represent different granularity of parcellation + # selecting N = 270, the 27th volume per http://ccraddock.github.io/cluster_roi/atlases.html + nifti = nib.Nifti1Image(nifti_arr[:, :, :, 26], nifti.affine) nifti.to_filename() return atlas, path + def vectorize_corrmats(matrices): """Returns the vectorized upper triangles of a 3-dimensional array (i.e., node x node x matrix) of matrices. Output will be a 2-dimensional @@ -181,14 +202,14 @@ def vectorize_corrmats(matrices): matrices : numpy array of shape (p, n, n) Represents the link strengths of the graphs. Assumed to be an array of symmetric nxn matrices per participant and/or timepoint (p). - + Returns ------- edge_vector : numpy array of shape (p, n^2) - Represents an array of vectorized upper triangles of + Represents an array of vectorized upper triangles of the input matrices. """ - #print(f'\n\n\n{matrices.shape}, {matrices.ndim}\n\n\n') + # print(f'\n\n\n{matrices.shape}, {matrices.ndim}\n\n\n') num_node = matrices.shape[1] upper_tri = np.triu_indices(num_node, k=1) if matrices.ndim == 3: @@ -196,17 +217,20 @@ def vectorize_corrmats(matrices): upper_tri = np.triu_indices(num_node, k=1) num_matrices = matrices.shape[0] edge_vector = [] - for matrix in range(0,num_matrices): - vectorized = matrices[matrix,:,:][upper_tri] + for matrix in range(0, num_matrices): + vectorized = matrices[matrix, :, :][upper_tri] edge_vector.append(vectorized) - + elif matrices.ndim == 2: true = matrices[0].T == matrices[0] if true.all(): edge_vector = matrices[upper_tri] else: - print('Matrices of incompatible shape:', matrices.shape, - '\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).') + print( + "Matrices of incompatible shape:", + matrices.shape, + "\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).", + ) elif matrices.ndim == 1: if matrices[0].ndim == 2: num_node = matrices[0].shape[0] @@ -216,14 +240,18 @@ def vectorize_corrmats(matrices): vectorized = matrix[upper_tri] edge_vector.append(vectorized) else: - print('Matrices of incompatible shape:', matrices.shape, - '\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).') + print( + "Matrices of incompatible shape:", + matrices.shape, + "\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).", + ) edge_vector = np.asarray(edge_vector) return edge_vector + def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True, verbose=False): - """Returns a node x node x (subject x session) matrix of correlation matrices - from a BIDS derivative folder. Optionally returns a node^2 x (subject x session) + """Returns a node x node x (subject x session) matrix of correlation matrices + from a BIDS derivative folder. Optionally returns a node^2 x (subject x session) array of vectorized upper triangles of those correlation matrices. Parameters ---------- @@ -237,286 +265,336 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True atlas: str The name of the atlas used to make the correlation matrix. Must match the string in corrmat filename. z_score : Bool - Would you like the correlation matrices z-scored? (Uses Fishers r-to-z, + Would you like the correlation matrices z-scored? (Uses Fishers r-to-z, thus assumes elements/edges of corrmats are product-moment correlations). vectorized : Bool - If True, returns the vectorized upper triangles of correlation matrices in a p x (n^2 - n)/2 array. + If True, returns the vectorized upper triangles of correlation matrices in a p x (n^2 - n)/2 array. If false, returns the full correlation matrices in a p x n x n array. verbose : Bool - If True, prints out subjects/sessions as their correlationmatrices are being read. + If True, prints out subjects/sessions as their correlationmatrices are being read. If False, prints nothing. - + Returns ------- # NOT TRUE CURRENTLY RETURNS DATAFRAME edge_vector : numpy array of shape (p, (n^2-n)/2) - Represents an array of vectorized upper triangles of + Represents an array of vectorized upper triangles of the input nxn matrices if vectorized=True. edge_cube : numpy array of shape (p, n^2) - Represents an array of the input nxn matrices + Represents an array of the input nxn matrices if vectorized=False. """ - subjects = layout.get(return_type='id', - target='subject', - suffix='bold', - scope=deriv_name - ) - - ppts_fname = layout.get_file('participants.tsv').path - ppt_df = pd.read_csv(ppts_fname, sep='\t', index_col=[0,1]) - ppt_df['adj'] = '' + subjects = layout.get(return_type="id", target="subject", suffix="bold", scope=deriv_name) + + ppts_fname = layout.get_file("participants.tsv").path + ppt_df = pd.read_csv(ppts_fname, sep="\t", index_col=[0, 1]) + ppt_df["adj"] = "" if vectorized: - ppt_df['edge_vector'] = '' - + ppt_df["edge_vector"] = "" + for subject in subjects: if verbose: print(subject) else: pass - sessions = layout.get(return_type='id', - target='session', - task=task, - suffix='bold', - subject=subject, - scope=deriv_name) - - + sessions = layout.get( + return_type="id", + target="session", + task=task, + suffix="bold", + subject=subject, + scope=deriv_name, + ) + for session in sessions: - runs = layout.get(return_type='id', - session=session, - target='run', - task=task, - suffix='timeseries', - subject=subject, - scope=deriv_name) + runs = layout.get( + return_type="id", + session=session, + target="run", + task=task, + suffix="timeseries", + subject=subject, + scope=deriv_name, + ) if len(runs) > 0: - path = layout.get(return_type='filename', - session=session, - run=runs[0], - task=task, - suffix='timeseries', - subject=subject, - scope=deriv_name) + path = layout.get( + return_type="filename", + session=session, + run=runs[0], + task=task, + suffix="timeseries", + subject=subject, + scope=deriv_name, + ) confounds = pd.read_table(path[0], header=0, index_col=0) - if not 'framewise_displacement' in confounds.columns: + if not "framewise_displacement" in confounds.columns: fd = calc_fd(confounds) - #fd.append(0) + # fd.append(0) fd = np.append(fd, [0]) - confounds['framewise_displacement'] = fd + confounds["framewise_displacement"] = fd confound_means = confounds.mean(axis=0) if len(runs) > 1: for run in runs[1:]: - path = layout.get(return_type='filename', - session=session, - run=run, - task=task, - suffix='timeseries', - subject=subject, - scope=deriv_name) + path = layout.get( + return_type="filename", + session=session, + run=run, + task=task, + suffix="timeseries", + subject=subject, + scope=deriv_name, + ) confounds = pd.read_table(path[0], header=0, index_col=0) - if not 'framewise_displacement' in confounds.columns: + if not "framewise_displacement" in confounds.columns: fd = calc_fd(confounds) - #fd.append(0) + # fd.append(0) fd = np.append(fd, [0]) - confounds['framewise_displacement'] = fd + confounds["framewise_displacement"] = fd confound_means_temp = confounds.mean(axis=0) - confound_means = np.mean(pd.concat([confound_means, confound_means_temp], axis=1), axis=1) - #print(confound_means) + confound_means = np.mean( + pd.concat([confound_means, confound_means_temp], axis=1), axis=1 + ) + # print(confound_means) else: - path = path = layout.get(return_type='filename', - session=session, - desc='confounds', - task=task, - suffix='timeseries', - subject=subject, - scope=deriv_name) - + path = path = layout.get( + return_type="filename", + session=session, + desc="confounds", + task=task, + suffix="timeseries", + subject=subject, + scope=deriv_name, + ) + confounds = pd.read_table(path[0], header=0, index_col=0) - if not 'framewise_displacement' in confounds.columns: + if not "framewise_displacement" in confounds.columns: fd = calc_fd(confounds) fd = np.append(fd, [0]) - confounds['framewise_displacement'] = fd + confounds["framewise_displacement"] = fd confound_means = confounds.mean(axis=0) - #print(confound_means) + # print(confound_means) for confound in confound_means.index: - ppt_df.at[(f'sub-{subject}', - f'ses-{session}'), - confound] = confound_means[confound] + ppt_df.at[(f"sub-{subject}", f"ses-{session}"), confound] = confound_means[ + confound + ] if verbose: print(session) else: pass - path = layout.get(return_type='filename', - task=task, - subject=subject, - session=session, - atlas=atlas, - suffix='bold', - scope='IDConn' - ) + path = layout.get( + return_type="filename", + task=task, + subject=subject, + session=session, + atlas=atlas, + suffix="bold", + scope="IDConn", + ) if verbose: - print(f'Corrmat path for sub-{subject}, ses-{session}: \t{path}') + print(f"Corrmat path for sub-{subject}, ses-{session}: \t{path}") else: pass if type(path) == list: - #print(len(path)) + # print(len(path)) path = path[0] else: pass - assert exists(path), f'Corrmat file not found at {path}' - adj_matrix = pd.read_csv(path, sep='\t', header=0, index_col=0) - + assert exists(path), f"Corrmat file not found at {path}" + adj_matrix = pd.read_csv(path, sep="\t", header=0, index_col=0) + if z_score == True: z_adj = np.arctanh(adj_matrix.values) z_adj = np.where(z_adj == np.inf, 0, z_adj) - #print(z_adj.shape) - ppt_df.at[(f'sub-{subject}', - f'ses-{session}'), - 'adj'] = z_adj + # print(z_adj.shape) + ppt_df.at[(f"sub-{subject}", f"ses-{session}"), "adj"] = z_adj else: - #print(adj_matrix.values.shape) - ppt_df.at[(f'sub-{subject}', - f'ses-{session}'), - 'adj'] = adj_matrix.values - - + # print(adj_matrix.values.shape) + ppt_df.at[(f"sub-{subject}", f"ses-{session}"), "adj"] = adj_matrix.values + if vectorized == True: edge_vector = vectorize_corrmats(adj_matrix.values) - #print(edge_vector.shape) - ppt_df.at[(f'sub-{subject}', - f'ses-{session}'), - 'edge_vector'] = edge_vector - ppt_df.replace({'': np.nan}, inplace=True) + # print(edge_vector.shape) + ppt_df.at[(f"sub-{subject}", f"ses-{session}"), "edge_vector"] = edge_vector + ppt_df.replace({"": np.nan}, inplace=True) return ppt_df + def undo_vectorize(edges, num_node=None): - ''' + """ Puts an edge vector back into an adjacency matrix. Parameters ---------- - edges : list-like of shape ((n^2-n)/2,) + edges : list-like of shape ((n^2-n)/2,) Vectorized upper triangle of an adjacency matrix. num_node : int The number of nodes in the graph. I would calculate this myself, but I'd rather not. - + Returns ------- matrix : numpy array of size (n,n) Symmetric array of connectivity values. - ''' - #j = len(edges) - #num_node = (np.sqrt((8 * j) + 1) + 1) / 2 + """ + # j = len(edges) + # num_node = (np.sqrt((8 * j) + 1) + 1) / 2 if num_node == None: j = len(edges) num_node = int((np.sqrt((8 * j) + 1) + 1) / 2) else: num_node = int(num_node) - X = np.zeros((num_node,num_node)) - X[np.triu_indices(X.shape[0], k = 1)] = edges + X = np.zeros((num_node, num_node)) + X[np.triu_indices(X.shape[0], k=1)] = edges X = X + X.T return X -def plot_edges(adj, atlas_nii, threshold=None, title=None, strength=False, cmap='seismic', node_size='strength'): - ''' + +def plot_edges( + adj, + atlas_nii, + threshold=None, + title=None, + strength=False, + cmap="seismic", + node_size="strength", +): + """ Plots the edges of a connectivity/adjacency matrix both in a heatmap and in brain space, with the option to include a surface plot of node strength. Parameters ---------- - adj : array-like of shape (n, n) + adj : array-like of shape (n, n) Adjacency matrix to be plotted. Can be numpy array or Pandas dataframe. atlas_nii : str - Path to the atlas used to define nodes in the adjacency matrix. + Path to the atlas used to define nodes in the adjacency matrix. Should be one value per node, with the same number of values as rows and columns in adj (i.e., n). Background should be 0, should be in MNI space. threshold : int - Percentile of edges to plot, between 0 and 100 such that 0 plots all the edges and 100 plots none. + Percentile of edges to plot, between 0 and 100 such that 0 plots all the edges and 100 plots none. If not specified, default is 99, which plots the top 1% of edges. title : str - Title for plots. + Title for plots. strength : bool - If True, plots surface maps of node strength (i.e., the sum of all a node's edge weights) + If True, plots surface maps of node strength (i.e., the sum of all a node's edge weights) cmap : str - One of the matplotlib colormaps. + One of the matplotlib colormaps. node_size : int or 'strength' Size to plot nodes in brain space. If 'strength', node size varies according to a node's summed edges (i.e., strength). - + Returns ------- fig1 : Matplotlib figure object Connectivity figure. fig2 : Matplotlib figure object If `strength=True`, the surface node strength plot. - ''' + """ coords = plotting.find_parcellation_cut_coords(atlas_nii) num_node = adj.shape[0] # only plot the top t% of edges - if threshold == 'computed': - threshold = f'{(1 - (100 / num_node ** 2)) * 100}%' + if threshold == "computed": + threshold = f"{(1 - (100 / num_node ** 2)) * 100}%" elif type(threshold) == float or type(threshold) == int: - threshold = f'{threshold}%' + threshold = f"{threshold}%" else: - threshold = '99.99%' - print('edge plotting threshold: ', threshold) + threshold = "99.99%" + print("edge plotting threshold: ", threshold) - if node_size == 'strength': + if node_size == "strength": node_strength = np.sum(adj, axis=0) - #node_strength /= np.max(node_strength) - #node_strength **= 4 + # node_strength /= np.max(node_strength) + # node_strength **= 4 node_strength = node_strength / np.max(node_strength) * 60 node_size = node_strength - - fig = plt.figure(figsize=(12,4)) + + fig = plt.figure(figsize=(12, 4)) if title is not None: fig.suptitle(title) - gs = GridSpec(1, 2, width_ratios=[3,1]) + gs = GridSpec(1, 2, width_ratios=[3, 1]) ax0 = fig.add_subplot(gs[0]) ax1 = fig.add_subplot(gs[1]) plt.tight_layout(w_pad=5) - g = plotting.plot_connectome(adj, coords, - node_size=node_size, - edge_threshold=threshold, - edge_cmap=cmap, - edge_kwargs={'alpha': 0.4}, - display_mode='lyrz', - figure=fig, - axes=ax0, - colorbar=False, - annotate=True) + g = plotting.plot_connectome( + adj, + coords, + node_size=node_size, + edge_threshold=threshold, + edge_cmap=cmap, + edge_kwargs={"alpha": 0.4}, + display_mode="lyrz", + figure=fig, + axes=ax0, + colorbar=False, + annotate=True, + ) h = sns.heatmap(adj, square=True, linewidths=0, cmap=cmap, ax=ax1, center=0) if strength: - fig2 = plt.figure(figsize=(12,4)) + fig2 = plt.figure(figsize=(12, 4)) if title is not None: fig2.suptitle(title) fsaverage = datasets.fetch_surf_fsaverage() nimg = nib.load(atlas_nii) regn_sch_arr = nimg.get_fdata() - for i in np.arange(0,num_node): - regn_sch_arr[np.where(regn_sch_arr == i+1)] = np.sum(adj[i]) + for i in np.arange(0, num_node): + regn_sch_arr[np.where(regn_sch_arr == i + 1)] = np.sum(adj[i]) strength_nimg = nib.Nifti1Image(regn_sch_arr, nimg.affine) # replace this filename with BIDSy output - #nib.save(strength_nimg, f'/Users/katherine.b/Dropbox/{title}predictive-strength.nii') + # nib.save(strength_nimg, f'/Users/katherine.b/Dropbox/{title}predictive-strength.nii') gs = GridSpec(1, 4) # plot edge weights on surfaces - ax2 = fig2.add_subplot(gs[0], projection='3d') - ax3 = fig2.add_subplot(gs[1], projection='3d') - ax4 = fig2.add_subplot(gs[2], projection='3d') - ax5 = fig2.add_subplot(gs[3], projection='3d') + ax2 = fig2.add_subplot(gs[0], projection="3d") + ax3 = fig2.add_subplot(gs[1], projection="3d") + ax4 = fig2.add_subplot(gs[2], projection="3d") + ax5 = fig2.add_subplot(gs[3], projection="3d") - texture_l = surface.vol_to_surf(strength_nimg, fsaverage.pial_left, interpolation='nearest') - texture_r = surface.vol_to_surf(strength_nimg, fsaverage.pial_right, interpolation='nearest') + texture_l = surface.vol_to_surf( + strength_nimg, fsaverage.pial_left, interpolation="nearest" + ) + texture_r = surface.vol_to_surf( + strength_nimg, fsaverage.pial_right, interpolation="nearest" + ) plt.tight_layout(w_pad=-1) - i = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5, - cmap=cmap, view='lateral', colorbar=False, axes=ax2) - j = plotting.plot_surf_stat_map(fsaverage.pial_left, texture_l, symmetric_cbar=False, threshold=0.5, - cmap=cmap, view='medial', colorbar=False, axes=ax3) - k = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5, - cmap=cmap, view='lateral', colorbar=False, axes=ax4) - l = plotting.plot_surf_stat_map(fsaverage.pial_right, texture_r, symmetric_cbar=False, threshold=0.5, - cmap=cmap, view='medial', colorbar=False, axes=ax5) + i = plotting.plot_surf_stat_map( + fsaverage.pial_left, + texture_l, + symmetric_cbar=False, + threshold=0.5, + cmap=cmap, + view="lateral", + colorbar=False, + axes=ax2, + ) + j = plotting.plot_surf_stat_map( + fsaverage.pial_left, + texture_l, + symmetric_cbar=False, + threshold=0.5, + cmap=cmap, + view="medial", + colorbar=False, + axes=ax3, + ) + k = plotting.plot_surf_stat_map( + fsaverage.pial_right, + texture_r, + symmetric_cbar=False, + threshold=0.5, + cmap=cmap, + view="lateral", + colorbar=False, + axes=ax4, + ) + l = plotting.plot_surf_stat_map( + fsaverage.pial_right, + texture_r, + symmetric_cbar=False, + threshold=0.5, + cmap=cmap, + view="medial", + colorbar=False, + axes=ax5, + ) return fig, fig2, strength_nimg else: - return fig \ No newline at end of file + return fig diff --git a/idconn/nbs.py b/idconn/nbs.py index ad236c0..facf96f 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -5,9 +5,16 @@ from idconn.io import vectorize_corrmats, undo_vectorize from scipy.stats import t, pearsonr, pointbiserialr, spearmanr import enlighten -#import bct -from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, GridSearchCV, StratifiedKFold, KFold +# import bct + +from sklearn.model_selection import ( + RepeatedStratifiedKFold, + RepeatedKFold, + GridSearchCV, + StratifiedKFold, + KFold, +) from sklearn.feature_selection import f_regression, f_classif from sklearn.linear_model import LogisticRegression, ElasticNet @@ -15,19 +22,23 @@ from sklearn.metrics import mean_squared_error + def calc_number_of_nodes(matrices): if matrices.shape[0] != matrices.shape[1]: if matrices.shape[1] == matrices.shape[2]: num_node = matrices.shape[1] matrices = np.moveaxis(matrices, 0, -1) else: - raise ValueError(f'Matrices of shape {matrices.shape}', - 'requires matrices of shape (subject x session) x node x node', - 'or node x node x (subject x session).') + raise ValueError( + f"Matrices of shape {matrices.shape}", + "requires matrices of shape (subject x session) x node x node", + "or node x node x (subject x session).", + ) else: num_node = matrices.shape[0] return num_node + def residualize(X, y=None, confounds=None): # residualize the outcome if confounds is not None: @@ -38,46 +49,45 @@ def residualize(X, y=None, confounds=None): # residualize features resid_X = np.zeros_like(X) - #print(X.shape, resid_X.shape) + # print(X.shape, resid_X.shape) for i in range(0, X.shape[1]): - X_temp = X[:,i] - #print(X_temp.shape) + X_temp = X[:, i] + # print(X_temp.shape) X_ = pg.linear_regression(confounds, X_temp) - #print(X_.residuals_.shape) - resid_X[:,i] = X_.residuals_.flatten() + # print(X_.residuals_.shape) + resid_X[:, i] = X_.residuals_.flatten() return resid_y, resid_X else: # residualize features resid_X = np.zeros_like(X) - #print(X.shape, resid_X.shape) + # print(X.shape, resid_X.shape) for i in range(0, X.shape[1]): - X_temp = X[:,i] - #print(X_temp.shape) + X_temp = X[:, i] + # print(X_temp.shape) X_ = pg.linear_regression(confounds, X_temp) - #print(X_.residuals_.shape) - resid_X[:,i] = X_.residuals_.flatten() + # print(X_.residuals_.shape) + resid_X[:, i] = X_.residuals_.flatten() return resid_X else: - print('Confound matrix wasn\'t provided, so no confounding was done') - - + print("Confound matrix wasn't provided, so no confounding was done") + def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000): - ''' + """ Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided of shape ((subject x session)x node x node) in the network. Returns a dataframe containing the results of kfolds cross-validation, including the indices of train and test samples, the resulting p-value and largest connected component, the accuracy of the network in predicting group belonging in the test samples (using logistic regression), - the parameter estimates from each regression, and the model object from each regression. + the parameter estimates from each regression, and the model object from each regression. from a BIDS derivative folder. Optionally returns a subject x session dataframe - of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) + of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) array of vectorized upper triangles of those correlation mat Parameters ---------- matrices : numpy array of shape (p, n, n) - Represents the link strengths of the graphs (i.e., functional connectivity). + Represents the link strengths of the graphs (i.e., functional connectivity). Assumed to be an array of symmetric matrices. outcome : list-like of shape (p,) Y-value to be predicted with connectivity @@ -91,7 +101,7 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000): permutations : int If `predict=False`, specifies the number of permutations run to create a null distribution for estimating the significance of the connected component size. Recommended 10,000. - + Returns ------- S1 : Pandas dataframe @@ -100,14 +110,14 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000): If `predict=False`, denotes the significance of the largest connected component. perms : numpy array of shape (permutations,) If `predict=False`, largest connected component size per permutation. - ''' + """ # need to do a mass-univariate test at every edge # and retain significant edges # then find the largest connected component # and, if not predict, build a null distribution - #n = matrices.shape[:-1] + # n = matrices.shape[:-1] ndims = len(matrices.shape) - + # vectorize_corrmats returns p x n^2 # turn matrices into vectorized upper triangles @@ -115,43 +125,42 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000): edges = vectorize_corrmats(matrices) else: edges = matrices.copy() - #print(edges.shape) - - - #edges = edges.T - + # print(edges.shape) + + # edges = edges.T + # run an ols per edge # create significancs matrix for predictor of interest (outcome) # 1 if edge is significantly predicted by outcome # 0 if it's not - + if len(np.unique(outcome)) < 5: (f, p) = f_classif(X=edges, y=outcome) else: (f, p) = f_regression(X=edges, y=outcome, center=False) sig_edges = np.where(p < alpha, 1, 0) - + # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - sig_matrix = undo_vectorize(sig_edges) # need to write this function + sig_matrix = undo_vectorize(sig_edges) # need to write this function matrix = nx.from_numpy_array(sig_matrix) - - #use networkX to find connected components + + # use networkX to find connected components largest_cc = max(nx.connected_components(matrix), key=len) G0 = matrix.subgraph(largest_cc) - #print(G0) - - # retain size of largest connected component + # print(G0) + + # retain size of largest connected component # for NBS permutation-based significance testing max_comp = G0.number_of_edges() - #print(f'Connected component has {max_comp} edges.') + # print(f'Connected component has {max_comp} edges.') # pull the subgraph with largest number of nodes # i.e., the largest connected component - + # grab list of nodes in largest connected component nodes = list(G0.nodes) - + unused_nodes = list(set(matrix.nodes) - set(nodes)) S1 = nx.to_pandas_adjacency(G0, nodelist=nodes) @@ -166,7 +175,7 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000): S1.sort_index(axis=0, inplace=True) S1.sort_index(axis=1, inplace=True) - + # permutation testing to create a null distribution of max component size # only for regular NBS, -Predict doesn't need this if predict == False: @@ -176,56 +185,60 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000): for i in range(0, permutations): # shuffle outcome order rng.shuffle(outcome_copy, axis=0) - #print(outcome_copy) - + # print(outcome_copy) + if len(np.unique(outcome)) < 5: (f1, p1) = f_classif(edges, outcome_copy) else: (f1, p1) = f_regression(edges, outcome_copy, center=False) - + perm_edges = np.where(p1 < alpha, 1, 0) - - #print(np.sum(perm_edges)) + + # print(np.sum(perm_edges)) # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - perm_matrix = undo_vectorize(perm_edges) # need to write this function + perm_matrix = undo_vectorize(perm_edges) # need to write this function perm_nx = nx.from_numpy_array(perm_matrix) largest_cc = max(nx.connected_components(perm_nx), key=len) S = perm_nx.subgraph(largest_cc) perm_comp_size = S.number_of_edges() - # retain for null distribution perms[i] = perm_comp_size if i == 0: pass elif i % 100 == 0: - print(f'p-value is {np.round(np.sum(np.where(perms >= max_comp, 1, 0)) / i, 3)} as of permutation {i}') - + print( + f"p-value is {np.round(np.sum(np.where(perms >= max_comp, 1, 0)) / i, 3)} as of permutation {i}" + ) + # bctpy nbs code uses hit to mark progress across permutations # prob not necessary? - + # bctpy calcs pval for all components, not just largest? # but I don't think that's relevant for the og implimentation of nbs? pval = np.size(np.where(perms >= max_comp)) / permutations print(max_comp, permutations, pval) - + return pval, S1, perms else: return S1 -def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_splits=10, n_iterations=10): + +def kfold_nbs( + matrices, outcome, confounds=None, alpha=0.05, groups=None, n_splits=10, n_iterations=10 +): """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided of shape ((subject x session)x node x node) in the network. Returns a dataframe containing the results of kfolds cross-validation, including the indices of train and test samples, the resulting p-value and largest connected component, the accuracy of the network in predicting group belonging in the test samples (using logistic regression), - the parameter estimates from each regression, and the model object from each regression. + the parameter estimates from each regression, and the model object from each regression. from a BIDS derivative folder. Optionally returns a subject x session dataframe - of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) + of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) array of vectorized upper triangles of those correlation mat Parameters ---------- @@ -235,10 +248,10 @@ def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_spli outcome : list-like of shape (p,) Y-value to be predicted with connectivity confounds : list-like - Columns in `participants.tsv` to be regressed out of connectivity and outcome + Columns in `participants.tsv` to be regressed out of connectivity and outcome data in each CV fold (per recommendation from Snoek et al., 2019). alpha : float - Proportion of type II errors (i.e., false positives) we're willing to put up with. + Proportion of type II errors (i.e., false positives) we're willing to put up with. This is the upper limit for pvalues in the edge detection process. groups : list-like of shape (p,) Grouping variable - currently only works for 2 groups. Will enforce stratified k-fold CV. @@ -248,7 +261,7 @@ def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_spli Value of K for K-fold cross-validation. Will split data into K chunks, train on K-1 chunks and test on the Kth. n_iterations : int Number of times to run K-fold cross-validation. More times = more stable results. - + Returns ------- weighted_average : Pandas dataframe @@ -256,56 +269,60 @@ def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_spli their prediction performance (i.e., accuracy for binary outcome, correlation for continuous). Could be used for out-of-sample prediction, once thresholded and binarized. cv_results : Pandas dataframe - Includes the results of each cross-validation loop + Includes the results of each cross-validation loop (e.g., predictive performance, data split, largest connected component per fold per iteration). """ edges = vectorize_corrmats(matrices) - #print(edges.shape) - #print(edges.shape) - index = list(range(0,n_splits * n_iterations)) - - cv_results = pd.DataFrame(index=index, - columns=['split', - #'pval', - 'score', - 'component', - 'coefficient_matrix', - 'coefficient_vector', - 'model']) + # print(edges.shape) + # print(edges.shape) + index = list(range(0, n_splits * n_iterations)) + + cv_results = pd.DataFrame( + index=index, + columns=[ + "split", + #'pval', + "score", + "component", + "coefficient_matrix", + "coefficient_vector", + "model", + ], + ) if groups is not None: - cv = RepeatedStratifiedKFold(n_splits=n_splits, - n_repeats=n_iterations) + cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_iterations) split_y = groups - + else: - cv = RepeatedKFold(n_splits=n_splits, - n_repeats=n_iterations) - split_y = outcome - + cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_iterations) + split_y = outcome + num_node = calc_number_of_nodes(matrices) - #print(num_node) - #if matrices.shape[0] != matrices.shape[1]: + # print(num_node) + # if matrices.shape[0] != matrices.shape[1]: # if matrices.shape[1] == matrices.shape[2]: # num_node = matrices.shape[1] - #matrices = np.moveaxis(matrices, 0, -1) + # matrices = np.moveaxis(matrices, 0, -1) # else: # raise ValueError(f'Matrices of shape {matrices.shape}', - #'requires matrices of shape (subject x session) x node x node', - #'or node x node x (subject x session).') - #else: + #'requires matrices of shape (subject x session) x node x node', + #'or node x node x (subject x session).') + # else: # num_node = matrices.shape[0] upper_tri = np.triu_indices(num_node, k=1) - + i = 0 manager = enlighten.get_manager() - ticks = manager.counter(total=n_splits * n_iterations, desc='Progress', unit='folds') + ticks = manager.counter(total=n_splits * n_iterations, desc="Progress", unit="folds") for train_idx, test_idx in cv.split(edges, split_y): scaler = StandardScaler() - cv_results.at[i, 'split'] = (train_idx, test_idx) - - #assert len(train_a_idx) == len(train_b_idx) + cv_results.at[i, "split"] = (train_idx, test_idx) + + # assert len(train_a_idx) == len(train_b_idx) if np.unique(outcome).shape[0] == 2: - regressor = LogisticRegression(l1_ratio=0.25, max_iter=1000, penalty='elasticnet', solver='saga') + regressor = LogisticRegression( + l1_ratio=0.25, max_iter=1000, penalty="elasticnet", solver="saga" + ) else: regressor = ElasticNet(l1_ratio=0.25, max_iter=1000) @@ -314,22 +331,24 @@ def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_spli train_edges = edges[train_idx, :] test_edges = edges[test_idx, :] - + if confounds is not None: train_confounds = confounds.values[train_idx] test_confounds = confounds.values[test_idx] - #print(train_edges.shape, train_confounds.shape, train_y.shape) - + # print(train_edges.shape, train_confounds.shape, train_y.shape) + # residualize the edges and outcome if np.unique(outcome).shape[0] == 2: train_edges = residualize(X=train_edges, confounds=train_confounds) test_edges = residualize(X=test_edges, confounds=test_confounds) elif np.unique(outcome).shape[0] > 3: - train_y, train_edges = residualize(X=train_edges, y=train_y, confounds=train_confounds) + train_y, train_edges = residualize( + X=train_edges, y=train_y, confounds=train_confounds + ) test_y, test_edges = residualize(X=test_edges, y=test_y, confounds=test_confounds) else: pass - + train_edges = scaler.fit_transform(train_edges) test_edges = scaler.fit_transform(test_edges) @@ -338,81 +357,81 @@ def kfold_nbs(matrices, outcome, confounds=None, alpha=0.05, groups=None, n_spli else: train_y = scaler.fit_transform(train_y.reshape(-1, 1)) test_y = scaler.fit_transform(test_y.reshape(-1, 1)) - + # perform NBS wooooooooo # note: output is a dataframe :) # PYNBS SHOULD NOT DO CONFOUND REGRESSION? adj = pynbs(train_edges, train_y, alpha, predict=True) - #print(adj.shape, adj.ndim, adj[0].shape, upper_tri) - - #cv_results.at[i, 'pval'] = pval - cv_results.at[i, 'component'] = adj.values - + # print(adj.shape, adj.ndim, adj[0].shape, upper_tri) + + # cv_results.at[i, 'pval'] = pval + cv_results.at[i, "component"] = adj.values + # in the event of no edges significantly related to - #print(sum(sum(adj.values)), '\n', adj.values.shape) + # print(sum(sum(adj.values)), '\n', adj.values.shape) if sum(sum(adj.values)) > 0: # grab the values of the adjacency matrix that are just in the upper triangle # so you don't have repeated edges # returns (n_edges, ) nbs_vector = adj.values[upper_tri] - #print(nbs_vector.shape) + # print(nbs_vector.shape) # use those to make a "significant edges" mask mask = nbs_vector == 1.0 # grab only the significant edges from testing and training sets of edges # for use as features in the predictive models # these are already residualized - #print(train_edges.shape) + # print(train_edges.shape) # returns (n_edges, samples) train_features = train_edges.T[mask] test_features = test_edges.T[mask] train_features = scaler.fit_transform(train_features.T) test_features = scaler.fit_transform(test_features.T) - #print(np.ravel(train_y)) + # print(np.ravel(train_y)) # train model predicting outcome from brain (note: no mas covariates) model = regressor.fit(X=train_features, y=np.ravel(train_y)) - cv_results.at[i, 'model'] = model - + cv_results.at[i, "model"] = model + # score that model on the testing data # if logistic regression: score = mean accuracy # if linear regression: score = coefficient of determination (R^2) # both from 0 (low) to 1 (high) score = model.score(X=test_features, y=np.ravel(test_y)) - cv_results.at[i, 'score'] = score - #print(model.coef_.shape) + cv_results.at[i, "score"] = score + # print(model.coef_.shape) m = 0 param_vector = np.zeros_like(nbs_vector) for l in range(0, nbs_vector.shape[0]): - if nbs_vector[l] == 1.: + if nbs_vector[l] == 1.0: ### # NEEDS IF STATEMENT BC LOGISTIC AND LINEAR HAVE DIFFERENT COEF_ SHAPES if np.unique(outcome).shape[0] == 2: - param_vector[l] = model.coef_[0,m] + param_vector[l] = model.coef_[0, m] else: - param_vector[l] = model.coef_[m] - m+=1 + param_vector[l] = model.coef_[m] + m += 1 else: pass X = undo_vectorize(param_vector, num_node=num_node) - cv_results.at[i, 'coefficient_matrix'] = X - cv_results.at[i, 'coefficient_vector'] = param_vector + cv_results.at[i, "coefficient_matrix"] = X + cv_results.at[i, "coefficient_vector"] = param_vector i += 1 else: pass ticks.update() # calculate weighted average - #print(cv_results['score']) - weighted_stack = cv_results.at[0, 'component'] * cv_results.at[0, 'score'] - #print(weighted_stack.shape) + # print(cv_results['score']) + weighted_stack = cv_results.at[0, "component"] * cv_results.at[0, "score"] + # print(weighted_stack.shape) for j in index[1:]: - #print(cv_results.at[j, 'score']) - if cv_results.at[j, 'score'] > 0: - weighted = cv_results.at[j, 'component'] * cv_results.at[j, 'score'] + # print(cv_results.at[j, 'score']) + if cv_results.at[j, "score"] > 0: + weighted = cv_results.at[j, "component"] * cv_results.at[j, "score"] weighted_stack = np.dstack([weighted_stack, weighted]) else: pass - #print(weighted_stack.shape, weighted.shape) + # print(weighted_stack.shape, weighted.shape) weighted_average = np.mean(weighted_stack, axis=-1) - return weighted_average, cv_results \ No newline at end of file + return weighted_average, cv_results diff --git a/idconn/networking.py b/idconn/networking.py index f74ee12..c2ddf39 100644 --- a/idconn/networking.py +++ b/idconn/networking.py @@ -4,22 +4,25 @@ import networkx as nx import matplotlib.pyplot as plt from os.path import join -#from nilearn.connectome import ConnectivityMeasure + +# from nilearn.connectome import ConnectivityMeasure from scipy.sparse.csgraph import minimum_spanning_tree from scipy.stats import skew import bct -#import datetime + +# import datetime def avg_corrmat(ppt_df): - ''' + """ Reads in adjacency matrices from the pandas df with ppt info and adj, then computes an average. - ''' - stacked_corrmats = np.array(ppt_df['adj']) - print('Stacked corrmats have dimensions', stacked_corrmats.shape) + """ + stacked_corrmats = np.array(ppt_df["adj"]) + print("Stacked corrmats have dimensions", stacked_corrmats.shape) avg_corrmat = np.mean(stacked_corrmats, axis=0) return avg_corrmat + def null_model(W, bin_swaps=5, wei_freq=0.1, seed=None): def get_rng(seed): if seed is None or seed == np.random: @@ -29,7 +32,7 @@ def get_rng(seed): try: rstate = np.random.RandomState(seed) except ValueError: - rstate = np.random.RandomState(np.random.Random(seed).randint(0, 2 ** 32 - 1)) + rstate = np.random.RandomState(np.random.Random(seed).randint(0, 2**32 - 1)) return rstate def randmio_und_signed(R, itr, seed=None): @@ -45,7 +48,6 @@ def randmio_und_signed(R, itr, seed=None): for it in range(int(itr)): att = 0 while att <= max_attempts: - a, b, c, d = pick_four_unique_nodes_quickly(n, rng) r0_ab = R[a, b] @@ -59,7 +61,6 @@ def randmio_und_signed(R, itr, seed=None): and np.sign(r0_ad) == np.sign(r0_cb) and np.sign(r0_ab) != np.sign(r0_ad) ): - R[a, d] = R[d, a] = r0_ab R[a, b] = R[b, a] = r0_ad @@ -80,11 +81,11 @@ def pick_four_unique_nodes_quickly(n, seed=None): clever but still substantially slower. """ rng = get_rng(seed) - k = rng.randint(n ** 4) + k = rng.randint(n**4) a = k % n b = k // n % n - c = k // n ** 2 % n - d = k // n ** 3 % n + c = k // n**2 % n + d = k // n**3 % n if a != b and a != c and a != d and b != c and b != d and c != d: return (a, b, c, d) else: @@ -134,9 +135,7 @@ def pick_four_unique_nodes_quickly(n, seed=None): W0.flat[Lij[Oind]] = s * Wv # weight at this index else: wsize = np.size(Wv) - wei_period = np.round(1 / wei_freq).astype( - int - ) # convert frequency to period + wei_period = np.round(1 / wei_freq).astype(int) # convert frequency to period lq = np.arange(wsize, 0, -wei_period, dtype=int) for m in lq: # iteratively explore at this period # get indices of Lij that sort P @@ -170,16 +169,15 @@ def pick_four_unique_nodes_quickly(n, seed=None): W0 = W0 + W0.T return W0 + def generate_null(ppt_df, thresh_arr, measure, permutations=1000): - ''' + """ Generate a distribution of graph measure values based on a null connectivity matrix that is like the average connectivity matrix across participants. - - ''' - null_dist = pd.DataFrame(index=range(0,permutations), columns=["mean", "sdev"]) - avg_corr = avg_corrmat( - ppt_df - ) + + """ + null_dist = pd.DataFrame(index=range(0, permutations), columns=["mean", "sdev"]) + avg_corr = avg_corrmat(ppt_df) eff_perm = [] j = 0 while j < permutations: @@ -193,19 +191,21 @@ def generate_null(ppt_df, thresh_arr, measure, permutations=1000): leff_auc = np.trapz(effs_arr, dx=0.03, axis=0) eff_perm.append(leff_auc) j += 1 - + return null_dist + def omst(matrix, density=True, plot=False): - ''' + """ WARNING: THIS IS SLOW AF, REPLACING WITH NETWORKX VERSION IN NEAR FUTURE - ''' + """ dims = matrix.shape if matrix.ndim > 2: - raise ValueError("'matrix' should be a 2D array. " - "An array with %d dimension%s was passed" - % (matrix.ndim, - "s" if matrix.ndim > 1 else "")) + raise ValueError( + "'matrix' should be a 2D array. " + "An array with %d dimension%s was passed" + % (matrix.ndim, "s" if matrix.ndim > 1 else "") + ) else: mst = minimum_spanning_tree(matrix) mst_arr = mst.toarray().astype(float) @@ -217,7 +217,7 @@ def omst(matrix, density=True, plot=False): Cost = [cost] while np.sum(matrix_2) > 1000: - #print(np.sum(matrix_2)) + # print(np.sum(matrix_2)) mst = minimum_spanning_tree(matrix_2) mst_arr = mst.toarray().astype(float) matrix_2 = np.where(mst_arr != 0, 0, matrix_2) @@ -231,26 +231,23 @@ def omst(matrix, density=True, plot=False): max_GCE = GCE.index(max_value) thresholded = np.sum(trees[:max_GCE, :, :], axis=0) if plot == True: - fig,ax = plt.subplots() - sns.lineplot(Cost, GCE, ax=ax, palette='husl') - plt.scatter(Cost[max_GCE], - GCE[max_GCE], - marker='x', - edgecolors=None, - c='magenta') - ax.set_ylabel('Global Cost Efficiency') - ax.set_xlabel('Cost') - + fig, ax = plt.subplots() + sns.lineplot(Cost, GCE, ax=ax, palette="husl") + plt.scatter(Cost[max_GCE], GCE[max_GCE], marker="x", edgecolors=None, c="magenta") + ax.set_ylabel("Global Cost Efficiency") + ax.set_xlabel("Cost") + if density == True: den = np.sum(thresholded != 0) / (dims[0] * dims[1]) return thresholded, den return thresholded, fig + def graph_auc(matrix, thresholds, measure, args): - ''' + """ matrix : array measure : function from bctpy - ''' + """ from bct import measure, threshold_proportional metrics = [] @@ -258,11 +255,13 @@ def graph_auc(matrix, thresholds, measure, args): thresh = threshold_proportional(matrix, p, copy=True) metric = measure(thresh, args) metrics.append(metric) - auc= np.trapz(metrics, dx=0.01) + auc = np.trapz(metrics, dx=0.01) return auc + def graph_omst(matrix, measure, args): from bct import measure + # threshold using orthogonal minimum spanning tree thresh_mat = omst(matrix) @@ -270,8 +269,9 @@ def graph_omst(matrix, measure, args): metric = measure(thresh_mat, args) return metric + def scale_free_tau(corrmat, skew_thresh, proportional=True): - '''' + """' Calculates threshold at which network becomes scale-free, estimated from the skewness of the networks degree distribution. Parameters ---------- @@ -285,7 +285,7 @@ def scale_free_tau(corrmat, skew_thresh, proportional=True): ------- tau : float Lowest vaue of tau (threshold) at which network is scale-free. - ''' + """ tau = 0.01 skewness = 1 while abs(skewness) > 0.3: @@ -297,8 +297,9 @@ def scale_free_tau(corrmat, skew_thresh, proportional=True): tau += 0.01 return tau + def connected_tau(corrmat, proportional=True): - ''' + """ Calculates threshold at network becomes node connected, using NetworkX's `is_connected` function. Parameters ---------- @@ -312,7 +313,7 @@ def connected_tau(corrmat, proportional=True): ------- tau : float Highest vaue of tau (threshold) at which network becomes node-connected. - ''' + """ tau = 0.01 connected = False while connected == False: @@ -323,4 +324,4 @@ def connected_tau(corrmat, proportional=True): w_nx = nx.convert_matrix.from_numpy_array(w) connected = nx.algorithms.components.is_connected(w_nx) tau += 0.01 - return tau \ No newline at end of file + return tau diff --git a/idconn/parser_utils.py b/idconn/parser_utils.py index 792123e..5872ec8 100644 --- a/idconn/parser_utils.py +++ b/idconn/parser_utils.py @@ -5,7 +5,7 @@ def is_valid_file(parser, arg): """Check if argument is existing folder.""" if not op.isfile(arg) and arg is not None: - parser.error(f'The file {arg} does not exist!') + parser.error(f"The file {arg} does not exist!") return arg @@ -13,6 +13,6 @@ def is_valid_file(parser, arg): def is_valid_path(parser, arg): """Check if argument is existing folder.""" if not op.isdir(arg) and arg is not None: - parser.error(f'The folder {arg} does not exist!') + parser.error(f"The folder {arg} does not exist!") return arg diff --git a/idconn/pipeline.py b/idconn/pipeline.py index 8c82eea..08b00bb 100644 --- a/idconn/pipeline.py +++ b/idconn/pipeline.py @@ -13,122 +13,191 @@ Please scroll to bottom to read full license. """ import warnings -warnings.filterwarnings('ignore') -#import numpy as np + +warnings.filterwarnings("ignore") +# import numpy as np import pandas as pd import bids import argparse -#import logging -#from os import makedirs + +# import logging +# from os import makedirs from os.path import exists -#from glob import glob -#from nilearn import input_data, connectome, plotting, image + +# from glob import glob +# from nilearn import input_data, connectome, plotting, image from idconn.connectivity import rest_connectivity, task_connectivity from idconn.parser_utils import is_valid_file, is_valid_path -#from idconn.networking import graph_theory, null_distribution +# from idconn.networking import graph_theory, null_distribution -#LGR = logging.getLogger(__name__) -#LGR.setLevel(logging.INFO) +# LGR = logging.getLogger(__name__) +# LGR.setLevel(logging.INFO) def _get_parser(): - parser = argparse.ArgumentParser(description='Make correlation matrices from BOLD data + mask.') + parser = argparse.ArgumentParser( + description="Make correlation matrices from BOLD data + mask." + ) parser.add_argument( - 'dset_dir', + "dset_dir", type=lambda x: is_valid_path(parser, x), - help='Path to BIDS dataset containing fmriprep derivatives folder.', + help="Path to BIDS dataset containing fmriprep derivatives folder.", ) parser.add_argument( - 'atlas', + "atlas", type=lambda x: is_valid_file(parser, x), - help='Path to atlas file in space specified by `space`.', + help="Path to atlas file in space specified by `space`.", ) - parser.add_argument('task', type=str, - help='Task to be analyzed.') + parser.add_argument("task", type=str, help="Task to be analyzed.") parser.add_argument( - '--space', + "--space", type=str, - help='Space in which to run analyses (must be the space `atlas` is in).', + help="Space in which to run analyses (must be the space `atlas` is in).", default="MNI152NLin2009cAsym", ) parser.add_argument( - '--conn', - action='store', - choices=['covariance', 'correlation', 'partial correlation', 'tangent', 'precision'], - help='Metric used to calculate connectivity.', - default='correlation', + "--conn", + action="store", + choices=["covariance", "correlation", "partial correlation", "tangent", "precision"], + help="Metric used to calculate connectivity.", + default="correlation", ) parser.add_argument( - '--bids_db', + "--bids_db", metavar="PATH", type=lambda x: is_valid_path(parser, x), - help='Path to saved BIDS dataset layout file.', + help="Path to saved BIDS dataset layout file.", ) parser.add_argument( - '--confounds', + "--confounds", nargs="+", type=str, - help='Names of confound regressors from ', + help="Names of confound regressors from ", default=None, ) return parser -def idconn_workflow(dset_dir, atlas, task, out_dir, space="MNI152NLin2009cAsym", conn=None, bids_db=None, confounds=None): - print('Getting started!') +def idconn_workflow( + dset_dir, + atlas, + task, + out_dir, + space="MNI152NLin2009cAsym", + conn=None, + bids_db=None, + confounds=None, +): + print("Getting started!") if not confounds: confounds = [ - "cosine00", "cosine01", "cosine02", - "trans_x", "trans_x_derivative1", "trans_x_power2", "trans_x_derivative1_power2", - "trans_y", "trans_y_derivative1", "trans_y_derivative1_power2", "trans_y_power2", - "trans_z", "trans_z_derivative1", "trans_z_power2", "trans_z_derivative1_power2", - "rot_x", "rot_x_derivative1", "rot_x_power2", "rot_x_derivative1_power2", - "rot_y", "rot_y_derivative1", "rot_y_power2", "rot_y_derivative1_power2", - "rot_z", "rot_z_derivative1", "rot_z_derivative1_power2", "rot_z_power2", - "a_comp_cor_00", "a_comp_cor_01", "a_comp_cor_02", "a_comp_cor_03", "a_comp_cor_04", "a_comp_cor_05", "a_comp_cor_06" + "cosine00", + "cosine01", + "cosine02", + "trans_x", + "trans_x_derivative1", + "trans_x_power2", + "trans_x_derivative1_power2", + "trans_y", + "trans_y_derivative1", + "trans_y_derivative1_power2", + "trans_y_power2", + "trans_z", + "trans_z_derivative1", + "trans_z_power2", + "trans_z_derivative1_power2", + "rot_x", + "rot_x_derivative1", + "rot_x_power2", + "rot_x_derivative1_power2", + "rot_y", + "rot_y_derivative1", + "rot_y_power2", + "rot_y_derivative1_power2", + "rot_z", + "rot_z_derivative1", + "rot_z_derivative1_power2", + "rot_z_power2", + "a_comp_cor_00", + "a_comp_cor_01", + "a_comp_cor_02", + "a_comp_cor_03", + "a_comp_cor_04", + "a_comp_cor_05", + "a_comp_cor_06", ] print(f"Atlas: {atlas}\nConnectivity measure: {conn}") - assert exists(dset_dir), f"Specified dataset doesn't exist:\n{dset_dir} not found.\n\nPlease check the filepath." + assert exists( + dset_dir + ), f"Specified dataset doesn't exist:\n{dset_dir} not found.\n\nPlease check the filepath." layout = bids.BIDSLayout(dset_dir, derivatives=True, database_path=bids_db) - subjects = layout.get(return_type='id', target='subject', suffix='bold') + subjects = layout.get(return_type="id", target="subject", suffix="bold") print(f"Subjects: {subjects}") - #runs = layout.get(return_type='id', target='session', suffix='bold') - preproc_subjects = layout.get(return_type='id', target='subject', task=task, space=space, desc='preproc', suffix='bold') + # runs = layout.get(return_type='id', target='session', suffix='bold') + preproc_subjects = layout.get( + return_type="id", target="subject", task=task, space=space, desc="preproc", suffix="bold" + ) if len(subjects) != len(preproc_subjects): - print(f'{len(subjects)} subjects found in dset, only {len(preproc_subjects)} have preprocessed BOLD data. Pipeline is contniuing anyway, please double check preprocessed data if this doesn\'t seem right.') + print( + f"{len(subjects)} subjects found in dset, only {len(preproc_subjects)} have preprocessed BOLD data. Pipeline is contniuing anyway, please double check preprocessed data if this doesn't seem right." + ) - example_events = layout.get(return_type='filename', suffix='events', task=task, subject=preproc_subjects[0]) - events_df = pd.read_csv(example_events[0], header=0, index_col=0, sep='\t') - conditions = events_df['trial_type'].unique() + example_events = layout.get( + return_type="filename", suffix="events", task=task, subject=preproc_subjects[0] + ) + events_df = pd.read_csv(example_events[0], header=0, index_col=0, sep="\t") + conditions = events_df["trial_type"].unique() print(f"Computing connectivity matrices using {atlas}") for subject in preproc_subjects: print(f"Subject {subject}") - sessions = layout.get(return_type='id', target='session', task=task, subject=subject, suffix='bold') + sessions = layout.get( + return_type="id", target="session", task=task, subject=subject, suffix="bold" + ) print(f"Sessions with task-{task} found for {subject}: {sessions}") for session in sessions: print(f"Session {session}") - print(f"here are the inputs: {layout, subject, session, task, atlas, conn, space, confounds}") - if 'rest' in task: + print( + f"here are the inputs: {layout, subject, session, task, atlas, conn, space, confounds}" + ) + if "rest" in task: try: - adj_matrix = rest_connectivity(layout, subject, session, task, atlas, conn, space, confounds) + adj_matrix = rest_connectivity( + layout, subject, session, task, atlas, conn, space, confounds + ) except Exception as e: - print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}') + print( + f"Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}" + ) if len(conditions) < 1: try: - adj_matrix = rest_connectivity(layout, subject, session, task, atlas, conn, space, confounds) + adj_matrix = rest_connectivity( + layout, subject, session, task, atlas, conn, space, confounds + ) except Exception as e: - print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}') + print( + f"Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}" + ) else: try: - adj_matrix = task_connectivity(layout=layout, subject=subject, session=session, task=task, atlas=atlas, confounds=confounds, connectivity_metric=conn) + adj_matrix = task_connectivity( + layout=layout, + subject=subject, + session=session, + task=task, + atlas=atlas, + confounds=confounds, + connectivity_metric=conn, + ) except Exception as e: - print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}') + print( + f"Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}" + ) def _main(argv=None): @@ -138,7 +207,7 @@ def _main(argv=None): idconn_workflow(**vars(options)) -if __name__ == '__main__': +if __name__ == "__main__": _main() """ diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index ed6b664..7a1563b 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -23,60 +23,77 @@ today = datetime.today() today_str = strftime("%m_%d_%Y") -TRAIN_DSET = '/Users/katherine.b/Dropbox/Data/ds002674' -TEST_DSET = '/Users/katherine.b/Dropbox/Data/diva-dset' -DERIV_NAME = 'IDConn' -OUTCOME = 'bc' -CONFOUNDS = 'framewise_displacement' -TASK = 'rest' -ATLAS = 'craddock2012' +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "bc" +CONFOUNDS = "framewise_displacement" +TASK = "rest" +ATLAS = "craddock2012" alpha = 0.05 -atlas_fname = '/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz' +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) -dat = io.read_corrmats(layout, task=TASK, deriv_name='IDConn', atlas=ATLAS, z_score=True) +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=True) -keep = dat['adj'].dropna().index +keep = dat["adj"].dropna().index dat = dat.loc[keep] -#print(dat['adj'].values.shape) -num_node = dat.iloc[0]['adj'].shape[0] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] -matrices = np.vstack(dat['adj'].values).reshape((len(keep), num_node, num_node)) +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) upper_tri = np.triu_indices(num_node, k=1) -outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]),1)) +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) if CONFOUNDS is not None: confounds = dat[CONFOUNDS] - base_name = f'nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}' + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" else: confounds = None - base_name = f'nbs-predict_outcome-{OUTCOME}' -#print(dat['bc']) - -weighted_average, cv_results = nbs.kfold_nbs(matrices, outcome, confounds, alpha, groups=dat['bc'], n_splits=10, n_iterations=100) - -fig,fig2, nimg = io.plot_edges(weighted_average, - atlas_fname, - threshold='computed', - title=f'{OUTCOME} Precition-Weighted Average', - strength=True, - cmap='seismic', - node_size='strength') - -fig.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-{today_str}.png'), dpi=400) -fig2.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-strength-{today_str}.png'), dpi=400) -nib.save(nimg, join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-strength-{today_str}')) - - -avg_df = pd.DataFrame(weighted_average, - index=range(0,weighted_average.shape[0]), - columns=range(0,weighted_average.shape[1])) - -cv_results.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_models-{today_str}.tsv'),sep='\t') -avg_df.to_csv(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_weighted-{today_str}.tsv'),sep='\t') + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=dat["bc"], n_splits=10, n_iterations=100 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precition-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) # this uses the most predictive subnetwork as features in the model @@ -91,26 +108,28 @@ nbs_vector = weighted_average[upper_tri] p50 = np.percentile(nbs_vector, 50) filter = np.where(nbs_vector >= p50, True, False) -#print(nbs_vector.shape, filter.shape) +# print(nbs_vector.shape, filter.shape) -#mask = io.vectorize_corrmats(filter) -edges_train = np.vstack(dat['edge_vector'].dropna().values) +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values) # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: confounds_train = dat[CONFOUNDS].values outcome_train = np.reshape(outcome, (outcome.shape[0],)) - #regress out the confounds from each edge and the outcome variable, + # regress out the confounds from each edge and the outcome variable, # use the residuals for the rest of the algorithm - #print(confounds.shape, outcome.shape) + # print(confounds.shape, outcome.shape) if len(np.unique(outcome_train)) <= 2: resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) train_outcome = outcome elif len(np.unique(outcome_train)) > 3: - train_outcome, resid_edges = nbs.residualize(X=edges_train, y=outcome_train, confounds=confounds_train) - train_features = resid_edges[:,filter] + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges[:, filter] else: - train_features = edges_train[:,filter] + train_features = edges_train[:, filter] train_outcome = outcome scaler = StandardScaler() @@ -126,7 +145,7 @@ # could be extended to the multiclass case? if len(np.unique(outcome)) == 2: - model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.25) + model = LogisticRegression(penalty="elasticnet", solver="saga", l1_ratio=0.25) else: model = ElasticNet(l1_ratio=0.25) @@ -135,16 +154,18 @@ fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) in_sample_score = fitted.score(X=train_features, y=np.ravel(train_outcome)) if len(np.unique(outcome)) == 2: - train_metrics['accuracy'] = in_sample_score + train_metrics["accuracy"] = in_sample_score else: - train_metrics['coefficient of determination'] = in_sample_score + train_metrics["coefficient of determination"] = in_sample_score y_pred = fitted.predict(X=train_features) mse = mean_squared_error(train_outcome, y_pred) -train_metrics['mean squared error'] = mse -print('In-sample prediction score: ', in_sample_score) -print('In-sample mean squared error: ', mse) -#print(np.mean(train_features)) -with open(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_fit-{today_str}.json'), 'w') as fp: +train_metrics["mean squared error"] = mse +print("In-sample prediction score: ", in_sample_score) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: json.dump(train_metrics, fp) # yoink the coefficients? for a more parsimonious figure? @@ -153,61 +174,74 @@ for i in range(0, filter.shape[0]): if filter[i] == True: if len(np.unique(outcome)) == 2: - coeff_vec[i] = fitted.coef_[0,j] + coeff_vec[i] = fitted.coef_[0, j] else: coeff_vec[i] = fitted.coef_[j] j += 1 else: pass -#print(coeff_vec) +# print(coeff_vec) coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) -#print(coef_mat == coef_mat.T) - -fig,fig2, nimg = io.plot_edges(coef_mat, - atlas_fname, - threshold='computed', - title=f'{OUTCOME} Coefficients', - strength=True, - cmap='seismic', - node_size='strength') - -fig.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_betas-{today_str}.png'), dpi=400) -fig2.savefig(join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_betas-strength-{today_str}.png'), dpi=400) -nib.save(nimg, join(TRAIN_DSET, 'derivatives', DERIV_NAME, f'{base_name}_betas-strength-{today_str}')) +# print(coef_mat == coef_mat.T) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) layout = bids.BIDSLayout(TEST_DSET, derivatives=True) -test_df = io.read_corrmats(layout, task=TASK, deriv_name='IDConn', atlas=ATLAS, z_score=True) +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=True) -keep = test_df[[OUTCOME, 'adj']].dropna().index -#print(keep) +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) test_df = test_df.loc[keep] outcome_test = test_df[OUTCOME].values -#print(test_df) +# print(test_df) -#print(outcome_test) -matrices_test = np.vstack(test_df['adj'].dropna().values).reshape((len(test_df['adj'].dropna().index),num_node,num_node)) -edges_test = np.vstack(test_df['edge_vector'].dropna().values) +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values) # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: confounds_test = test_df[CONFOUNDS].values - - #regress out the confounds from each edge and the outcome variable, + + # regress out the confounds from each edge and the outcome variable, # use the residuals for the rest of the algorithm - #print(confounds.shape, outcome.shape) + # print(confounds.shape, outcome.shape) if len(np.unique(outcome_test)) <= 2: resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) test_outcome = outcome_test elif len(np.unique(outcome_test)) > 3: - test_outcome, resid_edges = nbs.residualize(X=edges_test, y=outcome_test, confounds=confounds_test) - test_features = resid_edges[:,filter] + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges[:, filter] else: - test_features = edges_test[:,filter] + test_features = edges_test[:, filter] test_outcome = outcome_test # scale after residualizing omg @@ -216,7 +250,7 @@ pass else: test_outcome = scaler.fit_transform(test_outcome.reshape(-1, 1)) -#print(test_features.shape) +# print(test_features.shape) # if the model is a logistic regression, i.e. with a binary outcome # then score is prediction accuracy # if the model is a linear regression, i.e., with a continuous outcome @@ -224,29 +258,31 @@ # fit trained ElasticNet, initialized via warm_start # prob in CV? -#fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) -#score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) test_metrics = {} y_pred = fitted.predict(X=test_features) score = fitted.score(X=test_features, y=np.ravel(test_outcome)) if len(np.unique(test_outcome)) == 2: - test_metrics['accuracy'] = score + test_metrics["accuracy"] = score else: - test_metrics['coefficient of determination'] = score + test_metrics["coefficient of determination"] = score mse = mean_squared_error(test_outcome, y_pred) -test_metrics['mean squared error'] = mse -print('Out-of-sample prediction score:\t', score) -print('Out-of-sample mean squared error:\t', mse) -#print(np.mean(test_features)) -#pred_outcome = fitted.predict(test_features) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) -print(test_outcome, '\n',y_pred) -#print(pred_outcome) +print(test_outcome, "\n", y_pred) +# print(pred_outcome) if len(np.unique(test_outcome)) > 2: corr = spearmanr(test_outcome, y_pred) - print(f'\nSpearman correlation between predicted and actual {OUTCOME}:\t', corr) - test_metrics['spearman correlation'] = corr -with open(join(TEST_DSET, 'derivatives', DERIV_NAME, f'{base_name}_fit-{today_str}.json'), 'w') as fp: + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: json.dump(test_metrics, fp) -np.savetxt(join(TEST_DSET, f'{base_name}_predicted-values_fit-{today_str}.txt'), y_pred) \ No newline at end of file +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/setup.py b/setup.py index abab8f5..4d7ed83 100644 --- a/setup.py +++ b/setup.py @@ -27,14 +27,14 @@ "numpy", "scipy", "nilearn", - "sklearn", + "scikit-learn", "pandas", "nibabel", "bctpy", "pybids", "networkx", - "matplotlib", # necessary until nilearn includes mpl as a dependency - "enlighten", + "matplotlib", # necessary until nilearn includes mpl as a dependency + "enlighten", ], extras_require={ "doc": [ @@ -46,7 +46,7 @@ "sphinx-copybutton", "sphinx_gallery==0.10.1", "sphinxcontrib-bibtex", - ], + ], "tests": [ "codecov", "coverage", diff --git a/versioneer.py b/versioneer.py index 2b54540..b9421e4 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1136,9 +1136,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ - 0 - ].strip() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces @@ -1238,13 +1236,9 @@ def versions_from_file(filename): contents = f.read() except EnvironmentError: raise NotThisMethod("unable to read _version.py") - mo = re.search( - r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S - ) + mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: - mo = re.search( - r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S - ) + mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) @@ -1454,9 +1448,7 @@ def get_versions(verbose=False): handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose - assert ( - cfg.versionfile_source is not None - ), "please set versioneer.versionfile_source" + assert cfg.versionfile_source is not None, "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) @@ -1697,9 +1689,7 @@ def make_release_tree(self, base_dir, files): # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) - write_to_version_file( - target_versionfile, self._versioneer_generated_versions - ) + write_to_version_file(target_versionfile, self._versioneer_generated_versions) cmds["sdist"] = cmd_sdist @@ -1823,10 +1813,7 @@ def do_setup(): else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: - print( - " appending versionfile_source ('%s') to MANIFEST.in" - % cfg.versionfile_source - ) + print(" appending versionfile_source ('%s') to MANIFEST.in" % cfg.versionfile_source) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: From a2cdbd4fed72b3fc52cceba416e8e92bee1db153 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Tue, 18 Apr 2023 11:19:49 -0700 Subject: [PATCH 38/48] add param tuning to nbs, plot true vs pred --- idconn/nbs.py | 65 +++++++++++++----- idconn/workflows/nbs_predict.py | 115 ++++++++++++++++++++++++++++---- 2 files changed, 150 insertions(+), 30 deletions(-) diff --git a/idconn/nbs.py b/idconn/nbs.py index facf96f..c7bfceb 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -7,17 +7,15 @@ import enlighten # import bct - +from sklearn.experimental import enable_halving_search_cv from sklearn.model_selection import ( RepeatedStratifiedKFold, RepeatedKFold, - GridSearchCV, - StratifiedKFold, - KFold, + HalvingGridSearchCV ) from sklearn.feature_selection import f_regression, f_classif -from sklearn.linear_model import LogisticRegression, ElasticNet +from sklearn.linear_model import LogisticRegression, ElasticNet, LogisticRegressionCV, ElasticNetCV from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error @@ -319,12 +317,22 @@ def kfold_nbs( cv_results.at[i, "split"] = (train_idx, test_idx) # assert len(train_a_idx) == len(train_b_idx) + l1_ratio_grid = [0.2, 0.4, 0.6, 0.8] if np.unique(outcome).shape[0] == 2: - regressor = LogisticRegression( - l1_ratio=0.25, max_iter=1000, penalty="elasticnet", solver="saga" + regressor = LogisticRegressionCV( + l1_ratio=l1_ratio_grid, + max_iter=100000, + penalty="elasticnet", + solver="saga", + n_jobs=4 ) + else: - regressor = ElasticNet(l1_ratio=0.25, max_iter=1000) + regressor = ElasticNetCV( + l1_ratio=l1_ratio_grid, + cv=4, + n_jobs=4 + ) train_y = outcome[train_idx] test_y = outcome[test_idx] @@ -374,6 +382,7 @@ def kfold_nbs( # so you don't have repeated edges # returns (n_edges, ) nbs_vector = adj.values[upper_tri] + #print(nbs_vector.shape) # print(nbs_vector.shape) # use those to make a "significant edges" mask mask = nbs_vector == 1.0 @@ -385,12 +394,31 @@ def kfold_nbs( # returns (n_edges, samples) train_features = train_edges.T[mask] test_features = test_edges.T[mask] + #print(mask.shape, np.sum(mask), train_edges.shape, train_features.shape) + + train_features = train_features.T + test_features = test_features.T + + #train_features = scaler.fit_transform(train_features.T) + #test_features = scaler.fit_transform(test_features.T) + #print(train_features.shape, train_y.shape) - train_features = scaler.fit_transform(train_features.T) - test_features = scaler.fit_transform(test_features.T) + #print(f"train_edges:\t{train_edges[:10, 0]}\ntrain_features:\t{train_features[:10, 0]}") # print(np.ravel(train_y)) # train model predicting outcome from brain (note: no mas covariates) + # use grid search bc I want to know how to tune alpha and l1_ratio + + #grid = HalvingGridSearchCV(estimator=regressor, + # param_grid=param_grid, + # n_jobs=8, + # cv=4, + # factor=2, + # verbose=0, + # min_resources=20, + # refit=True, + # aggressive_elimination=False) model = regressor.fit(X=train_features, y=np.ravel(train_y)) + cv_results.at[i, "model"] = model # score that model on the testing data @@ -399,7 +427,11 @@ def kfold_nbs( # both from 0 (low) to 1 (high) score = model.score(X=test_features, y=np.ravel(test_y)) cv_results.at[i, "score"] = score - # print(model.coef_.shape) + if i % (n_splits * n_iterations / 10) == 0: + mean = cv_results['score'].mean() + sdev = cv_results['score'].std() + print(f'Iteration {i} out of {n_splits * n_iterations}, average score:\t{mean:.2f} +/- {sdev:.2f}') + #print(score) m = 0 param_vector = np.zeros_like(nbs_vector) @@ -427,11 +459,10 @@ def kfold_nbs( # print(weighted_stack.shape) for j in index[1:]: # print(cv_results.at[j, 'score']) - if cv_results.at[j, "score"] > 0: - weighted = cv_results.at[j, "component"] * cv_results.at[j, "score"] - weighted_stack = np.dstack([weighted_stack, weighted]) - else: - pass + weighted = cv_results.at[j, "component"] * cv_results.at[j, "score"] + weighted_stack = np.dstack([weighted_stack, weighted]) + # print(weighted_stack.shape, weighted.shape) weighted_average = np.mean(weighted_stack, axis=-1) - return weighted_average, cv_results + #model = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + return weighted_average, cv_results, #model diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 7a1563b..233c284 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -1,9 +1,10 @@ #!/usr/bin/env python3 import pandas as pd import numpy as np -import pingouin as pg import nibabel as nib +import seaborn as sns import bids +import matplotlib.pyplot as plt from os.path import join from datetime import datetime from time import strftime @@ -14,6 +15,9 @@ from sklearn.linear_model import LogisticRegression, ElasticNet from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + import warnings import json @@ -64,7 +68,7 @@ weighted_average, atlas_fname, threshold="computed", - title=f"{OUTCOME} Precition-Weighted Average", + title=f"{OUTCOME} Precision-Weighted Average", strength=True, cmap="seismic", node_size="strength", @@ -95,6 +99,7 @@ join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" ) +best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] # this uses the most predictive subnetwork as features in the model # might replace with thresholded weighted_average @@ -105,9 +110,10 @@ # either way, I don't think cv_results is necessary # here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) nbs_vector = weighted_average[upper_tri] -p50 = np.percentile(nbs_vector, 50) -filter = np.where(nbs_vector >= p50, True, False) +p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector >= p75, True, False) # print(nbs_vector.shape, filter.shape) # mask = io.vectorize_corrmats(filter) @@ -127,9 +133,9 @@ train_outcome, resid_edges = nbs.residualize( X=edges_train, y=outcome_train, confounds=confounds_train ) - train_features = resid_edges[:, filter] + train_features = resid_edges[:,filter] else: - train_features = edges_train[:, filter] + train_features = edges_train[:,filter] train_outcome = outcome scaler = StandardScaler() @@ -145,10 +151,18 @@ # could be extended to the multiclass case? if len(np.unique(outcome)) == 2: - model = LogisticRegression(penalty="elasticnet", solver="saga", l1_ratio=0.25) + model = LogisticRegression( + penalty="elasticnet", + solver="saga", + l1_ratio=best.l1_ratio_ + ) else: - model = ElasticNet(l1_ratio=0.25) - + model = ElasticNet( + l1_ratio=best.l1_ratio_, + alpha=best.alpha_ + ) +#print(params) +#model.set_params(**params) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict train_metrics = {} fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) @@ -158,6 +172,35 @@ else: train_metrics["coefficient of determination"] = in_sample_score y_pred = fitted.predict(X=train_features) +dat[f'{OUTCOME}_pred'] = y_pred +dat[f'{OUTCOME}_scaled'] = train_outcome + +Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] +Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +train_colors = ['#a08ad1', #light + '#685690', #medium + '#3f2d69' #dark + ] +light_cmap = sns.color_palette('dark:#a08ad1') +dark_cmap = sns.color_palette('dark:#685690') + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + ax=ax, + palette=dark_cmap) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + ax=ax, + palette=light_cmap) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + mse = mean_squared_error(train_outcome, y_pred) train_metrics["mean squared error"] = mse print("In-sample prediction score: ", in_sample_score) @@ -184,7 +227,8 @@ # print(coeff_vec) coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) -# print(coef_mat == coef_mat.T) +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) fig, fig2, nimg = io.plot_edges( coef_mat, @@ -216,6 +260,7 @@ # print(keep) test_df = test_df.loc[keep] + outcome_test = test_df[OUTCOME].values # print(test_df) @@ -273,9 +318,53 @@ print("Out-of-sample mean squared error:\t", mse) # print(np.mean(test_features)) # pred_outcome = fitted.predict(test_features) - - -print(test_outcome, "\n", y_pred) +test_df[f'{OUTCOME}_scaled'] = test_outcome +test_df[f'{OUTCOME}_pred'] = y_pred +Ys = test_df[[f'{OUTCOME}_scaled', + f'{OUTCOME}_pred', + 'cycle_day', + 'bc']] +Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +Ys['ppts'] = Ys.index.get_level_values(0) + + +light_colors = ['#33ACE3', #Bubbles + '#EA6964', #Blossom + '#4AB62C' #Buttercup + ] +dark_colors = ['#1278a6', + '#a11510', + '#228208'] +light = ListedColormap(light_colors, name='light_powderpuff') +dark = ListedColormap(dark_colors, name='dark_powderpuff') +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='light_powderpuff' + ) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='dark_powderpuff') +ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') +fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + + + +#print(test_outcome, "\n", y_pred) # print(pred_outcome) if len(np.unique(test_outcome)) > 2: corr = spearmanr(test_outcome, y_pred) From 6f1dd8b5110eca872252bc17a1214d9df8d6493d Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Tue, 18 Apr 2023 11:24:31 -0700 Subject: [PATCH 39/48] add contributor guidelines --- CONTRIBUTING.md | 125 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e69de29..11ce204 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -0,0 +1,125 @@ +# Contributing to IDConn + +Welcome to the ``IDConn`` repository! +We're excited you're here and want to contribute. + +These guidelines are designed to make it as easy as possible to get involved. +If you have any questions that aren't discussed below, please let us know by opening an [issue][link_issues]! + +Before you start you'll need to set up a free [GitHub][link_github] account and sign in. +Here are some [instructions][link_signupinstructions]. + +## Governance + +Governance is a hugely important part of any project. +It is especially important to have clear process and communication channels for open source projects that rely on a distributed network of volunteers, such as ``IDConn``. + +``IDConn`` is currently supported by a small group of core developers. +Even with only a couple of individuals involved in decision making processes, we've found that setting expectations and communicating a shared vision has great value. + +By starting the governance structure early in our development, we hope to welcome more people into the contributing team. +We are committed to continuing to update the governance structures as necessary. +Every member of the ``IDConn`` community is encouraged to comment on these processes and suggest improvements. + +As the first project leader, Katie Bottenhorn is ultimately responsible for any major decisions pertaining to ``IDConn`` development. +However, all potential changes are explicitly and openly discussed in the described channels of communication, and we strive for consensus amongst all community members. + +## Code of conduct + +All ``IDConn`` community members are expected to follow our [code of conduct](https://github.com/62442katieb/IDConn/blob/main/CODE_OF_CONDUCT.md) during any interaction with the project. +That includes- but is not limited to- online conversations, in-person workshops or development sprints, and when giving talks about the software. + +As stated in the code, severe or repeated violations by community members may result in exclusion from collective decision-making and rejection of future contributions to the ``IDConn`` project. + +## Asking questions about using IDConn + +Please direct usage-related questions to [NeuroStars][link_neurostars], with [the "Software Support" category and the "IDConn" tag][link_neurostars_IDConn]. +The ``IDConn`` developers follow NeuroStars, and will be able to answer your question there. + +## Labels + +The current list of labels are [here][link_labels] and include: + +* [![Good First Issue](https://img.shields.io/badge/-good%20first%20issue-7057ff.svg)](https://github.com/62442katieb/IDConn/labels/good%20first%20issue) +*These issues contain a task that a member of the team has determined should require minimal knowledge of the existing codebase, and should be good for people new to the project.* +If you are interested in contributing to IDConn, but aren't sure where to start, we encourage you to take a look at these issues in particular. + +* [![Help Wanted](https://img.shields.io/badge/-help%20wanted-33aa3f.svg)](https://github.com/62442katieb/IDConn/labels/help%20wanted) +*These issues contain a task that a member of the team has determined we need additional help with.* +If you feel that you can contribute to one of these issues, we especially encourage you to do so! + +* [![Bug](https://img.shields.io/badge/-bug-ee0701.svg)](https://github.com/62442katieb/IDConn/labels/bug) +*These issues point to problems in the project.* +If you find new a bug, please give as much detail as possible in your issue, including steps to recreate the error. +If you experience the same bug as one already listed, please add any additional information that you have as a comment. + +* [![Enhancement](https://img.shields.io/badge/-enhancement-84b6eb.svg)](https://github.com/62442katieb/IDConn/labels/enhancement) +*These issues are asking for new features to be added to the project.* +Please try to make sure that your requested feature is distinct from any others that have already been requested or implemented. +If you find one that's similar but there are subtle differences please reference the other request in your issue. + +## Making a change + +We appreciate all contributions to IDConn, but those accepted fastest will follow a workflow similar to the following: + +**1. Comment on an existing issue or open a new issue referencing your addition.** + +This allows other members of the IDConn development team to confirm that you aren't overlapping with work that's currently underway and that everyone is on the same page with the goal of the work you're going to carry out. + +[This blog][link_pushpullblog] is a nice explanation of why putting this work in up front is so useful to everyone involved. + +**2. Fork IDConn.** + +[Fork][link_fork] the [IDConn repository][link_idconn] to your profile. + +This is now your own unique copy of IDConn. +Changes here won't effect anyone else's work, so it's a safe space to explore edits to the code! + +Make sure to [keep your fork up to date][link_updateupstreamwiki] with the main repository. + +**3. Make the changes you've discussed.** + +Try to keep the changes focused. We've found that working on a [new branch][link_branches] makes it easier to keep your changes targeted. + +When you're creating your pull request, please do your best to follow IDConn's preferred style conventions. +Namely, documentation should follow the [numpydoc](https://numpydoc.readthedocs.io/en/latest/) convention and code should adhere to [PEP8](https://www.python.org/dev/peps/pep-0008/) as much as possible. + +**4. Submit a pull request.** + +Submit a [pull request][link_pullrequest]. + +A member of the development team will review your changes to confirm that they can be merged into the main codebase. + +Please use a sentence-case title for the pull request, and do not include any prefixes (e.g., ``[ENH]``), as we now use labels to distinguish pull request types. +The title should summarize the changes proposed in the pull request, with an emphasis on readability, as pull request titles are used directly in our release notes. + +## Recognizing contributions + +We welcome and recognize all contributions from documentation to testing to code development. +You can see a list of current contributors in our [zenodo][link_zenodo] file. +If you are new to the project, don't forget to add your name and affiliation there! + +## Thank you! + +You're awesome. + +.. note:: + These guidelines are based on contributing guidelines from the [STEMMRoleModels][link_stemmrolemodels] project. + +[link_github]: https://github.com/ +[link_idconn]: https://github.com/62442katieb/IDConn +[link_signupinstructions]: https://help.github.com/articles/signing-up-for-a-new-github-account +[link_react]: https://github.com/blog/2119-add-reactions-to-pull-requests-issues-and-comments +[link_issues]: https://github.com/62442katieb/IDConn/issues +[link_labels]: https://github.com/62442katieb/IDConn/labels +[link_discussingissues]: https://help.github.com/articles/discussing-projects-in-issues-and-pull-requests +[link_neurostars]: https://neurostars.org + + +[link_pullrequest]: https://help.github.com/articles/creating-a-pull-request/ +[link_fork]: https://help.github.com/articles/fork-a-repo/ +[link_pushpullblog]: https://www.igvita.com/2011/12/19/dont-push-your-pull-requests/ +[link_branches]: https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/ +[link_updateupstreamwiki]: https://help.github.com/articles/syncing-a-fork/ +[link_stemmrolemodels]: https://github.com/KirstieJane/STEMMRoleModels +[link_zenodo]: https://github.com/62442katieb/IDConn/blob/main/.zenodo.json From 55d4989e66ef6c43fc25d7081da12487a9a09ffb Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Sun, 3 Sep 2023 20:29:17 -0700 Subject: [PATCH 40/48] used for Flux aim2, corr score --- idconn/nbs.py | 116 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 79 insertions(+), 37 deletions(-) diff --git a/idconn/nbs.py b/idconn/nbs.py index c7bfceb..3e2b48f 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -15,10 +15,11 @@ ) from sklearn.feature_selection import f_regression, f_classif -from sklearn.linear_model import LogisticRegression, ElasticNet, LogisticRegressionCV, ElasticNetCV -from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression, ElasticNet, LogisticRegressionCV, RidgeCV +from sklearn.preprocessing import Normalizer -from sklearn.metrics import mean_squared_error +from sklearn.metrics import mean_squared_log_error, adjusted_mutual_info_score +from scipy.stats import spearmanr def calc_number_of_nodes(matrices): @@ -38,6 +39,9 @@ def calc_number_of_nodes(matrices): def residualize(X, y=None, confounds=None): + ''' + all inputs need to be arrays, not dataframes + ''' # residualize the outcome if confounds is not None: if y is not None: @@ -70,7 +74,7 @@ def residualize(X, y=None, confounds=None): print("Confound matrix wasn't provided, so no confounding was done") -def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000): +def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=False, permutations=10000): """ Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided of shape ((subject x session)x node x node) @@ -120,11 +124,12 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000): # turn matrices into vectorized upper triangles if ndims > 2: - edges = vectorize_corrmats(matrices) + edges = vectorize_corrmats(matrices, diagonal=diagonal) else: edges = matrices.copy() # print(edges.shape) + # edges = edges.T # run an ols per edge @@ -140,12 +145,14 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000): # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - sig_matrix = undo_vectorize(sig_edges) # need to write this function + sig_matrix = undo_vectorize(sig_edges, num_node=num_node, diagonal=diagonal) # need to write this function matrix = nx.from_numpy_array(sig_matrix) # use networkX to find connected components - largest_cc = max(nx.connected_components(matrix), key=len) - G0 = matrix.subgraph(largest_cc) + S = [matrix.subgraph(c).copy() for c in nx.connected_components(matrix)] + S.sort(key=len, reverse=True) + #largest_cc = max(nx.connected_components(matrix), key=len) + G0 = S[0] # print(G0) # retain size of largest connected component @@ -195,7 +202,7 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000): # print(np.sum(perm_edges)) # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - perm_matrix = undo_vectorize(perm_edges) # need to write this function + perm_matrix = undo_vectorize(perm_edges, num_node=num_node, diagonal=diagonal) # need to write this function perm_nx = nx.from_numpy_array(perm_matrix) largest_cc = max(nx.connected_components(perm_nx), key=len) @@ -226,7 +233,7 @@ def pynbs(matrices, outcome, alpha=0.05, predict=False, permutations=10000): def kfold_nbs( - matrices, outcome, confounds=None, alpha=0.05, groups=None, n_splits=10, n_iterations=10 + matrices, outcome, confounds=None, alpha=0.05, groups=None, num_node=None, diagonal=False, n_splits=10, n_iterations=10 ): """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided of shape ((subject x session)x node x node) @@ -240,9 +247,9 @@ def kfold_nbs( array of vectorized upper triangles of those correlation mat Parameters ---------- - matrices : numpy array of shape (p, n, n) + matrices : numpy array of shape (p, n, n) or (p, (n^2 / 2)- n) Represents the link strengths of the graphs. Assumed to be - an array of symmetric matrices. + an array of symmetric matrices or a vectorized triangle thereof. outcome : list-like of shape (p,) Y-value to be predicted with connectivity confounds : list-like @@ -270,7 +277,15 @@ def kfold_nbs( Includes the results of each cross-validation loop (e.g., predictive performance, data split, largest connected component per fold per iteration). """ - edges = vectorize_corrmats(matrices) + ndims = len(matrices.shape) + + # vectorize_corrmats returns p x n^2 + + # turn matrices into vectorized upper triangles + if ndims > 2: + edges = vectorize_corrmats(matrices) + else: + edges = matrices.copy() # print(edges.shape) # print(edges.shape) index = list(range(0, n_splits * n_iterations)) @@ -282,8 +297,6 @@ def kfold_nbs( #'pval', "score", "component", - "coefficient_matrix", - "coefficient_vector", "model", ], ) @@ -295,7 +308,10 @@ def kfold_nbs( cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_iterations) split_y = outcome - num_node = calc_number_of_nodes(matrices) + if num_node is None: + num_node = calc_number_of_nodes(matrices) + else: + pass # print(num_node) # if matrices.shape[0] != matrices.shape[1]: # if matrices.shape[1] == matrices.shape[2]: @@ -307,31 +323,41 @@ def kfold_nbs( #'or node x node x (subject x session).') # else: # num_node = matrices.shape[0] - upper_tri = np.triu_indices(num_node, k=1) + if diagonal == True: + k = 0 + if diagonal == False: + k=1 + upper_tri = np.triu_indices(num_node, k=k) i = 0 manager = enlighten.get_manager() ticks = manager.counter(total=n_splits * n_iterations, desc="Progress", unit="folds") for train_idx, test_idx in cv.split(edges, split_y): - scaler = StandardScaler() + x_scaler = Normalizer() + y_scaler = Normalizer() cv_results.at[i, "split"] = (train_idx, test_idx) # assert len(train_a_idx) == len(train_b_idx) - l1_ratio_grid = [0.2, 0.4, 0.6, 0.8] + Cs = np.logspace(-4, 4, 10) + #print(len(np.unique(outcome))) if np.unique(outcome).shape[0] == 2: + #print('binary') regressor = LogisticRegressionCV( - l1_ratio=l1_ratio_grid, + Cs=Cs, + cv=4, + #verbose=2, max_iter=100000, - penalty="elasticnet", + penalty="l2", solver="saga", n_jobs=4 ) else: - regressor = ElasticNetCV( - l1_ratio=l1_ratio_grid, + #print('continuous') + regressor = RidgeCV( + alphas=Cs, cv=4, - n_jobs=4 + #n_jobs=4 ) train_y = outcome[train_idx] @@ -357,20 +383,20 @@ def kfold_nbs( else: pass - train_edges = scaler.fit_transform(train_edges) - test_edges = scaler.fit_transform(test_edges) + train_edges = x_scaler.fit_transform(train_edges) + test_edges = x_scaler.transform(test_edges) if np.unique(outcome).shape[0] == 2: pass else: - train_y = scaler.fit_transform(train_y.reshape(-1, 1)) - test_y = scaler.fit_transform(test_y.reshape(-1, 1)) + train_y = y_scaler.fit_transform(train_y.reshape(-1, 1)) + test_y = y_scaler.transform(test_y.reshape(-1, 1)) # perform NBS wooooooooo # note: output is a dataframe :) # PYNBS SHOULD NOT DO CONFOUND REGRESSION? - adj = pynbs(train_edges, train_y, alpha, predict=True) - # print(adj.shape, adj.ndim, adj[0].shape, upper_tri) + adj = pynbs(train_edges, train_y, num_node=num_node, diagonal=diagonal, alpha=alpha, predict=True) + #print(adj.shape, adj.ndim, adj[0].shape, upper_tri) # cv_results.at[i, 'pval'] = pval cv_results.at[i, "component"] = adj.values @@ -425,7 +451,18 @@ def kfold_nbs( # if logistic regression: score = mean accuracy # if linear regression: score = coefficient of determination (R^2) # both from 0 (low) to 1 (high) - score = model.score(X=test_features, y=np.ravel(test_y)) + + # can't use MSE, which is the default score for ridge + # because larger values = worse performance + # I go die now + if np.unique(outcome).shape[0] == 2: + score = model.score(X=test_features, y=np.ravel(test_y)) + + else: + predicted_y = model.predict(X=test_features) + score,p = spearmanr(predicted_y, np.ravel(test_y)) + #spearman = spearmanr(predicted_y, np.ravel(test_y)) + cv_results.at[i, "score"] = score if i % (n_splits * n_iterations / 10) == 0: mean = cv_results['score'].mean() @@ -446,21 +483,26 @@ def kfold_nbs( m += 1 else: pass - X = undo_vectorize(param_vector, num_node=num_node) - cv_results.at[i, "coefficient_matrix"] = X - cv_results.at[i, "coefficient_vector"] = param_vector + X = undo_vectorize(param_vector, num_node=num_node, diagonal=diagonal) + #cv_results.at[i, "coefficient_matrix"] = X + #cv_results.at[i, "coefficient_vector"] = param_vector i += 1 else: pass ticks.update() # calculate weighted average # print(cv_results['score']) - weighted_stack = cv_results.at[0, "component"] * cv_results.at[0, "score"] + weighted_stack = np.zeros((num_node,num_node)) + fake = np.zeros((num_node,num_node)) # print(weighted_stack.shape) - for j in index[1:]: + for j in index: # print(cv_results.at[j, 'score']) weighted = cv_results.at[j, "component"] * cv_results.at[j, "score"] - weighted_stack = np.dstack([weighted_stack, weighted]) + + if np.sum(weighted) == 0 or np.isnan(np.sum(weighted)) == True: + weighted_stack = np.dstack([weighted_stack, fake]) + else: + weighted_stack = np.dstack([weighted_stack, weighted]) # print(weighted_stack.shape, weighted.shape) weighted_average = np.mean(weighted_stack, axis=-1) From 7df266a8f095c5035d07fcbcfeeb063b0e460295 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 2 Nov 2023 10:12:29 -0700 Subject: [PATCH 41/48] checkpoint before integrating neurocombat --- idconn/workflows/nbs_predict.py | 126 +++++++++++++++++--------------- 1 file changed, 68 insertions(+), 58 deletions(-) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 233c284..508aa74 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -10,10 +10,12 @@ from time import strftime from scipy.stats import spearmanr from idconn import nbs, io +from bct import threshold_proportional -from sklearn.linear_model import LogisticRegression, ElasticNet -from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler from sklearn.metrics import mean_squared_error from matplotlib.colors import ListedColormap import matplotlib as mpl @@ -34,16 +36,18 @@ CONFOUNDS = "framewise_displacement" TASK = "rest" ATLAS = "craddock2012" +THRESH = 0.5 alpha = 0.05 atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) -dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=True) +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) keep = dat["adj"].dropna().index dat = dat.loc[keep] + # print(dat['adj'].values.shape) num_node = dat.iloc[0]["adj"].shape[0] @@ -51,6 +55,7 @@ upper_tri = np.triu_indices(num_node, k=1) outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) +groups = dat['bc'] if CONFOUNDS is not None: confounds = dat[CONFOUNDS] @@ -61,7 +66,7 @@ # print(dat['bc']) weighted_average, cv_results = nbs.kfold_nbs( - matrices, outcome, confounds, alpha, groups=dat["bc"], n_splits=10, n_iterations=100 + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 ) fig, fig2, nimg = io.plot_edges( @@ -111,13 +116,17 @@ # here is where we'd threshold the weighted average to use for elastic-net weighted_average = np.where(weighted_average > 0, weighted_average, 0) -nbs_vector = weighted_average[upper_tri] -p75 = np.percentile(nbs_vector, 75) -filter = np.where(nbs_vector >= p75, True, False) +#nbs_vector = weighted_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +#filter = np.where(nbs_vector >= p75, True, False) # print(nbs_vector.shape, filter.shape) +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) # mask = io.vectorize_corrmats(filter) -edges_train = np.vstack(dat["edge_vector"].dropna().values) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: @@ -133,49 +142,65 @@ train_outcome, resid_edges = nbs.residualize( X=edges_train, y=outcome_train, confounds=confounds_train ) - train_features = resid_edges[:,filter] + train_features = resid_edges else: - train_features = edges_train[:,filter] + train_features = edges_train train_outcome = outcome -scaler = StandardScaler() -train_features = scaler.fit_transform(train_features) +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) if len(np.unique(train_outcome)) <= 2: pass else: - train_outcome = scaler.fit_transform(train_outcome.reshape(-1, 1)) + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) # run the model on the whole test dataset to get params # classification if the outcome is binary (for now) # could be extended to the multiclass case? +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +train_metrics = {} if len(np.unique(outcome)) == 2: model = LogisticRegression( - penalty="elasticnet", + penalty="l2", solver="saga", - l1_ratio=best.l1_ratio_ + C=best.C_[0] ) + train_metrics["alpha"] = best.C_[0] + #train_metrics["l1_ratio"] = best.l1_ratio_ else: - model = ElasticNet( - l1_ratio=best.l1_ratio_, + model = Ridge( + solver="saga", alpha=best.alpha_ ) + train_metrics["alpha"] = best.alpha_ + #train_metrics["l1_ratio"] = best.l1_ratio_ #print(params) #model.set_params(**params) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict -train_metrics = {} -fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) -in_sample_score = fitted.score(X=train_features, y=np.ravel(train_outcome)) -if len(np.unique(outcome)) == 2: - train_metrics["accuracy"] = in_sample_score -else: - train_metrics["coefficient of determination"] = in_sample_score + +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True + ) +train_metrics["in_sample_test"] = np.mean(scores['test_score']) +train_metrics["in_sample_train"] = np.mean(scores['train_score']) + +fitted = scores['estimator'][0] y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) dat[f'{OUTCOME}_pred'] = y_pred dat[f'{OUTCOME}_scaled'] = train_outcome -Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] +Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled']] Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') train_colors = ['#a08ad1', #light @@ -186,25 +211,20 @@ dark_cmap = sns.color_palette('dark:#685690') fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', +g = sns.scatterplot(x=f'{OUTCOME}_scaled', y=f'{OUTCOME}_pred', - style='bc', + #style='bc', data=Ys, ax=ax, palette=dark_cmap) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - ax=ax, - palette=light_cmap) -ax.legend(bbox_to_anchor=(1.0, 0.5)) +#ax.legend(bbox_to_anchor=(1.0, 0.5)) fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') mse = mean_squared_error(train_outcome, y_pred) train_metrics["mean squared error"] = mse -print("In-sample prediction score: ", in_sample_score) +print("In-sample prediction score: ", train_metrics["in_sample_test"]) print("In-sample mean squared error: ", mse) +train_metrics["in_sample_mse"] = mse # print(np.mean(train_features)) with open( join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" @@ -216,10 +236,8 @@ j = 0 for i in range(0, filter.shape[0]): if filter[i] == True: - if len(np.unique(outcome)) == 2: - coeff_vec[i] = fitted.coef_[0, j] - else: - coeff_vec[i] = fitted.coef_[j] + #print(j) + coeff_vec[i] = fitted.coef_[0, j] j += 1 else: pass @@ -254,7 +272,7 @@ layout = bids.BIDSLayout(TEST_DSET, derivatives=True) -test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=True) +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) keep = test_df[[OUTCOME, "adj"]].dropna().index # print(keep) @@ -268,7 +286,7 @@ matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( (len(test_df["adj"].dropna().index), num_node, num_node) ) -edges_test = np.vstack(test_df["edge_vector"].dropna().values) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: @@ -284,17 +302,17 @@ test_outcome, resid_edges = nbs.residualize( X=edges_test, y=outcome_test, confounds=confounds_test ) - test_features = resid_edges[:, filter] + test_features = resid_edges else: - test_features = edges_test[:, filter] + test_features = edges_test test_outcome = outcome_test # scale after residualizing omg -test_features = scaler.fit_transform(test_features) +test_features = x_scaler.transform(test_features) if len(np.unique(test_outcome)) <= 2: pass else: - test_outcome = scaler.fit_transform(test_outcome.reshape(-1, 1)) + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) # print(test_features.shape) # if the model is a logistic regression, i.e. with a binary outcome # then score is prediction accuracy @@ -312,6 +330,8 @@ test_metrics["accuracy"] = score else: test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr mse = mean_squared_error(test_outcome, y_pred) test_metrics["mean squared error"] = mse print("Out-of-sample prediction score:\t", score) @@ -321,9 +341,7 @@ test_df[f'{OUTCOME}_scaled'] = test_outcome test_df[f'{OUTCOME}_pred'] = y_pred Ys = test_df[[f'{OUTCOME}_scaled', - f'{OUTCOME}_pred', - 'cycle_day', - 'bc']] + f'{OUTCOME}_pred']] Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') Ys['ppts'] = Ys.index.get_level_values(0) @@ -342,23 +360,15 @@ mpl.colormaps.register(cmap=dark) fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', +g = sns.scatterplot(x=f'{OUTCOME}_scaled', y=f'{OUTCOME}_pred', - style='bc', + #style='bc', data=Ys, hue='ppts', hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], ax=ax, palette='light_powderpuff' ) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='dark_powderpuff') ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') From 8036f5e498538a466aec0ca8b1c131ebc5b5cdca Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Thu, 2 Nov 2023 10:12:41 -0700 Subject: [PATCH 42/48] checkpoint before integrating neurocombat --- idconn/workflows/nbs_predict.py | 1 - 1 file changed, 1 deletion(-) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 508aa74..50563e7 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -242,7 +242,6 @@ else: pass -# print(coeff_vec) coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) From f274decfd10bc8b13eaaf32aa0cd70148fcd7a7f Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Mon, 27 Nov 2023 15:43:31 -0800 Subject: [PATCH 43/48] add scaling as an option --- idconn/nbs.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/idconn/nbs.py b/idconn/nbs.py index 3e2b48f..26ed551 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -233,7 +233,7 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict= def kfold_nbs( - matrices, outcome, confounds=None, alpha=0.05, groups=None, num_node=None, diagonal=False, n_splits=10, n_iterations=10 + matrices, outcome, confounds=None, alpha=0.05, groups=None, num_node=None, diagonal=False, scale_x=False, scale_y=False, n_splits=10, n_iterations=10 ): """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided of shape ((subject x session)x node x node) @@ -333,8 +333,7 @@ def kfold_nbs( manager = enlighten.get_manager() ticks = manager.counter(total=n_splits * n_iterations, desc="Progress", unit="folds") for train_idx, test_idx in cv.split(edges, split_y): - x_scaler = Normalizer() - y_scaler = Normalizer() + cv_results.at[i, "split"] = (train_idx, test_idx) # assert len(train_a_idx) == len(train_b_idx) @@ -382,15 +381,21 @@ def kfold_nbs( test_y, test_edges = residualize(X=test_edges, y=test_y, confounds=test_confounds) else: pass + if scale_x: + x_scaler = Normalizer() + train_edges = x_scaler.fit_transform(train_edges) + test_edges = x_scaler.transform(test_edges) + if scale_y: + if np.unique(outcome).shape[0] == 2: + pass + else: + y_scaler = Normalizer() + train_y = y_scaler.fit_transform(train_y.reshape(-1, 1)) + test_y = y_scaler.transform(test_y.reshape(-1, 1)) + + - train_edges = x_scaler.fit_transform(train_edges) - test_edges = x_scaler.transform(test_edges) - - if np.unique(outcome).shape[0] == 2: - pass - else: - train_y = y_scaler.fit_transform(train_y.reshape(-1, 1)) - test_y = y_scaler.transform(test_y.reshape(-1, 1)) + # perform NBS wooooooooo # note: output is a dataframe :) From 68631fbc7aabde35a2f8b798a7e5d54661cd1a5d Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Fri, 12 Jan 2024 15:19:41 -0800 Subject: [PATCH 44/48] nbs workflow works yay --- idconn/io.py | 43 ++- idconn/workflows/nbs_predict-e2.py | 419 ++++++++++++++++++++++ idconn/workflows/nbs_predict-e2xp4-bc.py | 422 +++++++++++++++++++++++ idconn/workflows/nbs_predict-e2xp4.py | 422 +++++++++++++++++++++++ idconn/workflows/nbs_predict-p4.py | 416 ++++++++++++++++++++++ idconn/workflows/nbs_predict.py | 90 ++--- 6 files changed, 1754 insertions(+), 58 deletions(-) create mode 100644 idconn/workflows/nbs_predict-e2.py create mode 100644 idconn/workflows/nbs_predict-e2xp4-bc.py create mode 100644 idconn/workflows/nbs_predict-e2xp4.py create mode 100644 idconn/workflows/nbs_predict-p4.py diff --git a/idconn/io.py b/idconn/io.py index b5f43e1..23b563c 100644 --- a/idconn/io.py +++ b/idconn/io.py @@ -35,7 +35,6 @@ def calc_fd(confounds): fd = np.sum([delta_x, delta_y, delta_z, delta_alpha, delta_beta, delta_gamma], axis=0) return fd - def build_statsmodel_json( name, task, @@ -132,7 +131,6 @@ def build_statsmodel_json( json.dump(statsmodel, outfile) return statsmodel_json - def atlas_picker(atlas, path, key=None): """Takes in atlas name and path to file, if local, returns nifti-like object (usually file path to downloaded atlas), @@ -192,8 +190,7 @@ def atlas_picker(atlas, path, key=None): return atlas, path - -def vectorize_corrmats(matrices): +def vectorize_corrmats(matrices, diagonal=False): """Returns the vectorized upper triangles of a 3-dimensional array (i.e., node x node x matrix) of matrices. Output will be a 2-dimensional array (i.e., matrix x node^2) @@ -210,11 +207,15 @@ def vectorize_corrmats(matrices): the input matrices. """ # print(f'\n\n\n{matrices.shape}, {matrices.ndim}\n\n\n') + if diagonal == True: + k = 0 + else: + k = 1 num_node = matrices.shape[1] - upper_tri = np.triu_indices(num_node, k=1) + upper_tri = np.triu_indices(num_node, k=k) if matrices.ndim == 3: num_node = matrices.shape[1] - upper_tri = np.triu_indices(num_node, k=1) + upper_tri = np.triu_indices(num_node, k=k) num_matrices = matrices.shape[0] edge_vector = [] for matrix in range(0, num_matrices): @@ -234,7 +235,7 @@ def vectorize_corrmats(matrices): elif matrices.ndim == 1: if matrices[0].ndim == 2: num_node = matrices[0].shape[0] - upper_tri = np.triu_indices(num_node, k=1) + upper_tri = np.triu_indices(num_node, k=k) edge_vector = [] for matrix in matrices: vectorized = matrix[upper_tri] @@ -248,7 +249,6 @@ def vectorize_corrmats(matrices): edge_vector = np.asarray(edge_vector) return edge_vector - def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True, verbose=False): """Returns a node x node x (subject x session) matrix of correlation matrices from a BIDS derivative folder. Optionally returns a node^2 x (subject x session) @@ -419,8 +419,7 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True ppt_df.replace({"": np.nan}, inplace=True) return ppt_df - -def undo_vectorize(edges, num_node=None): +def undo_vectorize(edges, num_node=None, diagonal=False): """ Puts an edge vector back into an adjacency matrix. Parameters @@ -439,15 +438,25 @@ def undo_vectorize(edges, num_node=None): # num_node = (np.sqrt((8 * j) + 1) + 1) / 2 if num_node == None: j = len(edges) - num_node = int((np.sqrt((8 * j) + 1) + 1) / 2) + if diagonal == False: + num_node = int((np.sqrt((8 * j) + 1) + 1) / 2) + else: + num_node = int((np.sqrt((8 * j) + 1) - 1) / 2) else: num_node = int(num_node) X = np.zeros((num_node, num_node)) - X[np.triu_indices(X.shape[0], k=1)] = edges + if diagonal == False: + k=1 + if diagonal == True: + k=0 + X[np.triu_indices(num_node, k=k)] = edges + diag_X = X[np.diag_indices(num_node,2)] X = X + X.T + if diagonal == True: + X[np.diag_indices(num_node,2)] = diag_X + #print('did undo_vectorize work?', np.allclose(X, X.T)) return X - def plot_edges( adj, atlas_nii, @@ -499,7 +508,7 @@ def plot_edges( print("edge plotting threshold: ", threshold) if node_size == "strength": - node_strength = np.sum(adj, axis=0) + node_strength = np.abs(np.sum(adj, axis=0)) # node_strength /= np.max(node_strength) # node_strength **= 4 node_strength = node_strength / np.max(node_strength) * 60 @@ -535,7 +544,7 @@ def plot_edges( nimg = nib.load(atlas_nii) regn_sch_arr = nimg.get_fdata() for i in np.arange(0, num_node): - regn_sch_arr[np.where(regn_sch_arr == i + 1)] = np.sum(adj[i]) + regn_sch_arr[np.where(regn_sch_arr == i + 1)] = np.sum((adj[i])) strength_nimg = nib.Nifti1Image(regn_sch_arr, nimg.affine) # replace this filename with BIDSy output # nib.save(strength_nimg, f'/Users/katherine.b/Dropbox/{title}predictive-strength.nii') @@ -558,6 +567,7 @@ def plot_edges( i = plotting.plot_surf_stat_map( fsaverage.pial_left, texture_l, + bg_map=fsaverage.sulc_left, symmetric_cbar=False, threshold=0.5, cmap=cmap, @@ -568,6 +578,7 @@ def plot_edges( j = plotting.plot_surf_stat_map( fsaverage.pial_left, texture_l, + bg_map=fsaverage.sulc_left, symmetric_cbar=False, threshold=0.5, cmap=cmap, @@ -578,6 +589,7 @@ def plot_edges( k = plotting.plot_surf_stat_map( fsaverage.pial_right, texture_r, + bg_map=fsaverage.sulc_right, symmetric_cbar=False, threshold=0.5, cmap=cmap, @@ -588,6 +600,7 @@ def plot_edges( l = plotting.plot_surf_stat_map( fsaverage.pial_right, texture_r, + bg_map=fsaverage.sulc_right, symmetric_cbar=False, threshold=0.5, cmap=cmap, diff --git a/idconn/workflows/nbs_predict-e2.py b/idconn/workflows/nbs_predict-e2.py new file mode 100644 index 0000000..c92d274 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +#print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=3, n_iterations=3 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +#print(np.sum(weighted_average)) +#nbs_vector = weighted_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +#filter = np.where(nbs_vector >= p75, True, False) +#print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression( + penalty="l2", + solver="saga", + C=best.C_[0] + ) + train_metrics["alpha"] = best.C_[0] + #train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + + #train_metrics["l1_ratio"] = best.l1_ratio_ +#print(params) +#model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True + ) +train_metrics["in_sample_test"] = np.mean(scores['test_score']) +train_metrics["in_sample_train"] = np.mean(scores['train_score']) + +fitted = scores['estimator'][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f'{OUTCOME}_pred'] = y_pred +dat[f'{OUTCOME}_scaled'] = train_outcome + +Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] +Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +train_colors = ['#a08ad1', #light + '#685690', #medium + '#3f2d69' #dark + ] +light_cmap = sns.color_palette('dark:#a08ad1') +dark_cmap = sns.color_palette('dark:#685690') + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + ax=ax, + palette=dark_cmap) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + ax=ax, + palette=light_cmap) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +#print(fitted.coef_.shape) +#print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + #print(j) + #print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +#cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f'{OUTCOME}_scaled'] = test_outcome +test_df[f'{OUTCOME}_pred'] = y_pred +Ys = test_df[[f'{OUTCOME}_scaled', + f'{OUTCOME}_pred', + 'cycle_day', + 'bc']] +Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +Ys['ppts'] = Ys.index.get_level_values(0) + + +light_colors = ['#33ACE3', #Bubbles + '#EA6964', #Blossom + '#4AB62C' #Buttercup + ] +dark_colors = ['#1278a6', + '#a11510', + '#228208'] +light = ListedColormap(light_colors, name='light_powderpuff') +dark = ListedColormap(dark_colors, name='dark_powderpuff') +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='light_powderpuff' + ) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='dark_powderpuff') +ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') +fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + + + +#print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2xp4-bc.py b/idconn/workflows/nbs_predict-e2xp4-bc.py new file mode 100644 index 0000000..ad6a6d8 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2xp4-bc.py @@ -0,0 +1,422 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol÷progesterone" +CONFOUNDS = ["framewise_displacement", "bc"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +dat['estradiol÷progesterone'] = dat['estradiol'] / dat['progesterone'] + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +#print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +#print(np.sum(weighted_average)) +#nbs_vector = weighted_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +#filter = np.where(nbs_vector >= p75, True, False) +#print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression( + penalty="l2", + solver="saga", + C=best.C_[0] + ) + train_metrics["alpha"] = best.C_[0] + #train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + + #train_metrics["l1_ratio"] = best.l1_ratio_ +#print(params) +#model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True + ) +train_metrics["in_sample_test"] = np.mean(scores['test_score']) +train_metrics["in_sample_train"] = np.mean(scores['train_score']) + +fitted = scores['estimator'][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f'{OUTCOME}_pred'] = y_pred +dat[f'{OUTCOME}_scaled'] = train_outcome + +Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] +Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +train_colors = ['#a08ad1', #light + '#685690', #medium + '#3f2d69' #dark + ] +light_cmap = sns.color_palette('dark:#a08ad1') +dark_cmap = sns.color_palette('dark:#685690') + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + ax=ax, + palette=dark_cmap) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + ax=ax, + palette=light_cmap) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +#print(fitted.coef_.shape) +#print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + #print(j) + #print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) +test_df['estradiol÷progesterone'] = test_df['estradiol'] / test_df['progesterone'] + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +#cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f'{OUTCOME}_scaled'] = test_outcome +test_df[f'{OUTCOME}_pred'] = y_pred +Ys = test_df[[f'{OUTCOME}_scaled', + f'{OUTCOME}_pred', + 'cycle_day', + 'bc']] +Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +Ys['ppts'] = Ys.index.get_level_values(0) + + +light_colors = ['#33ACE3', #Bubbles + '#EA6964', #Blossom + '#4AB62C' #Buttercup + ] +dark_colors = ['#1278a6', + '#a11510', + '#228208'] +light = ListedColormap(light_colors, name='light_powderpuff') +dark = ListedColormap(dark_colors, name='dark_powderpuff') +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='light_powderpuff' + ) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='dark_powderpuff') +ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') +fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + + + +#print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2xp4.py b/idconn/workflows/nbs_predict-e2xp4.py new file mode 100644 index 0000000..022d8b9 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2xp4.py @@ -0,0 +1,422 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol÷progesterone" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +dat['estradiol÷progesterone'] = dat['estradiol'] / dat['progesterone'] + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +#print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +#print(np.sum(weighted_average)) +#nbs_vector = weighted_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +#filter = np.where(nbs_vector >= p75, True, False) +#print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression( + penalty="l2", + solver="saga", + C=best.C_[0] + ) + train_metrics["alpha"] = best.C_[0] + #train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + + #train_metrics["l1_ratio"] = best.l1_ratio_ +#print(params) +#model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True + ) +train_metrics["in_sample_test"] = np.mean(scores['test_score']) +train_metrics["in_sample_train"] = np.mean(scores['train_score']) + +fitted = scores['estimator'][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f'{OUTCOME}_pred'] = y_pred +dat[f'{OUTCOME}_scaled'] = train_outcome + +Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] +Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +train_colors = ['#a08ad1', #light + '#685690', #medium + '#3f2d69' #dark + ] +light_cmap = sns.color_palette('dark:#a08ad1') +dark_cmap = sns.color_palette('dark:#685690') + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + ax=ax, + palette=dark_cmap) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + ax=ax, + palette=light_cmap) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +#print(fitted.coef_.shape) +#print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + #print(j) + #print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) +test_df['estradiol÷progesterone'] = test_df['estradiol'] / test_df['progesterone'] + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +#cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f'{OUTCOME}_scaled'] = test_outcome +test_df[f'{OUTCOME}_pred'] = y_pred +Ys = test_df[[f'{OUTCOME}_scaled', + f'{OUTCOME}_pred', + 'cycle_day', + 'bc']] +Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +Ys['ppts'] = Ys.index.get_level_values(0) + + +light_colors = ['#33ACE3', #Bubbles + '#EA6964', #Blossom + '#4AB62C' #Buttercup + ] +dark_colors = ['#1278a6', + '#a11510', + '#228208'] +light = ListedColormap(light_colors, name='light_powderpuff') +dark = ListedColormap(dark_colors, name='dark_powderpuff') +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='light_powderpuff' + ) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='dark_powderpuff') +ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') +fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + + + +#print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-p4.py b/idconn/workflows/nbs_predict-p4.py new file mode 100644 index 0000000..559b4ff --- /dev/null +++ b/idconn/workflows/nbs_predict-p4.py @@ -0,0 +1,416 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "progesterone" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +#print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +#print(np.sum(weighted_average)) +#nbs_vector = weighted_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +#filter = np.where(nbs_vector >= p75, True, False) +#print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +#p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression( + penalty="l2", + solver="saga", + C=best.C_[0] + ) + train_metrics["alpha"] = best.C_[0] + #train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + + #train_metrics["l1_ratio"] = best.l1_ratio_ +#print(params) +#model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True + ) +train_metrics["in_sample_test"] = np.mean(scores['test_score']) +train_metrics["in_sample_train"] = np.mean(scores['train_score']) + +fitted = scores['estimator'][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f'{OUTCOME}_pred'] = y_pred +dat[f'{OUTCOME}_scaled'] = train_outcome + +Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] +Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +train_colors = ['#a08ad1', #light + '#685690', #medium + '#3f2d69' #dark + ] +light_cmap = sns.color_palette('dark:#a08ad1') +dark_cmap = sns.color_palette('dark:#685690') + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + ax=ax, + palette=dark_cmap) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + ax=ax, + palette=light_cmap) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +#print(fitted.coef_.shape) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + #print(j) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) + +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +#cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f'{OUTCOME}_scaled'] = test_outcome +test_df[f'{OUTCOME}_pred'] = y_pred +Ys = test_df[[f'{OUTCOME}_scaled', + f'{OUTCOME}_pred', + 'cycle_day', + 'bc']] +Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') + +Ys['ppts'] = Ys.index.get_level_values(0) + + +light_colors = ['#33ACE3', #Bubbles + '#EA6964', #Blossom + '#4AB62C' #Buttercup + ] +dark_colors = ['#1278a6', + '#a11510', + '#228208'] +light = ListedColormap(light_colors, name='light_powderpuff') +dark = ListedColormap(dark_colors, name='dark_powderpuff') +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig,ax = plt.subplots() +g = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_pred', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='light_powderpuff' + ) +h = sns.scatterplot(x='cycle_day', + y=f'{OUTCOME}_scaled', + style='bc', + data=Ys, + hue='ppts', + hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], + ax=ax, + palette='dark_powderpuff') +ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') +fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') + + + +#print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 50563e7..46e804c 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -29,21 +29,21 @@ today = datetime.today() today_str = strftime("%m_%d_%Y") -TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" -TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +TRAIN_DSET = "" +TEST_DSET = "" DERIV_NAME = "IDConn" -OUTCOME = "bc" +OUTCOME = "" CONFOUNDS = "framewise_displacement" TASK = "rest" ATLAS = "craddock2012" THRESH = 0.5 alpha = 0.05 -atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" +atlas_fname = "craddock2012_tcorr05_2level_270_2mm.nii.gz" -layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) +train_layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) -dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) +dat = io.read_corrmats(train_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) keep = dat["adj"].dropna().index dat = dat.loc[keep] @@ -65,6 +65,47 @@ base_name = f"nbs-predict_outcome-{OUTCOME}" # print(dat['bc']) +# load in test data +test_layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(test_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + + + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + + weighted_average, cv_results = nbs.kfold_nbs( matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 ) @@ -269,43 +310,6 @@ ) -layout = bids.BIDSLayout(TEST_DSET, derivatives=True) - -test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) - -keep = test_df[[OUTCOME, "adj"]].dropna().index -# print(keep) - -test_df = test_df.loc[keep] - -outcome_test = test_df[OUTCOME].values -# print(test_df) - -# print(outcome_test) -matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( - (len(test_df["adj"].dropna().index), num_node, num_node) -) -edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] - -# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE -if CONFOUNDS is not None: - confounds_test = test_df[CONFOUNDS].values - - # regress out the confounds from each edge and the outcome variable, - # use the residuals for the rest of the algorithm - # print(confounds.shape, outcome.shape) - if len(np.unique(outcome_test)) <= 2: - resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) - test_outcome = outcome_test - elif len(np.unique(outcome_test)) > 3: - test_outcome, resid_edges = nbs.residualize( - X=edges_test, y=outcome_test, confounds=confounds_test - ) - test_features = resid_edges -else: - test_features = edges_test - test_outcome = outcome_test - # scale after residualizing omg test_features = x_scaler.transform(test_features) if len(np.unique(test_outcome)) <= 2: From b6efbbd8e904bc83f1982380793c6dbfa4ea7db3 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Fri, 14 Jun 2024 15:40:03 -0700 Subject: [PATCH 45/48] added some to-dos need to rerun checks --- idconn/connectivity.py | 2 ++ idconn/io.py | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/idconn/connectivity.py b/idconn/connectivity.py index 1e79998..5ac1ae2 100644 --- a/idconn/connectivity.py +++ b/idconn/connectivity.py @@ -254,6 +254,8 @@ def task_connectivity( def rest_connectivity( layout, subject, session, task, atlas, confounds=None, connectivity_metric="correlation" ): + ################################################################################### + ################# Needs an option to keep runs separate. ########################## """ Makes connectivity matrices per subject per session per task per condition. Parameters diff --git a/idconn/io.py b/idconn/io.py index 23b563c..3e1bca9 100644 --- a/idconn/io.py +++ b/idconn/io.py @@ -253,6 +253,8 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True """Returns a node x node x (subject x session) matrix of correlation matrices from a BIDS derivative folder. Optionally returns a node^2 x (subject x session) array of vectorized upper triangles of those correlation matrices. + + ME @ ME: NEEDS AN OPTION TO KEEP RUNS SEPARATE. CURRENTLY IT AVERAGES CONFOUNDS AND Parameters ---------- layout : BIDSLayout or str @@ -356,7 +358,7 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True ) # print(confound_means) else: - path = path = layout.get( + path = layout.get( return_type="filename", session=session, desc="confounds", @@ -397,7 +399,12 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True pass if type(path) == list: # print(len(path)) + ################################################################ + ############ EEEEEEEEEEEEEEEEEK ################################ + ############### DOES THIS ONLY GRAB ONE RUN?!?!?! ############## + ################################################################ path = path[0] + else: pass assert exists(path), f"Corrmat file not found at {path}" From f39aba7d13015a3f9c40c22eee68cedcba661ab5 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Fri, 14 Jun 2024 15:47:29 -0700 Subject: [PATCH 46/48] update nbsp workflows for hormones x fc paper --- idconn/io.py | 20 +- idconn/nbs.py | 144 +++--- idconn/workflows/nbs_predict-bc.py | 387 ++++++++++++++++ .../workflows/nbs_predict-bc_sensitivity.py | 412 ++++++++++++++++++ idconn/workflows/nbs_predict-e2.py | 210 +++++---- .../workflows/nbs_predict-e2_sensitivity.py | 412 ++++++++++++++++++ .../workflows/nbs_predict-e2bc_sensitivity.py | 412 ++++++++++++++++++ idconn/workflows/nbs_predict-e2xp4-bc.py | 214 +++++---- idconn/workflows/nbs_predict-e2xp4.py | 214 +++++---- idconn/workflows/nbs_predict-p4.py | 206 +++++---- .../workflows/nbs_predict-p4_sensitivity.py | 412 ++++++++++++++++++ .../workflows/nbs_predict-p4bc_sensitivity.py | 412 ++++++++++++++++++ idconn/workflows/nbs_predict.py | 167 ++++--- 13 files changed, 3027 insertions(+), 595 deletions(-) create mode 100644 idconn/workflows/nbs_predict-bc.py create mode 100644 idconn/workflows/nbs_predict-bc_sensitivity.py create mode 100644 idconn/workflows/nbs_predict-e2_sensitivity.py create mode 100644 idconn/workflows/nbs_predict-e2bc_sensitivity.py create mode 100644 idconn/workflows/nbs_predict-p4_sensitivity.py create mode 100644 idconn/workflows/nbs_predict-p4bc_sensitivity.py diff --git a/idconn/io.py b/idconn/io.py index 3e1bca9..55ddc81 100644 --- a/idconn/io.py +++ b/idconn/io.py @@ -35,6 +35,7 @@ def calc_fd(confounds): fd = np.sum([delta_x, delta_y, delta_z, delta_alpha, delta_beta, delta_gamma], axis=0) return fd + def build_statsmodel_json( name, task, @@ -131,6 +132,7 @@ def build_statsmodel_json( json.dump(statsmodel, outfile) return statsmodel_json + def atlas_picker(atlas, path, key=None): """Takes in atlas name and path to file, if local, returns nifti-like object (usually file path to downloaded atlas), @@ -190,6 +192,7 @@ def atlas_picker(atlas, path, key=None): return atlas, path + def vectorize_corrmats(matrices, diagonal=False): """Returns the vectorized upper triangles of a 3-dimensional array (i.e., node x node x matrix) of matrices. Output will be a 2-dimensional @@ -249,12 +252,13 @@ def vectorize_corrmats(matrices, diagonal=False): edge_vector = np.asarray(edge_vector) return edge_vector + def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True, verbose=False): """Returns a node x node x (subject x session) matrix of correlation matrices from a BIDS derivative folder. Optionally returns a node^2 x (subject x session) array of vectorized upper triangles of those correlation matrices. - ME @ ME: NEEDS AN OPTION TO KEEP RUNS SEPARATE. CURRENTLY IT AVERAGES CONFOUNDS AND + ME @ ME: NEEDS AN OPTION TO KEEP RUNS SEPARATE. CURRENTLY IT AVERAGES CONFOUNDS AND Parameters ---------- layout : BIDSLayout or str @@ -404,7 +408,7 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True ############### DOES THIS ONLY GRAB ONE RUN?!?!?! ############## ################################################################ path = path[0] - + else: pass assert exists(path), f"Corrmat file not found at {path}" @@ -426,6 +430,7 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True ppt_df.replace({"": np.nan}, inplace=True) return ppt_df + def undo_vectorize(edges, num_node=None, diagonal=False): """ Puts an edge vector back into an adjacency matrix. @@ -453,17 +458,18 @@ def undo_vectorize(edges, num_node=None, diagonal=False): num_node = int(num_node) X = np.zeros((num_node, num_node)) if diagonal == False: - k=1 + k = 1 if diagonal == True: - k=0 + k = 0 X[np.triu_indices(num_node, k=k)] = edges - diag_X = X[np.diag_indices(num_node,2)] + diag_X = X[np.diag_indices(num_node, 2)] X = X + X.T if diagonal == True: - X[np.diag_indices(num_node,2)] = diag_X - #print('did undo_vectorize work?', np.allclose(X, X.T)) + X[np.diag_indices(num_node, 2)] = diag_X + # print('did undo_vectorize work?', np.allclose(X, X.T)) return X + def plot_edges( adj, atlas_nii, diff --git a/idconn/nbs.py b/idconn/nbs.py index 26ed551..52e9b37 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -8,11 +8,7 @@ # import bct from sklearn.experimental import enable_halving_search_cv -from sklearn.model_selection import ( - RepeatedStratifiedKFold, - RepeatedKFold, - HalvingGridSearchCV -) +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, HalvingGridSearchCV from sklearn.feature_selection import f_regression, f_classif from sklearn.linear_model import LogisticRegression, ElasticNet, LogisticRegressionCV, RidgeCV @@ -39,9 +35,9 @@ def calc_number_of_nodes(matrices): def residualize(X, y=None, confounds=None): - ''' + """ all inputs need to be arrays, not dataframes - ''' + """ # residualize the outcome if confounds is not None: if y is not None: @@ -74,7 +70,9 @@ def residualize(X, y=None, confounds=None): print("Confound matrix wasn't provided, so no confounding was done") -def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=False, permutations=10000): +def pynbs( + matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=False, permutations=10000 +): """ Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided of shape ((subject x session)x node x node) @@ -129,7 +127,6 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict= edges = matrices.copy() # print(edges.shape) - # edges = edges.T # run an ols per edge @@ -145,13 +142,15 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict= # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - sig_matrix = undo_vectorize(sig_edges, num_node=num_node, diagonal=diagonal) # need to write this function + sig_matrix = undo_vectorize( + sig_edges, num_node=num_node, diagonal=diagonal + ) # need to write this function matrix = nx.from_numpy_array(sig_matrix) # use networkX to find connected components S = [matrix.subgraph(c).copy() for c in nx.connected_components(matrix)] S.sort(key=len, reverse=True) - #largest_cc = max(nx.connected_components(matrix), key=len) + # largest_cc = max(nx.connected_components(matrix), key=len) G0 = S[0] # print(G0) @@ -202,7 +201,9 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict= # print(np.sum(perm_edges)) # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - perm_matrix = undo_vectorize(perm_edges, num_node=num_node, diagonal=diagonal) # need to write this function + perm_matrix = undo_vectorize( + perm_edges, num_node=num_node, diagonal=diagonal + ) # need to write this function perm_nx = nx.from_numpy_array(perm_matrix) largest_cc = max(nx.connected_components(perm_nx), key=len) @@ -233,7 +234,17 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict= def kfold_nbs( - matrices, outcome, confounds=None, alpha=0.05, groups=None, num_node=None, diagonal=False, scale_x=False, scale_y=False, n_splits=10, n_iterations=10 + matrices, + outcome, + confounds=None, + alpha=0.05, + groups=None, + num_node=None, + diagonal=False, + scale_x=False, + scale_y=False, + n_splits=10, + n_iterations=10, ): """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided of shape ((subject x session)x node x node) @@ -326,38 +337,38 @@ def kfold_nbs( if diagonal == True: k = 0 if diagonal == False: - k=1 + k = 1 upper_tri = np.triu_indices(num_node, k=k) i = 0 manager = enlighten.get_manager() ticks = manager.counter(total=n_splits * n_iterations, desc="Progress", unit="folds") for train_idx, test_idx in cv.split(edges, split_y): - + cv_results.at[i, "split"] = (train_idx, test_idx) # assert len(train_a_idx) == len(train_b_idx) Cs = np.logspace(-4, 4, 10) - #print(len(np.unique(outcome))) + # print(len(np.unique(outcome))) if np.unique(outcome).shape[0] == 2: - #print('binary') + # print('binary') regressor = LogisticRegressionCV( - Cs=Cs, + Cs=Cs, cv=4, - #verbose=2, - max_iter=100000, - penalty="l2", - solver="saga", - n_jobs=4 + # verbose=2, + max_iter=100000, + penalty="l2", + solver="saga", + n_jobs=4, ) - + else: - #print('continuous') + # print('continuous') regressor = RidgeCV( - alphas=Cs, - cv=4, - #n_jobs=4 - ) + alphas=Cs, + cv=4, + # n_jobs=4 + ) train_y = outcome[train_idx] test_y = outcome[test_idx] @@ -392,16 +403,14 @@ def kfold_nbs( y_scaler = Normalizer() train_y = y_scaler.fit_transform(train_y.reshape(-1, 1)) test_y = y_scaler.transform(test_y.reshape(-1, 1)) - - - - # perform NBS wooooooooo # note: output is a dataframe :) # PYNBS SHOULD NOT DO CONFOUND REGRESSION? - adj = pynbs(train_edges, train_y, num_node=num_node, diagonal=diagonal, alpha=alpha, predict=True) - #print(adj.shape, adj.ndim, adj[0].shape, upper_tri) + adj = pynbs( + train_edges, train_y, num_node=num_node, diagonal=diagonal, alpha=alpha, predict=True + ) + # print(adj.shape, adj.ndim, adj[0].shape, upper_tri) # cv_results.at[i, 'pval'] = pval cv_results.at[i, "component"] = adj.values @@ -413,7 +422,7 @@ def kfold_nbs( # so you don't have repeated edges # returns (n_edges, ) nbs_vector = adj.values[upper_tri] - #print(nbs_vector.shape) + # print(nbs_vector.shape) # print(nbs_vector.shape) # use those to make a "significant edges" mask mask = nbs_vector == 1.0 @@ -425,31 +434,31 @@ def kfold_nbs( # returns (n_edges, samples) train_features = train_edges.T[mask] test_features = test_edges.T[mask] - #print(mask.shape, np.sum(mask), train_edges.shape, train_features.shape) + # print(mask.shape, np.sum(mask), train_edges.shape, train_features.shape) train_features = train_features.T test_features = test_features.T - - #train_features = scaler.fit_transform(train_features.T) - #test_features = scaler.fit_transform(test_features.T) - #print(train_features.shape, train_y.shape) - #print(f"train_edges:\t{train_edges[:10, 0]}\ntrain_features:\t{train_features[:10, 0]}") + # train_features = scaler.fit_transform(train_features.T) + # test_features = scaler.fit_transform(test_features.T) + # print(train_features.shape, train_y.shape) + + # print(f"train_edges:\t{train_edges[:10, 0]}\ntrain_features:\t{train_features[:10, 0]}") # print(np.ravel(train_y)) # train model predicting outcome from brain (note: no mas covariates) # use grid search bc I want to know how to tune alpha and l1_ratio - - #grid = HalvingGridSearchCV(estimator=regressor, - # param_grid=param_grid, - # n_jobs=8, - # cv=4, + + # grid = HalvingGridSearchCV(estimator=regressor, + # param_grid=param_grid, + # n_jobs=8, + # cv=4, # factor=2, # verbose=0, - # min_resources=20, - # refit=True, + # min_resources=20, + # refit=True, # aggressive_elimination=False) model = regressor.fit(X=train_features, y=np.ravel(train_y)) - + cv_results.at[i, "model"] = model # score that model on the testing data @@ -462,18 +471,20 @@ def kfold_nbs( # I go die now if np.unique(outcome).shape[0] == 2: score = model.score(X=test_features, y=np.ravel(test_y)) - + else: predicted_y = model.predict(X=test_features) - score,p = spearmanr(predicted_y, np.ravel(test_y)) - #spearman = spearmanr(predicted_y, np.ravel(test_y)) - + score, p = spearmanr(predicted_y, np.ravel(test_y)) + # spearman = spearmanr(predicted_y, np.ravel(test_y)) + cv_results.at[i, "score"] = score if i % (n_splits * n_iterations / 10) == 0: - mean = cv_results['score'].mean() - sdev = cv_results['score'].std() - print(f'Iteration {i} out of {n_splits * n_iterations}, average score:\t{mean:.2f} +/- {sdev:.2f}') - #print(score) + mean = cv_results["score"].mean() + sdev = cv_results["score"].std() + print( + f"Iteration {i} out of {n_splits * n_iterations}, average score:\t{mean:.2f} +/- {sdev:.2f}" + ) + # print(score) m = 0 param_vector = np.zeros_like(nbs_vector) @@ -489,21 +500,21 @@ def kfold_nbs( else: pass X = undo_vectorize(param_vector, num_node=num_node, diagonal=diagonal) - #cv_results.at[i, "coefficient_matrix"] = X - #cv_results.at[i, "coefficient_vector"] = param_vector + # cv_results.at[i, "coefficient_matrix"] = X + # cv_results.at[i, "coefficient_vector"] = param_vector i += 1 else: pass ticks.update() # calculate weighted average # print(cv_results['score']) - weighted_stack = np.zeros((num_node,num_node)) - fake = np.zeros((num_node,num_node)) + weighted_stack = np.zeros((num_node, num_node)) + fake = np.zeros((num_node, num_node)) # print(weighted_stack.shape) for j in index: # print(cv_results.at[j, 'score']) weighted = cv_results.at[j, "component"] * cv_results.at[j, "score"] - + if np.sum(weighted) == 0 or np.isnan(np.sum(weighted)) == True: weighted_stack = np.dstack([weighted_stack, fake]) else: @@ -511,5 +522,8 @@ def kfold_nbs( # print(weighted_stack.shape, weighted.shape) weighted_average = np.mean(weighted_stack, axis=-1) - #model = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] - return weighted_average, cv_results, #model + # model = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + return ( + weighted_average, + cv_results, + ) # model diff --git a/idconn/workflows/nbs_predict-bc.py b/idconn/workflows/nbs_predict-bc.py new file mode 100644 index 0000000..ec3c559 --- /dev/null +++ b/idconn/workflows/nbs_predict-bc.py @@ -0,0 +1,387 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "bc" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +train_layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(train_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) +groups = dat["bc"] + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +# load in test data +test_layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(test_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(nbs_vector.shape, filter.shape) +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge(solver="saga", alpha=best.alpha_) + train_metrics["alpha"] = best.alpha_ + # train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict + +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x=f"{OUTCOME}_scaled", + y=f"{OUTCOME}_pred", + # style='bc', + data=Ys, + ax=ax, + palette=dark_cmap, +) +# ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample prediction score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +train_metrics["in_sample_mse"] = mse +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + + +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x=f"{OUTCOME}_scaled", + y=f"{OUTCOME}_pred", + # style='bc', + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + corr = spearmanr(test_outcome, y_pred) + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-bc_sensitivity.py b/idconn/workflows/nbs_predict-bc_sensitivity.py new file mode 100644 index 0000000..813cf66 --- /dev/null +++ b/idconn/workflows/nbs_predict-bc_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "bc" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2.py b/idconn/workflows/nbs_predict-e2.py index c92d274..a846b5a 100644 --- a/idconn/workflows/nbs_predict-e2.py +++ b/idconn/workflows/nbs_predict-e2.py @@ -58,7 +58,7 @@ outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) -#print(len(np.unique(outcome))) +# print(len(np.unique(outcome))) if CONFOUNDS is not None: confounds = dat[CONFOUNDS] @@ -107,7 +107,7 @@ join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" ) -best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] # this uses the most predictive subnetwork as features in the model # might replace with thresholded weighted_average @@ -119,20 +119,20 @@ # here is where we'd threshold the weighted average to use for elastic-net weighted_average = np.where(weighted_average > 0, weighted_average, 0) -#print(np.sum(weighted_average)) -#nbs_vector = weighted_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) -#filter = np.where(nbs_vector >= p75, True, False) -#print(np.sum(filter)) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) # print(nbs_vector.shape, filter.shape) thresh_average = threshold_proportional(weighted_average, THRESH) nbs_vector2 = thresh_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) +# p75 = np.percentile(nbs_vector, 75) filter = np.where(nbs_vector2 > 0, True, False) # mask = io.vectorize_corrmats(filter) -edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: @@ -162,79 +162,71 @@ train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) - # run the model on the whole test dataset to get params # classification if the outcome is binary (for now) # could be extended to the multiclass case? train_metrics = {} if len(np.unique(outcome)) == 2: - model = LogisticRegression( - penalty="l2", - solver="saga", - C=best.C_[0] - ) + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) train_metrics["alpha"] = best.C_[0] - #train_metrics["l1_ratio"] = best.l1_ratio_ + # train_metrics["l1_ratio"] = best.l1_ratio_ else: model = Ridge( - solver="auto", + solver="auto", alpha=best.alpha_, fit_intercept=False, - ) + ) train_metrics["alpha"] = best.alpha_ cv = RepeatedKFold(n_splits=5, n_repeats=10) - #train_metrics["l1_ratio"] = best.l1_ratio_ -#print(params) -#model.set_params(**params) +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict -#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) scores = cross_validate( - model, - train_features, - train_outcome, - groups=groups, + model, + train_features, + train_outcome, + groups=groups, cv=cv, - return_estimator=True, - return_train_score=True - ) -train_metrics["in_sample_test"] = np.mean(scores['test_score']) -train_metrics["in_sample_train"] = np.mean(scores['train_score']) + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) -fitted = scores['estimator'][0] +fitted = scores["estimator"][0] y_pred = fitted.predict(X=train_features) train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) -dat[f'{OUTCOME}_pred'] = y_pred -dat[f'{OUTCOME}_scaled'] = train_outcome - -Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] -Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -train_colors = ['#a08ad1', #light - '#685690', #medium - '#3f2d69' #dark - ] -light_cmap = sns.color_palette('dark:#a08ad1') -dark_cmap = sns.color_palette('dark:#685690') - -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - ax=ax, - palette=dark_cmap) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - ax=ax, - palette=light_cmap) +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) ax.legend(bbox_to_anchor=(1.0, 0.5)) -fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) mse = mean_squared_error(train_outcome, y_pred) train_metrics["mean squared error"] = mse @@ -248,14 +240,14 @@ json.dump(train_metrics, fp) # yoink the coefficients? for a more parsimonious figure? -#print(fitted.coef_.shape) -#print(fitted.coef_) +# print(fitted.coef_.shape) +# print(fitted.coef_) coeff_vec = np.zeros_like(filter) j = 0 for i in range(0, filter.shape[0]): if filter[i] == True: - #print(j) - #print(fitted.coef_[0, j]) + # print(j) + # print(fitted.coef_[0, j]) coeff_vec[i] = fitted.coef_[0, j] j += 1 else: @@ -345,7 +337,7 @@ # score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) test_metrics = {} -#cross_validate(model, ) +# cross_validate(model, ) y_pred = fitted.predict(X=test_features) score = fitted.score(X=test_features, y=np.ravel(test_outcome)) if len(np.unique(test_outcome)) == 2: @@ -360,56 +352,56 @@ print("Out-of-sample mean squared error:\t", mse) # print(np.mean(test_features)) # pred_outcome = fitted.predict(test_features) -test_df[f'{OUTCOME}_scaled'] = test_outcome -test_df[f'{OUTCOME}_pred'] = y_pred -Ys = test_df[[f'{OUTCOME}_scaled', - f'{OUTCOME}_pred', - 'cycle_day', - 'bc']] -Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -Ys['ppts'] = Ys.index.get_level_values(0) - - -light_colors = ['#33ACE3', #Bubbles - '#EA6964', #Blossom - '#4AB62C' #Buttercup - ] -dark_colors = ['#1278a6', - '#a11510', - '#228208'] -light = ListedColormap(light_colors, name='light_powderpuff') -dark = ListedColormap(dark_colors, name='dark_powderpuff') +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") mpl.colormaps.register(cmap=light) mpl.colormaps.register(cmap=dark) -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='light_powderpuff' - ) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='dark_powderpuff') -ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') -fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') - - - -#print(test_outcome, "\n", y_pred) +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) # print(pred_outcome) if len(np.unique(test_outcome)) > 2: - + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) test_metrics["spearman correlation"] = corr with open( diff --git a/idconn/workflows/nbs_predict-e2_sensitivity.py b/idconn/workflows/nbs_predict-e2_sensitivity.py new file mode 100644 index 0000000..13177c7 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2bc_sensitivity.py b/idconn/workflows/nbs_predict-e2bc_sensitivity.py new file mode 100644 index 0000000..8052164 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2bc_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol" +CONFOUNDS = ["framewise_displacement", "bc"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2xp4-bc.py b/idconn/workflows/nbs_predict-e2xp4-bc.py index ad6a6d8..4b32a85 100644 --- a/idconn/workflows/nbs_predict-e2xp4-bc.py +++ b/idconn/workflows/nbs_predict-e2xp4-bc.py @@ -46,7 +46,7 @@ dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) -dat['estradiol÷progesterone'] = dat['estradiol'] / dat['progesterone'] +dat["estradiol÷progesterone"] = dat["estradiol"] / dat["progesterone"] keep = dat["adj"].dropna().index dat = dat.loc[keep] @@ -60,7 +60,7 @@ outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) -#print(len(np.unique(outcome))) +# print(len(np.unique(outcome))) if CONFOUNDS is not None: confounds = dat[CONFOUNDS] @@ -109,7 +109,7 @@ join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" ) -best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] # this uses the most predictive subnetwork as features in the model # might replace with thresholded weighted_average @@ -121,20 +121,20 @@ # here is where we'd threshold the weighted average to use for elastic-net weighted_average = np.where(weighted_average > 0, weighted_average, 0) -#print(np.sum(weighted_average)) -#nbs_vector = weighted_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) -#filter = np.where(nbs_vector >= p75, True, False) -#print(np.sum(filter)) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) # print(nbs_vector.shape, filter.shape) thresh_average = threshold_proportional(weighted_average, THRESH) nbs_vector2 = thresh_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) +# p75 = np.percentile(nbs_vector, 75) filter = np.where(nbs_vector2 > 0, True, False) # mask = io.vectorize_corrmats(filter) -edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: @@ -164,79 +164,71 @@ train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) - # run the model on the whole test dataset to get params # classification if the outcome is binary (for now) # could be extended to the multiclass case? train_metrics = {} if len(np.unique(outcome)) == 2: - model = LogisticRegression( - penalty="l2", - solver="saga", - C=best.C_[0] - ) + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) train_metrics["alpha"] = best.C_[0] - #train_metrics["l1_ratio"] = best.l1_ratio_ + # train_metrics["l1_ratio"] = best.l1_ratio_ else: model = Ridge( - solver="auto", + solver="auto", alpha=best.alpha_, fit_intercept=False, - ) + ) train_metrics["alpha"] = best.alpha_ cv = RepeatedKFold(n_splits=5, n_repeats=10) - #train_metrics["l1_ratio"] = best.l1_ratio_ -#print(params) -#model.set_params(**params) +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict -#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) scores = cross_validate( - model, - train_features, - train_outcome, - groups=groups, + model, + train_features, + train_outcome, + groups=groups, cv=cv, - return_estimator=True, - return_train_score=True - ) -train_metrics["in_sample_test"] = np.mean(scores['test_score']) -train_metrics["in_sample_train"] = np.mean(scores['train_score']) + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) -fitted = scores['estimator'][0] +fitted = scores["estimator"][0] y_pred = fitted.predict(X=train_features) train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) -dat[f'{OUTCOME}_pred'] = y_pred -dat[f'{OUTCOME}_scaled'] = train_outcome - -Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] -Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -train_colors = ['#a08ad1', #light - '#685690', #medium - '#3f2d69' #dark - ] -light_cmap = sns.color_palette('dark:#a08ad1') -dark_cmap = sns.color_palette('dark:#685690') - -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - ax=ax, - palette=dark_cmap) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - ax=ax, - palette=light_cmap) +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) ax.legend(bbox_to_anchor=(1.0, 0.5)) -fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) mse = mean_squared_error(train_outcome, y_pred) train_metrics["mean squared error"] = mse @@ -250,14 +242,14 @@ json.dump(train_metrics, fp) # yoink the coefficients? for a more parsimonious figure? -#print(fitted.coef_.shape) -#print(fitted.coef_) +# print(fitted.coef_.shape) +# print(fitted.coef_) coeff_vec = np.zeros_like(filter) j = 0 for i in range(0, filter.shape[0]): if filter[i] == True: - #print(j) - #print(fitted.coef_[0, j]) + # print(j) + # print(fitted.coef_[0, j]) coeff_vec[i] = fitted.coef_[0, j] j += 1 else: @@ -295,7 +287,7 @@ layout = bids.BIDSLayout(TEST_DSET, derivatives=True) test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) -test_df['estradiol÷progesterone'] = test_df['estradiol'] / test_df['progesterone'] +test_df["estradiol÷progesterone"] = test_df["estradiol"] / test_df["progesterone"] keep = test_df[[OUTCOME, "adj"]].dropna().index # print(keep) @@ -348,7 +340,7 @@ # score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) test_metrics = {} -#cross_validate(model, ) +# cross_validate(model, ) y_pred = fitted.predict(X=test_features) score = fitted.score(X=test_features, y=np.ravel(test_outcome)) if len(np.unique(test_outcome)) == 2: @@ -363,56 +355,56 @@ print("Out-of-sample mean squared error:\t", mse) # print(np.mean(test_features)) # pred_outcome = fitted.predict(test_features) -test_df[f'{OUTCOME}_scaled'] = test_outcome -test_df[f'{OUTCOME}_pred'] = y_pred -Ys = test_df[[f'{OUTCOME}_scaled', - f'{OUTCOME}_pred', - 'cycle_day', - 'bc']] -Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -Ys['ppts'] = Ys.index.get_level_values(0) - - -light_colors = ['#33ACE3', #Bubbles - '#EA6964', #Blossom - '#4AB62C' #Buttercup - ] -dark_colors = ['#1278a6', - '#a11510', - '#228208'] -light = ListedColormap(light_colors, name='light_powderpuff') -dark = ListedColormap(dark_colors, name='dark_powderpuff') +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") mpl.colormaps.register(cmap=light) mpl.colormaps.register(cmap=dark) -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='light_powderpuff' - ) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='dark_powderpuff') -ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') -fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') - - - -#print(test_outcome, "\n", y_pred) +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) # print(pred_outcome) if len(np.unique(test_outcome)) > 2: - + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) test_metrics["spearman correlation"] = corr with open( diff --git a/idconn/workflows/nbs_predict-e2xp4.py b/idconn/workflows/nbs_predict-e2xp4.py index 022d8b9..fcd6f40 100644 --- a/idconn/workflows/nbs_predict-e2xp4.py +++ b/idconn/workflows/nbs_predict-e2xp4.py @@ -46,7 +46,7 @@ dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) -dat['estradiol÷progesterone'] = dat['estradiol'] / dat['progesterone'] +dat["estradiol÷progesterone"] = dat["estradiol"] / dat["progesterone"] keep = dat["adj"].dropna().index dat = dat.loc[keep] @@ -60,7 +60,7 @@ outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) -#print(len(np.unique(outcome))) +# print(len(np.unique(outcome))) if CONFOUNDS is not None: confounds = dat[CONFOUNDS] @@ -109,7 +109,7 @@ join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" ) -best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] # this uses the most predictive subnetwork as features in the model # might replace with thresholded weighted_average @@ -121,20 +121,20 @@ # here is where we'd threshold the weighted average to use for elastic-net weighted_average = np.where(weighted_average > 0, weighted_average, 0) -#print(np.sum(weighted_average)) -#nbs_vector = weighted_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) -#filter = np.where(nbs_vector >= p75, True, False) -#print(np.sum(filter)) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) # print(nbs_vector.shape, filter.shape) thresh_average = threshold_proportional(weighted_average, THRESH) nbs_vector2 = thresh_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) +# p75 = np.percentile(nbs_vector, 75) filter = np.where(nbs_vector2 > 0, True, False) # mask = io.vectorize_corrmats(filter) -edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: @@ -164,79 +164,71 @@ train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) - # run the model on the whole test dataset to get params # classification if the outcome is binary (for now) # could be extended to the multiclass case? train_metrics = {} if len(np.unique(outcome)) == 2: - model = LogisticRegression( - penalty="l2", - solver="saga", - C=best.C_[0] - ) + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) train_metrics["alpha"] = best.C_[0] - #train_metrics["l1_ratio"] = best.l1_ratio_ + # train_metrics["l1_ratio"] = best.l1_ratio_ else: model = Ridge( - solver="auto", + solver="auto", alpha=best.alpha_, fit_intercept=False, - ) + ) train_metrics["alpha"] = best.alpha_ cv = RepeatedKFold(n_splits=5, n_repeats=10) - #train_metrics["l1_ratio"] = best.l1_ratio_ -#print(params) -#model.set_params(**params) +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict -#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) scores = cross_validate( - model, - train_features, - train_outcome, - groups=groups, + model, + train_features, + train_outcome, + groups=groups, cv=cv, - return_estimator=True, - return_train_score=True - ) -train_metrics["in_sample_test"] = np.mean(scores['test_score']) -train_metrics["in_sample_train"] = np.mean(scores['train_score']) + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) -fitted = scores['estimator'][0] +fitted = scores["estimator"][0] y_pred = fitted.predict(X=train_features) train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) -dat[f'{OUTCOME}_pred'] = y_pred -dat[f'{OUTCOME}_scaled'] = train_outcome - -Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] -Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -train_colors = ['#a08ad1', #light - '#685690', #medium - '#3f2d69' #dark - ] -light_cmap = sns.color_palette('dark:#a08ad1') -dark_cmap = sns.color_palette('dark:#685690') - -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - ax=ax, - palette=dark_cmap) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - ax=ax, - palette=light_cmap) +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) ax.legend(bbox_to_anchor=(1.0, 0.5)) -fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) mse = mean_squared_error(train_outcome, y_pred) train_metrics["mean squared error"] = mse @@ -250,14 +242,14 @@ json.dump(train_metrics, fp) # yoink the coefficients? for a more parsimonious figure? -#print(fitted.coef_.shape) -#print(fitted.coef_) +# print(fitted.coef_.shape) +# print(fitted.coef_) coeff_vec = np.zeros_like(filter) j = 0 for i in range(0, filter.shape[0]): if filter[i] == True: - #print(j) - #print(fitted.coef_[0, j]) + # print(j) + # print(fitted.coef_[0, j]) coeff_vec[i] = fitted.coef_[0, j] j += 1 else: @@ -295,7 +287,7 @@ layout = bids.BIDSLayout(TEST_DSET, derivatives=True) test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) -test_df['estradiol÷progesterone'] = test_df['estradiol'] / test_df['progesterone'] +test_df["estradiol÷progesterone"] = test_df["estradiol"] / test_df["progesterone"] keep = test_df[[OUTCOME, "adj"]].dropna().index # print(keep) @@ -348,7 +340,7 @@ # score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) test_metrics = {} -#cross_validate(model, ) +# cross_validate(model, ) y_pred = fitted.predict(X=test_features) score = fitted.score(X=test_features, y=np.ravel(test_outcome)) if len(np.unique(test_outcome)) == 2: @@ -363,56 +355,56 @@ print("Out-of-sample mean squared error:\t", mse) # print(np.mean(test_features)) # pred_outcome = fitted.predict(test_features) -test_df[f'{OUTCOME}_scaled'] = test_outcome -test_df[f'{OUTCOME}_pred'] = y_pred -Ys = test_df[[f'{OUTCOME}_scaled', - f'{OUTCOME}_pred', - 'cycle_day', - 'bc']] -Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -Ys['ppts'] = Ys.index.get_level_values(0) - - -light_colors = ['#33ACE3', #Bubbles - '#EA6964', #Blossom - '#4AB62C' #Buttercup - ] -dark_colors = ['#1278a6', - '#a11510', - '#228208'] -light = ListedColormap(light_colors, name='light_powderpuff') -dark = ListedColormap(dark_colors, name='dark_powderpuff') +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") mpl.colormaps.register(cmap=light) mpl.colormaps.register(cmap=dark) -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='light_powderpuff' - ) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='dark_powderpuff') -ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') -fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') - - - -#print(test_outcome, "\n", y_pred) +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) # print(pred_outcome) if len(np.unique(test_outcome)) > 2: - + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) test_metrics["spearman correlation"] = corr with open( diff --git a/idconn/workflows/nbs_predict-p4.py b/idconn/workflows/nbs_predict-p4.py index 559b4ff..2251179 100644 --- a/idconn/workflows/nbs_predict-p4.py +++ b/idconn/workflows/nbs_predict-p4.py @@ -58,7 +58,7 @@ outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) -#print(len(np.unique(outcome))) +# print(len(np.unique(outcome))) if CONFOUNDS is not None: confounds = dat[CONFOUNDS] @@ -107,7 +107,7 @@ join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" ) -best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] # this uses the most predictive subnetwork as features in the model # might replace with thresholded weighted_average @@ -119,20 +119,20 @@ # here is where we'd threshold the weighted average to use for elastic-net weighted_average = np.where(weighted_average > 0, weighted_average, 0) -#print(np.sum(weighted_average)) -#nbs_vector = weighted_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) -#filter = np.where(nbs_vector >= p75, True, False) -#print(np.sum(filter)) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) # print(nbs_vector.shape, filter.shape) thresh_average = threshold_proportional(weighted_average, THRESH) nbs_vector2 = thresh_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) +# p75 = np.percentile(nbs_vector, 75) filter = np.where(nbs_vector2 > 0, True, False) # mask = io.vectorize_corrmats(filter) -edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: @@ -162,79 +162,71 @@ train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) - # run the model on the whole test dataset to get params # classification if the outcome is binary (for now) # could be extended to the multiclass case? train_metrics = {} if len(np.unique(outcome)) == 2: - model = LogisticRegression( - penalty="l2", - solver="saga", - C=best.C_[0] - ) + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) train_metrics["alpha"] = best.C_[0] - #train_metrics["l1_ratio"] = best.l1_ratio_ + # train_metrics["l1_ratio"] = best.l1_ratio_ else: model = Ridge( - solver="auto", + solver="auto", alpha=best.alpha_, fit_intercept=False, - ) + ) train_metrics["alpha"] = best.alpha_ cv = RepeatedKFold(n_splits=5, n_repeats=10) - #train_metrics["l1_ratio"] = best.l1_ratio_ -#print(params) -#model.set_params(**params) +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict -#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) scores = cross_validate( - model, - train_features, - train_outcome, - groups=groups, + model, + train_features, + train_outcome, + groups=groups, cv=cv, - return_estimator=True, - return_train_score=True - ) -train_metrics["in_sample_test"] = np.mean(scores['test_score']) -train_metrics["in_sample_train"] = np.mean(scores['train_score']) + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) -fitted = scores['estimator'][0] +fitted = scores["estimator"][0] y_pred = fitted.predict(X=train_features) train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) -dat[f'{OUTCOME}_pred'] = y_pred -dat[f'{OUTCOME}_scaled'] = train_outcome - -Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] -Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -train_colors = ['#a08ad1', #light - '#685690', #medium - '#3f2d69' #dark - ] -light_cmap = sns.color_palette('dark:#a08ad1') -dark_cmap = sns.color_palette('dark:#685690') - -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - ax=ax, - palette=dark_cmap) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - ax=ax, - palette=light_cmap) +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) ax.legend(bbox_to_anchor=(1.0, 0.5)) -fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) mse = mean_squared_error(train_outcome, y_pred) train_metrics["mean squared error"] = mse @@ -248,12 +240,12 @@ json.dump(train_metrics, fp) # yoink the coefficients? for a more parsimonious figure? -#print(fitted.coef_.shape) +# print(fitted.coef_.shape) coeff_vec = np.zeros_like(filter) j = 0 for i in range(0, filter.shape[0]): if filter[i] == True: - #print(j) + # print(j) coeff_vec[i] = fitted.coef_[0, j] j += 1 else: @@ -342,7 +334,7 @@ # score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) test_metrics = {} -#cross_validate(model, ) +# cross_validate(model, ) y_pred = fitted.predict(X=test_features) score = fitted.score(X=test_features, y=np.ravel(test_outcome)) if len(np.unique(test_outcome)) == 2: @@ -357,56 +349,56 @@ print("Out-of-sample mean squared error:\t", mse) # print(np.mean(test_features)) # pred_outcome = fitted.predict(test_features) -test_df[f'{OUTCOME}_scaled'] = test_outcome -test_df[f'{OUTCOME}_pred'] = y_pred -Ys = test_df[[f'{OUTCOME}_scaled', - f'{OUTCOME}_pred', - 'cycle_day', - 'bc']] -Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -Ys['ppts'] = Ys.index.get_level_values(0) - - -light_colors = ['#33ACE3', #Bubbles - '#EA6964', #Blossom - '#4AB62C' #Buttercup - ] -dark_colors = ['#1278a6', - '#a11510', - '#228208'] -light = ListedColormap(light_colors, name='light_powderpuff') -dark = ListedColormap(dark_colors, name='dark_powderpuff') +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") mpl.colormaps.register(cmap=light) mpl.colormaps.register(cmap=dark) -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='light_powderpuff' - ) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='dark_powderpuff') -ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') -fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') - - - -#print(test_outcome, "\n", y_pred) +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) # print(pred_outcome) if len(np.unique(test_outcome)) > 2: - + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) test_metrics["spearman correlation"] = corr with open( diff --git a/idconn/workflows/nbs_predict-p4_sensitivity.py b/idconn/workflows/nbs_predict-p4_sensitivity.py new file mode 100644 index 0000000..449db27 --- /dev/null +++ b/idconn/workflows/nbs_predict-p4_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "progesterone" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-p4bc_sensitivity.py b/idconn/workflows/nbs_predict-p4bc_sensitivity.py new file mode 100644 index 0000000..8cf6026 --- /dev/null +++ b/idconn/workflows/nbs_predict-p4bc_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "progesterone" +CONFOUNDS = ["framewise_displacement", "bc"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 46e804c..169a5aa 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -55,7 +55,7 @@ upper_tri = np.triu_indices(num_node, k=1) outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) -groups = dat['bc'] +groups = dat["bc"] if CONFOUNDS is not None: confounds = dat[CONFOUNDS] @@ -85,7 +85,6 @@ edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] - # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: confounds_test = test_df[CONFOUNDS].values @@ -145,7 +144,7 @@ join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" ) -best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] # this uses the most predictive subnetwork as features in the model # might replace with thresholded weighted_average @@ -157,13 +156,13 @@ # here is where we'd threshold the weighted average to use for elastic-net weighted_average = np.where(weighted_average > 0, weighted_average, 0) -#nbs_vector = weighted_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) -#filter = np.where(nbs_vector >= p75, True, False) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) # print(nbs_vector.shape, filter.shape) thresh_average = threshold_proportional(weighted_average, THRESH) nbs_vector2 = thresh_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) +# p75 = np.percentile(nbs_vector, 75) filter = np.where(nbs_vector2 > 0, True, False) # mask = io.vectorize_corrmats(filter) @@ -205,61 +204,59 @@ train_metrics = {} if len(np.unique(outcome)) == 2: - model = LogisticRegression( - penalty="l2", - solver="saga", - C=best.C_[0] - ) + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) train_metrics["alpha"] = best.C_[0] - #train_metrics["l1_ratio"] = best.l1_ratio_ + # train_metrics["l1_ratio"] = best.l1_ratio_ else: - model = Ridge( - solver="saga", - alpha=best.alpha_ - ) + model = Ridge(solver="saga", alpha=best.alpha_) train_metrics["alpha"] = best.alpha_ - #train_metrics["l1_ratio"] = best.l1_ratio_ -#print(params) -#model.set_params(**params) + # train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict scores = cross_validate( - model, - train_features, - train_outcome, - groups=groups, + model, + train_features, + train_outcome, + groups=groups, cv=cv, - return_estimator=True, - return_train_score=True - ) -train_metrics["in_sample_test"] = np.mean(scores['test_score']) -train_metrics["in_sample_train"] = np.mean(scores['train_score']) + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) -fitted = scores['estimator'][0] +fitted = scores["estimator"][0] y_pred = fitted.predict(X=train_features) train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) -dat[f'{OUTCOME}_pred'] = y_pred -dat[f'{OUTCOME}_scaled'] = train_outcome - -Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled']] -Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -train_colors = ['#a08ad1', #light - '#685690', #medium - '#3f2d69' #dark - ] -light_cmap = sns.color_palette('dark:#a08ad1') -dark_cmap = sns.color_palette('dark:#685690') - -fig,ax = plt.subplots() -g = sns.scatterplot(x=f'{OUTCOME}_scaled', - y=f'{OUTCOME}_pred', - #style='bc', - data=Ys, - ax=ax, - palette=dark_cmap) -#ax.legend(bbox_to_anchor=(1.0, 0.5)) -fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x=f"{OUTCOME}_scaled", + y=f"{OUTCOME}_pred", + # style='bc', + data=Ys, + ax=ax, + palette=dark_cmap, +) +# ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) mse = mean_squared_error(train_outcome, y_pred) train_metrics["mean squared error"] = mse @@ -277,7 +274,7 @@ j = 0 for i in range(0, filter.shape[0]): if filter[i] == True: - #print(j) + # print(j) coeff_vec[i] = fitted.coef_[0, j] j += 1 else: @@ -341,43 +338,43 @@ print("Out-of-sample mean squared error:\t", mse) # print(np.mean(test_features)) # pred_outcome = fitted.predict(test_features) -test_df[f'{OUTCOME}_scaled'] = test_outcome -test_df[f'{OUTCOME}_pred'] = y_pred -Ys = test_df[[f'{OUTCOME}_scaled', - f'{OUTCOME}_pred']] -Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -Ys['ppts'] = Ys.index.get_level_values(0) - - -light_colors = ['#33ACE3', #Bubbles - '#EA6964', #Blossom - '#4AB62C' #Buttercup - ] -dark_colors = ['#1278a6', - '#a11510', - '#228208'] -light = ListedColormap(light_colors, name='light_powderpuff') -dark = ListedColormap(dark_colors, name='dark_powderpuff') +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") mpl.colormaps.register(cmap=light) mpl.colormaps.register(cmap=dark) -fig,ax = plt.subplots() -g = sns.scatterplot(x=f'{OUTCOME}_scaled', - y=f'{OUTCOME}_pred', - #style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='light_powderpuff' - ) -ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') -fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') - +fig, ax = plt.subplots() +g = sns.scatterplot( + x=f"{OUTCOME}_scaled", + y=f"{OUTCOME}_pred", + # style='bc', + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) -#print(test_outcome, "\n", y_pred) +# print(test_outcome, "\n", y_pred) # print(pred_outcome) if len(np.unique(test_outcome)) > 2: corr = spearmanr(test_outcome, y_pred) From bdf7527eb0ac6e7d5b1fc1961f08b927bba32667 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Fri, 14 Jun 2024 15:53:03 -0700 Subject: [PATCH 47/48] misspelled pingouin oops --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 4d7ed83..af040ab 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ "networkx", "matplotlib", # necessary until nilearn includes mpl as a dependency "enlighten", + 'pingouin' ], extras_require={ "doc": [ From 602fd72167e538f7820560c162f8cc6e761bd290 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Fri, 14 Jun 2024 16:01:20 -0700 Subject: [PATCH 48/48] add docstring to test function --- idconn/tests/test_pipeline.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/idconn/tests/test_pipeline.py b/idconn/tests/test_pipeline.py index 6d78fae..8322e7f 100644 --- a/idconn/tests/test_pipeline.py +++ b/idconn/tests/test_pipeline.py @@ -2,6 +2,9 @@ def test_idconn_workflow_smoke(): + ''' + this is a docstring bc my tests kept failing and it was annoying + ''' from idconn.pipeline import idconn_workflow # Check that it's a function ¯\_(ツ)_/¯