From f39aba7d13015a3f9c40c22eee68cedcba661ab5 Mon Sep 17 00:00:00 2001 From: "Katherine L. Bottenhorn" Date: Fri, 14 Jun 2024 15:47:29 -0700 Subject: [PATCH] update nbsp workflows for hormones x fc paper --- idconn/io.py | 20 +- idconn/nbs.py | 144 +++--- idconn/workflows/nbs_predict-bc.py | 387 ++++++++++++++++ .../workflows/nbs_predict-bc_sensitivity.py | 412 ++++++++++++++++++ idconn/workflows/nbs_predict-e2.py | 210 +++++---- .../workflows/nbs_predict-e2_sensitivity.py | 412 ++++++++++++++++++ .../workflows/nbs_predict-e2bc_sensitivity.py | 412 ++++++++++++++++++ idconn/workflows/nbs_predict-e2xp4-bc.py | 214 +++++---- idconn/workflows/nbs_predict-e2xp4.py | 214 +++++---- idconn/workflows/nbs_predict-p4.py | 206 +++++---- .../workflows/nbs_predict-p4_sensitivity.py | 412 ++++++++++++++++++ .../workflows/nbs_predict-p4bc_sensitivity.py | 412 ++++++++++++++++++ idconn/workflows/nbs_predict.py | 167 ++++--- 13 files changed, 3027 insertions(+), 595 deletions(-) create mode 100644 idconn/workflows/nbs_predict-bc.py create mode 100644 idconn/workflows/nbs_predict-bc_sensitivity.py create mode 100644 idconn/workflows/nbs_predict-e2_sensitivity.py create mode 100644 idconn/workflows/nbs_predict-e2bc_sensitivity.py create mode 100644 idconn/workflows/nbs_predict-p4_sensitivity.py create mode 100644 idconn/workflows/nbs_predict-p4bc_sensitivity.py diff --git a/idconn/io.py b/idconn/io.py index 3e1bca9..55ddc81 100644 --- a/idconn/io.py +++ b/idconn/io.py @@ -35,6 +35,7 @@ def calc_fd(confounds): fd = np.sum([delta_x, delta_y, delta_z, delta_alpha, delta_beta, delta_gamma], axis=0) return fd + def build_statsmodel_json( name, task, @@ -131,6 +132,7 @@ def build_statsmodel_json( json.dump(statsmodel, outfile) return statsmodel_json + def atlas_picker(atlas, path, key=None): """Takes in atlas name and path to file, if local, returns nifti-like object (usually file path to downloaded atlas), @@ -190,6 +192,7 @@ def atlas_picker(atlas, path, key=None): return atlas, path + def vectorize_corrmats(matrices, diagonal=False): """Returns the vectorized upper triangles of a 3-dimensional array (i.e., node x node x matrix) of matrices. Output will be a 2-dimensional @@ -249,12 +252,13 @@ def vectorize_corrmats(matrices, diagonal=False): edge_vector = np.asarray(edge_vector) return edge_vector + def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True, verbose=False): """Returns a node x node x (subject x session) matrix of correlation matrices from a BIDS derivative folder. Optionally returns a node^2 x (subject x session) array of vectorized upper triangles of those correlation matrices. - ME @ ME: NEEDS AN OPTION TO KEEP RUNS SEPARATE. CURRENTLY IT AVERAGES CONFOUNDS AND + ME @ ME: NEEDS AN OPTION TO KEEP RUNS SEPARATE. CURRENTLY IT AVERAGES CONFOUNDS AND Parameters ---------- layout : BIDSLayout or str @@ -404,7 +408,7 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True ############### DOES THIS ONLY GRAB ONE RUN?!?!?! ############## ################################################################ path = path[0] - + else: pass assert exists(path), f"Corrmat file not found at {path}" @@ -426,6 +430,7 @@ def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True ppt_df.replace({"": np.nan}, inplace=True) return ppt_df + def undo_vectorize(edges, num_node=None, diagonal=False): """ Puts an edge vector back into an adjacency matrix. @@ -453,17 +458,18 @@ def undo_vectorize(edges, num_node=None, diagonal=False): num_node = int(num_node) X = np.zeros((num_node, num_node)) if diagonal == False: - k=1 + k = 1 if diagonal == True: - k=0 + k = 0 X[np.triu_indices(num_node, k=k)] = edges - diag_X = X[np.diag_indices(num_node,2)] + diag_X = X[np.diag_indices(num_node, 2)] X = X + X.T if diagonal == True: - X[np.diag_indices(num_node,2)] = diag_X - #print('did undo_vectorize work?', np.allclose(X, X.T)) + X[np.diag_indices(num_node, 2)] = diag_X + # print('did undo_vectorize work?', np.allclose(X, X.T)) return X + def plot_edges( adj, atlas_nii, diff --git a/idconn/nbs.py b/idconn/nbs.py index 26ed551..52e9b37 100644 --- a/idconn/nbs.py +++ b/idconn/nbs.py @@ -8,11 +8,7 @@ # import bct from sklearn.experimental import enable_halving_search_cv -from sklearn.model_selection import ( - RepeatedStratifiedKFold, - RepeatedKFold, - HalvingGridSearchCV -) +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, HalvingGridSearchCV from sklearn.feature_selection import f_regression, f_classif from sklearn.linear_model import LogisticRegression, ElasticNet, LogisticRegressionCV, RidgeCV @@ -39,9 +35,9 @@ def calc_number_of_nodes(matrices): def residualize(X, y=None, confounds=None): - ''' + """ all inputs need to be arrays, not dataframes - ''' + """ # residualize the outcome if confounds is not None: if y is not None: @@ -74,7 +70,9 @@ def residualize(X, y=None, confounds=None): print("Confound matrix wasn't provided, so no confounding was done") -def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=False, permutations=10000): +def pynbs( + matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=False, permutations=10000 +): """ Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided of shape ((subject x session)x node x node) @@ -129,7 +127,6 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict= edges = matrices.copy() # print(edges.shape) - # edges = edges.T # run an ols per edge @@ -145,13 +142,15 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict= # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - sig_matrix = undo_vectorize(sig_edges, num_node=num_node, diagonal=diagonal) # need to write this function + sig_matrix = undo_vectorize( + sig_edges, num_node=num_node, diagonal=diagonal + ) # need to write this function matrix = nx.from_numpy_array(sig_matrix) # use networkX to find connected components S = [matrix.subgraph(c).copy() for c in nx.connected_components(matrix)] S.sort(key=len, reverse=True) - #largest_cc = max(nx.connected_components(matrix), key=len) + # largest_cc = max(nx.connected_components(matrix), key=len) G0 = S[0] # print(G0) @@ -202,7 +201,9 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict= # print(np.sum(perm_edges)) # find largest connected component of sig_edges # turn sig_edges into an nxn matrix first - perm_matrix = undo_vectorize(perm_edges, num_node=num_node, diagonal=diagonal) # need to write this function + perm_matrix = undo_vectorize( + perm_edges, num_node=num_node, diagonal=diagonal + ) # need to write this function perm_nx = nx.from_numpy_array(perm_matrix) largest_cc = max(nx.connected_components(perm_nx), key=len) @@ -233,7 +234,17 @@ def pynbs(matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict= def kfold_nbs( - matrices, outcome, confounds=None, alpha=0.05, groups=None, num_node=None, diagonal=False, scale_x=False, scale_y=False, n_splits=10, n_iterations=10 + matrices, + outcome, + confounds=None, + alpha=0.05, + groups=None, + num_node=None, + diagonal=False, + scale_x=False, + scale_y=False, + n_splits=10, + n_iterations=10, ): """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided of shape ((subject x session)x node x node) @@ -326,38 +337,38 @@ def kfold_nbs( if diagonal == True: k = 0 if diagonal == False: - k=1 + k = 1 upper_tri = np.triu_indices(num_node, k=k) i = 0 manager = enlighten.get_manager() ticks = manager.counter(total=n_splits * n_iterations, desc="Progress", unit="folds") for train_idx, test_idx in cv.split(edges, split_y): - + cv_results.at[i, "split"] = (train_idx, test_idx) # assert len(train_a_idx) == len(train_b_idx) Cs = np.logspace(-4, 4, 10) - #print(len(np.unique(outcome))) + # print(len(np.unique(outcome))) if np.unique(outcome).shape[0] == 2: - #print('binary') + # print('binary') regressor = LogisticRegressionCV( - Cs=Cs, + Cs=Cs, cv=4, - #verbose=2, - max_iter=100000, - penalty="l2", - solver="saga", - n_jobs=4 + # verbose=2, + max_iter=100000, + penalty="l2", + solver="saga", + n_jobs=4, ) - + else: - #print('continuous') + # print('continuous') regressor = RidgeCV( - alphas=Cs, - cv=4, - #n_jobs=4 - ) + alphas=Cs, + cv=4, + # n_jobs=4 + ) train_y = outcome[train_idx] test_y = outcome[test_idx] @@ -392,16 +403,14 @@ def kfold_nbs( y_scaler = Normalizer() train_y = y_scaler.fit_transform(train_y.reshape(-1, 1)) test_y = y_scaler.transform(test_y.reshape(-1, 1)) - - - - # perform NBS wooooooooo # note: output is a dataframe :) # PYNBS SHOULD NOT DO CONFOUND REGRESSION? - adj = pynbs(train_edges, train_y, num_node=num_node, diagonal=diagonal, alpha=alpha, predict=True) - #print(adj.shape, adj.ndim, adj[0].shape, upper_tri) + adj = pynbs( + train_edges, train_y, num_node=num_node, diagonal=diagonal, alpha=alpha, predict=True + ) + # print(adj.shape, adj.ndim, adj[0].shape, upper_tri) # cv_results.at[i, 'pval'] = pval cv_results.at[i, "component"] = adj.values @@ -413,7 +422,7 @@ def kfold_nbs( # so you don't have repeated edges # returns (n_edges, ) nbs_vector = adj.values[upper_tri] - #print(nbs_vector.shape) + # print(nbs_vector.shape) # print(nbs_vector.shape) # use those to make a "significant edges" mask mask = nbs_vector == 1.0 @@ -425,31 +434,31 @@ def kfold_nbs( # returns (n_edges, samples) train_features = train_edges.T[mask] test_features = test_edges.T[mask] - #print(mask.shape, np.sum(mask), train_edges.shape, train_features.shape) + # print(mask.shape, np.sum(mask), train_edges.shape, train_features.shape) train_features = train_features.T test_features = test_features.T - - #train_features = scaler.fit_transform(train_features.T) - #test_features = scaler.fit_transform(test_features.T) - #print(train_features.shape, train_y.shape) - #print(f"train_edges:\t{train_edges[:10, 0]}\ntrain_features:\t{train_features[:10, 0]}") + # train_features = scaler.fit_transform(train_features.T) + # test_features = scaler.fit_transform(test_features.T) + # print(train_features.shape, train_y.shape) + + # print(f"train_edges:\t{train_edges[:10, 0]}\ntrain_features:\t{train_features[:10, 0]}") # print(np.ravel(train_y)) # train model predicting outcome from brain (note: no mas covariates) # use grid search bc I want to know how to tune alpha and l1_ratio - - #grid = HalvingGridSearchCV(estimator=regressor, - # param_grid=param_grid, - # n_jobs=8, - # cv=4, + + # grid = HalvingGridSearchCV(estimator=regressor, + # param_grid=param_grid, + # n_jobs=8, + # cv=4, # factor=2, # verbose=0, - # min_resources=20, - # refit=True, + # min_resources=20, + # refit=True, # aggressive_elimination=False) model = regressor.fit(X=train_features, y=np.ravel(train_y)) - + cv_results.at[i, "model"] = model # score that model on the testing data @@ -462,18 +471,20 @@ def kfold_nbs( # I go die now if np.unique(outcome).shape[0] == 2: score = model.score(X=test_features, y=np.ravel(test_y)) - + else: predicted_y = model.predict(X=test_features) - score,p = spearmanr(predicted_y, np.ravel(test_y)) - #spearman = spearmanr(predicted_y, np.ravel(test_y)) - + score, p = spearmanr(predicted_y, np.ravel(test_y)) + # spearman = spearmanr(predicted_y, np.ravel(test_y)) + cv_results.at[i, "score"] = score if i % (n_splits * n_iterations / 10) == 0: - mean = cv_results['score'].mean() - sdev = cv_results['score'].std() - print(f'Iteration {i} out of {n_splits * n_iterations}, average score:\t{mean:.2f} +/- {sdev:.2f}') - #print(score) + mean = cv_results["score"].mean() + sdev = cv_results["score"].std() + print( + f"Iteration {i} out of {n_splits * n_iterations}, average score:\t{mean:.2f} +/- {sdev:.2f}" + ) + # print(score) m = 0 param_vector = np.zeros_like(nbs_vector) @@ -489,21 +500,21 @@ def kfold_nbs( else: pass X = undo_vectorize(param_vector, num_node=num_node, diagonal=diagonal) - #cv_results.at[i, "coefficient_matrix"] = X - #cv_results.at[i, "coefficient_vector"] = param_vector + # cv_results.at[i, "coefficient_matrix"] = X + # cv_results.at[i, "coefficient_vector"] = param_vector i += 1 else: pass ticks.update() # calculate weighted average # print(cv_results['score']) - weighted_stack = np.zeros((num_node,num_node)) - fake = np.zeros((num_node,num_node)) + weighted_stack = np.zeros((num_node, num_node)) + fake = np.zeros((num_node, num_node)) # print(weighted_stack.shape) for j in index: # print(cv_results.at[j, 'score']) weighted = cv_results.at[j, "component"] * cv_results.at[j, "score"] - + if np.sum(weighted) == 0 or np.isnan(np.sum(weighted)) == True: weighted_stack = np.dstack([weighted_stack, fake]) else: @@ -511,5 +522,8 @@ def kfold_nbs( # print(weighted_stack.shape, weighted.shape) weighted_average = np.mean(weighted_stack, axis=-1) - #model = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] - return weighted_average, cv_results, #model + # model = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + return ( + weighted_average, + cv_results, + ) # model diff --git a/idconn/workflows/nbs_predict-bc.py b/idconn/workflows/nbs_predict-bc.py new file mode 100644 index 0000000..ec3c559 --- /dev/null +++ b/idconn/workflows/nbs_predict-bc.py @@ -0,0 +1,387 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "bc" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +train_layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(train_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) +groups = dat["bc"] + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +# load in test data +test_layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(test_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(nbs_vector.shape, filter.shape) +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge(solver="saga", alpha=best.alpha_) + train_metrics["alpha"] = best.alpha_ + # train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict + +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x=f"{OUTCOME}_scaled", + y=f"{OUTCOME}_pred", + # style='bc', + data=Ys, + ax=ax, + palette=dark_cmap, +) +# ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample prediction score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +train_metrics["in_sample_mse"] = mse +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + + +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x=f"{OUTCOME}_scaled", + y=f"{OUTCOME}_pred", + # style='bc', + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + corr = spearmanr(test_outcome, y_pred) + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-bc_sensitivity.py b/idconn/workflows/nbs_predict-bc_sensitivity.py new file mode 100644 index 0000000..813cf66 --- /dev/null +++ b/idconn/workflows/nbs_predict-bc_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "bc" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2.py b/idconn/workflows/nbs_predict-e2.py index c92d274..a846b5a 100644 --- a/idconn/workflows/nbs_predict-e2.py +++ b/idconn/workflows/nbs_predict-e2.py @@ -58,7 +58,7 @@ outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) -#print(len(np.unique(outcome))) +# print(len(np.unique(outcome))) if CONFOUNDS is not None: confounds = dat[CONFOUNDS] @@ -107,7 +107,7 @@ join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" ) -best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] # this uses the most predictive subnetwork as features in the model # might replace with thresholded weighted_average @@ -119,20 +119,20 @@ # here is where we'd threshold the weighted average to use for elastic-net weighted_average = np.where(weighted_average > 0, weighted_average, 0) -#print(np.sum(weighted_average)) -#nbs_vector = weighted_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) -#filter = np.where(nbs_vector >= p75, True, False) -#print(np.sum(filter)) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) # print(nbs_vector.shape, filter.shape) thresh_average = threshold_proportional(weighted_average, THRESH) nbs_vector2 = thresh_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) +# p75 = np.percentile(nbs_vector, 75) filter = np.where(nbs_vector2 > 0, True, False) # mask = io.vectorize_corrmats(filter) -edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: @@ -162,79 +162,71 @@ train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) - # run the model on the whole test dataset to get params # classification if the outcome is binary (for now) # could be extended to the multiclass case? train_metrics = {} if len(np.unique(outcome)) == 2: - model = LogisticRegression( - penalty="l2", - solver="saga", - C=best.C_[0] - ) + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) train_metrics["alpha"] = best.C_[0] - #train_metrics["l1_ratio"] = best.l1_ratio_ + # train_metrics["l1_ratio"] = best.l1_ratio_ else: model = Ridge( - solver="auto", + solver="auto", alpha=best.alpha_, fit_intercept=False, - ) + ) train_metrics["alpha"] = best.alpha_ cv = RepeatedKFold(n_splits=5, n_repeats=10) - #train_metrics["l1_ratio"] = best.l1_ratio_ -#print(params) -#model.set_params(**params) +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict -#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) scores = cross_validate( - model, - train_features, - train_outcome, - groups=groups, + model, + train_features, + train_outcome, + groups=groups, cv=cv, - return_estimator=True, - return_train_score=True - ) -train_metrics["in_sample_test"] = np.mean(scores['test_score']) -train_metrics["in_sample_train"] = np.mean(scores['train_score']) + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) -fitted = scores['estimator'][0] +fitted = scores["estimator"][0] y_pred = fitted.predict(X=train_features) train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) -dat[f'{OUTCOME}_pred'] = y_pred -dat[f'{OUTCOME}_scaled'] = train_outcome - -Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] -Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -train_colors = ['#a08ad1', #light - '#685690', #medium - '#3f2d69' #dark - ] -light_cmap = sns.color_palette('dark:#a08ad1') -dark_cmap = sns.color_palette('dark:#685690') - -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - ax=ax, - palette=dark_cmap) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - ax=ax, - palette=light_cmap) +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) ax.legend(bbox_to_anchor=(1.0, 0.5)) -fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) mse = mean_squared_error(train_outcome, y_pred) train_metrics["mean squared error"] = mse @@ -248,14 +240,14 @@ json.dump(train_metrics, fp) # yoink the coefficients? for a more parsimonious figure? -#print(fitted.coef_.shape) -#print(fitted.coef_) +# print(fitted.coef_.shape) +# print(fitted.coef_) coeff_vec = np.zeros_like(filter) j = 0 for i in range(0, filter.shape[0]): if filter[i] == True: - #print(j) - #print(fitted.coef_[0, j]) + # print(j) + # print(fitted.coef_[0, j]) coeff_vec[i] = fitted.coef_[0, j] j += 1 else: @@ -345,7 +337,7 @@ # score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) test_metrics = {} -#cross_validate(model, ) +# cross_validate(model, ) y_pred = fitted.predict(X=test_features) score = fitted.score(X=test_features, y=np.ravel(test_outcome)) if len(np.unique(test_outcome)) == 2: @@ -360,56 +352,56 @@ print("Out-of-sample mean squared error:\t", mse) # print(np.mean(test_features)) # pred_outcome = fitted.predict(test_features) -test_df[f'{OUTCOME}_scaled'] = test_outcome -test_df[f'{OUTCOME}_pred'] = y_pred -Ys = test_df[[f'{OUTCOME}_scaled', - f'{OUTCOME}_pred', - 'cycle_day', - 'bc']] -Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -Ys['ppts'] = Ys.index.get_level_values(0) - - -light_colors = ['#33ACE3', #Bubbles - '#EA6964', #Blossom - '#4AB62C' #Buttercup - ] -dark_colors = ['#1278a6', - '#a11510', - '#228208'] -light = ListedColormap(light_colors, name='light_powderpuff') -dark = ListedColormap(dark_colors, name='dark_powderpuff') +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") mpl.colormaps.register(cmap=light) mpl.colormaps.register(cmap=dark) -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='light_powderpuff' - ) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='dark_powderpuff') -ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') -fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') - - - -#print(test_outcome, "\n", y_pred) +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) # print(pred_outcome) if len(np.unique(test_outcome)) > 2: - + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) test_metrics["spearman correlation"] = corr with open( diff --git a/idconn/workflows/nbs_predict-e2_sensitivity.py b/idconn/workflows/nbs_predict-e2_sensitivity.py new file mode 100644 index 0000000..13177c7 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2bc_sensitivity.py b/idconn/workflows/nbs_predict-e2bc_sensitivity.py new file mode 100644 index 0000000..8052164 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2bc_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol" +CONFOUNDS = ["framewise_displacement", "bc"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2xp4-bc.py b/idconn/workflows/nbs_predict-e2xp4-bc.py index ad6a6d8..4b32a85 100644 --- a/idconn/workflows/nbs_predict-e2xp4-bc.py +++ b/idconn/workflows/nbs_predict-e2xp4-bc.py @@ -46,7 +46,7 @@ dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) -dat['estradiol÷progesterone'] = dat['estradiol'] / dat['progesterone'] +dat["estradiol÷progesterone"] = dat["estradiol"] / dat["progesterone"] keep = dat["adj"].dropna().index dat = dat.loc[keep] @@ -60,7 +60,7 @@ outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) -#print(len(np.unique(outcome))) +# print(len(np.unique(outcome))) if CONFOUNDS is not None: confounds = dat[CONFOUNDS] @@ -109,7 +109,7 @@ join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" ) -best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] # this uses the most predictive subnetwork as features in the model # might replace with thresholded weighted_average @@ -121,20 +121,20 @@ # here is where we'd threshold the weighted average to use for elastic-net weighted_average = np.where(weighted_average > 0, weighted_average, 0) -#print(np.sum(weighted_average)) -#nbs_vector = weighted_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) -#filter = np.where(nbs_vector >= p75, True, False) -#print(np.sum(filter)) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) # print(nbs_vector.shape, filter.shape) thresh_average = threshold_proportional(weighted_average, THRESH) nbs_vector2 = thresh_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) +# p75 = np.percentile(nbs_vector, 75) filter = np.where(nbs_vector2 > 0, True, False) # mask = io.vectorize_corrmats(filter) -edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: @@ -164,79 +164,71 @@ train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) - # run the model on the whole test dataset to get params # classification if the outcome is binary (for now) # could be extended to the multiclass case? train_metrics = {} if len(np.unique(outcome)) == 2: - model = LogisticRegression( - penalty="l2", - solver="saga", - C=best.C_[0] - ) + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) train_metrics["alpha"] = best.C_[0] - #train_metrics["l1_ratio"] = best.l1_ratio_ + # train_metrics["l1_ratio"] = best.l1_ratio_ else: model = Ridge( - solver="auto", + solver="auto", alpha=best.alpha_, fit_intercept=False, - ) + ) train_metrics["alpha"] = best.alpha_ cv = RepeatedKFold(n_splits=5, n_repeats=10) - #train_metrics["l1_ratio"] = best.l1_ratio_ -#print(params) -#model.set_params(**params) +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict -#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) scores = cross_validate( - model, - train_features, - train_outcome, - groups=groups, + model, + train_features, + train_outcome, + groups=groups, cv=cv, - return_estimator=True, - return_train_score=True - ) -train_metrics["in_sample_test"] = np.mean(scores['test_score']) -train_metrics["in_sample_train"] = np.mean(scores['train_score']) + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) -fitted = scores['estimator'][0] +fitted = scores["estimator"][0] y_pred = fitted.predict(X=train_features) train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) -dat[f'{OUTCOME}_pred'] = y_pred -dat[f'{OUTCOME}_scaled'] = train_outcome - -Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] -Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -train_colors = ['#a08ad1', #light - '#685690', #medium - '#3f2d69' #dark - ] -light_cmap = sns.color_palette('dark:#a08ad1') -dark_cmap = sns.color_palette('dark:#685690') - -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - ax=ax, - palette=dark_cmap) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - ax=ax, - palette=light_cmap) +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) ax.legend(bbox_to_anchor=(1.0, 0.5)) -fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) mse = mean_squared_error(train_outcome, y_pred) train_metrics["mean squared error"] = mse @@ -250,14 +242,14 @@ json.dump(train_metrics, fp) # yoink the coefficients? for a more parsimonious figure? -#print(fitted.coef_.shape) -#print(fitted.coef_) +# print(fitted.coef_.shape) +# print(fitted.coef_) coeff_vec = np.zeros_like(filter) j = 0 for i in range(0, filter.shape[0]): if filter[i] == True: - #print(j) - #print(fitted.coef_[0, j]) + # print(j) + # print(fitted.coef_[0, j]) coeff_vec[i] = fitted.coef_[0, j] j += 1 else: @@ -295,7 +287,7 @@ layout = bids.BIDSLayout(TEST_DSET, derivatives=True) test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) -test_df['estradiol÷progesterone'] = test_df['estradiol'] / test_df['progesterone'] +test_df["estradiol÷progesterone"] = test_df["estradiol"] / test_df["progesterone"] keep = test_df[[OUTCOME, "adj"]].dropna().index # print(keep) @@ -348,7 +340,7 @@ # score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) test_metrics = {} -#cross_validate(model, ) +# cross_validate(model, ) y_pred = fitted.predict(X=test_features) score = fitted.score(X=test_features, y=np.ravel(test_outcome)) if len(np.unique(test_outcome)) == 2: @@ -363,56 +355,56 @@ print("Out-of-sample mean squared error:\t", mse) # print(np.mean(test_features)) # pred_outcome = fitted.predict(test_features) -test_df[f'{OUTCOME}_scaled'] = test_outcome -test_df[f'{OUTCOME}_pred'] = y_pred -Ys = test_df[[f'{OUTCOME}_scaled', - f'{OUTCOME}_pred', - 'cycle_day', - 'bc']] -Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -Ys['ppts'] = Ys.index.get_level_values(0) - - -light_colors = ['#33ACE3', #Bubbles - '#EA6964', #Blossom - '#4AB62C' #Buttercup - ] -dark_colors = ['#1278a6', - '#a11510', - '#228208'] -light = ListedColormap(light_colors, name='light_powderpuff') -dark = ListedColormap(dark_colors, name='dark_powderpuff') +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") mpl.colormaps.register(cmap=light) mpl.colormaps.register(cmap=dark) -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='light_powderpuff' - ) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='dark_powderpuff') -ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') -fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') - - - -#print(test_outcome, "\n", y_pred) +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) # print(pred_outcome) if len(np.unique(test_outcome)) > 2: - + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) test_metrics["spearman correlation"] = corr with open( diff --git a/idconn/workflows/nbs_predict-e2xp4.py b/idconn/workflows/nbs_predict-e2xp4.py index 022d8b9..fcd6f40 100644 --- a/idconn/workflows/nbs_predict-e2xp4.py +++ b/idconn/workflows/nbs_predict-e2xp4.py @@ -46,7 +46,7 @@ dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) -dat['estradiol÷progesterone'] = dat['estradiol'] / dat['progesterone'] +dat["estradiol÷progesterone"] = dat["estradiol"] / dat["progesterone"] keep = dat["adj"].dropna().index dat = dat.loc[keep] @@ -60,7 +60,7 @@ outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) -#print(len(np.unique(outcome))) +# print(len(np.unique(outcome))) if CONFOUNDS is not None: confounds = dat[CONFOUNDS] @@ -109,7 +109,7 @@ join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" ) -best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] # this uses the most predictive subnetwork as features in the model # might replace with thresholded weighted_average @@ -121,20 +121,20 @@ # here is where we'd threshold the weighted average to use for elastic-net weighted_average = np.where(weighted_average > 0, weighted_average, 0) -#print(np.sum(weighted_average)) -#nbs_vector = weighted_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) -#filter = np.where(nbs_vector >= p75, True, False) -#print(np.sum(filter)) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) # print(nbs_vector.shape, filter.shape) thresh_average = threshold_proportional(weighted_average, THRESH) nbs_vector2 = thresh_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) +# p75 = np.percentile(nbs_vector, 75) filter = np.where(nbs_vector2 > 0, True, False) # mask = io.vectorize_corrmats(filter) -edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: @@ -164,79 +164,71 @@ train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) - # run the model on the whole test dataset to get params # classification if the outcome is binary (for now) # could be extended to the multiclass case? train_metrics = {} if len(np.unique(outcome)) == 2: - model = LogisticRegression( - penalty="l2", - solver="saga", - C=best.C_[0] - ) + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) train_metrics["alpha"] = best.C_[0] - #train_metrics["l1_ratio"] = best.l1_ratio_ + # train_metrics["l1_ratio"] = best.l1_ratio_ else: model = Ridge( - solver="auto", + solver="auto", alpha=best.alpha_, fit_intercept=False, - ) + ) train_metrics["alpha"] = best.alpha_ cv = RepeatedKFold(n_splits=5, n_repeats=10) - #train_metrics["l1_ratio"] = best.l1_ratio_ -#print(params) -#model.set_params(**params) +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict -#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) scores = cross_validate( - model, - train_features, - train_outcome, - groups=groups, + model, + train_features, + train_outcome, + groups=groups, cv=cv, - return_estimator=True, - return_train_score=True - ) -train_metrics["in_sample_test"] = np.mean(scores['test_score']) -train_metrics["in_sample_train"] = np.mean(scores['train_score']) + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) -fitted = scores['estimator'][0] +fitted = scores["estimator"][0] y_pred = fitted.predict(X=train_features) train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) -dat[f'{OUTCOME}_pred'] = y_pred -dat[f'{OUTCOME}_scaled'] = train_outcome - -Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] -Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -train_colors = ['#a08ad1', #light - '#685690', #medium - '#3f2d69' #dark - ] -light_cmap = sns.color_palette('dark:#a08ad1') -dark_cmap = sns.color_palette('dark:#685690') - -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - ax=ax, - palette=dark_cmap) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - ax=ax, - palette=light_cmap) +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) ax.legend(bbox_to_anchor=(1.0, 0.5)) -fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) mse = mean_squared_error(train_outcome, y_pred) train_metrics["mean squared error"] = mse @@ -250,14 +242,14 @@ json.dump(train_metrics, fp) # yoink the coefficients? for a more parsimonious figure? -#print(fitted.coef_.shape) -#print(fitted.coef_) +# print(fitted.coef_.shape) +# print(fitted.coef_) coeff_vec = np.zeros_like(filter) j = 0 for i in range(0, filter.shape[0]): if filter[i] == True: - #print(j) - #print(fitted.coef_[0, j]) + # print(j) + # print(fitted.coef_[0, j]) coeff_vec[i] = fitted.coef_[0, j] j += 1 else: @@ -295,7 +287,7 @@ layout = bids.BIDSLayout(TEST_DSET, derivatives=True) test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) -test_df['estradiol÷progesterone'] = test_df['estradiol'] / test_df['progesterone'] +test_df["estradiol÷progesterone"] = test_df["estradiol"] / test_df["progesterone"] keep = test_df[[OUTCOME, "adj"]].dropna().index # print(keep) @@ -348,7 +340,7 @@ # score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) test_metrics = {} -#cross_validate(model, ) +# cross_validate(model, ) y_pred = fitted.predict(X=test_features) score = fitted.score(X=test_features, y=np.ravel(test_outcome)) if len(np.unique(test_outcome)) == 2: @@ -363,56 +355,56 @@ print("Out-of-sample mean squared error:\t", mse) # print(np.mean(test_features)) # pred_outcome = fitted.predict(test_features) -test_df[f'{OUTCOME}_scaled'] = test_outcome -test_df[f'{OUTCOME}_pred'] = y_pred -Ys = test_df[[f'{OUTCOME}_scaled', - f'{OUTCOME}_pred', - 'cycle_day', - 'bc']] -Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -Ys['ppts'] = Ys.index.get_level_values(0) - - -light_colors = ['#33ACE3', #Bubbles - '#EA6964', #Blossom - '#4AB62C' #Buttercup - ] -dark_colors = ['#1278a6', - '#a11510', - '#228208'] -light = ListedColormap(light_colors, name='light_powderpuff') -dark = ListedColormap(dark_colors, name='dark_powderpuff') +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") mpl.colormaps.register(cmap=light) mpl.colormaps.register(cmap=dark) -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='light_powderpuff' - ) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='dark_powderpuff') -ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') -fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') - - - -#print(test_outcome, "\n", y_pred) +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) # print(pred_outcome) if len(np.unique(test_outcome)) > 2: - + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) test_metrics["spearman correlation"] = corr with open( diff --git a/idconn/workflows/nbs_predict-p4.py b/idconn/workflows/nbs_predict-p4.py index 559b4ff..2251179 100644 --- a/idconn/workflows/nbs_predict-p4.py +++ b/idconn/workflows/nbs_predict-p4.py @@ -58,7 +58,7 @@ outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) -#print(len(np.unique(outcome))) +# print(len(np.unique(outcome))) if CONFOUNDS is not None: confounds = dat[CONFOUNDS] @@ -107,7 +107,7 @@ join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" ) -best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] # this uses the most predictive subnetwork as features in the model # might replace with thresholded weighted_average @@ -119,20 +119,20 @@ # here is where we'd threshold the weighted average to use for elastic-net weighted_average = np.where(weighted_average > 0, weighted_average, 0) -#print(np.sum(weighted_average)) -#nbs_vector = weighted_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) -#filter = np.where(nbs_vector >= p75, True, False) -#print(np.sum(filter)) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) # print(nbs_vector.shape, filter.shape) thresh_average = threshold_proportional(weighted_average, THRESH) nbs_vector2 = thresh_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) +# p75 = np.percentile(nbs_vector, 75) filter = np.where(nbs_vector2 > 0, True, False) # mask = io.vectorize_corrmats(filter) -edges_train = np.vstack(dat["edge_vector"].dropna().values)[:,filter] +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: @@ -162,79 +162,71 @@ train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) - # run the model on the whole test dataset to get params # classification if the outcome is binary (for now) # could be extended to the multiclass case? train_metrics = {} if len(np.unique(outcome)) == 2: - model = LogisticRegression( - penalty="l2", - solver="saga", - C=best.C_[0] - ) + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) train_metrics["alpha"] = best.C_[0] - #train_metrics["l1_ratio"] = best.l1_ratio_ + # train_metrics["l1_ratio"] = best.l1_ratio_ else: model = Ridge( - solver="auto", + solver="auto", alpha=best.alpha_, fit_intercept=False, - ) + ) train_metrics["alpha"] = best.alpha_ cv = RepeatedKFold(n_splits=5, n_repeats=10) - #train_metrics["l1_ratio"] = best.l1_ratio_ -#print(params) -#model.set_params(**params) +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict -#fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) scores = cross_validate( - model, - train_features, - train_outcome, - groups=groups, + model, + train_features, + train_outcome, + groups=groups, cv=cv, - return_estimator=True, - return_train_score=True - ) -train_metrics["in_sample_test"] = np.mean(scores['test_score']) -train_metrics["in_sample_train"] = np.mean(scores['train_score']) + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) -fitted = scores['estimator'][0] +fitted = scores["estimator"][0] y_pred = fitted.predict(X=train_features) train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) -dat[f'{OUTCOME}_pred'] = y_pred -dat[f'{OUTCOME}_scaled'] = train_outcome - -Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled', 'bc', 'cycle_day']] -Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -train_colors = ['#a08ad1', #light - '#685690', #medium - '#3f2d69' #dark - ] -light_cmap = sns.color_palette('dark:#a08ad1') -dark_cmap = sns.color_palette('dark:#685690') - -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - ax=ax, - palette=dark_cmap) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - ax=ax, - palette=light_cmap) +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) ax.legend(bbox_to_anchor=(1.0, 0.5)) -fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) mse = mean_squared_error(train_outcome, y_pred) train_metrics["mean squared error"] = mse @@ -248,12 +240,12 @@ json.dump(train_metrics, fp) # yoink the coefficients? for a more parsimonious figure? -#print(fitted.coef_.shape) +# print(fitted.coef_.shape) coeff_vec = np.zeros_like(filter) j = 0 for i in range(0, filter.shape[0]): if filter[i] == True: - #print(j) + # print(j) coeff_vec[i] = fitted.coef_[0, j] j += 1 else: @@ -342,7 +334,7 @@ # score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) test_metrics = {} -#cross_validate(model, ) +# cross_validate(model, ) y_pred = fitted.predict(X=test_features) score = fitted.score(X=test_features, y=np.ravel(test_outcome)) if len(np.unique(test_outcome)) == 2: @@ -357,56 +349,56 @@ print("Out-of-sample mean squared error:\t", mse) # print(np.mean(test_features)) # pred_outcome = fitted.predict(test_features) -test_df[f'{OUTCOME}_scaled'] = test_outcome -test_df[f'{OUTCOME}_pred'] = y_pred -Ys = test_df[[f'{OUTCOME}_scaled', - f'{OUTCOME}_pred', - 'cycle_day', - 'bc']] -Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -Ys['ppts'] = Ys.index.get_level_values(0) - - -light_colors = ['#33ACE3', #Bubbles - '#EA6964', #Blossom - '#4AB62C' #Buttercup - ] -dark_colors = ['#1278a6', - '#a11510', - '#228208'] -light = ListedColormap(light_colors, name='light_powderpuff') -dark = ListedColormap(dark_colors, name='dark_powderpuff') +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") mpl.colormaps.register(cmap=light) mpl.colormaps.register(cmap=dark) -fig,ax = plt.subplots() -g = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_pred', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='light_powderpuff' - ) -h = sns.scatterplot(x='cycle_day', - y=f'{OUTCOME}_scaled', - style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='dark_powderpuff') -ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') -fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') - - - -#print(test_outcome, "\n", y_pred) +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) # print(pred_outcome) if len(np.unique(test_outcome)) > 2: - + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) test_metrics["spearman correlation"] = corr with open( diff --git a/idconn/workflows/nbs_predict-p4_sensitivity.py b/idconn/workflows/nbs_predict-p4_sensitivity.py new file mode 100644 index 0000000..449db27 --- /dev/null +++ b/idconn/workflows/nbs_predict-p4_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "progesterone" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-p4bc_sensitivity.py b/idconn/workflows/nbs_predict-p4bc_sensitivity.py new file mode 100644 index 0000000..8cf6026 --- /dev/null +++ b/idconn/workflows/nbs_predict-p4bc_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "progesterone" +CONFOUNDS = ["framewise_displacement", "bc"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py index 46e804c..169a5aa 100644 --- a/idconn/workflows/nbs_predict.py +++ b/idconn/workflows/nbs_predict.py @@ -55,7 +55,7 @@ upper_tri = np.triu_indices(num_node, k=1) outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) -groups = dat['bc'] +groups = dat["bc"] if CONFOUNDS is not None: confounds = dat[CONFOUNDS] @@ -85,7 +85,6 @@ edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] - # NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE if CONFOUNDS is not None: confounds_test = test_df[CONFOUNDS].values @@ -145,7 +144,7 @@ join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" ) -best = cv_results.sort_values(by='score', ascending=False).iloc[0]['model'] +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] # this uses the most predictive subnetwork as features in the model # might replace with thresholded weighted_average @@ -157,13 +156,13 @@ # here is where we'd threshold the weighted average to use for elastic-net weighted_average = np.where(weighted_average > 0, weighted_average, 0) -#nbs_vector = weighted_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) -#filter = np.where(nbs_vector >= p75, True, False) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) # print(nbs_vector.shape, filter.shape) thresh_average = threshold_proportional(weighted_average, THRESH) nbs_vector2 = thresh_average[upper_tri] -#p75 = np.percentile(nbs_vector, 75) +# p75 = np.percentile(nbs_vector, 75) filter = np.where(nbs_vector2 > 0, True, False) # mask = io.vectorize_corrmats(filter) @@ -205,61 +204,59 @@ train_metrics = {} if len(np.unique(outcome)) == 2: - model = LogisticRegression( - penalty="l2", - solver="saga", - C=best.C_[0] - ) + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) train_metrics["alpha"] = best.C_[0] - #train_metrics["l1_ratio"] = best.l1_ratio_ + # train_metrics["l1_ratio"] = best.l1_ratio_ else: - model = Ridge( - solver="saga", - alpha=best.alpha_ - ) + model = Ridge(solver="saga", alpha=best.alpha_) train_metrics["alpha"] = best.alpha_ - #train_metrics["l1_ratio"] = best.l1_ratio_ -#print(params) -#model.set_params(**params) + # train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) # train ElasticNet on full train dataset, using feature extraction from NBS-Predict scores = cross_validate( - model, - train_features, - train_outcome, - groups=groups, + model, + train_features, + train_outcome, + groups=groups, cv=cv, - return_estimator=True, - return_train_score=True - ) -train_metrics["in_sample_test"] = np.mean(scores['test_score']) -train_metrics["in_sample_train"] = np.mean(scores['train_score']) + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) -fitted = scores['estimator'][0] +fitted = scores["estimator"][0] y_pred = fitted.predict(X=train_features) train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) -dat[f'{OUTCOME}_pred'] = y_pred -dat[f'{OUTCOME}_scaled'] = train_outcome - -Ys = dat[[f'{OUTCOME}_pred', f'{OUTCOME}_scaled']] -Ys.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -train_colors = ['#a08ad1', #light - '#685690', #medium - '#3f2d69' #dark - ] -light_cmap = sns.color_palette('dark:#a08ad1') -dark_cmap = sns.color_palette('dark:#685690') - -fig,ax = plt.subplots() -g = sns.scatterplot(x=f'{OUTCOME}_scaled', - y=f'{OUTCOME}_pred', - #style='bc', - data=Ys, - ax=ax, - palette=dark_cmap) -#ax.legend(bbox_to_anchor=(1.0, 0.5)) -fig.savefig(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x=f"{OUTCOME}_scaled", + y=f"{OUTCOME}_pred", + # style='bc', + data=Ys, + ax=ax, + palette=dark_cmap, +) +# ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) mse = mean_squared_error(train_outcome, y_pred) train_metrics["mean squared error"] = mse @@ -277,7 +274,7 @@ j = 0 for i in range(0, filter.shape[0]): if filter[i] == True: - #print(j) + # print(j) coeff_vec[i] = fitted.coef_[0, j] j += 1 else: @@ -341,43 +338,43 @@ print("Out-of-sample mean squared error:\t", mse) # print(np.mean(test_features)) # pred_outcome = fitted.predict(test_features) -test_df[f'{OUTCOME}_scaled'] = test_outcome -test_df[f'{OUTCOME}_pred'] = y_pred -Ys = test_df[[f'{OUTCOME}_scaled', - f'{OUTCOME}_pred']] -Ys.to_csv(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep='\t') - -Ys['ppts'] = Ys.index.get_level_values(0) - - -light_colors = ['#33ACE3', #Bubbles - '#EA6964', #Blossom - '#4AB62C' #Buttercup - ] -dark_colors = ['#1278a6', - '#a11510', - '#228208'] -light = ListedColormap(light_colors, name='light_powderpuff') -dark = ListedColormap(dark_colors, name='dark_powderpuff') +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") mpl.colormaps.register(cmap=light) mpl.colormaps.register(cmap=dark) -fig,ax = plt.subplots() -g = sns.scatterplot(x=f'{OUTCOME}_scaled', - y=f'{OUTCOME}_pred', - #style='bc', - data=Ys, - hue='ppts', - hue_order=['sub-Bubbles', 'sub-Blossom', 'sub-Buttercup'], - ax=ax, - palette='light_powderpuff' - ) -ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left') -fig.savefig(join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), dpi=400, bbox_inches='tight') - +fig, ax = plt.subplots() +g = sns.scatterplot( + x=f"{OUTCOME}_scaled", + y=f"{OUTCOME}_pred", + # style='bc', + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) -#print(test_outcome, "\n", y_pred) +# print(test_outcome, "\n", y_pred) # print(pred_outcome) if len(np.unique(test_outcome)) > 2: corr = spearmanr(test_outcome, y_pred)