diff --git a/.gitignore b/.gitignore index 33bdf27..ce30a02 100644 --- a/.gitignore +++ b/.gitignore @@ -32,7 +32,6 @@ idconn/networking/task-graph-theory-fci.py idconn/networking/task-graph-theory-local-nodal.py idconn/networking/task-graph-theory-local.py idconn/networking/task-graph-theory-nodal.py -idconn/io.py docs/_build/ docs/generated/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..11ce204 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,125 @@ +# Contributing to IDConn + +Welcome to the ``IDConn`` repository! +We're excited you're here and want to contribute. + +These guidelines are designed to make it as easy as possible to get involved. +If you have any questions that aren't discussed below, please let us know by opening an [issue][link_issues]! + +Before you start you'll need to set up a free [GitHub][link_github] account and sign in. +Here are some [instructions][link_signupinstructions]. + +## Governance + +Governance is a hugely important part of any project. +It is especially important to have clear process and communication channels for open source projects that rely on a distributed network of volunteers, such as ``IDConn``. + +``IDConn`` is currently supported by a small group of core developers. +Even with only a couple of individuals involved in decision making processes, we've found that setting expectations and communicating a shared vision has great value. + +By starting the governance structure early in our development, we hope to welcome more people into the contributing team. +We are committed to continuing to update the governance structures as necessary. +Every member of the ``IDConn`` community is encouraged to comment on these processes and suggest improvements. + +As the first project leader, Katie Bottenhorn is ultimately responsible for any major decisions pertaining to ``IDConn`` development. +However, all potential changes are explicitly and openly discussed in the described channels of communication, and we strive for consensus amongst all community members. + +## Code of conduct + +All ``IDConn`` community members are expected to follow our [code of conduct](https://github.com/62442katieb/IDConn/blob/main/CODE_OF_CONDUCT.md) during any interaction with the project. +That includes- but is not limited to- online conversations, in-person workshops or development sprints, and when giving talks about the software. + +As stated in the code, severe or repeated violations by community members may result in exclusion from collective decision-making and rejection of future contributions to the ``IDConn`` project. + +## Asking questions about using IDConn + +Please direct usage-related questions to [NeuroStars][link_neurostars], with [the "Software Support" category and the "IDConn" tag][link_neurostars_IDConn]. +The ``IDConn`` developers follow NeuroStars, and will be able to answer your question there. + +## Labels + +The current list of labels are [here][link_labels] and include: + +* [![Good First Issue](https://img.shields.io/badge/-good%20first%20issue-7057ff.svg)](https://github.com/62442katieb/IDConn/labels/good%20first%20issue) +*These issues contain a task that a member of the team has determined should require minimal knowledge of the existing codebase, and should be good for people new to the project.* +If you are interested in contributing to IDConn, but aren't sure where to start, we encourage you to take a look at these issues in particular. + +* [![Help Wanted](https://img.shields.io/badge/-help%20wanted-33aa3f.svg)](https://github.com/62442katieb/IDConn/labels/help%20wanted) +*These issues contain a task that a member of the team has determined we need additional help with.* +If you feel that you can contribute to one of these issues, we especially encourage you to do so! + +* [![Bug](https://img.shields.io/badge/-bug-ee0701.svg)](https://github.com/62442katieb/IDConn/labels/bug) +*These issues point to problems in the project.* +If you find new a bug, please give as much detail as possible in your issue, including steps to recreate the error. +If you experience the same bug as one already listed, please add any additional information that you have as a comment. + +* [![Enhancement](https://img.shields.io/badge/-enhancement-84b6eb.svg)](https://github.com/62442katieb/IDConn/labels/enhancement) +*These issues are asking for new features to be added to the project.* +Please try to make sure that your requested feature is distinct from any others that have already been requested or implemented. +If you find one that's similar but there are subtle differences please reference the other request in your issue. + +## Making a change + +We appreciate all contributions to IDConn, but those accepted fastest will follow a workflow similar to the following: + +**1. Comment on an existing issue or open a new issue referencing your addition.** + +This allows other members of the IDConn development team to confirm that you aren't overlapping with work that's currently underway and that everyone is on the same page with the goal of the work you're going to carry out. + +[This blog][link_pushpullblog] is a nice explanation of why putting this work in up front is so useful to everyone involved. + +**2. Fork IDConn.** + +[Fork][link_fork] the [IDConn repository][link_idconn] to your profile. + +This is now your own unique copy of IDConn. +Changes here won't effect anyone else's work, so it's a safe space to explore edits to the code! + +Make sure to [keep your fork up to date][link_updateupstreamwiki] with the main repository. + +**3. Make the changes you've discussed.** + +Try to keep the changes focused. We've found that working on a [new branch][link_branches] makes it easier to keep your changes targeted. + +When you're creating your pull request, please do your best to follow IDConn's preferred style conventions. +Namely, documentation should follow the [numpydoc](https://numpydoc.readthedocs.io/en/latest/) convention and code should adhere to [PEP8](https://www.python.org/dev/peps/pep-0008/) as much as possible. + +**4. Submit a pull request.** + +Submit a [pull request][link_pullrequest]. + +A member of the development team will review your changes to confirm that they can be merged into the main codebase. + +Please use a sentence-case title for the pull request, and do not include any prefixes (e.g., ``[ENH]``), as we now use labels to distinguish pull request types. +The title should summarize the changes proposed in the pull request, with an emphasis on readability, as pull request titles are used directly in our release notes. + +## Recognizing contributions + +We welcome and recognize all contributions from documentation to testing to code development. +You can see a list of current contributors in our [zenodo][link_zenodo] file. +If you are new to the project, don't forget to add your name and affiliation there! + +## Thank you! + +You're awesome. + +.. note:: + These guidelines are based on contributing guidelines from the [STEMMRoleModels][link_stemmrolemodels] project. + +[link_github]: https://github.com/ +[link_idconn]: https://github.com/62442katieb/IDConn +[link_signupinstructions]: https://help.github.com/articles/signing-up-for-a-new-github-account +[link_react]: https://github.com/blog/2119-add-reactions-to-pull-requests-issues-and-comments +[link_issues]: https://github.com/62442katieb/IDConn/issues +[link_labels]: https://github.com/62442katieb/IDConn/labels +[link_discussingissues]: https://help.github.com/articles/discussing-projects-in-issues-and-pull-requests +[link_neurostars]: https://neurostars.org + + +[link_pullrequest]: https://help.github.com/articles/creating-a-pull-request/ +[link_fork]: https://help.github.com/articles/fork-a-repo/ +[link_pushpullblog]: https://www.igvita.com/2011/12/19/dont-push-your-pull-requests/ +[link_branches]: https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/ +[link_updateupstreamwiki]: https://help.github.com/articles/syncing-a-fork/ +[link_stemmrolemodels]: https://github.com/KirstieJane/STEMMRoleModels +[link_zenodo]: https://github.com/62442katieb/IDConn/blob/main/.zenodo.json diff --git a/idconn/__init__.py b/idconn/__init__.py index 83ad6d8..79ab307 100644 --- a/idconn/__init__.py +++ b/idconn/__init__.py @@ -12,13 +12,13 @@ warnings.simplefilter("ignore") from . import connectivity from . import data - #from . import figures + from . import nbs from . import networking # from . import preprocessing # from . import statistics # from . import utils - # from . import io + from . import io __version__ = get_versions()["version"] @@ -26,12 +26,13 @@ "idconn", "connectivity", "data", - #"figures", + # "figures", "networking", # "preprocessing", - #"statistics", + # "statistics", # "utils", - # "io", + "io", + "nbs", "__version__", ] diff --git a/idconn/connectivity.py b/idconn/connectivity.py new file mode 100644 index 0000000..5ac1ae2 --- /dev/null +++ b/idconn/connectivity.py @@ -0,0 +1,484 @@ +from posixpath import sep +import numpy as np +import pandas as pd + +# import idconn.connectivity.build_networks +from os import makedirs +from os.path import join, exists, basename +from nilearn import input_data, datasets, connectome, image, plotting +from ._version import get_versions + +# from .utils import contrast + + +def _check_dims(matrix): + """Raise a ValueError if the input matrix has more than two square. + Parameters + ---------- + matrix : numpy.ndarray + Input array. + """ + if matrix.ndim != 2: + raise ValueError( + "Expected a square matrix, got array of shape" " {0}.".format(matrix.shape) + ) + + +def task_connectivity( + layout, subject, session, task, atlas, confounds, connectivity_metric="correlation" +): + """ + Makes connectivity matrices per subject per session per task per condition. + Parameters + ---------- + layout : BIDSLayout object + BIDSLayout (i.e., pybids layout object) for directory containing data for analysis (with `derivative=True`, as we're using fmriprep output). + subject : str + Subject ID for which the networks will be calculated. + session : str, optional + Session of data collection for which networks will be calculated. If there's only one session, we'll find it. + task : str + Name of task fMRI scan (can be "rest") from which networks will be calculated. + connectivity_metric : {"correlation", "partial correlation", "tangent",\ + "covariance", "precision"}, optional + The matrix kind. Passed to Nilearn's `ConnectivityMeasure`. Default is product-moment correlation, "correlation". + space : str + 'native' if analyses will be performed in subjects' functional native space (atlas(es) should be transformed into this space already). + 'mni152-2mm' if analyses will be performed in MNI125 2mm isotropic space (fMRI data should already be transformed into MNI space). + atlas : str + If you want to grab an atlas using Nilearn, this is the name of the atlas and + must match the corresponding function `fetch_atlas_[name]` in `nilearn.datasets`. + If you have your own atlas, this is the path to that nifti file. Currently: only works with paths. + confounds : list-like + Columns from fMRIPrep confounds output to be regressed out of fMRI data before correlation matrices are made. + Returns + ------- + avg_corrmats: numpy array + Average corrmat (per condition, if applicable). + files : list + Filenames of computed correlation matrices. + """ + # version = '0.1.1' + try: + version = get_versions()["version"] + except: + version = "test" + if ".nii" in atlas: + assert exists(atlas), f"Mask file does not exist at {atlas}" + + deriv_dir = join(layout.root, "derivatives", f"idconn-{version}") + + space = "MNI152NLin2009cAsym" + atlas_name = basename(atlas).rsplit(".", 2)[0] + # use pybids here to grab # of runs and preproc bold filenames + connectivity_measure = connectome.ConnectivityMeasure(kind=connectivity_metric) + bold_files = layout.get( + scope="derivatives", + return_type="file", + suffix="bold", + task=task, + space=space, + subject=subject, + session=session, + extension="nii.gz", + ) # should be preprocessed BOLD file from fmriprep, grabbed with pybids + print(f"BOLD files found at {bold_files}") + + runs = [] + if len(bold_files) > 1: + for i in range(0, len(bold_files)): + assert exists(bold_files[i]), "Preprocessed bold file(s) does not exist at {0}".format( + bold_files + ) + runs.append(layout.parse_file_entities(bold_files[i])["run"]) + else: + runs = None + print(f"Found runs: {runs}") + + out = join(deriv_dir, f"sub-{subject}", f"ses-{session}", "func") + if not exists(out): + makedirs(out) + + event_files = layout.get(return_type="filename", suffix="events", task=task, subject=subject) + timing = pd.read_csv(event_files[0], header=0, index_col=0, sep="\t") + conditions = timing["trial_type"].unique() + + run_cond = {} + corrmats = {} + for run in runs: + bold_file = layout.get( + scope="derivatives", + return_type="file", + suffix="bold", + task=task, + space="MNI152NLin2009cAsym", + subject=subject, + session=session, + extension="nii.gz", + run=run, + ) + assert ( + len(bold_file) == 1 + ), f"BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}, {run}: {bold_file}" + tr = layout.get_tr(bold_file) + + # load timing file + # update to use pyBIDS + layout + event_file = layout.get( + return_type="filename", + suffix="events", + task=task, + subject=subject, + run=run, + session=session, + ) + print("# of event files =", len(event_file), "\nfilename = ", event_file[0]) + the_file = str(event_file[0]) + assert exists(the_file), "file really does not exist" + timing = pd.read_csv(the_file, header=0, index_col=0, sep="\t") + timing.sort_values("onset") + + confounds_file = layout.get( + scope="derivatives", + return_type="file", + desc="confounds", + subject=subject, + session=session, + task=task, + run=run, + extension="tsv", + ) + print(f"Confounds file located at: {confounds_file}") + confounds_df = pd.read_csv(confounds_file[0], header=0, sep="\t") + confounds_df = confounds_df[confounds].fillna(0) + confounds_fname = join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-confounds_timeseries.tsv", + ) + confounds_df.to_csv(confounds_fname, sep="\t") + + masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=2) + ex_bold = image.index_img(bold_file[0], 2) + display = plotting.plot_epi(ex_bold) + display.add_contours(atlas) + display.savefig( + join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_run-{run}_space-MNI152NLin2009cAsym_space-{atlas_name}_overlay.png", + ) + ) + + print(f"BOLD file located at {bold_file}\nTR = {tr}s") + + masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=1) + timeseries = masker.fit_transform(bold_file[0], confounds=confounds_fname) + # load timing file + # update to use pyBIDS + layout + try: + # and now we slice into conditions + for condition in conditions: + run_cond[condition] = {} + corrmats[condition] = {} + blocks = [] + cond_timing = timing[timing["trial_type"] == condition] + for i in cond_timing.index: + blocks.append( + ( + cond_timing.loc[i]["onset"] / tr, + ((cond_timing.loc[i]["onset"] + cond_timing.loc[i]["duration"]) / tr) + + 1, + ) + ) + if len(blocks) > 1: + run_cond[condition][run] = np.vstack( + ( + timeseries[int(blocks[0][0]) : int(blocks[0][1]), :], + timeseries[int(blocks[1][0]) : int(blocks[1][1]), :], + ) + ) + if len(blocks) > 2: + for i in np.arange(2, len(blocks)): + run_cond[condition][run] = np.vstack( + ( + timeseries[int(blocks[0][0]) : int(blocks[0][1]), :], + timeseries[int(blocks[1][0]) : int(blocks[1][1]), :], + ) + ) + # print('extracted signals for {0}, {1}, {2}'.format(task, run, condition), run_cond['{0}-{1}'.format(run, condition)].shape) + else: + pass + print(f"Making correlation matrix for {run}, {condition}.") + corrmats[condition][run] = connectivity_measure.fit_transform( + [run_cond[condition][run]] + )[0] + print("And that correlation matrix is", corrmats[condition][run].shape) + except Exception as e: + print("trying to slice and dice, but", e) + # and paste together the timeseries from each run together per condition + files = [] + avg_corrmats = {} + print("Corrmats per run per condition have been made!") + for condition in conditions: + print(f"Merging corrmats for {task}-{condition}...") + data = list(corrmats[condition].values()) + stacked_corrmats = np.array(data) + print("Stacked corrmats have dimensions", stacked_corrmats.shape) + avg_corrmat = np.mean(stacked_corrmats, axis=0) + corrmat_df = pd.DataFrame( + index=np.arange(1, avg_corrmat.shape[0] + 1), + columns=np.arange(1, avg_corrmat.shape[0] + 1), + data=avg_corrmat, + ) + avg_corrmats[condition] = corrmat_df + corrmat_file = join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_desc-{condition}_space-MNI152NLin2009cAsym_atlas-{atlas_name}_corrmat.tsv", + ) + try: + corrmat_df.to_csv(corrmat_file, sep="\t") + files.append(corrmat_file) + except Exception as e: + print("saving corrmat...", e) + return files, avg_corrmats + + +def rest_connectivity( + layout, subject, session, task, atlas, confounds=None, connectivity_metric="correlation" +): + ################################################################################### + ################# Needs an option to keep runs separate. ########################## + """ + Makes connectivity matrices per subject per session per task per condition. + Parameters + ---------- + layout : str + BIDS layout with fMRIPrep derivatives indexed from pyBIDS + subject : str + Subject ID for which the networks will be calculated. + session : str, optional + Session of data collection. If there's only one session, we'll find it. + connectivity_metric : {"correlation", "partial correlation", "tangent",\ + "covariance", "precision"}, optional + The matrix kind. Passed to Nilearn's `ConnectivityMeasure`. + atlas : str + Name of atlas for parcellating voxels into nodes, must be in the same `space` as preprocessed rsfMRI data from fMRIPrep. + confounds : list-like + Names of confounds (should be columns in fmriprep output confounds.tsv). + Returns + ------- + corrmat_df : Pandas dataframe + Functional connectivity matrix with labeled nodes (i.e., rows, columns) and weighted edges (i.e., elements) based on + the connectivity metric selected. If multiple runs, represents average across runs. + corrmat_file : str + Path to saved correlation matrix. + """ + try: + version = get_versions()["version"] + except: + version = "test" + if ".nii" in atlas: + assert exists(atlas), f"Mask file does not exist at {atlas}" + + deriv_dir = join(layout.root, "derivatives", f"idconn-{version}") + atlas_name = basename(atlas).rsplit(".", 2)[0] + # use pybids here to grab # of runs and preproc bold filenames + connectivity_measure = connectome.ConnectivityMeasure(kind=connectivity_metric) + bold_files = layout.get( + scope="derivatives", + return_type="file", + suffix="bold", + task=task, + space="MNI152NLin2009cAsym", + subject=subject, + session=session, + extension="nii.gz", + ) # should be preprocessed BOLD file from fmriprep, grabbed with pybids + print(f"BOLD files found at {bold_files}") + # confounds_files = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task) + + runs = [] + if len(bold_files) > 1: + for i in range(0, len(bold_files)): + assert exists(bold_files[i]), "Preprocessed bold file(s) does not exist at {0}".format( + bold_files + ) + runs.append(layout.parse_file_entities(bold_files[i])["run"]) + else: + runs = None + print(f"Found runs: {runs}") + + out = join(deriv_dir, f"sub-{subject}", f"ses-{session}", "func") + if not exists(out): + makedirs(out) + + # event_files = layout.get(return_type='filename', suffix='events', task=task, subject=subject) + # timing = pd.read_csv(event_files[0], header=0, index_col=0, sep='\t') + # conditions = timing['trial_type'].unique() + + if runs: + corrmats = {} + for run in runs: + print("run = ", run) + # read in events file for this subject, task, and run + + confounds_file = layout.get( + scope="derivatives", + return_type="file", + desc="confounds", + subject=subject, + session=session, + task=task, + run=run, + extension="tsv", + ) + print(f"Confounds file located at: {confounds_file}") + confounds_df = pd.read_csv(confounds_file[0], header=0, sep="\t") + confounds_df = confounds_df[confounds].fillna(0) + confounds_fname = join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-confounds_timeseries.tsv", + ) + confounds_df.to_csv(confounds_fname, sep="\t") + + bold_file = layout.get( + scope="derivatives", + return_type="file", + suffix="bold", + task=task, + space="MNI152NLin2009cAsym", + subject=subject, + session=session, + extension="nii.gz", + run=run, + ) + assert ( + len(bold_file) == 1 + ), f"BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}, {run}: {bold_file}" + tr = layout.get_tr(bold_file) + masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=2) + + ex_bold = image.index_img(bold_file[0], 2) + display = plotting.plot_epi(ex_bold) + display.add_contours(atlas) + display.savefig( + join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-atlas_overlay.png", + ) + ) + + print(f"BOLD file located at {bold_file}\nTR = {tr}s") + try: + # for each parcellation, extract BOLD timeseries + print(f"Extracting bold signal for sub-{subject}, ses-{session}, run-{run}...") + timeseries = masker.fit_transform(bold_file[0], confounds_fname) + except Exception as e: + print("ERROR: Trying to extract BOLD signals, but", e) + try: + print( + f"Making correlation matrix for for sub-{subject}, ses-{session}, task-{task}, run-{run}..." + ) + corrmats[run] = connectivity_measure.fit_transform([timeseries])[0] + except Exception as e: + print("ERROR: Trying to make corrmat, but", e) + data = list(corrmats.values()) + stacked_corrmats = np.array(data) + print("Stacked corrmats have dimensions", stacked_corrmats.shape) + avg_corrmat = np.mean(stacked_corrmats, axis=0) + else: + confounds_file = layout.get( + scope="derivatives", + return_type="file", + desc="confounds", + subject=subject, + session=session, + task=task, + extension="tsv", + ) + print(f"Confounds file located at: {confounds_file}") + confounds_df = pd.read_csv(confounds_file[0], header=0, sep="\t") + confounds_df = confounds_df[confounds].fillna(0) + confounds_fname = join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_desc-confounds_timeseries.tsv", + ) + confounds_df.to_csv(confounds_fname, sep="\t") + + bold_file = layout.get( + scope="derivatives", + return_type="file", + suffix="bold", + task=task, + space="MNI152NLin2009cAsym", + subject=subject, + session=session, + extension="nii.gz", + ) + assert ( + len(bold_file) == 1 + ), f"BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}: {bold_file}" + tr = layout.get_tr(bold_file) + masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=2) + + ex_bold = image.index_img(bold_file[0], 2) + display = plotting.plot_epi(ex_bold) + display.add_contours(atlas) + display.savefig( + join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_desc-atlas_overlay.png", + ) + ) + + print(f"BOLD file located at {bold_file}\nTR = {tr}s") + try: + # for each parcellation, extract BOLD timeseries + print(f"Extracting bold signal for sub-{subject}, ses-{session}...") + timeseries = masker.fit_transform(bold_file[0], confounds_fname) + except Exception as e: + print("ERROR: Trying to extract BOLD signals, but", e) + try: + print(f"Making correlation matrix for for sub-{subject}, ses-{session}...") + avg_corrmat = connectivity_measure.fit_transform([timeseries])[0] + except Exception as e: + print("ERROR: Trying to make corrmat, but", e) + + print("Correlation matrix created, dimensions:", avg_corrmat.shape) + try: + corrmat_df = pd.DataFrame( + index=np.arange(1, avg_corrmat.shape[0] + 1), + columns=np.arange(1, avg_corrmat.shape[0] + 1), + data=avg_corrmat, + ) + corrmat_file = join( + deriv_dir, + f"sub-{subject}", + f"ses-{session}", + "func", + f"sub-{subject}_ses-{session}_task-{task}_space-MNI152NLin2009cAsym_atlas-{atlas_name}_desc-corrmat_bold.tsv", + ) + corrmat_df.to_csv(corrmat_file, sep="\t") + except Exception as e: + print("ERROR saving corrmat...", e) + return corrmat_df, corrmat_file diff --git a/idconn/connectivity/__init__.py b/idconn/connectivity/__init__.py deleted file mode 100644 index afe46b9..0000000 --- a/idconn/connectivity/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -Tools for computing connectivity matrices/graphs -""" - -from . import build_networks -from . import estimate_thresh - -__all__ = ["build_networks", "estimate_thresh"] diff --git a/idconn/connectivity/__pycache__/__init__.cpython-37.pyc b/idconn/connectivity/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 587b05d..0000000 Binary files a/idconn/connectivity/__pycache__/__init__.cpython-37.pyc and /dev/null differ diff --git a/idconn/connectivity/build_networks.py b/idconn/connectivity/build_networks.py deleted file mode 100644 index 15e3017..0000000 --- a/idconn/connectivity/build_networks.py +++ /dev/null @@ -1,311 +0,0 @@ -from posixpath import sep -import numpy as np -import pandas as pd -import idconn.connectivity.build_networks -from os import makedirs -from os.path import join, exists, basename -from nilearn import input_data, datasets, connectome, image, plotting - -#from .utils import contrast - -def _check_dims(matrix): - """Raise a ValueError if the input matrix has more than two square. - Parameters - ---------- - matrix : numpy.ndarray - Input array. - """ - if matrix.ndim != 2: - raise ValueError('Expected a square matrix, got array of shape' - ' {0}.'.format(matrix.shape)) - - -def task_connectivity(layout, subject, session, task, atlas, confounds, connectivity_metric='correlation', out_dir=None): - """ - Makes connectivity matrices per subject per session per task per condition. - Parameters - ---------- - dset_dir : str - BIDS-formatted dataset path (top-level, in which a 'derivatives/' directory will be made if one does not exist) - subject : str - Subject ID for which the networks will be calculated. - session : str, optional - Session of data collection. If there's only one session, we'll find it. - task : str - Name of task fMRI scan from which networks will be calculated. - connectivity_metric : {"correlation", "partial correlation", "tangent",\ - "covariance", "precision"}, optional - The matrix kind. Passed to Nilearn's `ConnectivityMeasure`. - space : str - 'native' if analyses will be performed in subjects' functional native space (atlas(es) should be transformed) - 'mni152-2mm' if analyses will be performed in MNI125 2mm isotropic space (fMRI data should already be transformed) - atlas : str - If you want to grab an atlas using Nilearn, this is the name of the atlas and - must match the corresponding function `fetch_atlas_[name]` in `nilearn.datasets`. - If you have your own atlas, this is the path to that nifti file.` - confounds : list-like - Filenames of confounds files. - Returns - ------- - confounds_file : str - Filename of merged confounds .tsv file - """ - #version = '0.1.1' - try: - version = idconn.__version__ - except: - version = 'test' - if '.nii' in atlas: - assert exists(atlas), f'Mask file does not exist at {atlas}' - - if not out_dir: - deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}') - else: - deriv_dir = out_dir - space = 'MNI152NLin2009cAsym' - atlas_name = basename(atlas).rsplit('.', 2)[0] - # use pybids here to grab # of runs and preproc bold filenames - connectivity_measure = connectome.ConnectivityMeasure(kind=connectivity_metric) - bold_files = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space=space,subject=subject, session=session, extension='nii.gz') # should be preprocessed BOLD file from fmriprep, grabbed with pybids - print(f'BOLD files found at {bold_files}') - - runs = [] - if len(bold_files) > 1: - for i in range(0, len(bold_files)): - assert exists(bold_files[i]), "Preprocessed bold file(s) does not exist at {0}".format(bold_files) - runs.append(layout.parse_file_entities(bold_files[i])['run']) - else: - runs = None - print(f'Found runs: {runs}') - - out = join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func') - if not exists(out): - makedirs(out) - - event_files = layout.get(return_type='filename', suffix='events', task=task, subject=subject) - timing = pd.read_csv(event_files[0], header=0, index_col=0, sep='\t') - conditions = timing['trial_type'].unique() - - run_cond = {} - corrmats = {} - for run in runs: - bold_file = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz', run=run) - assert len(bold_file) == 1, f'BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}, {run}: {bold_file}' - tr = layout.get_tr(bold_file) - - #load timing file - #update to use pyBIDS + layout - event_file = layout.get(return_type='filename', suffix='events', task=task, subject=subject, run=run, session=session) - print('# of event files =', len(event_file), '\nfilename = ', event_file[0]) - the_file = str(event_file[0]) - assert exists(the_file), 'file really does not exist' - timing = pd.read_csv(the_file, header=0, index_col=0, sep='\t') - timing.sort_values('onset') - - confounds_file = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task, run=run, extension='tsv') - print(f'Confounds file located at: {confounds_file}') - confounds_df = pd.read_csv(confounds_file[0], header=0, sep='\t') - confounds_df = confounds_df[confounds].fillna(0) - confounds_fname = join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-confounds_timeseries.tsv') - confounds_df.to_csv(confounds_fname, sep='\t') - - masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=2) - ex_bold = image.index_img(bold_file[0], 2) - display = plotting.plot_epi(ex_bold) - display.add_contours(atlas) - display.savefig(join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_run-{run}_space-MNI152NLin2009cAsym_space-{atlas_name}_overlay.png')) - - print(f'BOLD file located at {bold_file}\nTR = {tr}s') - - masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=1) - timeseries = masker.fit_transform(bold_file[0], confounds=confounds_fname) - #load timing file - #update to use pyBIDS + layout - try: - #and now we slice into conditions - for condition in conditions: - run_cond[condition] = {} - corrmats[condition] = {} - blocks = [] - cond_timing = timing[timing['trial_type'] == condition] - for i in cond_timing.index: - blocks.append((cond_timing.loc[i]['onset'] / tr, ((cond_timing.loc[i]['onset'] + cond_timing.loc[i]['duration']) / tr) + 1)) - if len(blocks) > 1: - run_cond[condition][run] = np.vstack((timeseries[int(blocks[0][0]):int(blocks[0][1]), :], timeseries[int(blocks[1][0]):int(blocks[1][1]), :])) - if len(blocks) > 2: - for i in np.arange(2,len(blocks)): - run_cond[condition][run] = np.vstack((timeseries[int(blocks[0][0]):int(blocks[0][1]), :], timeseries[int(blocks[1][0]):int(blocks[1][1]), :])) - #print('extracted signals for {0}, {1}, {2}'.format(task, run, condition), run_cond['{0}-{1}'.format(run, condition)].shape) - else: - pass - print(f'Making correlation matrix for {run}, {condition}.') - corrmats[condition][run] = connectivity_measure.fit_transform([run_cond[condition][run]])[0] - print('And that correlation matrix is', corrmats[condition][run].shape) - except Exception as e: - print('trying to slice and dice, but', e) - #and paste together the timeseries from each run together per condition - files = [] - avg_corrmats = {} - print('Corrmats per run per condition have been made!') - for condition in conditions: - print(f'Merging corrmats for {task}-{condition}...') - data = list(corrmats[condition].values()) - stacked_corrmats = np.array(data) - print('Stacked corrmats have dimensions', stacked_corrmats.shape) - avg_corrmat = np.mean(stacked_corrmats, axis=0) - corrmat_df = pd.DataFrame(index=np.arange(1, avg_corrmat.shape[0]+1), columns=np.arange(1, avg_corrmat.shape[0]+1),data=avg_corrmat) - avg_corrmats[condition] = corrmat_df - corrmat_file = join(deriv_dir, - f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_desc-{condition}_space-MNI152NLin2009cAsym_atlas-{atlas_name}_corrmat.tsv') - try: - corrmat_df.to_csv(corrmat_file, sep='\t') - files.append(corrmat_file) - except Exception as e: - print('saving corrmat...', e) - return files, avg_corrmats - -def connectivity(layout, subject, session, task, atlas, connectivity_metric='correlation', confounds=None, out_dir=None): - - """ - Makes connectivity matrices per subject per session per task per condition. - Parameters - ---------- - layout : str - BIDS layout with derivatives indexed from pyBIDS - subject : str - Subject ID for which the networks will be calculated. - session : str, optional - Session of data collection. If there's only one session, we'll find it. - connectivity_metric : {"correlation", "partial correlation", "tangent",\ - "covariance", "precision"}, optional - The matrix kind. Passed to Nilearn's `ConnectivityMeasure`. - space : str - 'native' if analyses will be performed in subjects' functional native space (atlas(es) should be transformed) - 'mni152-2mm' if analyses will be performed in MNI125 2mm isotropic space (fMRI data should already be transformed) - atlas : str - Name of atlas for parcellating voxels into nodes, must be in the same `space` given above. - confounds : list-like - Names of confounds (should be columns in fmriprep output confounds.tsv). - Returns - ------- - adjacency_matrix - """ - try: - version = idconn.__version__ - except: - version = 'test' - if '.nii' in atlas: - assert exists(atlas), f'Mask file does not exist at {atlas}' - - if not out_dir: - deriv_dir = join(layout.root, 'derivatives', f'idconn-{version}') - else: - deriv_dir = out_dir - atlas_name = basename(atlas).rsplit('.', 2)[0] - # use pybids here to grab # of runs and preproc bold filenames - connectivity_measure = connectome.ConnectivityMeasure(kind=connectivity_metric) - bold_files = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz') # should be preprocessed BOLD file from fmriprep, grabbed with pybids - print(f'BOLD files found at {bold_files}') - confounds_files = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task) - - runs = [] - if len(bold_files) > 1: - for i in range(0, len(bold_files)): - assert exists(bold_files[i]), "Preprocessed bold file(s) does not exist at {0}".format(bold_files) - runs.append(layout.parse_file_entities(bold_files[i])['run']) - else: - runs = None - print(f'Found runs: {runs}') - - out = join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func') - if not exists(out): - makedirs(out) - - - #event_files = layout.get(return_type='filename', suffix='events', task=task, subject=subject) - #timing = pd.read_csv(event_files[0], header=0, index_col=0, sep='\t') - #conditions = timing['trial_type'].unique() - - if runs: - corrmats = {} - for run in runs: - print('run = ', run) - # read in events file for this subject, task, and run - - - confounds_file = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task, run=run, extension='tsv') - print(f'Confounds file located at: {confounds_file}') - confounds_df = pd.read_csv(confounds_file[0], header=0, sep='\t') - confounds_df = confounds_df[confounds].fillna(0) - confounds_fname = join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-confounds_timeseries.tsv') - confounds_df.to_csv(confounds_fname, sep='\t') - - bold_file = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz', run=run) - assert len(bold_file) == 1, f'BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}, {run}: {bold_file}' - tr = layout.get_tr(bold_file) - masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=2) - - ex_bold = image.index_img(bold_file[0], 2) - display = plotting.plot_epi(ex_bold) - display.add_contours(atlas) - display.savefig(join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_run-{run}_desc-atlas_overlay.png')) - - print(f'BOLD file located at {bold_file}\nTR = {tr}s') - try: - #for each parcellation, extract BOLD timeseries - print(f'Extracting bold signal for sub-{subject}, ses-{session}, run-{run}...') - timeseries = masker.fit_transform(bold_file[0], confounds_fname) - except Exception as e: - print('ERROR: Trying to extract BOLD signals, but', e) - try: - print(f'Making correlation matrix for for sub-{subject}, ses-{session}, task-{task}, run-{run}...') - corrmats[run] = connectivity_measure.fit_transform([timeseries])[0] - except Exception as e: - print('ERROR: Trying to make corrmat, but', e) - data = list(corrmats.values()) - stacked_corrmats = np.array(data) - print('Stacked corrmats have dimensions', stacked_corrmats.shape) - avg_corrmat = np.mean(stacked_corrmats, axis=0) - else: - confounds_file = layout.get(scope='derivatives', return_type='file', desc='confounds',subject=subject,session=session, task=task, extension='tsv') - print(f'Confounds file located at: {confounds_file}') - confounds_df = pd.read_csv(confounds_file[0], header=0, sep='\t') - confounds_df = confounds_df[confounds].fillna(0) - confounds_fname = join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_desc-confounds_timeseries.tsv') - confounds_df.to_csv(confounds_fname, sep='\t') - - bold_file = layout.get(scope='derivatives', return_type='file', suffix='bold', task=task, space='MNI152NLin2009cAsym',subject=subject, session=session, extension='nii.gz') - assert len(bold_file) == 1, f'BOLD file improperly specified, more than one .nii.gz file with {subject}, {session}, {task}: {bold_file}' - tr = layout.get_tr(bold_file) - masker = input_data.NiftiLabelsMasker(atlas, standardize=True, t_r=tr, verbose=2) - - ex_bold = image.index_img(bold_file[0], 2) - display = plotting.plot_epi(ex_bold) - display.add_contours(atlas) - display.savefig(join(deriv_dir, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-{task}_desc-atlas_overlay.png')) - - print(f'BOLD file located at {bold_file}\nTR = {tr}s') - try: - #for each parcellation, extract BOLD timeseries - print(f'Extracting bold signal for sub-{subject}, ses-{session}...') - timeseries = masker.fit_transform(bold_file[0], confounds_fname) - except Exception as e: - print('ERROR: Trying to extract BOLD signals, but', e) - try: - print(f'Making correlation matrix for for sub-{subject}, ses-{session}...') - avg_corrmat = connectivity_measure.fit_transform([timeseries])[0] - except Exception as e: - print('ERROR: Trying to make corrmat, but', e) - - print('Correlation matrix created, dimensions:', avg_corrmat.shape) - try: - corrmat_df = pd.DataFrame(index=np.arange(1, avg_corrmat.shape[0]+1), columns=np.arange(1, avg_corrmat.shape[0]+1),data=avg_corrmat) - corrmat_file = join(deriv_dir, - f'sub-{subject}', - f'ses-{session}', - 'func', - f'sub-{subject}_ses-{session}_task-{task}_space-MNI152NLin2009cAsym_atlas-{atlas_name}_desc-corrmat_bold.tsv') - corrmat_df.to_csv(corrmat_file, sep='\t') - except Exception as e: - print('ERROR saving corrmat...', e) - return corrmat_df, corrmat_file diff --git a/idconn/connectivity/estimate_thresh.py b/idconn/connectivity/estimate_thresh.py deleted file mode 100644 index 7dd99a4..0000000 --- a/idconn/connectivity/estimate_thresh.py +++ /dev/null @@ -1,60 +0,0 @@ -import numpy as np -import networkx as nx -import pandas as pd -import bct - - -def scale_free_tau(corrmat, skew_thresh, proportional=True): - '''' - Calculates threshold at which network becomes scale-free, estimated from the skewness of the networks degree distribution. - Parameters - ---------- - corrmat : numpy.array - Correlation or other connectivity matrix from which tau_connected will be estimated. - Should be values between 0 and 1. - proportional : bool - Determines whether connectivity matrix is thresholded proportionally or absolutely. - Default is proportional as maintaining network density across participants is a priority - Returns - ------- - tau : float - Lowest vaue of tau (threshold) at which network is scale-free. - ''' - tau = 0.01 - skewness = 1 - while abs(skewness) > 0.3: - if proportional: - w = bct.threshold_proportional(corrmat, tau) - else: - w = bct.threshold_absolute(corrmat, tau) - skewness = skew(bct.degrees_und(w)) - tau += 0.01 - return tau - -def connected_tau(corrmat, proportional=True): - ''' - Calculates threshold at network becomes node connected, using NetworkX's `is_connected` function. - Parameters - ---------- - corrmat : numpy.array - Correlation or other connectivity matrix from which tau_connected will be estimated. - Should be values between 0 and 1. - proportional : bool - Determines whether connectivity matrix is thresholded proportionally or absolutely. - Default is proportional as maintaining network density across participants is a priority - Returns - ------- - tau : float - Highest vaue of tau (threshold) at which network becomes node-connected. - ''' - tau = 0.01 - connected = False - while connected == False: - if proportional: - w = bct.threshold_proportional(corrmat, tau) - else: - w = bct.threshold_absolute(corrmat, tau) - w_nx = nx.convert_matrix.from_numpy_array(w) - connected = nx.algorithms.components.is_connected(w_nx) - tau += 0.01 - return tau \ No newline at end of file diff --git a/idconn/data.py b/idconn/data.py new file mode 100644 index 0000000..0e18186 --- /dev/null +++ b/idconn/data.py @@ -0,0 +1,26 @@ +import pandas as pd +from os.path import join, exists + +from sklearn.experimental import enable_iterative_imputer +from sklearn.impute import IterativeImputer + + +def impute(data, max_iter=10000): + """ + Fill in missing data with an iterative imputation algorithm from scikit learn. + NOTE: Will not imput connectivity data. + """ + + non_numeric = data.select_dtypes(exclude=["number"]).columns + dumb = pd.get_dummies(data[non_numeric], prefix="dummy") + df = pd.concat([data.drop(non_numeric, axis=1), dumb]) + impute_pls = IterativeImputer( + max_iter=max_iter, skip_complete=True, verbose=1, tol=5e-3, n_nearest_features=1000 + ) + imputed = impute_pls.fit_transform(df) + imp_df = pd.DataFrame( + imputed, + columns=data.drop(non_numeric, axis=1).columns, + index=data.index, + ) + return imp_df diff --git a/idconn/data/__init__.py b/idconn/data/__init__.py deleted file mode 100644 index e0be4c5..0000000 --- a/idconn/data/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -Tools for arranging data and addressing missing data -""" - -from . import iterative_imputation -from . import missingness - -__all__ = ["iterative_imputation", "missingness"] diff --git a/idconn/data/iterative_imputation.py b/idconn/data/iterative_imputation.py deleted file mode 100644 index 73505fe..0000000 --- a/idconn/data/iterative_imputation.py +++ /dev/null @@ -1,31 +0,0 @@ -import pandas as pd -from os.path import join, exists - -from sklearn.experimental import enable_iterative_imputer -from sklearn.impute import IterativeImputer - - -#sink_dir = "/Users/kbottenh/Dropbox/Projects/physics-retrieval/data/rescored" -# sink_dir = '/home/kbott006/physics-retrieval' -# fig_dir = '/Users/kbottenh/Dropbox/Projects/physics-retrieval/figures/' -#data_dir = "/Users/kbottenh/Dropbox/Projects/physics-retrieval/data/rescored" -# roi_dir = '/Users/kbottenh/Dropbox/Data/templates/shen2015/' -# data_dir = '/home/kbott006/physics-retrieval' - -# big_df = pd.read_csv(join(data_dir, 'physics_learning-nonbrain_OLS-missing+fd+local_efficiency.csv'), -# index_col=0, header=0) - -# impute first? -def impute(data, max_iter): - non_numeric = data.select_dtypes(exclude=['number']).columns - dumb = pd.get_dummies(data[non_numeric], prefix='dummy') - df = pd.concat([data.drop(non_numeric, axis=1), dumb]) - impute_pls = IterativeImputer( - max_iter=10000, skip_complete=True, verbose=1, tol=5e-3, n_nearest_features=1000 - ) - imputed = impute_pls.fit_transform(df) - imp_df = pd.DataFrame(imputed,columns=data.drop(non_numeric, axis=1).columns, index=data.index, - ) - return imp_df - - diff --git a/idconn/data/missingness.py b/idconn/data/missingness.py deleted file mode 100644 index e69de29..0000000 diff --git a/idconn/io.py b/idconn/io.py new file mode 100644 index 0000000..55ddc81 --- /dev/null +++ b/idconn/io.py @@ -0,0 +1,626 @@ +import bids +import json +from nilearn import datasets +import nibabel as nib +from os.path import exists, join, basename + + +import nibabel as nib +import numpy as np +import pandas as pd +import seaborn as sns + +# from matplotlib import projections +from matplotlib import pyplot as plt +from matplotlib.gridspec import GridSpec +from nilearn import datasets, plotting, surface + + +def calc_fd(confounds): + x = confounds["trans_x"].values + y = confounds["trans_y"].values + z = confounds["trans_z"].values + alpha = confounds["rot_x"].values + beta = confounds["rot_y"].values + gamma = confounds["rot_z"].values + + delta_x = [np.abs(t - s) for s, t in zip(x, x[1:])] + delta_y = [np.abs(t - s) for s, t in zip(y, y[1:])] + delta_z = [np.abs(t - s) for s, t in zip(z, z[1:])] + + delta_alpha = [np.abs(t - s) for s, t in zip(alpha, alpha[1:])] + delta_beta = [np.abs(t - s) for s, t in zip(beta, beta[1:])] + delta_gamma = [np.abs(t - s) for s, t in zip(gamma, gamma[1:])] + + fd = np.sum([delta_x, delta_y, delta_z, delta_alpha, delta_beta, delta_gamma], axis=0) + return fd + + +def build_statsmodel_json( + name, + task, + contrast, + confounds, + highpass, + mask, + conn_meas, + graph_meas=None, + exclude=None, + outfile=None, +): + """ + Creates a BIDS Stats Models json with analysis details for further use. + DOES NOT WORK YET. + + Parameters + ---------- + root_dir : str + Location of BIDS dataset root + validate : bool + If true, pybids will check if this is a valid BIDS-format + dataset before continuing. + absolute_paths : bool + If true, will assume paths are absolute, instead of relative. + derivatives : str + Location of preprocessed data (i.e., name of fmriprep dir). + verbose : bool + If true, will narrate finding of dataset and describe it. + Returns + ------- + atlas : str + Name of the atlas chosen. + path : str + File path of atlas. If user-provided, will be copied into + `derivatives/idconn`. If using an atlas from Nilearn, will + be path to downloaded nifti. + shape : str + Indicates shape of map (3d, 4d, coords) for choosing appropriate + Nilearn masker for extracting BOLD signals from nifti files. + + """ + mask_builtins = ["shen270", "craddock270", "schaefer400", "yeo7", "yeo17"] + if ".nii" in mask: + assert exists(mask), "Mask file does not exist at {mask}".format(mask=mask) + if ".gz" in mask: + mask_name = basename(mask).rsplit(".", 2)[0] + else: + mask_name = basename(mask).rsplit(".", 1)[0] + else: + assert ( + mask in mask_builtins + ), "Mask {mask} not in built-in mask options. Please provide file path or one of {mask_builtins}".format( + mask=mask, mask_builtins=mask_builtins + ) + variables = confounds + ["{mask_name}*".format(mask_name=mask_name)] + statsmodel = { + "name": name, + "description": "A functional connectivity analysis of {task}, comparing {contrast}".format( + task=task, contrast=contrast + ), + "input": {"task": task}, + "blocks": [ + { + "level": "run", + "transformations": { + "name": "load_image_data", + "input": ["bold"], + "aggregate": ["mean"], + "mask": [mask_name], + "output": ["{mask_name}*".format(mask_name=mask_name)], + }, + }, + { + "level": "session", + "model": { + "variables": variables, + "options": {"confounds": confounds, "high_pass_filter_cutoff_secs": highpass}, + "variances": {"name": "session_level", "groupBy": "session"}, + "software": { + "IDConn": { + "ConnectivityMeasure": [conn_meas], + "GraphMetrics": [graph_meas], + } + }, + }, + }, + ], + } + statsmodel_json = json.dumps(statsmodel, indent=2) + + outfile = "{name}-statsmodel.json".format(name=name) + with open(outfile, "w") as outfile: + json.dump(statsmodel, outfile) + return statsmodel_json + + +def atlas_picker(atlas, path, key=None): + """Takes in atlas name and path to file, if local, returns + nifti-like object (usually file path to downloaded atlas), + and atlas name (for tagging output files). If atlas is from + Nilearn, will download atlas, **and space must be == 'MNI'. + If atlas is provided by user (path must be specified), then + space of atlas must match space of fMRI data, but that is up + to the user to determine. + Parameters + ---------- + atlas : str + Name of the atlas/parcellation used to define nodes from + voxels. If using an atlas fetchable by Nilearn, atlas name + must match the function `fetch_atlas_[name]`. + path : str + Path to the atlas specified, if not using a dataset from Nilearn. + If using `nilearn.datasets` to fetch an atlas, will revert to + `derivatives/idconn` path. + key : str + Atlas-specific key for denoting which of multiple versions + will be used. Default behavior is described in the "atlases" + section of the docs. NOT IMPLEMENTED + Returns + ------- + atlas : str + Name of the atlas chosen. + path : str + File path of atlas. If user-provided, will be copied into + `derivatives/idconn`. If using an atlas from Nilearn, will + be path to downloaded nifti. + shape : str + Indicates shape of map (3d, 4d, coords) for choosing appropriate + Nilearn masker for extracting BOLD signals from nifti files. + """ + nilearn_3d = [ + "craddock_2012", + "destrieux_2009", + "harvard_oxford", + "smith_2009", + "yeo_2011", + "aal", + "pauli_2017", + "msdl", + ] + # nilearn_coord = ['power_2011', 'dosenbach_2010', 'seitzman_2018'] + # nilearn_4d = ['allen_2011', ''] + if atlas in nilearn_3d: + if atlas == "craddock_2012": + atlas_dict = datasets.fetch_atlas_craddock_2012(data_dir=path) + atlas_path = atlas_dict["tcorr_2level"] + nifti = nib.load(atlas_path) + nifti_arr = nifti.get_fdata() + # selecting one volume of the nifti, each represent different granularity of parcellation + # selecting N = 270, the 27th volume per http://ccraddock.github.io/cluster_roi/atlases.html + nifti = nib.Nifti1Image(nifti_arr[:, :, :, 26], nifti.affine) + nifti.to_filename() + + return atlas, path + + +def vectorize_corrmats(matrices, diagonal=False): + """Returns the vectorized upper triangles of a 3-dimensional array + (i.e., node x node x matrix) of matrices. Output will be a 2-dimensional + array (i.e., matrix x node^2) + Parameters + ---------- + matrices : numpy array of shape (p, n, n) + Represents the link strengths of the graphs. Assumed to be + an array of symmetric nxn matrices per participant and/or timepoint (p). + + Returns + ------- + edge_vector : numpy array of shape (p, n^2) + Represents an array of vectorized upper triangles of + the input matrices. + """ + # print(f'\n\n\n{matrices.shape}, {matrices.ndim}\n\n\n') + if diagonal == True: + k = 0 + else: + k = 1 + num_node = matrices.shape[1] + upper_tri = np.triu_indices(num_node, k=k) + if matrices.ndim == 3: + num_node = matrices.shape[1] + upper_tri = np.triu_indices(num_node, k=k) + num_matrices = matrices.shape[0] + edge_vector = [] + for matrix in range(0, num_matrices): + vectorized = matrices[matrix, :, :][upper_tri] + edge_vector.append(vectorized) + + elif matrices.ndim == 2: + true = matrices[0].T == matrices[0] + if true.all(): + edge_vector = matrices[upper_tri] + else: + print( + "Matrices of incompatible shape:", + matrices.shape, + "\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).", + ) + elif matrices.ndim == 1: + if matrices[0].ndim == 2: + num_node = matrices[0].shape[0] + upper_tri = np.triu_indices(num_node, k=k) + edge_vector = [] + for matrix in matrices: + vectorized = matrix[upper_tri] + edge_vector.append(vectorized) + else: + print( + "Matrices of incompatible shape:", + matrices.shape, + "\nNumber of dimensions needs to be 3 (node x node x participant) or 2 (node x node).", + ) + edge_vector = np.asarray(edge_vector) + return edge_vector + + +def read_corrmats(layout, task, deriv_name, atlas, z_score=True, vectorized=True, verbose=False): + """Returns a node x node x (subject x session) matrix of correlation matrices + from a BIDS derivative folder. Optionally returns a node^2 x (subject x session) + array of vectorized upper triangles of those correlation matrices. + + ME @ ME: NEEDS AN OPTION TO KEEP RUNS SEPARATE. CURRENTLY IT AVERAGES CONFOUNDS AND + Parameters + ---------- + layout : BIDSLayout or str + A valid BIDSLayout or directory. If BIDSLayout, must be generated with derivatives=True, + in order to find the derivatives folder containing the relevant correlation matrices. + task : str + The task used to collect fMRI data from which correlation matrices were computed. + deriv_name : str + The name of the derivatives subdirectory in which correlation matrices can be found + atlas: str + The name of the atlas used to make the correlation matrix. Must match the string in corrmat filename. + z_score : Bool + Would you like the correlation matrices z-scored? (Uses Fishers r-to-z, + thus assumes elements/edges of corrmats are product-moment correlations). + vectorized : Bool + If True, returns the vectorized upper triangles of correlation matrices in a p x (n^2 - n)/2 array. + If false, returns the full correlation matrices in a p x n x n array. + verbose : Bool + If True, prints out subjects/sessions as their correlationmatrices are being read. + If False, prints nothing. + + Returns + ------- + # NOT TRUE CURRENTLY RETURNS DATAFRAME + edge_vector : numpy array of shape (p, (n^2-n)/2) + Represents an array of vectorized upper triangles of + the input nxn matrices if vectorized=True. + edge_cube : numpy array of shape (p, n^2) + Represents an array of the input nxn matrices + if vectorized=False. + """ + subjects = layout.get(return_type="id", target="subject", suffix="bold", scope=deriv_name) + + ppts_fname = layout.get_file("participants.tsv").path + ppt_df = pd.read_csv(ppts_fname, sep="\t", index_col=[0, 1]) + ppt_df["adj"] = "" + if vectorized: + ppt_df["edge_vector"] = "" + + for subject in subjects: + if verbose: + print(subject) + else: + pass + sessions = layout.get( + return_type="id", + target="session", + task=task, + suffix="bold", + subject=subject, + scope=deriv_name, + ) + + for session in sessions: + runs = layout.get( + return_type="id", + session=session, + target="run", + task=task, + suffix="timeseries", + subject=subject, + scope=deriv_name, + ) + if len(runs) > 0: + path = layout.get( + return_type="filename", + session=session, + run=runs[0], + task=task, + suffix="timeseries", + subject=subject, + scope=deriv_name, + ) + confounds = pd.read_table(path[0], header=0, index_col=0) + if not "framewise_displacement" in confounds.columns: + fd = calc_fd(confounds) + # fd.append(0) + fd = np.append(fd, [0]) + confounds["framewise_displacement"] = fd + confound_means = confounds.mean(axis=0) + if len(runs) > 1: + for run in runs[1:]: + path = layout.get( + return_type="filename", + session=session, + run=run, + task=task, + suffix="timeseries", + subject=subject, + scope=deriv_name, + ) + confounds = pd.read_table(path[0], header=0, index_col=0) + if not "framewise_displacement" in confounds.columns: + fd = calc_fd(confounds) + # fd.append(0) + fd = np.append(fd, [0]) + confounds["framewise_displacement"] = fd + confound_means_temp = confounds.mean(axis=0) + confound_means = np.mean( + pd.concat([confound_means, confound_means_temp], axis=1), axis=1 + ) + # print(confound_means) + else: + path = layout.get( + return_type="filename", + session=session, + desc="confounds", + task=task, + suffix="timeseries", + subject=subject, + scope=deriv_name, + ) + + confounds = pd.read_table(path[0], header=0, index_col=0) + if not "framewise_displacement" in confounds.columns: + fd = calc_fd(confounds) + fd = np.append(fd, [0]) + confounds["framewise_displacement"] = fd + confound_means = confounds.mean(axis=0) + # print(confound_means) + for confound in confound_means.index: + ppt_df.at[(f"sub-{subject}", f"ses-{session}"), confound] = confound_means[ + confound + ] + + if verbose: + print(session) + else: + pass + path = layout.get( + return_type="filename", + task=task, + subject=subject, + session=session, + atlas=atlas, + suffix="bold", + scope="IDConn", + ) + if verbose: + print(f"Corrmat path for sub-{subject}, ses-{session}: \t{path}") + else: + pass + if type(path) == list: + # print(len(path)) + ################################################################ + ############ EEEEEEEEEEEEEEEEEK ################################ + ############### DOES THIS ONLY GRAB ONE RUN?!?!?! ############## + ################################################################ + path = path[0] + + else: + pass + assert exists(path), f"Corrmat file not found at {path}" + adj_matrix = pd.read_csv(path, sep="\t", header=0, index_col=0) + + if z_score == True: + z_adj = np.arctanh(adj_matrix.values) + z_adj = np.where(z_adj == np.inf, 0, z_adj) + # print(z_adj.shape) + ppt_df.at[(f"sub-{subject}", f"ses-{session}"), "adj"] = z_adj + else: + # print(adj_matrix.values.shape) + ppt_df.at[(f"sub-{subject}", f"ses-{session}"), "adj"] = adj_matrix.values + + if vectorized == True: + edge_vector = vectorize_corrmats(adj_matrix.values) + # print(edge_vector.shape) + ppt_df.at[(f"sub-{subject}", f"ses-{session}"), "edge_vector"] = edge_vector + ppt_df.replace({"": np.nan}, inplace=True) + return ppt_df + + +def undo_vectorize(edges, num_node=None, diagonal=False): + """ + Puts an edge vector back into an adjacency matrix. + Parameters + ---------- + edges : list-like of shape ((n^2-n)/2,) + Vectorized upper triangle of an adjacency matrix. + num_node : int + The number of nodes in the graph. I would calculate this myself, but I'd rather not. + + Returns + ------- + matrix : numpy array of size (n,n) + Symmetric array of connectivity values. + """ + # j = len(edges) + # num_node = (np.sqrt((8 * j) + 1) + 1) / 2 + if num_node == None: + j = len(edges) + if diagonal == False: + num_node = int((np.sqrt((8 * j) + 1) + 1) / 2) + else: + num_node = int((np.sqrt((8 * j) + 1) - 1) / 2) + else: + num_node = int(num_node) + X = np.zeros((num_node, num_node)) + if diagonal == False: + k = 1 + if diagonal == True: + k = 0 + X[np.triu_indices(num_node, k=k)] = edges + diag_X = X[np.diag_indices(num_node, 2)] + X = X + X.T + if diagonal == True: + X[np.diag_indices(num_node, 2)] = diag_X + # print('did undo_vectorize work?', np.allclose(X, X.T)) + return X + + +def plot_edges( + adj, + atlas_nii, + threshold=None, + title=None, + strength=False, + cmap="seismic", + node_size="strength", +): + """ + Plots the edges of a connectivity/adjacency matrix both in a heatmap and in brain space, with the option to include + a surface plot of node strength. + Parameters + ---------- + adj : array-like of shape (n, n) + Adjacency matrix to be plotted. Can be numpy array or Pandas dataframe. + atlas_nii : str + Path to the atlas used to define nodes in the adjacency matrix. + Should be one value per node, with the same number of values as rows and columns in adj (i.e., n). + Background should be 0, should be in MNI space. + threshold : int + Percentile of edges to plot, between 0 and 100 such that 0 plots all the edges and 100 plots none. + If not specified, default is 99, which plots the top 1% of edges. + title : str + Title for plots. + strength : bool + If True, plots surface maps of node strength (i.e., the sum of all a node's edge weights) + cmap : str + One of the matplotlib colormaps. + node_size : int or 'strength' + Size to plot nodes in brain space. If 'strength', node size varies according to a node's summed edges (i.e., strength). + + Returns + ------- + fig1 : Matplotlib figure object + Connectivity figure. + fig2 : Matplotlib figure object + If `strength=True`, the surface node strength plot. + """ + coords = plotting.find_parcellation_cut_coords(atlas_nii) + num_node = adj.shape[0] + # only plot the top t% of edges + if threshold == "computed": + threshold = f"{(1 - (100 / num_node ** 2)) * 100}%" + elif type(threshold) == float or type(threshold) == int: + threshold = f"{threshold}%" + else: + threshold = "99.99%" + print("edge plotting threshold: ", threshold) + + if node_size == "strength": + node_strength = np.abs(np.sum(adj, axis=0)) + # node_strength /= np.max(node_strength) + # node_strength **= 4 + node_strength = node_strength / np.max(node_strength) * 60 + node_size = node_strength + + fig = plt.figure(figsize=(12, 4)) + if title is not None: + fig.suptitle(title) + gs = GridSpec(1, 2, width_ratios=[3, 1]) + ax0 = fig.add_subplot(gs[0]) + ax1 = fig.add_subplot(gs[1]) + + plt.tight_layout(w_pad=5) + g = plotting.plot_connectome( + adj, + coords, + node_size=node_size, + edge_threshold=threshold, + edge_cmap=cmap, + edge_kwargs={"alpha": 0.4}, + display_mode="lyrz", + figure=fig, + axes=ax0, + colorbar=False, + annotate=True, + ) + h = sns.heatmap(adj, square=True, linewidths=0, cmap=cmap, ax=ax1, center=0) + if strength: + fig2 = plt.figure(figsize=(12, 4)) + if title is not None: + fig2.suptitle(title) + fsaverage = datasets.fetch_surf_fsaverage() + nimg = nib.load(atlas_nii) + regn_sch_arr = nimg.get_fdata() + for i in np.arange(0, num_node): + regn_sch_arr[np.where(regn_sch_arr == i + 1)] = np.sum((adj[i])) + strength_nimg = nib.Nifti1Image(regn_sch_arr, nimg.affine) + # replace this filename with BIDSy output + # nib.save(strength_nimg, f'/Users/katherine.b/Dropbox/{title}predictive-strength.nii') + + gs = GridSpec(1, 4) + # plot edge weights on surfaces + ax2 = fig2.add_subplot(gs[0], projection="3d") + ax3 = fig2.add_subplot(gs[1], projection="3d") + ax4 = fig2.add_subplot(gs[2], projection="3d") + ax5 = fig2.add_subplot(gs[3], projection="3d") + + texture_l = surface.vol_to_surf( + strength_nimg, fsaverage.pial_left, interpolation="nearest" + ) + texture_r = surface.vol_to_surf( + strength_nimg, fsaverage.pial_right, interpolation="nearest" + ) + + plt.tight_layout(w_pad=-1) + i = plotting.plot_surf_stat_map( + fsaverage.pial_left, + texture_l, + bg_map=fsaverage.sulc_left, + symmetric_cbar=False, + threshold=0.5, + cmap=cmap, + view="lateral", + colorbar=False, + axes=ax2, + ) + j = plotting.plot_surf_stat_map( + fsaverage.pial_left, + texture_l, + bg_map=fsaverage.sulc_left, + symmetric_cbar=False, + threshold=0.5, + cmap=cmap, + view="medial", + colorbar=False, + axes=ax3, + ) + k = plotting.plot_surf_stat_map( + fsaverage.pial_right, + texture_r, + bg_map=fsaverage.sulc_right, + symmetric_cbar=False, + threshold=0.5, + cmap=cmap, + view="lateral", + colorbar=False, + axes=ax4, + ) + l = plotting.plot_surf_stat_map( + fsaverage.pial_right, + texture_r, + bg_map=fsaverage.sulc_right, + symmetric_cbar=False, + threshold=0.5, + cmap=cmap, + view="medial", + colorbar=False, + axes=ax5, + ) + return fig, fig2, strength_nimg + else: + return fig diff --git a/idconn/nbs.py b/idconn/nbs.py new file mode 100644 index 0000000..52e9b37 --- /dev/null +++ b/idconn/nbs.py @@ -0,0 +1,529 @@ +import numpy as np +import pingouin as pg +import networkx as nx +import pandas as pd +from idconn.io import vectorize_corrmats, undo_vectorize +from scipy.stats import t, pearsonr, pointbiserialr, spearmanr +import enlighten + +# import bct +from sklearn.experimental import enable_halving_search_cv +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, HalvingGridSearchCV + +from sklearn.feature_selection import f_regression, f_classif +from sklearn.linear_model import LogisticRegression, ElasticNet, LogisticRegressionCV, RidgeCV +from sklearn.preprocessing import Normalizer + +from sklearn.metrics import mean_squared_log_error, adjusted_mutual_info_score +from scipy.stats import spearmanr + + +def calc_number_of_nodes(matrices): + if matrices.shape[0] != matrices.shape[1]: + if matrices.shape[1] == matrices.shape[2]: + num_node = matrices.shape[1] + matrices = np.moveaxis(matrices, 0, -1) + else: + raise ValueError( + f"Matrices of shape {matrices.shape}", + "requires matrices of shape (subject x session) x node x node", + "or node x node x (subject x session).", + ) + else: + num_node = matrices.shape[0] + return num_node + + +def residualize(X, y=None, confounds=None): + """ + all inputs need to be arrays, not dataframes + """ + # residualize the outcome + if confounds is not None: + if y is not None: + temp_y = np.reshape(y, (y.shape[0],)) + y = pg.linear_regression(confounds, temp_y) + resid_y = y.residuals_ + + # residualize features + resid_X = np.zeros_like(X) + # print(X.shape, resid_X.shape) + for i in range(0, X.shape[1]): + X_temp = X[:, i] + # print(X_temp.shape) + X_ = pg.linear_regression(confounds, X_temp) + # print(X_.residuals_.shape) + resid_X[:, i] = X_.residuals_.flatten() + return resid_y, resid_X + else: + # residualize features + resid_X = np.zeros_like(X) + # print(X.shape, resid_X.shape) + for i in range(0, X.shape[1]): + X_temp = X[:, i] + # print(X_temp.shape) + X_ = pg.linear_regression(confounds, X_temp) + # print(X_.residuals_.shape) + resid_X[:, i] = X_.residuals_.flatten() + return resid_X + else: + print("Confound matrix wasn't provided, so no confounding was done") + + +def pynbs( + matrices, outcome, num_node=None, diagonal=False, alpha=0.05, predict=False, permutations=10000 +): + """ + Calculates the Network Based Statistic (Zalesky et al., 2011) on connectivity matrices provided + of shape ((subject x session)x node x node) + in the network. + Returns a dataframe containing the results of kfolds cross-validation, + including the indices of train and test samples, the resulting p-value and largest connected component, + the accuracy of the network in predicting group belonging in the test samples (using logistic regression), + the parameter estimates from each regression, and the model object from each regression. + from a BIDS derivative folder. Optionally returns a subject x session dataframe + of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) + array of vectorized upper triangles of those correlation mat + Parameters + ---------- + matrices : numpy array of shape (p, n, n) + Represents the link strengths of the graphs (i.e., functional connectivity). + Assumed to be an array of symmetric matrices. + outcome : list-like of shape (p,) + Y-value to be predicted with connectivity + confounds : list-like of shape (p,m) + Covariates, included as predictors in model. + alpha : float + Type-I error (i.e., false positive) rate, for outcome-related edge detection. Default = 0.05 + predict : bool + If True, bypasses `permutations` parameter and only runs edge detection + component identification. + Used for NBS-Predict. + permutations : int + If `predict=False`, specifies the number of permutations run to create a null distribution + for estimating the significance of the connected component size. Recommended 10,000. + + Returns + ------- + S1 : Pandas dataframe + A binary matrix denoting the largest connected component. + pval : float + If `predict=False`, denotes the significance of the largest connected component. + perms : numpy array of shape (permutations,) + If `predict=False`, largest connected component size per permutation. + """ + # need to do a mass-univariate test at every edge + # and retain significant edges + # then find the largest connected component + # and, if not predict, build a null distribution + # n = matrices.shape[:-1] + ndims = len(matrices.shape) + + # vectorize_corrmats returns p x n^2 + + # turn matrices into vectorized upper triangles + if ndims > 2: + edges = vectorize_corrmats(matrices, diagonal=diagonal) + else: + edges = matrices.copy() + # print(edges.shape) + + # edges = edges.T + + # run an ols per edge + # create significancs matrix for predictor of interest (outcome) + # 1 if edge is significantly predicted by outcome + # 0 if it's not + + if len(np.unique(outcome)) < 5: + (f, p) = f_classif(X=edges, y=outcome) + else: + (f, p) = f_regression(X=edges, y=outcome, center=False) + sig_edges = np.where(p < alpha, 1, 0) + + # find largest connected component of sig_edges + # turn sig_edges into an nxn matrix first + sig_matrix = undo_vectorize( + sig_edges, num_node=num_node, diagonal=diagonal + ) # need to write this function + matrix = nx.from_numpy_array(sig_matrix) + + # use networkX to find connected components + S = [matrix.subgraph(c).copy() for c in nx.connected_components(matrix)] + S.sort(key=len, reverse=True) + # largest_cc = max(nx.connected_components(matrix), key=len) + G0 = S[0] + # print(G0) + + # retain size of largest connected component + # for NBS permutation-based significance testing + max_comp = G0.number_of_edges() + # print(f'Connected component has {max_comp} edges.') + + # pull the subgraph with largest number of nodes + # i.e., the largest connected component + + # grab list of nodes in largest connected component + nodes = list(G0.nodes) + + unused_nodes = list(set(matrix.nodes) - set(nodes)) + S1 = nx.to_pandas_adjacency(G0, nodelist=nodes) + + # add empty edges for unused nodes + # bc NBS-Predict needs all nodes for + # the eventual weighted average + # and NBS might need all nodes for easier + # plotting in brain space + for i in unused_nodes: + S1.loc[i] = 0 + S1[i] = 0 + + S1.sort_index(axis=0, inplace=True) + S1.sort_index(axis=1, inplace=True) + + # permutation testing to create a null distribution of max component size + # only for regular NBS, -Predict doesn't need this + if predict == False: + perms = np.zeros((permutations,)) + rng = np.random.default_rng() + outcome_copy = outcome.copy() + for i in range(0, permutations): + # shuffle outcome order + rng.shuffle(outcome_copy, axis=0) + # print(outcome_copy) + + if len(np.unique(outcome)) < 5: + (f1, p1) = f_classif(edges, outcome_copy) + else: + (f1, p1) = f_regression(edges, outcome_copy, center=False) + + perm_edges = np.where(p1 < alpha, 1, 0) + + # print(np.sum(perm_edges)) + # find largest connected component of sig_edges + # turn sig_edges into an nxn matrix first + perm_matrix = undo_vectorize( + perm_edges, num_node=num_node, diagonal=diagonal + ) # need to write this function + perm_nx = nx.from_numpy_array(perm_matrix) + + largest_cc = max(nx.connected_components(perm_nx), key=len) + S = perm_nx.subgraph(largest_cc) + + perm_comp_size = S.number_of_edges() + + # retain for null distribution + perms[i] = perm_comp_size + if i == 0: + pass + elif i % 100 == 0: + print( + f"p-value is {np.round(np.sum(np.where(perms >= max_comp, 1, 0)) / i, 3)} as of permutation {i}" + ) + + # bctpy nbs code uses hit to mark progress across permutations + # prob not necessary? + + # bctpy calcs pval for all components, not just largest? + # but I don't think that's relevant for the og implimentation of nbs? + pval = np.size(np.where(perms >= max_comp)) / permutations + print(max_comp, permutations, pval) + + return pval, S1, perms + else: + return S1 + + +def kfold_nbs( + matrices, + outcome, + confounds=None, + alpha=0.05, + groups=None, + num_node=None, + diagonal=False, + scale_x=False, + scale_y=False, + n_splits=10, + n_iterations=10, +): + """Calculates the Network Based Statistic (Zalesky et al., 20##) on connectivity matrices provided + of shape ((subject x session)x node x node) + in the network. + Returns a dataframe containing the results of kfolds cross-validation, + including the indices of train and test samples, the resulting p-value and largest connected component, + the accuracy of the network in predicting group belonging in the test samples (using logistic regression), + the parameter estimates from each regression, and the model object from each regression. + from a BIDS derivative folder. Optionally returns a subject x session dataframe + of confound measures (e.g., motion averages) and/or a node^2 x (subject x session) + array of vectorized upper triangles of those correlation mat + Parameters + ---------- + matrices : numpy array of shape (p, n, n) or (p, (n^2 / 2)- n) + Represents the link strengths of the graphs. Assumed to be + an array of symmetric matrices or a vectorized triangle thereof. + outcome : list-like of shape (p,) + Y-value to be predicted with connectivity + confounds : list-like + Columns in `participants.tsv` to be regressed out of connectivity and outcome + data in each CV fold (per recommendation from Snoek et al., 2019). + alpha : float + Proportion of type II errors (i.e., false positives) we're willing to put up with. + This is the upper limit for pvalues in the edge detection process. + groups : list-like of shape (p,) + Grouping variable - currently only works for 2 groups. Will enforce stratified k-fold CV. + Currently intended for use where grouping variable is the outcome of interest, assumed by StratifiedKFold. + NEED TO FIX THIS: ALLOW THE CASE WHERE GROUPING VAR != OUTCOME VAR + n_splits : int + Value of K for K-fold cross-validation. Will split data into K chunks, train on K-1 chunks and test on the Kth. + n_iterations : int + Number of times to run K-fold cross-validation. More times = more stable results. + + Returns + ------- + weighted_average : Pandas dataframe + Includes the average of all largest components across folds and iterations, weighted by + their prediction performance (i.e., accuracy for binary outcome, correlation for continuous). + Could be used for out-of-sample prediction, once thresholded and binarized. + cv_results : Pandas dataframe + Includes the results of each cross-validation loop + (e.g., predictive performance, data split, largest connected component per fold per iteration). + """ + ndims = len(matrices.shape) + + # vectorize_corrmats returns p x n^2 + + # turn matrices into vectorized upper triangles + if ndims > 2: + edges = vectorize_corrmats(matrices) + else: + edges = matrices.copy() + # print(edges.shape) + # print(edges.shape) + index = list(range(0, n_splits * n_iterations)) + + cv_results = pd.DataFrame( + index=index, + columns=[ + "split", + #'pval', + "score", + "component", + "model", + ], + ) + if groups is not None: + cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_iterations) + split_y = groups + + else: + cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_iterations) + split_y = outcome + + if num_node is None: + num_node = calc_number_of_nodes(matrices) + else: + pass + # print(num_node) + # if matrices.shape[0] != matrices.shape[1]: + # if matrices.shape[1] == matrices.shape[2]: + # num_node = matrices.shape[1] + # matrices = np.moveaxis(matrices, 0, -1) + # else: + # raise ValueError(f'Matrices of shape {matrices.shape}', + #'requires matrices of shape (subject x session) x node x node', + #'or node x node x (subject x session).') + # else: + # num_node = matrices.shape[0] + if diagonal == True: + k = 0 + if diagonal == False: + k = 1 + upper_tri = np.triu_indices(num_node, k=k) + + i = 0 + manager = enlighten.get_manager() + ticks = manager.counter(total=n_splits * n_iterations, desc="Progress", unit="folds") + for train_idx, test_idx in cv.split(edges, split_y): + + cv_results.at[i, "split"] = (train_idx, test_idx) + + # assert len(train_a_idx) == len(train_b_idx) + Cs = np.logspace(-4, 4, 10) + # print(len(np.unique(outcome))) + if np.unique(outcome).shape[0] == 2: + # print('binary') + regressor = LogisticRegressionCV( + Cs=Cs, + cv=4, + # verbose=2, + max_iter=100000, + penalty="l2", + solver="saga", + n_jobs=4, + ) + + else: + # print('continuous') + regressor = RidgeCV( + alphas=Cs, + cv=4, + # n_jobs=4 + ) + + train_y = outcome[train_idx] + test_y = outcome[test_idx] + + train_edges = edges[train_idx, :] + test_edges = edges[test_idx, :] + + if confounds is not None: + train_confounds = confounds.values[train_idx] + test_confounds = confounds.values[test_idx] + # print(train_edges.shape, train_confounds.shape, train_y.shape) + + # residualize the edges and outcome + if np.unique(outcome).shape[0] == 2: + train_edges = residualize(X=train_edges, confounds=train_confounds) + test_edges = residualize(X=test_edges, confounds=test_confounds) + elif np.unique(outcome).shape[0] > 3: + train_y, train_edges = residualize( + X=train_edges, y=train_y, confounds=train_confounds + ) + test_y, test_edges = residualize(X=test_edges, y=test_y, confounds=test_confounds) + else: + pass + if scale_x: + x_scaler = Normalizer() + train_edges = x_scaler.fit_transform(train_edges) + test_edges = x_scaler.transform(test_edges) + if scale_y: + if np.unique(outcome).shape[0] == 2: + pass + else: + y_scaler = Normalizer() + train_y = y_scaler.fit_transform(train_y.reshape(-1, 1)) + test_y = y_scaler.transform(test_y.reshape(-1, 1)) + + # perform NBS wooooooooo + # note: output is a dataframe :) + # PYNBS SHOULD NOT DO CONFOUND REGRESSION? + adj = pynbs( + train_edges, train_y, num_node=num_node, diagonal=diagonal, alpha=alpha, predict=True + ) + # print(adj.shape, adj.ndim, adj[0].shape, upper_tri) + + # cv_results.at[i, 'pval'] = pval + cv_results.at[i, "component"] = adj.values + + # in the event of no edges significantly related to + # print(sum(sum(adj.values)), '\n', adj.values.shape) + if sum(sum(adj.values)) > 0: + # grab the values of the adjacency matrix that are just in the upper triangle + # so you don't have repeated edges + # returns (n_edges, ) + nbs_vector = adj.values[upper_tri] + # print(nbs_vector.shape) + # print(nbs_vector.shape) + # use those to make a "significant edges" mask + mask = nbs_vector == 1.0 + + # grab only the significant edges from testing and training sets of edges + # for use as features in the predictive models + # these are already residualized + # print(train_edges.shape) + # returns (n_edges, samples) + train_features = train_edges.T[mask] + test_features = test_edges.T[mask] + # print(mask.shape, np.sum(mask), train_edges.shape, train_features.shape) + + train_features = train_features.T + test_features = test_features.T + + # train_features = scaler.fit_transform(train_features.T) + # test_features = scaler.fit_transform(test_features.T) + # print(train_features.shape, train_y.shape) + + # print(f"train_edges:\t{train_edges[:10, 0]}\ntrain_features:\t{train_features[:10, 0]}") + # print(np.ravel(train_y)) + # train model predicting outcome from brain (note: no mas covariates) + # use grid search bc I want to know how to tune alpha and l1_ratio + + # grid = HalvingGridSearchCV(estimator=regressor, + # param_grid=param_grid, + # n_jobs=8, + # cv=4, + # factor=2, + # verbose=0, + # min_resources=20, + # refit=True, + # aggressive_elimination=False) + model = regressor.fit(X=train_features, y=np.ravel(train_y)) + + cv_results.at[i, "model"] = model + + # score that model on the testing data + # if logistic regression: score = mean accuracy + # if linear regression: score = coefficient of determination (R^2) + # both from 0 (low) to 1 (high) + + # can't use MSE, which is the default score for ridge + # because larger values = worse performance + # I go die now + if np.unique(outcome).shape[0] == 2: + score = model.score(X=test_features, y=np.ravel(test_y)) + + else: + predicted_y = model.predict(X=test_features) + score, p = spearmanr(predicted_y, np.ravel(test_y)) + # spearman = spearmanr(predicted_y, np.ravel(test_y)) + + cv_results.at[i, "score"] = score + if i % (n_splits * n_iterations / 10) == 0: + mean = cv_results["score"].mean() + sdev = cv_results["score"].std() + print( + f"Iteration {i} out of {n_splits * n_iterations}, average score:\t{mean:.2f} +/- {sdev:.2f}" + ) + # print(score) + + m = 0 + param_vector = np.zeros_like(nbs_vector) + for l in range(0, nbs_vector.shape[0]): + if nbs_vector[l] == 1.0: + ### + # NEEDS IF STATEMENT BC LOGISTIC AND LINEAR HAVE DIFFERENT COEF_ SHAPES + if np.unique(outcome).shape[0] == 2: + param_vector[l] = model.coef_[0, m] + else: + param_vector[l] = model.coef_[m] + m += 1 + else: + pass + X = undo_vectorize(param_vector, num_node=num_node, diagonal=diagonal) + # cv_results.at[i, "coefficient_matrix"] = X + # cv_results.at[i, "coefficient_vector"] = param_vector + i += 1 + else: + pass + ticks.update() + # calculate weighted average + # print(cv_results['score']) + weighted_stack = np.zeros((num_node, num_node)) + fake = np.zeros((num_node, num_node)) + # print(weighted_stack.shape) + for j in index: + # print(cv_results.at[j, 'score']) + weighted = cv_results.at[j, "component"] * cv_results.at[j, "score"] + + if np.sum(weighted) == 0 or np.isnan(np.sum(weighted)) == True: + weighted_stack = np.dstack([weighted_stack, fake]) + else: + weighted_stack = np.dstack([weighted_stack, weighted]) + + # print(weighted_stack.shape, weighted.shape) + weighted_average = np.mean(weighted_stack, axis=-1) + # model = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + return ( + weighted_average, + cv_results, + ) # model diff --git a/idconn/networking.py b/idconn/networking.py new file mode 100644 index 0000000..c2ddf39 --- /dev/null +++ b/idconn/networking.py @@ -0,0 +1,327 @@ +import numpy as np +import pandas as pd +import seaborn as sns +import networkx as nx +import matplotlib.pyplot as plt +from os.path import join + +# from nilearn.connectome import ConnectivityMeasure +from scipy.sparse.csgraph import minimum_spanning_tree +from scipy.stats import skew +import bct + +# import datetime + + +def avg_corrmat(ppt_df): + """ + Reads in adjacency matrices from the pandas df with ppt info and adj, then computes an average. + """ + stacked_corrmats = np.array(ppt_df["adj"]) + print("Stacked corrmats have dimensions", stacked_corrmats.shape) + avg_corrmat = np.mean(stacked_corrmats, axis=0) + return avg_corrmat + + +def null_model(W, bin_swaps=5, wei_freq=0.1, seed=None): + def get_rng(seed): + if seed is None or seed == np.random: + return np.random.mtrand._rand + elif isinstance(seed, np.random.RandomState): + return seed + try: + rstate = np.random.RandomState(seed) + except ValueError: + rstate = np.random.RandomState(np.random.Random(seed).randint(0, 2**32 - 1)) + return rstate + + def randmio_und_signed(R, itr, seed=None): + rng = get_rng(seed) + R = R.copy() + n = len(R) + + itr *= int(n * (n - 1) / 2) + + max_attempts = int(np.round(n / 2)) + eff = 0 + + for it in range(int(itr)): + att = 0 + while att <= max_attempts: + a, b, c, d = pick_four_unique_nodes_quickly(n, rng) + + r0_ab = R[a, b] + r0_cd = R[c, d] + r0_ad = R[a, d] + r0_cb = R[c, b] + + # rewiring condition + if ( + np.sign(r0_ab) == np.sign(r0_cd) + and np.sign(r0_ad) == np.sign(r0_cb) + and np.sign(r0_ab) != np.sign(r0_ad) + ): + R[a, d] = R[d, a] = r0_ab + R[a, b] = R[b, a] = r0_ad + + R[c, b] = R[b, c] = r0_cd + R[c, d] = R[d, c] = r0_cb + + eff += 1 + break + + att += 1 + + return R, eff + + def pick_four_unique_nodes_quickly(n, seed=None): + """ + This is equivalent to np.random.choice(n, 4, replace=False) + Another fellow suggested np.random.random_sample(n).argpartition(4) which is + clever but still substantially slower. + """ + rng = get_rng(seed) + k = rng.randint(n**4) + a = k % n + b = k // n % n + c = k // n**2 % n + d = k // n**3 % n + if a != b and a != c and a != d and b != c and b != d and c != d: + return (a, b, c, d) + else: + # the probability of finding a wrong configuration is extremely low + # unless for extremely small n. if n is extremely small the + # computational demand is not a problem. + + # In my profiling it only took 0.4 seconds to include the uniqueness + # check in 1 million runs of this function so I think it is OK. + return pick_four_unique_nodes_quickly(n, rng) + + rng = get_rng(seed) + if not np.allclose(W, W.T): + print("Input must be undirected") + W = W.copy() + n = len(W) + np.fill_diagonal(W, 0) # clear diagonal + Ap = W > 0 # positive adjmat + An = W < 0 # negative adjmat + + if np.size(np.where(Ap.flat)) < (n * (n - 1)): + W_r, eff = randmio_und_signed(W, bin_swaps, seed=rng) + Ap_r = W_r > 0 + An_r = W_r < 0 + else: + Ap_r = Ap + An_r = An + + W0 = np.zeros((n, n)) + for s in (1, -1): + if s == 1: + Acur = Ap + A_rcur = Ap_r + else: + Acur = An + A_rcur = An_r + + S = np.sum(W * Acur, axis=0) # strengths + Wv = np.sort(W[np.where(np.triu(Acur))]) # sorted weights vector + i, j = np.where(np.triu(A_rcur)) + (Lij,) = np.where(np.triu(A_rcur).flat) # weights indices + + P = np.outer(S, S) + + if wei_freq == 0: # get indices of Lij that sort P + Oind = np.argsort(P.flat[Lij]) # assign corresponding sorted + W0.flat[Lij[Oind]] = s * Wv # weight at this index + else: + wsize = np.size(Wv) + wei_period = np.round(1 / wei_freq).astype(int) # convert frequency to period + lq = np.arange(wsize, 0, -wei_period, dtype=int) + for m in lq: # iteratively explore at this period + # get indices of Lij that sort P + Oind = np.argsort(P.flat[Lij]) + R = rng.permutation(m)[: np.min((m, wei_period))] + for q, r in enumerate(R): + # choose random index of sorted expected weight + o = Oind[r] + W0.flat[Lij[o]] = s * Wv[r] # assign corresponding weight + + # readjust expected weighted probability for i[o],j[o] + f = 1 - Wv[r] / S[i[o]] + P[i[o], :] *= f + P[:, i[o]] *= f + f = 1 - Wv[r] / S[j[o]] + P[j[o], :] *= f + P[:, j[o]] *= f + + # readjust strength of i[o] + S[i[o]] -= Wv[r] + # readjust strength of j[o] + S[j[o]] -= Wv[r] + + O = Oind[R] + # remove current indices from further consideration + Lij = np.delete(Lij, O) + i = np.delete(i, O) + j = np.delete(j, O) + Wv = np.delete(Wv, R) + + W0 = W0 + W0.T + return W0 + + +def generate_null(ppt_df, thresh_arr, measure, permutations=1000): + """ + Generate a distribution of graph measure values based on a null connectivity matrix + that is like the average connectivity matrix across participants. + + """ + null_dist = pd.DataFrame(index=range(0, permutations), columns=["mean", "sdev"]) + avg_corr = avg_corrmat(ppt_df) + eff_perm = [] + j = 0 + while j < permutations: + effs = [] + W = null_model(avg_corr.values) + for thresh in thresh_arr: + thresh_corr = bct.threshold_proportional(W, thresh) + leff = measure(thresh_corr) + effs.append(leff) + effs_arr = np.asarray(effs) + leff_auc = np.trapz(effs_arr, dx=0.03, axis=0) + eff_perm.append(leff_auc) + j += 1 + + return null_dist + + +def omst(matrix, density=True, plot=False): + """ + WARNING: THIS IS SLOW AF, REPLACING WITH NETWORKX VERSION IN NEAR FUTURE + """ + dims = matrix.shape + if matrix.ndim > 2: + raise ValueError( + "'matrix' should be a 2D array. " + "An array with %d dimension%s was passed" + % (matrix.ndim, "s" if matrix.ndim > 1 else "") + ) + else: + mst = minimum_spanning_tree(matrix) + mst_arr = mst.toarray().astype(float) + matrix_2 = np.where(mst_arr != 0, 0, matrix) + cost = np.sum(matrix_2) / np.sum(matrix) + Eg = bct.efficiency_wei(matrix_2) + trees = [mst_arr] + GCE = [Eg - cost] + Cost = [cost] + + while np.sum(matrix_2) > 1000: + # print(np.sum(matrix_2)) + mst = minimum_spanning_tree(matrix_2) + mst_arr = mst.toarray().astype(float) + matrix_2 = np.where(mst_arr != 0, 0, matrix_2) + cost = np.sum(matrix_2) / np.sum(matrix) + Eg = bct.efficiency_wei(matrix_2) + trees.append(mst_arr) + GCE.append(Eg - cost) + Cost.append(cost) + trees = np.asarray(trees) + max_value = max(GCE) + max_GCE = GCE.index(max_value) + thresholded = np.sum(trees[:max_GCE, :, :], axis=0) + if plot == True: + fig, ax = plt.subplots() + sns.lineplot(Cost, GCE, ax=ax, palette="husl") + plt.scatter(Cost[max_GCE], GCE[max_GCE], marker="x", edgecolors=None, c="magenta") + ax.set_ylabel("Global Cost Efficiency") + ax.set_xlabel("Cost") + + if density == True: + den = np.sum(thresholded != 0) / (dims[0] * dims[1]) + return thresholded, den + return thresholded, fig + + +def graph_auc(matrix, thresholds, measure, args): + """ + matrix : array + measure : function from bctpy + """ + from bct import measure, threshold_proportional + + metrics = [] + for p in np.arange(thresholds[0], thresholds[1], 0.01): + thresh = threshold_proportional(matrix, p, copy=True) + metric = measure(thresh, args) + metrics.append(metric) + auc = np.trapz(metrics, dx=0.01) + return auc + + +def graph_omst(matrix, measure, args): + from bct import measure + + # threshold using orthogonal minimum spanning tree + thresh_mat = omst(matrix) + + # calculate graph measure on thresholded matrix + metric = measure(thresh_mat, args) + return metric + + +def scale_free_tau(corrmat, skew_thresh, proportional=True): + """' + Calculates threshold at which network becomes scale-free, estimated from the skewness of the networks degree distribution. + Parameters + ---------- + corrmat : numpy.array + Correlation or other connectivity matrix from which tau_connected will be estimated. + Should be values between 0 and 1. + proportional : bool + Determines whether connectivity matrix is thresholded proportionally or absolutely. + Default is proportional as maintaining network density across participants is a priority + Returns + ------- + tau : float + Lowest vaue of tau (threshold) at which network is scale-free. + """ + tau = 0.01 + skewness = 1 + while abs(skewness) > 0.3: + if proportional: + w = bct.threshold_proportional(corrmat, tau) + else: + w = bct.threshold_absolute(corrmat, tau) + skewness = skew(bct.degrees_und(w)) + tau += 0.01 + return tau + + +def connected_tau(corrmat, proportional=True): + """ + Calculates threshold at network becomes node connected, using NetworkX's `is_connected` function. + Parameters + ---------- + corrmat : numpy.array + Correlation or other connectivity matrix from which tau_connected will be estimated. + Should be values between 0 and 1. + proportional : bool + Determines whether connectivity matrix is thresholded proportionally or absolutely. + Default is proportional as maintaining network density across participants is a priority + Returns + ------- + tau : float + Highest vaue of tau (threshold) at which network becomes node-connected. + """ + tau = 0.01 + connected = False + while connected == False: + if proportional: + w = bct.threshold_proportional(corrmat, tau) + else: + w = bct.threshold_absolute(corrmat, tau) + w_nx = nx.convert_matrix.from_numpy_array(w) + connected = nx.algorithms.components.is_connected(w_nx) + tau += 0.01 + return tau diff --git a/idconn/networking/__init__.py b/idconn/networking/__init__.py deleted file mode 100644 index a4564bf..0000000 --- a/idconn/networking/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -Tools for computing network topology / graph theoretic measures -""" - -from . import null_distribution -from . import graph_theory - -__all__ = ["null_distribution", "graph_theory"] diff --git a/idconn/networking/graph_theory.py b/idconn/networking/graph_theory.py deleted file mode 100644 index 2713929..0000000 --- a/idconn/networking/graph_theory.py +++ /dev/null @@ -1,83 +0,0 @@ -import numpy as np -#import pandas as pd -import seaborn as sns -import matplotlib.pyplot as plt -from os.path import join -#from nilearn.connectome import ConnectivityMeasure -from scipy.sparse.csgraph import minimum_spanning_tree -import bct -#import datetime - -def omst(matrix, density=True, plot=False): - ''' - WARNING: THIS IS SLOW AF, REPLACING WITH NETWORKX VERSION IN NEAR FUTURE - ''' - dims = matrix.shape - if matrix.ndim > 2: - raise ValueError("'matrix' should be a 2D array. " - "An array with %d dimension%s was passed" - % (matrix.ndim, - "s" if matrix.ndim > 1 else "")) - else: - mst = minimum_spanning_tree(matrix) - mst_arr = mst.toarray().astype(float) - matrix_2 = np.where(mst_arr != 0, 0, matrix) - cost = np.sum(matrix_2) / np.sum(matrix) - Eg = bct.efficiency_wei(matrix_2) - trees = [mst_arr] - GCE = [Eg - cost] - Cost = [cost] - - while np.sum(matrix_2) > 1000: - #print(np.sum(matrix_2)) - mst = minimum_spanning_tree(matrix_2) - mst_arr = mst.toarray().astype(float) - matrix_2 = np.where(mst_arr != 0, 0, matrix_2) - cost = np.sum(matrix_2) / np.sum(matrix) - Eg = bct.efficiency_wei(matrix_2) - trees.append(mst_arr) - GCE.append(Eg - cost) - Cost.append(cost) - trees = np.asarray(trees) - max_value = max(GCE) - max_GCE = GCE.index(max_value) - thresholded = np.sum(trees[:max_GCE, :, :], axis=0) - if plot == True: - fig,ax = plt.subplots() - sns.lineplot(Cost, GCE, ax=ax, palette='husl') - plt.scatter(Cost[max_GCE], - GCE[max_GCE], - marker='x', - edgecolors=None, - c='magenta') - ax.set_ylabel('Global Cost Efficiency') - ax.set_xlabel('Cost') - - if density == True: - den = np.sum(thresholded != 0) / (dims[0] * dims[1]) - return thresholded, den - return thresholded, fig - -def graph_auc(matrix, thresholds, measure, args): - ''' - matrix : array - measure : function from bctpy - ''' - from bct import measure, threshold_proportional - - metrics = [] - for p in np.arange(thresholds[0], thresholds[1], 0.01): - thresh = threshold_proportional(matrix, p, copy=True) - metric = measure(thresh, args) - metrics.append(metric) - auc= np.trapz(metrics, dx=0.01) - return auc - -def graph_omst(matrix, measure, args): - from bct import measure - # threshold using orthogonal minimum spanning tree - thresh_mat = omst(matrix) - - # calculate graph measure on thresholded matrix - metric = measure(thresh_mat, args) - return metric \ No newline at end of file diff --git a/idconn/networking/null_distribution.py b/idconn/networking/null_distribution.py deleted file mode 100644 index 623f64f..0000000 --- a/idconn/networking/null_distribution.py +++ /dev/null @@ -1,221 +0,0 @@ -import numpy as np -import pandas as pd -from os.path import join, exists -import bct -import datetime - -def avg_corrmat(layout, task, session): - subjects = layout.get_subjects(task=task,session=session) - corrmats = {} - for subject in subjects: - try: - if task == "rest": - corrmat = np.genfromtxt( - join( - data_dir, - sesh[session], - subject, - "{0}-session-{1}-{2}_network_corrmat_{3}.csv".format( - subject, session, task, atlas - ), - ), - delimiter=",", - ) - else: - corrmat = np.genfromtxt( - join( - data_dir, - sesh[session], - subject, - "{0}-session-{1}_{2}-{3}_{4}-corrmat.csv".format( - subject, session, task, condition, atlas - ), - ), - delimiter=" ", - ) - # corrmat = np.genfromtxt(join(data_dir, '{0}-session-{1}_{2}-{3}_{4}-corrmat.csv'.format(subject, session, task, condition, atlas)), delimiter=' ') - corrmats[subject] = corrmat - except Exception as e: - print(subject, e) - data = list(corrmats.values()) - stacked_corrmats = np.array(data) - print('Stacked corrmats have dimensions', stacked_corrmats.shape) - avg_corrmat = np.mean(stacked_corrmats, axis=0) - return avg_corrmat - - -def null_model_und_sign(W, bin_swaps=5, wei_freq=0.1, seed=None): - def get_rng(seed): - if seed is None or seed == np.random: - return np.random.mtrand._rand - elif isinstance(seed, np.random.RandomState): - return seed - try: - rstate = np.random.RandomState(seed) - except ValueError: - rstate = np.random.RandomState(random.Random(seed).randint(0, 2 ** 32 - 1)) - return rstate - - def randmio_und_signed(R, itr, seed=None): - rng = get_rng(seed) - R = R.copy() - n = len(R) - - itr *= int(n * (n - 1) / 2) - - max_attempts = int(np.round(n / 2)) - eff = 0 - - for it in range(int(itr)): - att = 0 - while att <= max_attempts: - - a, b, c, d = pick_four_unique_nodes_quickly(n, rng) - - r0_ab = R[a, b] - r0_cd = R[c, d] - r0_ad = R[a, d] - r0_cb = R[c, b] - - # rewiring condition - if ( - np.sign(r0_ab) == np.sign(r0_cd) - and np.sign(r0_ad) == np.sign(r0_cb) - and np.sign(r0_ab) != np.sign(r0_ad) - ): - - R[a, d] = R[d, a] = r0_ab - R[a, b] = R[b, a] = r0_ad - - R[c, b] = R[b, c] = r0_cd - R[c, d] = R[d, c] = r0_cb - - eff += 1 - break - - att += 1 - - return R, eff - - def pick_four_unique_nodes_quickly(n, seed=None): - """ - This is equivalent to np.random.choice(n, 4, replace=False) - Another fellow suggested np.random.random_sample(n).argpartition(4) which is - clever but still substantially slower. - """ - rng = get_rng(seed) - k = rng.randint(n ** 4) - a = k % n - b = k // n % n - c = k // n ** 2 % n - d = k // n ** 3 % n - if a != b and a != c and a != d and b != c and b != d and c != d: - return (a, b, c, d) - else: - # the probability of finding a wrong configuration is extremely low - # unless for extremely small n. if n is extremely small the - # computational demand is not a problem. - - # In my profiling it only took 0.4 seconds to include the uniqueness - # check in 1 million runs of this function so I think it is OK. - return pick_four_unique_nodes_quickly(n, rng) - - rng = get_rng(seed) - if not np.allclose(W, W.T): - print("Input must be undirected") - W = W.copy() - n = len(W) - np.fill_diagonal(W, 0) # clear diagonal - Ap = W > 0 # positive adjmat - An = W < 0 # negative adjmat - - if np.size(np.where(Ap.flat)) < (n * (n - 1)): - W_r, eff = randmio_und_signed(W, bin_swaps, seed=rng) - Ap_r = W_r > 0 - An_r = W_r < 0 - else: - Ap_r = Ap - An_r = An - - W0 = np.zeros((n, n)) - for s in (1, -1): - if s == 1: - Acur = Ap - A_rcur = Ap_r - else: - Acur = An - A_rcur = An_r - - S = np.sum(W * Acur, axis=0) # strengths - Wv = np.sort(W[np.where(np.triu(Acur))]) # sorted weights vector - i, j = np.where(np.triu(A_rcur)) - (Lij,) = np.where(np.triu(A_rcur).flat) # weights indices - - P = np.outer(S, S) - - if wei_freq == 0: # get indices of Lij that sort P - Oind = np.argsort(P.flat[Lij]) # assign corresponding sorted - W0.flat[Lij[Oind]] = s * Wv # weight at this index - else: - wsize = np.size(Wv) - wei_period = np.round(1 / wei_freq).astype( - int - ) # convert frequency to period - lq = np.arange(wsize, 0, -wei_period, dtype=int) - for m in lq: # iteratively explore at this period - # get indices of Lij that sort P - Oind = np.argsort(P.flat[Lij]) - R = rng.permutation(m)[: np.min((m, wei_period))] - for q, r in enumerate(R): - # choose random index of sorted expected weight - o = Oind[r] - W0.flat[Lij[o]] = s * Wv[r] # assign corresponding weight - - # readjust expected weighted probability for i[o],j[o] - f = 1 - Wv[r] / S[i[o]] - P[i[o], :] *= f - P[:, i[o]] *= f - f = 1 - Wv[r] / S[j[o]] - P[j[o], :] *= f - P[:, j[o]] *= f - - # readjust strength of i[o] - S[i[o]] -= Wv[r] - # readjust strength of j[o] - S[j[o]] -= Wv[r] - - O = Oind[R] - # remove current indices from further consideration - Lij = np.delete(Lij, O) - i = np.delete(i, O) - j = np.delete(j, O) - Wv = np.delete(Wv, R) - - W0 = W0 + W0.T - return W0 - -def generate_null(layout, task, session, mask): - null_dist = pd.DataFrame(index=subjects, columns=["mean", "sdev"]) - avg_corr = avg_corrmat( - layout, task, session, mask - ) - eff_perm = [] - j = 1 - while j < 3: - effs = [] - W = null_model_und_sign(avg_corr.values) - for thresh in np.arange(0.21, 0.31, 0.03): - thresh_corr = bct.threshold_proportional(W, thresh) - leff = bct.efficiency_wei(thresh_corr) - effs.append(leff) - effs_arr = np.asarray(effs) - leff_auc = np.trapz(effs_arr, dx=0.03, axis=0) - eff_perm.append(leff_auc) - j += 1 - null_dist.at[(sesh[session], task, conds[i], mask), "mean"] = np.mean( - eff_perm - ) - null_dist.at[(sesh[session], task, conds[i], mask), "sdev"] = np.std( - eff_perm - ) - return null_dist \ No newline at end of file diff --git a/idconn/parser_utils.py b/idconn/parser_utils.py index 792123e..5872ec8 100644 --- a/idconn/parser_utils.py +++ b/idconn/parser_utils.py @@ -5,7 +5,7 @@ def is_valid_file(parser, arg): """Check if argument is existing folder.""" if not op.isfile(arg) and arg is not None: - parser.error(f'The file {arg} does not exist!') + parser.error(f"The file {arg} does not exist!") return arg @@ -13,6 +13,6 @@ def is_valid_file(parser, arg): def is_valid_path(parser, arg): """Check if argument is existing folder.""" if not op.isdir(arg) and arg is not None: - parser.error(f'The folder {arg} does not exist!') + parser.error(f"The folder {arg} does not exist!") return arg diff --git a/idconn/pipeline.py b/idconn/pipeline.py index 667870f..08b00bb 100644 --- a/idconn/pipeline.py +++ b/idconn/pipeline.py @@ -13,122 +13,191 @@ Please scroll to bottom to read full license. """ import warnings -warnings.filterwarnings('ignore') -#import numpy as np + +warnings.filterwarnings("ignore") +# import numpy as np import pandas as pd import bids import argparse -#import logging -#from os import makedirs + +# import logging +# from os import makedirs from os.path import exists -#from glob import glob -#from nilearn import input_data, connectome, plotting, image -from idconn.connectivity import build_networks + +# from glob import glob +# from nilearn import input_data, connectome, plotting, image +from idconn.connectivity import rest_connectivity, task_connectivity from idconn.parser_utils import is_valid_file, is_valid_path -#from idconn.networking import graph_theory, null_distribution +# from idconn.networking import graph_theory, null_distribution -#LGR = logging.getLogger(__name__) -#LGR.setLevel(logging.INFO) +# LGR = logging.getLogger(__name__) +# LGR.setLevel(logging.INFO) def _get_parser(): - parser = argparse.ArgumentParser(description='Make correlation matrices from BOLD data + mask.') + parser = argparse.ArgumentParser( + description="Make correlation matrices from BOLD data + mask." + ) parser.add_argument( - 'dset_dir', + "dset_dir", type=lambda x: is_valid_path(parser, x), - help='Path to BIDS dataset containing fmriprep derivatives folder.', + help="Path to BIDS dataset containing fmriprep derivatives folder.", ) parser.add_argument( - 'atlas', + "atlas", type=lambda x: is_valid_file(parser, x), - help='Path to atlas file in space specified by `space`.', + help="Path to atlas file in space specified by `space`.", ) - parser.add_argument('task', type=str, - help='Task to be analyzed.') + parser.add_argument("task", type=str, help="Task to be analyzed.") parser.add_argument( - '--space', + "--space", type=str, - help='Space in which to run analyses (must be the space `atlas` is in).', + help="Space in which to run analyses (must be the space `atlas` is in).", default="MNI152NLin2009cAsym", ) parser.add_argument( - '--conn', - action='store', - choices=['covariance', 'correlation', 'partial correlation', 'tangent', 'precision'], - help='Metric used to calculate connectivity.', - default='correlation', + "--conn", + action="store", + choices=["covariance", "correlation", "partial correlation", "tangent", "precision"], + help="Metric used to calculate connectivity.", + default="correlation", ) parser.add_argument( - '--bids_db', + "--bids_db", metavar="PATH", type=lambda x: is_valid_path(parser, x), - help='Path to saved BIDS dataset layout file.', + help="Path to saved BIDS dataset layout file.", ) parser.add_argument( - '--confounds', + "--confounds", nargs="+", type=str, - help='Names of confound regressors from ', + help="Names of confound regressors from ", default=None, ) return parser -def idconn_workflow(dset_dir, atlas, task, out_dir, space="MNI152NLin2009cAsym", conn=None, bids_db=None, confounds=None): - print('Getting started!') +def idconn_workflow( + dset_dir, + atlas, + task, + out_dir, + space="MNI152NLin2009cAsym", + conn=None, + bids_db=None, + confounds=None, +): + print("Getting started!") if not confounds: confounds = [ - "cosine00", "cosine01", "cosine02", - "trans_x", "trans_x_derivative1", "trans_x_power2", "trans_x_derivative1_power2", - "trans_y", "trans_y_derivative1", "trans_y_derivative1_power2", "trans_y_power2", - "trans_z", "trans_z_derivative1", "trans_z_power2", "trans_z_derivative1_power2", - "rot_x", "rot_x_derivative1", "rot_x_power2", "rot_x_derivative1_power2", - "rot_y", "rot_y_derivative1", "rot_y_power2", "rot_y_derivative1_power2", - "rot_z", "rot_z_derivative1", "rot_z_derivative1_power2", "rot_z_power2", - "a_comp_cor_00", "a_comp_cor_01", "a_comp_cor_02", "a_comp_cor_03", "a_comp_cor_04", "a_comp_cor_05", "a_comp_cor_06" + "cosine00", + "cosine01", + "cosine02", + "trans_x", + "trans_x_derivative1", + "trans_x_power2", + "trans_x_derivative1_power2", + "trans_y", + "trans_y_derivative1", + "trans_y_derivative1_power2", + "trans_y_power2", + "trans_z", + "trans_z_derivative1", + "trans_z_power2", + "trans_z_derivative1_power2", + "rot_x", + "rot_x_derivative1", + "rot_x_power2", + "rot_x_derivative1_power2", + "rot_y", + "rot_y_derivative1", + "rot_y_power2", + "rot_y_derivative1_power2", + "rot_z", + "rot_z_derivative1", + "rot_z_derivative1_power2", + "rot_z_power2", + "a_comp_cor_00", + "a_comp_cor_01", + "a_comp_cor_02", + "a_comp_cor_03", + "a_comp_cor_04", + "a_comp_cor_05", + "a_comp_cor_06", ] print(f"Atlas: {atlas}\nConnectivity measure: {conn}") - assert exists(dset_dir), f"Specified dataset doesn't exist:\n{dset_dir} not found.\n\nPlease check the filepath." + assert exists( + dset_dir + ), f"Specified dataset doesn't exist:\n{dset_dir} not found.\n\nPlease check the filepath." layout = bids.BIDSLayout(dset_dir, derivatives=True, database_path=bids_db) - subjects = layout.get(return_type='id', target='subject', suffix='bold') + subjects = layout.get(return_type="id", target="subject", suffix="bold") print(f"Subjects: {subjects}") - #runs = layout.get(return_type='id', target='session', suffix='bold') - preproc_subjects = layout.get(return_type='id', target='subject', task=task, space=space, desc='preproc', suffix='bold') + # runs = layout.get(return_type='id', target='session', suffix='bold') + preproc_subjects = layout.get( + return_type="id", target="subject", task=task, space=space, desc="preproc", suffix="bold" + ) if len(subjects) != len(preproc_subjects): - print(f'{len(subjects)} subjects found in dset, only {len(preproc_subjects)} have preprocessed BOLD data. Pipeline is contniuing anyway, please double check preprocessed data if this doesn\'t seem right.') + print( + f"{len(subjects)} subjects found in dset, only {len(preproc_subjects)} have preprocessed BOLD data. Pipeline is contniuing anyway, please double check preprocessed data if this doesn't seem right." + ) - example_events = layout.get(return_type='filename', suffix='events', task=task, subject=preproc_subjects[0]) - events_df = pd.read_csv(example_events[0], header=0, index_col=0, sep='\t') - conditions = events_df['trial_type'].unique() + example_events = layout.get( + return_type="filename", suffix="events", task=task, subject=preproc_subjects[0] + ) + events_df = pd.read_csv(example_events[0], header=0, index_col=0, sep="\t") + conditions = events_df["trial_type"].unique() print(f"Computing connectivity matrices using {atlas}") for subject in preproc_subjects: print(f"Subject {subject}") - sessions = layout.get(return_type='id', target='session', task=task, subject=subject, suffix='bold') + sessions = layout.get( + return_type="id", target="session", task=task, subject=subject, suffix="bold" + ) print(f"Sessions with task-{task} found for {subject}: {sessions}") for session in sessions: print(f"Session {session}") - print(f"here are the inputs: {layout, subject, session, task, atlas, conn, space, confounds}") - if 'rest' in task: + print( + f"here are the inputs: {layout, subject, session, task, atlas, conn, space, confounds}" + ) + if "rest" in task: try: - adj_matrix = build_networks.connectivity(layout, subject, session, task, atlas, conn, space, confounds) + adj_matrix = rest_connectivity( + layout, subject, session, task, atlas, conn, space, confounds + ) except Exception as e: - print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}') + print( + f"Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}" + ) if len(conditions) < 1: try: - adj_matrix = build_networks.connectivity(layout, subject, session, task, atlas, conn, space, confounds) + adj_matrix = rest_connectivity( + layout, subject, session, task, atlas, conn, space, confounds + ) except Exception as e: - print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}') + print( + f"Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}" + ) else: try: - adj_matrix = build_networks.task_connectivity(layout=layout, subject=subject, session=session, task=task, atlas=atlas, confounds=confounds, connectivity_metric=conn) + adj_matrix = task_connectivity( + layout=layout, + subject=subject, + session=session, + task=task, + atlas=atlas, + confounds=confounds, + connectivity_metric=conn, + ) except Exception as e: - print(f'Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}') + print( + f"Error building corrmat for sub-{subject}, ses-{session}, task-{task}: {e}" + ) def _main(argv=None): @@ -138,7 +207,7 @@ def _main(argv=None): idconn_workflow(**vars(options)) -if __name__ == '__main__': +if __name__ == "__main__": _main() """ diff --git a/idconn/tests/test_pipeline.py b/idconn/tests/test_pipeline.py index 6d78fae..8322e7f 100644 --- a/idconn/tests/test_pipeline.py +++ b/idconn/tests/test_pipeline.py @@ -2,6 +2,9 @@ def test_idconn_workflow_smoke(): + ''' + this is a docstring bc my tests kept failing and it was annoying + ''' from idconn.pipeline import idconn_workflow # Check that it's a function ¯\_(ツ)_/¯ diff --git a/idconn/workflows/nbs_predict-bc.py b/idconn/workflows/nbs_predict-bc.py new file mode 100644 index 0000000..ec3c559 --- /dev/null +++ b/idconn/workflows/nbs_predict-bc.py @@ -0,0 +1,387 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "bc" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +train_layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(train_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) +groups = dat["bc"] + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +# load in test data +test_layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(test_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(nbs_vector.shape, filter.shape) +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge(solver="saga", alpha=best.alpha_) + train_metrics["alpha"] = best.alpha_ + # train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict + +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x=f"{OUTCOME}_scaled", + y=f"{OUTCOME}_pred", + # style='bc', + data=Ys, + ax=ax, + palette=dark_cmap, +) +# ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample prediction score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +train_metrics["in_sample_mse"] = mse +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + + +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x=f"{OUTCOME}_scaled", + y=f"{OUTCOME}_pred", + # style='bc', + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + corr = spearmanr(test_outcome, y_pred) + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-bc_sensitivity.py b/idconn/workflows/nbs_predict-bc_sensitivity.py new file mode 100644 index 0000000..813cf66 --- /dev/null +++ b/idconn/workflows/nbs_predict-bc_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "bc" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2.py b/idconn/workflows/nbs_predict-e2.py new file mode 100644 index 0000000..a846b5a --- /dev/null +++ b/idconn/workflows/nbs_predict-e2.py @@ -0,0 +1,411 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=3, n_iterations=3 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2_sensitivity.py b/idconn/workflows/nbs_predict-e2_sensitivity.py new file mode 100644 index 0000000..13177c7 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2bc_sensitivity.py b/idconn/workflows/nbs_predict-e2bc_sensitivity.py new file mode 100644 index 0000000..8052164 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2bc_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol" +CONFOUNDS = ["framewise_displacement", "bc"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2xp4-bc.py b/idconn/workflows/nbs_predict-e2xp4-bc.py new file mode 100644 index 0000000..4b32a85 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2xp4-bc.py @@ -0,0 +1,414 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol÷progesterone" +CONFOUNDS = ["framewise_displacement", "bc"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +dat["estradiol÷progesterone"] = dat["estradiol"] / dat["progesterone"] + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) +test_df["estradiol÷progesterone"] = test_df["estradiol"] / test_df["progesterone"] + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-e2xp4.py b/idconn/workflows/nbs_predict-e2xp4.py new file mode 100644 index 0000000..fcd6f40 --- /dev/null +++ b/idconn/workflows/nbs_predict-e2xp4.py @@ -0,0 +1,414 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "estradiol÷progesterone" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +dat["estradiol÷progesterone"] = dat["estradiol"] / dat["progesterone"] + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) +test_df["estradiol÷progesterone"] = test_df["estradiol"] / test_df["progesterone"] + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-p4.py b/idconn/workflows/nbs_predict-p4.py new file mode 100644 index 0000000..2251179 --- /dev/null +++ b/idconn/workflows/nbs_predict-p4.py @@ -0,0 +1,408 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "progesterone" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) + +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-p4_sensitivity.py b/idconn/workflows/nbs_predict-p4_sensitivity.py new file mode 100644 index 0000000..449db27 --- /dev/null +++ b/idconn/workflows/nbs_predict-p4_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "progesterone" +CONFOUNDS = ["framewise_displacement"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict-p4bc_sensitivity.py b/idconn/workflows/nbs_predict-p4bc_sensitivity.py new file mode 100644 index 0000000..8cf6026 --- /dev/null +++ b/idconn/workflows/nbs_predict-p4bc_sensitivity.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io + +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "/Users/katherine.b/Dropbox/Data/ds002674" +TEST_DSET = "/Users/katherine.b/Dropbox/Data/diva-dset" +DERIV_NAME = "IDConn" +OUTCOME = "progesterone" +CONFOUNDS = ["framewise_displacement", "bc"] +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.01 +atlas_fname = "/Users/katherine.b/Dropbox/HPC-Backup-083019/physics-retrieval/craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +drop = dat[dat["cycle_day"].between(11, 17, inclusive="neither")].index +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +groups = dat["bc"] +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) + +# print(len(np.unique(outcome))) + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=500 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# print(np.sum(weighted_average)) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(np.sum(filter)) +# print(nbs_vector.shape, filter.shape) + +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge( + solver="auto", + alpha=best.alpha_, + fit_intercept=False, + ) + train_metrics["alpha"] = best.alpha_ + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +# train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict +# fitted = model.fit(X=train_features, y=np.ravel(train_outcome)) +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) + +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled", "bc", "cycle_day"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_pred", style="bc", data=Ys, ax=ax, palette=dark_cmap +) +h = sns.scatterplot( + x="cycle_day", y=f"{OUTCOME}_scaled", style="bc", data=Ys, ax=ax, palette=light_cmap +) +ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample train score: ", train_metrics["in_sample_train"]) +print("In-sample test score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +# print(fitted.coef_.shape) +# print(fitted.coef_) +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + # print(fitted.coef_[0, j]) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + +# print(coeff_vec) +print(coeff_vec) +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) + +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} + +# cross_validate(model, ) +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred", "cycle_day", "bc"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_pred", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +h = sns.scatterplot( + x="cycle_day", + y=f"{OUTCOME}_scaled", + style="bc", + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="dark_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/idconn/workflows/nbs_predict.py b/idconn/workflows/nbs_predict.py new file mode 100644 index 0000000..169a5aa --- /dev/null +++ b/idconn/workflows/nbs_predict.py @@ -0,0 +1,387 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np +import nibabel as nib +import seaborn as sns +import bids +import matplotlib.pyplot as plt +from os.path import join +from datetime import datetime +from time import strftime +from scipy.stats import spearmanr +from idconn import nbs, io +from bct import threshold_proportional + + +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, cross_validate +from sklearn.preprocessing import Normalizer, StandardScaler +from sklearn.metrics import mean_squared_error +from matplotlib.colors import ListedColormap +import matplotlib as mpl + + +import warnings +import json + +warnings.simplefilter("ignore") + +today = datetime.today() +today_str = strftime("%m_%d_%Y") + +TRAIN_DSET = "" +TEST_DSET = "" +DERIV_NAME = "IDConn" +OUTCOME = "" +CONFOUNDS = "framewise_displacement" +TASK = "rest" +ATLAS = "craddock2012" +THRESH = 0.5 +alpha = 0.05 +atlas_fname = "craddock2012_tcorr05_2level_270_2mm.nii.gz" + + +train_layout = bids.BIDSLayout(TRAIN_DSET, derivatives=True) + +dat = io.read_corrmats(train_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = dat["adj"].dropna().index +dat = dat.loc[keep] + +# print(dat['adj'].values.shape) +num_node = dat.iloc[0]["adj"].shape[0] + +matrices = np.vstack(dat["adj"].values).reshape((len(keep), num_node, num_node)) +upper_tri = np.triu_indices(num_node, k=1) + +outcome = np.reshape(dat[OUTCOME].values, (len(dat[OUTCOME]), 1)) +groups = dat["bc"] + +if CONFOUNDS is not None: + confounds = dat[CONFOUNDS] + base_name = f"nbs-predict_outcome-{OUTCOME}_confounds-{CONFOUNDS}" +else: + confounds = None + base_name = f"nbs-predict_outcome-{OUTCOME}" +# print(dat['bc']) + +# load in test data +test_layout = bids.BIDSLayout(TEST_DSET, derivatives=True) + +test_df = io.read_corrmats(test_layout, task=TASK, deriv_name="IDConn", atlas=ATLAS, z_score=False) + +keep = test_df[[OUTCOME, "adj"]].dropna().index +# print(keep) + +test_df = test_df.loc[keep] + +outcome_test = test_df[OUTCOME].values +# print(test_df) + +# print(outcome_test) +matrices_test = np.vstack(test_df["adj"].dropna().values).reshape( + (len(test_df["adj"].dropna().index), num_node, num_node) +) +edges_test = np.vstack(test_df["edge_vector"].dropna().values)[:, filter] + + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_test = test_df[CONFOUNDS].values + + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_test)) <= 2: + resid_edges = nbs.residualize(X=edges_test, confounds=confounds_test) + test_outcome = outcome_test + elif len(np.unique(outcome_test)) > 3: + test_outcome, resid_edges = nbs.residualize( + X=edges_test, y=outcome_test, confounds=confounds_test + ) + test_features = resid_edges +else: + test_features = edges_test + test_outcome = outcome_test + + +weighted_average, cv_results = nbs.kfold_nbs( + matrices, outcome, confounds, alpha, groups=groups, n_splits=5, n_iterations=1000 +) + +fig, fig2, nimg = io.plot_edges( + weighted_average, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Precision-Weighted Average", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-strength-{today_str}") +) + + +avg_df = pd.DataFrame( + weighted_average, + index=range(0, weighted_average.shape[0]), + columns=range(0, weighted_average.shape[1]), +) + +cv_results.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_models-{today_str}.tsv"), sep="\t" +) +avg_df.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_weighted-{today_str}.tsv"), sep="\t" +) + +best = cv_results.sort_values(by="score", ascending=False).iloc[0]["model"] + +# this uses the most predictive subnetwork as features in the model +# might replace with thresholded weighted_average +# or use _all_ the edges in weighted_average with KRR or ElasticNet... +# ORRR use thresholded weighted average edges with ElasticNet... +# - stays true to NBS-Predict +# - increases parsimony while handling multicollinearity... +# either way, I don't think cv_results is necessary + +# here is where we'd threshold the weighted average to use for elastic-net +weighted_average = np.where(weighted_average > 0, weighted_average, 0) +# nbs_vector = weighted_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +# filter = np.where(nbs_vector >= p75, True, False) +# print(nbs_vector.shape, filter.shape) +thresh_average = threshold_proportional(weighted_average, THRESH) +nbs_vector2 = thresh_average[upper_tri] +# p75 = np.percentile(nbs_vector, 75) +filter = np.where(nbs_vector2 > 0, True, False) + +# mask = io.vectorize_corrmats(filter) +edges_train = np.vstack(dat["edge_vector"].dropna().values)[:, filter] + +# NEED TO RESIDUALIZE IF CONFOUNDS IS NOT NONE +if CONFOUNDS is not None: + confounds_train = dat[CONFOUNDS].values + outcome_train = np.reshape(outcome, (outcome.shape[0],)) + # regress out the confounds from each edge and the outcome variable, + # use the residuals for the rest of the algorithm + # print(confounds.shape, outcome.shape) + if len(np.unique(outcome_train)) <= 2: + resid_edges = nbs.residualize(X=edges_train, confounds=confounds_train) + train_outcome = outcome + elif len(np.unique(outcome_train)) > 3: + train_outcome, resid_edges = nbs.residualize( + X=edges_train, y=outcome_train, confounds=confounds_train + ) + train_features = resid_edges +else: + train_features = edges_train + train_outcome = outcome + +x_scaler = StandardScaler() +y_scaler = StandardScaler() +train_features = x_scaler.fit_transform(train_features) +if len(np.unique(train_outcome)) <= 2: + pass +else: + train_outcome = y_scaler.fit_transform(train_outcome.reshape(-1, 1)) + +# run the model on the whole test dataset to get params + +# classification if the outcome is binary (for now) +# could be extended to the multiclass case? + +cv = RepeatedKFold(n_splits=5, n_repeats=10) + +train_metrics = {} +if len(np.unique(outcome)) == 2: + model = LogisticRegression(penalty="l2", solver="saga", C=best.C_[0]) + train_metrics["alpha"] = best.C_[0] + # train_metrics["l1_ratio"] = best.l1_ratio_ +else: + model = Ridge(solver="saga", alpha=best.alpha_) + train_metrics["alpha"] = best.alpha_ + # train_metrics["l1_ratio"] = best.l1_ratio_ +# print(params) +# model.set_params(**params) +# train ElasticNet on full train dataset, using feature extraction from NBS-Predict + +scores = cross_validate( + model, + train_features, + train_outcome, + groups=groups, + cv=cv, + return_estimator=True, + return_train_score=True, +) +train_metrics["in_sample_test"] = np.mean(scores["test_score"]) +train_metrics["in_sample_train"] = np.mean(scores["train_score"]) + +fitted = scores["estimator"][0] +y_pred = fitted.predict(X=train_features) +train_metrics["true_v_pred_corr"] = spearmanr(y_pred, train_outcome) +dat[f"{OUTCOME}_pred"] = y_pred +dat[f"{OUTCOME}_scaled"] = train_outcome + +Ys = dat[[f"{OUTCOME}_pred", f"{OUTCOME}_scaled"]] +Ys.to_csv( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +train_colors = ["#a08ad1", "#685690", "#3f2d69"] # light # medium # dark +light_cmap = sns.color_palette("dark:#a08ad1") +dark_cmap = sns.color_palette("dark:#685690") + +fig, ax = plt.subplots() +g = sns.scatterplot( + x=f"{OUTCOME}_scaled", + y=f"{OUTCOME}_pred", + # style='bc', + data=Ys, + ax=ax, + palette=dark_cmap, +) +# ax.legend(bbox_to_anchor=(1.0, 0.5)) +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + +mse = mean_squared_error(train_outcome, y_pred) +train_metrics["mean squared error"] = mse +print("In-sample prediction score: ", train_metrics["in_sample_test"]) +print("In-sample mean squared error: ", mse) +train_metrics["in_sample_mse"] = mse +# print(np.mean(train_features)) +with open( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(train_metrics, fp) + +# yoink the coefficients? for a more parsimonious figure? +coeff_vec = np.zeros_like(filter) +j = 0 +for i in range(0, filter.shape[0]): + if filter[i] == True: + # print(j) + coeff_vec[i] = fitted.coef_[0, j] + j += 1 + else: + pass + + +coef_mat = io.undo_vectorize(coeff_vec, num_node=num_node) +coef_df = pd.DataFrame(coef_mat, columns=avg_df.columns, index=avg_df.index) +coef_df.to_csv(join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.csv")) + +fig, fig2, nimg = io.plot_edges( + coef_mat, + atlas_fname, + threshold="computed", + title=f"{OUTCOME} Coefficients", + strength=True, + cmap="seismic", + node_size="strength", +) + +fig.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-{today_str}.png"), dpi=400 +) +fig2.savefig( + join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}.png"), + dpi=400, +) +nib.save( + nimg, join(TRAIN_DSET, "derivatives", DERIV_NAME, f"{base_name}_betas-strength-{today_str}") +) + + +# scale after residualizing omg +test_features = x_scaler.transform(test_features) +if len(np.unique(test_outcome)) <= 2: + pass +else: + test_outcome = y_scaler.transform(test_outcome.reshape(-1, 1)) +# print(test_features.shape) +# if the model is a logistic regression, i.e. with a binary outcome +# then score is prediction accuracy +# if the model is a linear regression, i.e., with a continuous outcome +# then the score is R^2 (coefficient of determination) + +# fit trained ElasticNet, initialized via warm_start +# prob in CV? +# fitted_test = fitted.fit(X=test_features, y=np.ravel(test_outcome)) +# score = fitted_test.score(X=test_features, y=np.ravel(test_outcome)) +test_metrics = {} +y_pred = fitted.predict(X=test_features) +score = fitted.score(X=test_features, y=np.ravel(test_outcome)) +if len(np.unique(test_outcome)) == 2: + test_metrics["accuracy"] = score +else: + test_metrics["coefficient of determination"] = score +corr = spearmanr(test_outcome, y_pred) +test_metrics["pred_v_actual_corr"] = corr +mse = mean_squared_error(test_outcome, y_pred) +test_metrics["mean squared error"] = mse +print("Out-of-sample prediction score:\t", score) +print("Out-of-sample mean squared error:\t", mse) +# print(np.mean(test_features)) +# pred_outcome = fitted.predict(test_features) +test_df[f"{OUTCOME}_scaled"] = test_outcome +test_df[f"{OUTCOME}_pred"] = y_pred +Ys = test_df[[f"{OUTCOME}_scaled", f"{OUTCOME}_pred"]] +Ys.to_csv( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.tsv"), sep="\t" +) + +Ys["ppts"] = Ys.index.get_level_values(0) + + +light_colors = ["#33ACE3", "#EA6964", "#4AB62C"] # Bubbles # Blossom # Buttercup +dark_colors = ["#1278a6", "#a11510", "#228208"] +light = ListedColormap(light_colors, name="light_powderpuff") +dark = ListedColormap(dark_colors, name="dark_powderpuff") +mpl.colormaps.register(cmap=light) +mpl.colormaps.register(cmap=dark) + +fig, ax = plt.subplots() +g = sns.scatterplot( + x=f"{OUTCOME}_scaled", + y=f"{OUTCOME}_pred", + # style='bc', + data=Ys, + hue="ppts", + hue_order=["sub-Bubbles", "sub-Blossom", "sub-Buttercup"], + ax=ax, + palette="light_powderpuff", +) +ax.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") +fig.savefig( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_actual-predicted.png"), + dpi=400, + bbox_inches="tight", +) + + +# print(test_outcome, "\n", y_pred) +# print(pred_outcome) +if len(np.unique(test_outcome)) > 2: + corr = spearmanr(test_outcome, y_pred) + print(f"\nSpearman correlation between predicted and actual {OUTCOME}:\t", corr) + test_metrics["spearman correlation"] = corr +with open( + join(TEST_DSET, "derivatives", DERIV_NAME, f"{base_name}_fit-{today_str}.json"), "w" +) as fp: + json.dump(test_metrics, fp) +np.savetxt(join(TEST_DSET, f"{base_name}_predicted-values_fit-{today_str}.txt"), y_pred) diff --git a/setup.py b/setup.py index 6d42c12..af040ab 100644 --- a/setup.py +++ b/setup.py @@ -27,13 +27,15 @@ "numpy", "scipy", "nilearn", - "sklearn", + "scikit-learn", "pandas", "nibabel", "bctpy", "pybids", "networkx", "matplotlib", # necessary until nilearn includes mpl as a dependency + "enlighten", + 'pingouin' ], extras_require={ "doc": [ @@ -45,7 +47,7 @@ "sphinx-copybutton", "sphinx_gallery==0.10.1", "sphinxcontrib-bibtex", - ], + ], "tests": [ "codecov", "coverage", diff --git a/versioneer.py b/versioneer.py index 2b54540..b9421e4 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1136,9 +1136,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ - 0 - ].strip() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces @@ -1238,13 +1236,9 @@ def versions_from_file(filename): contents = f.read() except EnvironmentError: raise NotThisMethod("unable to read _version.py") - mo = re.search( - r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S - ) + mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: - mo = re.search( - r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S - ) + mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) @@ -1454,9 +1448,7 @@ def get_versions(verbose=False): handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose - assert ( - cfg.versionfile_source is not None - ), "please set versioneer.versionfile_source" + assert cfg.versionfile_source is not None, "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) @@ -1697,9 +1689,7 @@ def make_release_tree(self, base_dir, files): # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) - write_to_version_file( - target_versionfile, self._versioneer_generated_versions - ) + write_to_version_file(target_versionfile, self._versioneer_generated_versions) cmds["sdist"] = cmd_sdist @@ -1823,10 +1813,7 @@ def do_setup(): else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: - print( - " appending versionfile_source ('%s') to MANIFEST.in" - % cfg.versionfile_source - ) + print(" appending versionfile_source ('%s') to MANIFEST.in" % cfg.versionfile_source) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: