Skip to content

Commit

Permalink
Move cluster_single_parameters from cubids to utils.
Browse files Browse the repository at this point in the history
  • Loading branch information
tsalo committed Feb 4, 2025
1 parent 6a5c09b commit 929bfab
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 109 deletions.
109 changes: 0 additions & 109 deletions cubids/cubids.py
Original file line number Diff line number Diff line change
Expand Up @@ -1519,112 +1519,3 @@ def get_fieldmap_lookup(self):
def get_layout(self):
"""Get layout."""
return self.layout


def cluster_single_parameters(param_group_df, config, modality):
"""Run agglomerative clustering on individual parameters and add cluster columns to dataframe.
Parameters
----------
param_group_df : :obj:`pandas.DataFrame`
A data frame with one row per file where the ParamGroup column
indicates which group each scan is a part of.
config : :obj:`dict`
Configuration for defining parameter groups.
This dictionary has two relevant keys: ``'sidecar_params'`` and ``'derived_params'``.
modality : :obj:`str`
Modality of the scan.
This is used to select the correct configuration from the config dict.
Returns
-------
param_group_df : :obj:`pandas.DataFrame`
An updated version of the input data frame,
with a new column added for each element in the modality's
``'sidecar_params'`` and ``'derived_params'`` dictionaries.
The new columns will have the name ``'Cluster_' + column_name``,
and will contain the cluster labels for each parameter group.
Notes
-----
``'sidecar_params'`` is a dictionary of dictionaries, where keys are modalities.
The modality-wise dictionary's keys are names of BIDS fields to directly include
in the Parameter Groupings,
and the values describe the parameters by which those BIDS' fields are compared.
For example,
{"RepetitionTime": {"tolerance": 0.000001, "precision": 6, "suggest_variant_rename": True}
means that the RepetitionTime field should be compared across files and flagged as a
variant if it differs from others by 0.000001 or more.
``'derived_params'`` is a dictionary of dictionaries, where keys are modalities.
The modality-wise dictionary's keys are names of BIDS fields to derive from the
NIfTI header and include in the Parameter Groupings.
"""
to_format = config["sidecar_params"][modality]
to_format.update(config["derived_params"][modality])

for column_name, column_fmt in to_format.items():
if column_name not in param_group_df:
continue

if "tolerance" in column_fmt and len(param_group_df) > 1:
column_data = param_group_df[column_name].to_numpy()

if any(isinstance(x, list) for x in column_data):
# For array/list data, we should first define "clusters" based on the number of
# elements, then apply the clustering within each set of lengths.
# For example, if there are four runs with five elements and 10 runs with three
# elements, we should cluster the five-element runs separately from the
# three-element runs, and account for that in the clustering labels.
lengths = ["x".join(str(i) for i in np.array(x).shape) for x in column_data]
unique_lengths = np.unique(lengths)
cluster_idx = 0
for unique_length in unique_lengths:
sel_rows = [i for i, x in enumerate(lengths) if x == unique_length]
array = np.array([np.array(x) for x in column_data[sel_rows]])

if array.shape[0] > 1:
# clustering requires at least two samples
array[np.isnan(array)] = -999

tolerance = to_format[column_name]["tolerance"]
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=tolerance, linkage="complete"
).fit(array)

param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = (
clustering.labels_ + cluster_idx
)
cluster_idx += max(clustering.labels_) + 1
else:
# single-file cluster
param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
cluster_idx += 1
else:
array = column_data.reshape(-1, 1)
array[np.isnan(array)] = -999

tolerance = to_format[column_name]["tolerance"]
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=tolerance, linkage="complete"
).fit(array)

# now add clustering_labels as a column
param_group_df[f"Cluster_{column_name}"] = clustering.labels_

else:
# We can rely on string matching (done separately) for string-type fields,
# but arrays of strings need to be handled differently.
column_data = param_group_df[column_name].tolist()

if any(isinstance(x, list) for x in column_data):
cluster_idx = 0

column_data = ["|&|".join(str(val) for val in cell) for cell in column_data]
unique_vals = np.unique(column_data)
for val in unique_vals:
sel_rows = [i for i, x in enumerate(column_data) if x == val]
param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
cluster_idx += 1

return param_group_df
110 changes: 110 additions & 0 deletions cubids/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
import pandas as pd
from bids.layout import parse_file_entities
from sklearn.cluster import AgglomerativeClustering

from cubids.constants import ID_VARS, NON_KEY_ENTITIES

Expand Down Expand Up @@ -468,6 +469,115 @@ def get_sidecar_metadata(json_file):
return "Erroneous sidecar"


def cluster_single_parameters(param_group_df, config, modality):
"""Run agglomerative clustering on individual parameters and add cluster columns to dataframe.
Parameters
----------
param_group_df : :obj:`pandas.DataFrame`
A data frame with one row per file where the ParamGroup column
indicates which group each scan is a part of.
config : :obj:`dict`
Configuration for defining parameter groups.
This dictionary has two relevant keys: ``'sidecar_params'`` and ``'derived_params'``.
modality : :obj:`str`
Modality of the scan.
This is used to select the correct configuration from the config dict.
Returns
-------
param_group_df : :obj:`pandas.DataFrame`
An updated version of the input data frame,
with a new column added for each element in the modality's
``'sidecar_params'`` and ``'derived_params'`` dictionaries.
The new columns will have the name ``'Cluster_' + column_name``,
and will contain the cluster labels for each parameter group.
Notes
-----
``'sidecar_params'`` is a dictionary of dictionaries, where keys are modalities.
The modality-wise dictionary's keys are names of BIDS fields to directly include
in the Parameter Groupings,
and the values describe the parameters by which those BIDS' fields are compared.
For example,
{"RepetitionTime": {"tolerance": 0.000001, "precision": 6, "suggest_variant_rename": True}
means that the RepetitionTime field should be compared across files and flagged as a
variant if it differs from others by 0.000001 or more.
``'derived_params'`` is a dictionary of dictionaries, where keys are modalities.
The modality-wise dictionary's keys are names of BIDS fields to derive from the
NIfTI header and include in the Parameter Groupings.
"""
to_format = config["sidecar_params"][modality]
to_format.update(config["derived_params"][modality])

for column_name, column_fmt in to_format.items():
if column_name not in param_group_df:
continue

if "tolerance" in column_fmt and len(param_group_df) > 1:
column_data = param_group_df[column_name].to_numpy()

if any(isinstance(x, list) for x in column_data):
# For array/list data, we should first define "clusters" based on the number of
# elements, then apply the clustering within each set of lengths.
# For example, if there are four runs with five elements and 10 runs with three
# elements, we should cluster the five-element runs separately from the
# three-element runs, and account for that in the clustering labels.
lengths = ["x".join(str(i) for i in np.array(x).shape) for x in column_data]
unique_lengths = np.unique(lengths)
cluster_idx = 0
for unique_length in unique_lengths:
sel_rows = [i for i, x in enumerate(lengths) if x == unique_length]
array = np.array([np.array(x) for x in column_data[sel_rows]])

if array.shape[0] > 1:
# clustering requires at least two samples
array[np.isnan(array)] = -999

tolerance = to_format[column_name]["tolerance"]
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=tolerance, linkage="complete"
).fit(array)

param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = (
clustering.labels_ + cluster_idx
)
cluster_idx += max(clustering.labels_) + 1
else:
# single-file cluster
param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
cluster_idx += 1
else:
array = column_data.reshape(-1, 1)
array[np.isnan(array)] = -999

tolerance = to_format[column_name]["tolerance"]
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=tolerance, linkage="complete"
).fit(array)

# now add clustering_labels as a column
param_group_df[f"Cluster_{column_name}"] = clustering.labels_

else:
# We can rely on string matching (done separately) for string-type fields,
# but arrays of strings need to be handled differently.
column_data = param_group_df[column_name].tolist()

if any(isinstance(x, list) for x in column_data):
cluster_idx = 0

column_data = ["|&|".join(str(val) for val in cell) for cell in column_data]
unique_vals = np.unique(column_data)
for val in unique_vals:
sel_rows = [i for i, x in enumerate(column_data) if x == val]
param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
cluster_idx += 1

return param_group_df


def _order_columns(df):
"""Organize columns of the summary and files DataFrames.
Expand Down

0 comments on commit 929bfab

Please sign in to comment.