Move cluster_single_parameters from cubids to utils.

PennLINC · Feb 4, 2025 · 929bfab · 929bfab
1 parent 6a5c09b
commit 929bfab
Show file tree

Hide file tree

Showing 2 changed files with 110 additions and 109 deletions.
diff --git a/cubids/cubids.py b/cubids/cubids.py
@@ -1519,112 +1519,3 @@ def get_fieldmap_lookup(self):
     def get_layout(self):
         """Get layout."""
         return self.layout
-
-
-def cluster_single_parameters(param_group_df, config, modality):
-    """Run agglomerative clustering on individual parameters and add cluster columns to dataframe.
-
-    Parameters
-    ----------
-    param_group_df : :obj:`pandas.DataFrame`
-        A data frame with one row per file where the ParamGroup column
-        indicates which group each scan is a part of.
-    config : :obj:`dict`
-        Configuration for defining parameter groups.
-        This dictionary has two relevant keys: ``'sidecar_params'`` and ``'derived_params'``.
-    modality : :obj:`str`
-        Modality of the scan.
-        This is used to select the correct configuration from the config dict.
-
-    Returns
-    -------
-    param_group_df : :obj:`pandas.DataFrame`
-        An updated version of the input data frame,
-        with a new column added for each element in the modality's
-        ``'sidecar_params'`` and ``'derived_params'`` dictionaries.
-        The new columns will have the name ``'Cluster_' + column_name``,
-        and will contain the cluster labels for each parameter group.
-
-    Notes
-    -----
-    ``'sidecar_params'`` is a dictionary of dictionaries, where keys are modalities.
-    The modality-wise dictionary's keys are names of BIDS fields to directly include
-    in the Parameter Groupings,
-    and the values describe the parameters by which those BIDS' fields are compared.
-    For example,
-    {"RepetitionTime": {"tolerance": 0.000001, "precision": 6, "suggest_variant_rename": True}
-    means that the RepetitionTime field should be compared across files and flagged as a
-    variant if it differs from others by 0.000001 or more.
-
-    ``'derived_params'`` is a dictionary of dictionaries, where keys are modalities.
-    The modality-wise dictionary's keys are names of BIDS fields to derive from the
-    NIfTI header and include in the Parameter Groupings.
-    """
-    to_format = config["sidecar_params"][modality]
-    to_format.update(config["derived_params"][modality])
-
-    for column_name, column_fmt in to_format.items():
-        if column_name not in param_group_df:
-            continue
-
-        if "tolerance" in column_fmt and len(param_group_df) > 1:
-            column_data = param_group_df[column_name].to_numpy()
-
-            if any(isinstance(x, list) for x in column_data):
-                # For array/list data, we should first define "clusters" based on the number of
-                # elements, then apply the clustering within each set of lengths.
-                # For example, if there are four runs with five elements and 10 runs with three
-                # elements, we should cluster the five-element runs separately from the
-                # three-element runs, and account for that in the clustering labels.
-                lengths = ["x".join(str(i) for i in np.array(x).shape) for x in column_data]
-                unique_lengths = np.unique(lengths)
-                cluster_idx = 0
-                for unique_length in unique_lengths:
-                    sel_rows = [i for i, x in enumerate(lengths) if x == unique_length]
-                    array = np.array([np.array(x) for x in column_data[sel_rows]])
-
-                    if array.shape[0] > 1:
-                        # clustering requires at least two samples
-                        array[np.isnan(array)] = -999
-
-                        tolerance = to_format[column_name]["tolerance"]
-                        clustering = AgglomerativeClustering(
-                            n_clusters=None, distance_threshold=tolerance, linkage="complete"
-                        ).fit(array)
-
-                        param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = (
-                            clustering.labels_ + cluster_idx
-                        )
-                        cluster_idx += max(clustering.labels_) + 1
-                    else:
-                        # single-file cluster
-                        param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
-                        cluster_idx += 1
-            else:
-                array = column_data.reshape(-1, 1)
-                array[np.isnan(array)] = -999
-
-                tolerance = to_format[column_name]["tolerance"]
-                clustering = AgglomerativeClustering(
-                    n_clusters=None, distance_threshold=tolerance, linkage="complete"
-                ).fit(array)
-
-                # now add clustering_labels as a column
-                param_group_df[f"Cluster_{column_name}"] = clustering.labels_
-
-        else:
-            # We can rely on string matching (done separately) for string-type fields,
-            # but arrays of strings need to be handled differently.
-            column_data = param_group_df[column_name].tolist()
-
-            if any(isinstance(x, list) for x in column_data):
-                cluster_idx = 0
-
-                column_data = ["|&|".join(str(val) for val in cell) for cell in column_data]
-                unique_vals = np.unique(column_data)
-                for val in unique_vals:
-                    sel_rows = [i for i, x in enumerate(column_data) if x == val]
-                    param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
-                    cluster_idx += 1
-
-    return param_group_df
diff --git a/cubids/utils.py b/cubids/utils.py
@@ -10,6 +10,7 @@
 import numpy as np
 import pandas as pd
 from bids.layout import parse_file_entities
+from sklearn.cluster import AgglomerativeClustering
 
 from cubids.constants import ID_VARS, NON_KEY_ENTITIES
 
@@ -468,6 +469,115 @@ def get_sidecar_metadata(json_file):
         return "Erroneous sidecar"
 
 
+def cluster_single_parameters(param_group_df, config, modality):
+    """Run agglomerative clustering on individual parameters and add cluster columns to dataframe.
+
+    Parameters
+    ----------
+    param_group_df : :obj:`pandas.DataFrame`
+        A data frame with one row per file where the ParamGroup column
+        indicates which group each scan is a part of.
+    config : :obj:`dict`
+        Configuration for defining parameter groups.
+        This dictionary has two relevant keys: ``'sidecar_params'`` and ``'derived_params'``.
+    modality : :obj:`str`
+        Modality of the scan.
+        This is used to select the correct configuration from the config dict.
+
+    Returns
+    -------
+    param_group_df : :obj:`pandas.DataFrame`
+        An updated version of the input data frame,
+        with a new column added for each element in the modality's
+        ``'sidecar_params'`` and ``'derived_params'`` dictionaries.
+        The new columns will have the name ``'Cluster_' + column_name``,
+        and will contain the cluster labels for each parameter group.
+
+    Notes
+    -----
+    ``'sidecar_params'`` is a dictionary of dictionaries, where keys are modalities.
+    The modality-wise dictionary's keys are names of BIDS fields to directly include
+    in the Parameter Groupings,
+    and the values describe the parameters by which those BIDS' fields are compared.
+    For example,
+    {"RepetitionTime": {"tolerance": 0.000001, "precision": 6, "suggest_variant_rename": True}
+    means that the RepetitionTime field should be compared across files and flagged as a
+    variant if it differs from others by 0.000001 or more.
+
+    ``'derived_params'`` is a dictionary of dictionaries, where keys are modalities.
+    The modality-wise dictionary's keys are names of BIDS fields to derive from the
+    NIfTI header and include in the Parameter Groupings.
+    """
+    to_format = config["sidecar_params"][modality]
+    to_format.update(config["derived_params"][modality])
+
+    for column_name, column_fmt in to_format.items():
+        if column_name not in param_group_df:
+            continue
+
+        if "tolerance" in column_fmt and len(param_group_df) > 1:
+            column_data = param_group_df[column_name].to_numpy()
+
+            if any(isinstance(x, list) for x in column_data):
+                # For array/list data, we should first define "clusters" based on the number of
+                # elements, then apply the clustering within each set of lengths.
+                # For example, if there are four runs with five elements and 10 runs with three
+                # elements, we should cluster the five-element runs separately from the
+                # three-element runs, and account for that in the clustering labels.
+                lengths = ["x".join(str(i) for i in np.array(x).shape) for x in column_data]
+                unique_lengths = np.unique(lengths)
+                cluster_idx = 0
+                for unique_length in unique_lengths:
+                    sel_rows = [i for i, x in enumerate(lengths) if x == unique_length]
+                    array = np.array([np.array(x) for x in column_data[sel_rows]])
+
+                    if array.shape[0] > 1:
+                        # clustering requires at least two samples
+                        array[np.isnan(array)] = -999
+
+                        tolerance = to_format[column_name]["tolerance"]
+                        clustering = AgglomerativeClustering(
+                            n_clusters=None, distance_threshold=tolerance, linkage="complete"
+                        ).fit(array)
+
+                        param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = (
+                            clustering.labels_ + cluster_idx
+                        )
+                        cluster_idx += max(clustering.labels_) + 1
+                    else:
+                        # single-file cluster
+                        param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
+                        cluster_idx += 1
+            else:
+                array = column_data.reshape(-1, 1)
+                array[np.isnan(array)] = -999
+
+                tolerance = to_format[column_name]["tolerance"]
+                clustering = AgglomerativeClustering(
+                    n_clusters=None, distance_threshold=tolerance, linkage="complete"
+                ).fit(array)
+
+                # now add clustering_labels as a column
+                param_group_df[f"Cluster_{column_name}"] = clustering.labels_
+
+        else:
+            # We can rely on string matching (done separately) for string-type fields,
+            # but arrays of strings need to be handled differently.
+            column_data = param_group_df[column_name].tolist()
+
+            if any(isinstance(x, list) for x in column_data):
+                cluster_idx = 0
+
+                column_data = ["|&|".join(str(val) for val in cell) for cell in column_data]
+                unique_vals = np.unique(column_data)
+                for val in unique_vals:
+                    sel_rows = [i for i, x in enumerate(column_data) if x == val]
+                    param_group_df.loc[sel_rows, f"Cluster_{column_name}"] = cluster_idx
+                    cluster_idx += 1
+
+    return param_group_df
+
+
 def _order_columns(df):
     """Organize columns of the summary and files DataFrames.