Merge branch 'main' into add_tests_for_all_os

SpikeInterface · Jun 18, 2024 · 844c65d · 844c65d
2 parents 2e3cb1c + a3527ea
commit 844c65d
Show file tree

Hide file tree

Showing 17 changed files with 476 additions and 36 deletions.
diff --git a/doc/modules/curation.rst b/doc/modules/curation.rst
@@ -41,6 +41,111 @@ The merging and splitting operations are handled by the :py:class:`~spikeinterfa
     # here is the final clean sorting
     clean_sorting = cs.sorting
 
+Manual curation format
+----------------------
+
+SpikeInterface internally supports a JSON-based manual curation format.
+When manual curation is necessary, modifying a dataset in place is a bad practice.
+Instead, to ensure the reproducibility of the spike sorting pipelines, we have introduced a simple and JSON-based manual curation format.
+This format defines at the moment : merges + deletions + manual tags.
+The simple file can be kept along side the output of a sorter and applied on the result to have a "clean" result.
+
+This format has two part:
+
+  * **definition** with the folowing keys:
+
+    * "format_version" : format specification
+    * "unit_ids" : the list of unit_ds
+    * "label_definitions" : list of label categories and possible labels per category.
+                            Every category can be *exclusive=True* onely one label or *exclusive=False* several labels possible
+
+  * **manual output** curation with the folowing keys:
+
+    * "manual_labels"
+    * "merged_unit_groups"
+    * "removed_units"
+
+Here is the description of the format with a simple example:
+
+.. code-block:: json
+
+    {
+        # the first part of the format is the definitation
+        "format_version": "1",
+        "unit_ids": [
+            "u1",
+            "u2",
+            "u3",
+            "u6",
+            "u10",
+            "u14",
+            "u20",
+            "u31",
+            "u42"
+        ],
+        "label_definitions": {
+            "quality": {
+                "label_options": [
+                    "good",
+                    "noise",
+                    "MUA",
+                    "artifact"
+                ],
+                "exclusive": true
+            },
+            "putative_type": {
+                "label_options": [
+                    "excitatory",
+                    "inhibitory",
+                    "pyramidal",
+                    "mitral"
+                ],
+                "exclusive": false
+            }
+        },
+        # the second part of the format is manual action
+        "manual_labels": [
+            {
+                "unit_id": "u1",
+                "quality": [
+                    "good"
+                ]
+            },
+            {
+                "unit_id": "u2",
+                "quality": [
+                    "noise"
+                ],
+                "putative_type": [
+                    "excitatory",
+                    "pyramidal"
+                ]
+            },
+            {
+                "unit_id": "u3",
+                "putative_type": [
+                    "inhibitory"
+                ]
+            }
+        ],
+        "merged_unit_groups": [
+            [
+                "u3",
+                "u6"
+            ],
+            [
+                "u10",
+                "u14",
+                "u20"
+            ]
+        ],
+        "removed_units": [
+            "u31",
+            "u42"
+        ]
+    }
+
+
 
 Automatic curation tools
 ------------------------

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,7 @@ classifiers = [
 
 
 dependencies = [
-    "numpy",
+    "numpy>=1.26, <2.0",  # 1.20 np.ptp, 1.26 for avoiding pickling errors when numpy >2.0
     "threadpoolctl>=3.0.0",
     "tqdm",
     "zarr>=2.16,<2.18",
@@ -65,18 +65,16 @@ extractors = [
     "pyedflib>=0.1.30",
     "sonpy;python_version<'3.10'",
     "lxml", # lxml for neuroscope
-    "scipy<1.13",
+    "scipy",
     "ONE-api>=2.7.0", # alf sorter and streaming IBL
-    "ibllib>=2.32.5", # streaming IBL
+    "ibllib>=2.36.0", # streaming IBL
     "pymatreader>=0.0.32", # For cell explorer matlab files
     "zugbruecke>=0.2; sys_platform!='win32'", # For plexon2
 ]
 
 streaming_extractors = [
     "ONE-api>=2.7.0", # alf sorter and streaming IBL
-    "ibllib>=2.32.5", # streaming IBL
-    "scipy<1.13", # ibl has a dependency on scipy but it does not have an upper bound
-    # Remove this once https://github.com/int-brain-lab/ibllib/issues/753
+    "ibllib>=2.36.0", # streaming IBL
     # Following dependencies are for streaming with nwb files
     "pynwb>=2.6.0",
     "fsspec",

diff --git a/src/spikeinterface/__init__.py b/src/spikeinterface/__init__.py
@@ -30,5 +30,5 @@
 # This flag must be set to False for release
 # This avoids using versioning that contains ".dev0" (and this is a better choice)
 # This is mainly useful when using run_sorter in a container and spikeinterface install
-# DEV_MODE = True
-DEV_MODE = False
+DEV_MODE = True
+# DEV_MODE = False
diff --git a/src/spikeinterface/core/core_tools.py b/src/spikeinterface/core/core_tools.py
@@ -83,7 +83,12 @@ def default(self, obj):
         if isinstance(obj, np.generic):
             return obj.item()
 
-        if np.issctype(obj):  # Cast numpy datatypes to their names
+        # Standard numpy dtypes like np.dtype('int32") are transformed this way
+        if isinstance(obj, np.dtype):
+            return np.dtype(obj).name
+
+        # This will transform to a string canonical representation of the dtype (e.g. np.int32 -> 'int32')
+        if isinstance(obj, type) and issubclass(obj, np.generic):
             return np.dtype(obj).name
 
         if isinstance(obj, np.ndarray):

diff --git a/src/spikeinterface/core/tests/test_jsonification.py b/src/spikeinterface/core/tests/test_jsonification.py
@@ -122,7 +122,6 @@ def test_numpy_dtype_alises_encoding():
     # People tend to use this a dtype instead of the proper classes
     json.dumps(np.int32, cls=SIJsonEncoder)
     json.dumps(np.float32, cls=SIJsonEncoder)
-    json.dumps(np.bool_, cls=SIJsonEncoder)  # Note that np.bool was deperecated in numpy 1.20.0
 
 
 def test_recording_encoding(numpy_generated_recording):

diff --git a/src/spikeinterface/curation/__init__.py b/src/spikeinterface/curation/__init__.py
@@ -11,4 +11,7 @@
 from .mergeunitssorting import MergeUnitsSorting, merge_units_sorting
 from .splitunitsorting import SplitUnitSorting, split_unit_sorting
 
+# curation format
+from .curation_format import validate_curation_dict, curation_label_to_dataframe
+
 from .sortingview_curation import apply_sortingview_curation
diff --git a/src/spikeinterface/curation/curation_format.py b/src/spikeinterface/curation/curation_format.py
@@ -0,0 +1,163 @@
+from itertools import combinations
+
+
+supported_curation_format_versions = {"1"}
+
+
+def validate_curation_dict(curation_dict):
+    """
+    Validate that the curation dictionary given as parameter complies with the format
+
+    The function do not return anything. This raise an error if something is wring in the format.
+
+    Parameters
+    ----------
+    curation_dict : dict
+
+    """
+
+    # format
+    if "format_version" not in curation_dict:
+        raise ValueError("No version_format")
+
+    if curation_dict["format_version"] not in supported_curation_format_versions:
+        raise ValueError(
+            f"Format version ({curation_dict['format_version']}) not supported. "
+            f"Only {supported_curation_format_versions} are valid"
+        )
+
+    # unit_ids
+    labeled_unit_set = set([lbl["unit_id"] for lbl in curation_dict["manual_labels"]])
+    merged_units_set = set(sum(curation_dict["merged_unit_groups"], []))
+    removed_units_set = set(curation_dict["removed_units"])
+
+    if curation_dict["unit_ids"] is not None:
+        # old format v0 did not contain unit_ids so this can contains None
+        unit_set = set(curation_dict["unit_ids"])
+        if not labeled_unit_set.issubset(unit_set):
+            raise ValueError("Curation format: some labeled units are not in the unit list")
+        if not merged_units_set.issubset(unit_set):
+            raise ValueError("Curation format: some merged units are not in the unit list")
+        if not removed_units_set.issubset(unit_set):
+            raise ValueError("Curation format: some removed units are not in the unit list")
+
+    all_merging_groups = [set(group) for group in curation_dict["merged_unit_groups"]]
+    for gp_1, gp_2 in combinations(all_merging_groups, 2):
+        if len(gp_1.intersection(gp_2)) != 0:
+            raise ValueError("Some units belong to multiple merge groups")
+    if len(removed_units_set.intersection(merged_units_set)) != 0:
+        raise ValueError("Some units were merged and deleted")
+
+    # Check the labels exclusivity
+    for lbl in curation_dict["manual_labels"]:
+        for label_key in curation_dict["label_definitions"].keys():
+            if label_key in lbl:
+                unit_id = lbl["unit_id"]
+                label_value = lbl[label_key]
+                if not isinstance(label_value, list):
+                    raise ValueError(f"Curation format: manual_labels {unit_id} is invalid shoudl be a list")
+
+                is_exclusive = curation_dict["label_definitions"][label_key]["exclusive"]
+
+                if is_exclusive and not len(label_value) <= 1:
+                    raise ValueError(
+                        f"Curation format: manual_labels {unit_id} {label_key} are exclusive labels. {label_value} is invalid"
+                    )
+
+
+def convert_from_sortingview_curation_format_v0(sortingview_dict, destination_format="1"):
+    """
+    Converts the old sortingview curation format (v0) into a curation dictionary new format (v1)
+    Couple of caveats:
+        * The list of units is not available in the original sortingview dictionary. We set it to None
+        * Labels can not be mutually exclusive.
+        * Labels have no category, so we regroup them under the "all_labels" category
+
+    Parameters
+    ----------
+    sortingview_dict : dict
+        Dictionary containing the curation information from sortingview
+    destination_format : str
+        Version of the format to use.
+        Default to "1"
+
+    Returns
+    -------
+    curation_dict: dict
+        A curation dictionary
+    """
+
+    assert destination_format == "1"
+
+    merge_groups = sortingview_dict["mergeGroups"]
+    merged_units = sum(merge_groups, [])
+    if len(merged_units) > 0:
+        unit_id_type = int if isinstance(merged_units[0], int) else str
+    else:
+        unit_id_type = str
+    all_units = []
+    all_labels = []
+    manual_labels = []
+    general_cat = "all_labels"
+    for unit_id_, l_labels in sortingview_dict["labelsByUnit"].items():
+        all_labels.extend(l_labels)
+        # recorver the correct type for unit_id
+        unit_id = unit_id_type(unit_id_)
+        all_units.append(unit_id)
+        manual_labels.append({"unit_id": unit_id, general_cat: l_labels})
+    labels_def = {"all_labels": {"name": "all_labels", "label_options": list(set(all_labels)), "exclusive": False}}
+
+    curation_dict = {
+        "format_version": destination_format,
+        "unit_ids": None,
+        "label_definitions": labels_def,
+        "manual_labels": manual_labels,
+        "merged_unit_groups": merge_groups,
+        "removed_units": [],
+    }
+
+    return curation_dict
+
+
+def curation_label_to_dataframe(curation_dict):
+    """
+    Transform the curation dict into a pandas dataframe.
+    For label category with exclusive=True : a column is created and values are the unique label.
+    For label category with exclusive=False : one column per possible is created and values are boolean.
+
+    If exclusive=False and the same label appear several times then it raises an error.
+
+    Parameters
+    ----------
+    curation_dict : dict
+        A curation dictionary
+
+    Returns
+    -------
+    labels : pd.DataFrame
+        dataframe with labels.
+    """
+    import pandas as pd
+
+    labels = pd.DataFrame(index=curation_dict["unit_ids"])
+
+    for label_key, label_def in curation_dict["label_definitions"].items():
+        if label_def["exclusive"]:
+            assert label_key not in labels.columns, f"{label_key} is already a column"
+            labels[label_key] = pd.Series(dtype=str)
+            labels[label_key][:] = ""
+            for lbl in curation_dict["manual_labels"]:
+                value = lbl.get(label_key, [])
+                if len(value) == 1:
+                    labels.at[lbl["unit_id"], label_key] = value[0]
+        else:
+            for label_opt in label_def["label_options"]:
+                assert label_opt not in labels.columns, f"{label_opt} is already a column"
+                labels[label_opt] = pd.Series(dtype=bool)
+                labels[label_opt][:] = False
+            for lbl in curation_dict["manual_labels"]:
+                values = lbl.get(label_key, [])
+                for value in values:
+                    labels.at[lbl["unit_id"], value] = True
+
+    return labels
diff --git a/src/spikeinterface/curation/sortingview_curation.py b/src/spikeinterface/curation/sortingview_curation.py
@@ -6,6 +6,8 @@
 from .curationsorting import CurationSorting
 
 
+# @alessio
+# TODO later : this should be reimplemented using the new curation format
 def apply_sortingview_curation(
     sorting, uri_or_json, exclude_labels=None, include_labels=None, skip_merge=False, verbose=False
 ):