Add Thielen2015 c-VEP dataset and mutualize common cVEP functions (#557)

* Fix cvep searchable in dataset_search (#554) * Fix docs and _add_stim_chhanel_trial Thielen2021 (#555) * Update whats_new * Add Thielen2015 dataset * [pre-commit.ci] auto fixes from pre-commit.com hooks * Update docs/source/dataset_summary.rst * Update docs/source/whats_new.rst * Update docs/source/whats_new.rst * Update docs/source/whats_new.rst * Update docs/source/whats_new.rst * Apply suggestions from code review * Move cVEP common functions to datasets.utils - Closes #564 * Update whats_new.rst --------- Co-authored-by: Jordy Thielen <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Bru <[email protected]> Co-authored-by: PierreGtch <[email protected]> Co-authored-by: Pierre Guetschel <[email protected]>
NeuroTechX · Apr 11, 2024 · 097b057 · 097b057
1 parent a753fa4
commit 097b057
Show file tree

Hide file tree

Showing 8 changed files with 305 additions and 185 deletions.
diff --git a/docs/source/dataset_summary.rst b/docs/source/dataset_summary.rst
@@ -100,7 +100,10 @@ DOI: https://doi.org/10.1088/1741-2552/ac38cf
    :header: Dataset, #Subj, #Chan, #Classes, #Trials / class, Trials length, #Epochs / class, Sampling rate, #Sessions, Codes, Presentation rate
    :class: sortable
 
-   :class:`Thielen2021`,30,8,20,5,31.5s,18900 NT / 18900 T,512Hz,1,Gold codes,60Hz
+   :class:`Thielen2015`,12,64,36,3,4.2s,27216 NT / 27216 T,2048Hz,1,Gold codes,120Hz
+   :class:`Thielen2021`,30,8,20,5,31.5s,94500 NT / 94500 T,512Hz,1,Gold codes,60Hz
+
+
 
 
 Resting States

diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
@@ -83,6 +83,7 @@ c-VEP Datasets
     :toctree: generated/
     :template: class.rst
 
+    Thielen2015
     Thielen2021
 
 

diff --git a/docs/source/whats_new.rst b/docs/source/whats_new.rst
@@ -23,6 +23,7 @@ Enhancements
 - Adding leave k-Subjects out evaluations (:gh:`470` by `Bruno Aristimunha`_)
 - Update Braindecode dependency to 0.8 (:gh:`542` by `Pierre Guetschel`_)
 - Improve transform function of AugmentedDataset (:gh:`541` by `Quentin Barthelemy`_)
+- Move cVEP common functions to :mod:`moabb.datasets.utils` (:gh:`564` :gh:`557` by `Pierre Guetschel`_)
 
 Bugs
 ~~~~
@@ -80,7 +81,8 @@ Enhancements
 - Add match_all method in paradigm to support CompoundDataset evaluation with MNE epochs (:gh:`473` by `Gregoire Cattan`_)
 - Automate setting of event_id in compound dataset and add `data_origin` information to the data (:gh:`475` by `Gregoire Cattan`_)
 - Add possibility of not saving the model (:gh:`489` by `Igor Carrara`_)
-- Add CVEP and BurstVEP dataset from Castillos from Toulouse lab (by `Seabstien Velut`_)
+- Add CVEP and BurstVEP dataset from Castillos from Toulouse lab (:gh:`531` by `Sebastien Velut`_)
+- Add c-VEP dataset from Thielen et al. 2015 (:gh:`557` by `Jordy Thielen`_)
 
 Bugs
 ~~~~
@@ -110,6 +112,8 @@ Bugs
 - Fix case when events specified via ``raw.annotations`` but no events (:gh:`491` by `Pierre Guetschel`_)
 - Fix bug in downloading Shin2017A dataset (:gh:`493` by `Igor Carrara`_)
 - Fix the cropped option in the dataset preprocessing (:gh:`502` by `Bruno Aristimunha`_)
+- Fix bug in :func:`moabb.datasets.utils.dataset_search` with missing cvep paradigm (:gh:`557` by `Jordy Thielen`_)
+- Fix mistakes in :func:`moabb.datasets.thielen2021` considering wrong docs and hardcoded trial stim channel (:gh:`557` by `Jordy Thielen`_)
 
 API changes
 ~~~~~~~~~~~

diff --git a/moabb/datasets/__init__.py b/moabb/datasets/__init__.py
@@ -66,6 +66,7 @@
 from .ssvep_mamem import MAMEM1, MAMEM2, MAMEM3
 from .ssvep_nakanishi import Nakanishi2015
 from .ssvep_wang import Wang2016
+from .thielen2015 import Thielen2015
 from .thielen2021 import Thielen2021
 from .upper_limb import Ofner2017
 from .utils import _init_dataset_list

diff --git a/moabb/datasets/castillos2023.py b/moabb/datasets/castillos2023.py
@@ -4,11 +4,10 @@
 
 import mne
 import numpy as np
-from mne import create_info
-from mne.io import RawArray
 
 from moabb.datasets import download as dl
 from moabb.datasets.base import BaseDataset
+from moabb.datasets.utils import add_stim_channel_epoch, add_stim_channel_trial
 
 
 Castillos2023_URL = "https://zenodo.org/records/8255618"
@@ -41,88 +40,6 @@ def __init__(
         self.n_channels = 32
         self.window_size = window_size
 
-    def _add_stim_channel_trial(
-        self, raw, onsets, labels, offset=200, ch_name="stim_trial"
-    ):
-        """
-        Add a stimulus channel with trial onsets and their labels.
-
-        Parameters
-        ----------
-        raw: mne.Raw
-            The raw object to add the stimulus channel to.
-        onsets: List | np.ndarray
-            The onsets of the trials in sample numbers.
-        labels: List | np.ndarray
-            The labels of the trials.
-        offset: int (default: 200)
-            The integer value to start markers with. For instance, if 200, then label 0 will be marker 200, label 1
-            will be be marker 201, etc.
-        ch_name: str (default: "stim_trial")
-            The name of the added stimulus channel.
-
-        Returns
-        -------
-        mne.Raw
-            The raw object with the added stimulus channel.
-        """
-        stim_chan = np.zeros((1, len(raw)))
-        for onset, label in zip(onsets, labels):
-            stim_chan[0, onset] = offset + label
-        info = create_info(
-            ch_names=[ch_name],
-            ch_types=["stim"],
-            sfreq=raw.info["sfreq"],
-            verbose=False,
-        )
-        raw = raw.add_channels([RawArray(data=stim_chan, info=info, verbose=False)])
-        return raw
-
-    def _add_stim_channel_epoch(
-        self,
-        raw,
-        onsets,
-        labels,
-        offset=100,
-        ch_name="stim_epoch",
-    ):
-        """
-        Add a stimulus channel with epoch onsets and their labels, which are the values of the presented code for each
-        of the trials.
-
-        Parameters
-        ----------
-        raw: mne.Raw
-            The raw object to add the stimulus channel to.
-        onsets: List | np.ndarray
-            The onsets of the trials in sample numbers.
-        labels: List | np.ndarray
-            The labels of the trials.
-        codes: np.ndarray
-            The codebook containing each presented code of shape (nb_bits, nb_codes), sampled at the presentation rate.
-        offset: int (default: 100)
-            The integer value to start markers with. For instance, if 100, then label 0 will be marker 100, label 1
-            will be be marker 101, etc.
-        ch_name: str (default: "stim_epoch")
-            The name of the added stimulus channel.
-
-        Returns
-        -------
-        mne.Raw
-            The raw object with the added stimulus channel.
-        """
-        stim_chan = np.zeros((1, len(raw)))
-        for onset, label in zip(onsets, labels):
-            stim_chan[0, int(onset * self.sfreq)] = offset + label
-        info = create_info(
-            ch_names=[ch_name],
-            ch_types=["stim"],
-            sfreq=raw.info["sfreq"],
-            verbose=False,
-        )
-        raw = raw.add_channels([RawArray(data=stim_chan, info=info, verbose=False)])
-        return raw
-
     def _get_single_subject_data(self, subject):
         """Return the data of a single subject."""
         file_path_list = self.data_path(subject, self.paradigm_type)
@@ -183,10 +100,10 @@ def _get_single_subject_data(self, subject):
 
         # Create stim channel with trial information (i.e., symbols)
         # Specifically: 200 = symbol-0, 201 = symbol-1, 202 = symbol-2, etc.
-        raw = self._add_stim_channel_trial(raw, onset_code, labels, offset=200)
+        raw = add_stim_channel_trial(raw, onset_code, labels, offset=200)
         # Create stim channel with epoch information (i.e., 1 / 0, or on / off)
         # Specifically: 100 = "0", 101 = "1"
-        raw = self._add_stim_channel_epoch(
+        raw = add_stim_channel_epoch(
             raw,
             np.concatenate([onset, onset_0]),
             np.concatenate([np.ones(onset.shape), np.zeros(onset_0.shape)]),

diff --git a/moabb/datasets/thielen2015.py b/moabb/datasets/thielen2015.py
@@ -0,0 +1,184 @@
+import mne
+import numpy as np
+from scipy.io import loadmat
+
+from moabb.datasets import download as dl
+from moabb.datasets.base import BaseDataset
+from moabb.datasets.utils import add_stim_channel_epoch, add_stim_channel_trial
+
+
+Thielen2015_URL = "https://public.data.ru.nl/dcc/DSC_2018.00047_553_v3"
+
+# Each session consisted of 3 fixed-length trials runs
+NR_RUNS = 3
+
+# Each trial contained 4 cycles of a 1.05 second code
+NR_CYCLES_PER_TRIAL = 4
+
+# Codes were presented at a 120 Hz monitor refresh rate
+PRESENTATION_RATE = 120
+
+
+class Thielen2015(BaseDataset):
+    """c-VEP dataset from Thielen et al. (2015)
+
+    Dataset [1]_ from the study on reconvolution for c-VEP [2]_.
+
+    .. admonition:: Dataset summary
+
+        =============  ======= ===== ======== ===============  ================== =============  =============  =========
+        Name           #Subj   #Chan #Classes #Trials / class  #Epochs / class    Trials length  Sampling rate  #Sessions
+        =============  ======= ===== ======== ===============  ================== =============  =============  =========
+        Thielen2015         12    64       36               3  27216 NT / 27216 T          4.2s         2048Hz          1
+        =============  ======= ===== ======== ===============  ================== =============  =============  =========
+
+    **Dataset description**
+
+    EEG recordings were obtained with a sampling rate of 2048 Hz, using a setup comprising 64 Ag/AgCl electrodes, and
+    amplified by a Biosemi ActiveTwo EEG amplifier. Electrode placement followed the international 10-10 system.
+
+    During the experimental sessions, participants actively operated a 6 x 6 visual speller brain-computer interface
+    (BCI) with real-time feedback, encompassing 36 distinct classes. Each cell within the symbol grid underwent
+    luminance modulation at full contrast, achieved through the application of pseudo-random noise-codes derived from a
+    set of modulated Gold codes. These binary codes have a balanced distribution of ones and zeros while adhering to a
+    limited run-length pattern, with a maximum run-length of 2 bits. Codes were presented at a rate of 120 Hz. Given
+    that one cycle of these modulated Gold codes comprises 126 bits, the duration of a complete cycle spans 1.05
+    seconds.
+
+    Throughout the experiment, participants underwent four distinct blocks: an initial practice block consisting of two
+    runs, followed by a training block of one run. Subsequently, they engaged in a copy-spelling block comprising six
+    runs, and finally, a free-spelling block consisting of one run. Between the training and copy-spelling block, a
+    classifier was calibrated using data from the training block. This calibrated classifier was then applied during
+    both the copy-spelling and free-spelling runs. Additionally, during calibration, the stimulation codes were
+    tailored and optimized specifically for each individual participant.
+
+    Among the six copy-spelling runs, there were three fixed-length runs. Trials in these runs started with a cueing
+    phase, where the target symbol was highlighted in a green hue for 1 second. Participants maintained their gaze
+    fixated on the target symbol as all symbols flashed in sync with their corresponding pseudo-random noise-codes for a
+    duration of 4.2 seconds (equivalent to 4 code cycles). Immediately following this stimulation, the output of the
+    classifier was shown by coloring the cell blue for 1 second. Each run consisted of 36 trials, presented in a
+    randomized order.
+
+    Here, our focus is solely on the three copy-spelling runs characterized by fixed-length trials lasting 4.2 seconds
+    (equivalent to four code cycles). The other three runs utilized a dynamic stopping procedure, resulting in trials of
+    varying durations, rendering them unsuitable for benchmarking purposes. Similarly, the practice and free-spelling
+    runs included dynamic stopping and are ignored in this dataset. The training dataset, comprising 36 trials, used a
+    different noise-code set, and is therefore also ignored in this dataset. In total, this dataset should contain 108
+    trials of 4.2 seconds each, with 3 repetitions for each of the 36 codes.
+
+    References
+    ----------
+
+    .. [1] Thielen, J. (Jordy), Jason Farquhar, Desain, P.W.M. (Peter) (2023): Broad-Band Visually Evoked Potentials:
+           Re(con)volution in Brain-Computer Interfacing. Version 2. Radboud University. (dataset).
+           DOI: https://doi.org/10.34973/1ecz-1232
+
+    .. [2] Thielen, J., Van Den Broek, P., Farquhar, J., & Desain, P. (2015). Broad-Band visually evoked potentials:
+           re(con)volution in brain-computer interfacing. PLOS ONE, 10(7), e0133797.
+           DOI: https://doi.org/10.1371/journal.pone.0133797
+
+    Notes
+    -----
+
+    .. versionadded:: 1.0.0
+
+    """
+
+    def __init__(self):
+        super().__init__(
+            subjects=list(range(1, 12 + 1)),
+            sessions_per_subject=1,
+            events={"1.0": 101, "0.0": 100},
+            code="Thielen2015",
+            interval=(0, 0.3),
+            paradigm="cvep",
+            doi="10.34973/1ecz-1232",
+        )
+
+    def _get_single_subject_data(self, subject):
+        """Return the data of a single subject."""
+        file_path_list = self.data_path(subject)
+
+        # Channels
+        montage = mne.channels.read_custom_montage(file_path_list[-1])
+
+        # There is only one session, each of 3 runs
+        sessions = {"0": {}}
+        for i_b in range(NR_RUNS):
+            # EEG
+            raw = mne.io.read_raw_gdf(
+                file_path_list[2 * i_b],
+                stim_channel="status",
+                preload=True,
+                verbose=False,
+            )
+
+            # Drop redundant ANA and EXG channels
+            ana = [f"ANA{1 + i}" for i in range(32)]
+            exg = [f"EXG{1 + i}" for i in range(8)]
+            raw.drop_channels(ana + exg)
+
+            # Set electrode positions
+            raw.set_montage(montage)
+
+            # Read info file
+            tmp = loadmat(file_path_list[2 * i_b + 1])
+
+            # Labels at trial level (i.e., symbols)
+            trial_labels = tmp["labels"].astype("uint8").flatten() - 1
+
+            # Codes (select optimized subset and layout, and repeat to trial length)
+            subset = (
+                tmp["subset"].astype("uint8").flatten() - 1
+            )  # the optimized subset of 36 codes from a set of 65
+            layout = (
+                tmp["layout"].astype("uint8").flatten() - 1
+            )  # the optimized position of the 36 codes in the grid
+            codes = tmp["codes"][:, subset[layout]]
+            codes = np.tile(codes, (NR_CYCLES_PER_TRIAL, 1))
+
+            # Find onsets of trials
+            events = mne.find_events(raw, verbose=False)
+            trial_onsets = events[:, 0]
+
+            # Create stim channel with trial information (i.e., symbols)
+            # Specifically: 200 = symbol-0, 201 = symbol-1, 202 = symbol-2, etc.
+            raw = add_stim_channel_trial(raw, trial_onsets, trial_labels, offset=200)
+
+            # Create stim channel with epoch information (i.e., 1 / 0, or on / off)
+            # Specifically: 100 = "0", 101 = "1"
+            raw = add_stim_channel_epoch(
+                raw, trial_onsets, trial_labels, codes, PRESENTATION_RATE, offset=100
+            )
+
+            # Add data as a new run
+            run_name = str(i_b)
+            sessions["0"][run_name] = raw
+
+        return sessions
+
+    def data_path(
+        self, subject, path=None, force_update=False, update_path=None, verbose=None
+    ):
+        """Return the data paths of a single subject."""
+        if subject not in self.subject_list:
+            raise (ValueError("Invalid subject number"))
+
+        sub = f"sub-{subject:02d}"
+        subject_paths = []
+        for i_b in range(NR_RUNS):
+            blk = f"test_sync_{1 + i_b:d}"
+
+            # EEG
+            url = f"{Thielen2015_URL:s}/sourcedata/{sub}/{blk}/{sub}_{blk}.gdf"
+            subject_paths.append(dl.data_dl(url, self.code, path, force_update, verbose))
+
+            # Labels at trial level (i.e., symbols)
+            url = f"{Thielen2015_URL:s}/sourcedata/{sub}/{blk}/{sub}_{blk}.mat"
+            subject_paths.append(dl.data_dl(url, self.code, path, force_update, verbose))
+
+        # Channel locations
+        url = f"{Thielen2015_URL:s}/resources/biosemi64.loc"
+        subject_paths.append(dl.data_dl(url, self.code, path, force_update, verbose))
+
+        return subject_paths