pasqal-io · Yoric · Feb 3, 2025 · Jan 20, 2025
diff --git a/examples/tutorial.ipynb b/examples/tutorial.ipynb
@@ -54,7 +54,7 @@
    "source": [
     "To extract machine-learning features from our dataset, we will need to configure a feature extractor. This library provides several feature extractors to either make use of a physical quantum device (QPU), or a variety of emulators.\n",
     "\n",
-    "To configure a feature extractor, we will need to give it a _compiler_, whose task is to take a list of graphs, extract embeddings and compile these embeddings to _sequences of pulses_, the format that can be executed  by either a QPU or an emulator. For this tutorial, our dataset is composed of molecule graphs, so we will use the `MoleculeGraphCompiler`:"
+    "To configure a feature extractor, we will need to give it a _compiler_, whose task is to take a list of graphs, extract embeddings and compile these embeddings to _sequences of pulses_, the format that can be executed  by either a QPU or an emulator. For this tutorial, our dataset is composed of molecule graphs encoded with the PTC-FM conventions, so we will use the `PTCFMGraphCompiler`:"
    ]
   },
   {
@@ -65,14 +65,14 @@
    "source": [
     "import qek.data.graphs as qek_graphs\n",
     "\n",
-    "compiler = qek_graphs.MoleculeGraphCompiler()"
+    "compiler = qek_graphs.PTCFMCompiler()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This library provides other compilers from other formats of graphs."
+    "This library provides other compilers from other formats of graphs, including the `MoleculeGraphCompiler` and general-purpose graph compilers for pytorch_geometric or networkx graphs."
    ]
   },
   {
@@ -126,8 +126,8 @@
     "# You can increase this value to higher number of qubits, but this\n",
     "# notebook will take longer to execute and may run out of memory.\n",
     "max_qubits = 5\n",
-    "processed_dataset = await extractor.run(max_qubits=max_qubits) # Don't forget to `await`!\n",
-    "display(\"Extracted features from %s samples\"% (len(processed_dataset), ))"
+    "processed_dataset = extractor.run(max_qubits=max_qubits)\n",
+    "display(\"Extracted features from %s samples\"% (len(processed_dataset.states), ))"
    ]
   },
   {
@@ -159,8 +159,6 @@
     "HAVE_PASQAL_ACCOUNT = False # If you have a PASQAL Cloud account, fill in the details and set this to `True`.\n",
     "\n",
     "if HAVE_PASQAL_ACCOUNT:\n",
-    "    processed_dataset = []\n",
-    "\n",
     "    # Use the QPU Extractor.\n",
     "    extractor = qek_extractors.QPUExtractor(\n",
     "        # Once computing is complete, data will be saved in this file.\n",
@@ -179,12 +177,12 @@
     "    display(\"Compiled %s sequences\" % (len(compiled), ))\n",
     "\n",
     "    # Launch the execution.\n",
-    "    execution = extractor.run()\n",
-    "    display(\"Work enqueued with ids %s\" % (extractor.batch_ids, ))\n",
+    "    processed_dataset = extractor.run()\n",
+    "    display(\"Work enqueued with ids %s\" % (processed_dataset.batch_ids, ))\n",
     "\n",
     "    # ...and wait for the results.\n",
-    "    processed_dataset = await execution\n",
-    "    display(\"Extracted features from %s samples\"% (len(processed_dataset), ))"
+    "    await processed_dataset\n",
+    "    display(\"Extracted states from %s samples\"% (len(processed_dataset.states), ))"
    ]
   },
   {
@@ -221,7 +219,8 @@
    "outputs": [],
    "source": [
     "import qek.data.dataset as qek_dataset\n",
-    "processed_dataset = qek_dataset.load_dataset(file_path=\"ptcfm_processed_dataset.json\")\n",
+    "from qek.data.dataset import ProcessedData\n",
+    "processed_dataset: list[ProcessedData] = qek_dataset.load_dataset(file_path=\"ptcfm_processed_dataset.json\")\n",
     "print(f\"Size of the quantum compatible dataset = {len(processed_dataset)}\")"
    ]
   },

diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
   "torch",
   "torch_geometric",
   "matplotlib",
-  "emu-mps",
+  "emu-mps~=1.2.0",
 ]
 
 [tool.hatch.metadata]

diff --git a/qek/data/dataset.py b/qek/data/dataset.py
@@ -1,6 +1,6 @@
 import collections
 import json
-from typing import cast
+from typing import Final, cast
 import matplotlib
 
 import logging
@@ -21,8 +21,8 @@ class ProcessedData:
             executed on the device.
         state_dict: A dictionary {bitstring: number of instances}
             for this graph.
-        target: The machine-learning target (in this case, a value
-            in {0, 1}, as specified by the original graph).
+        target: If specified, the machine-learning target, as a
+            value `0` or `1`.
 
     The state dictionary represents an approximation of the quantum
     state of the device for this graph after completion of the
@@ -40,14 +40,24 @@ class ProcessedData:
     specific graph).
     """
 
-    sequence: pl.Sequence
-    state_dict: dict[str, int]
-    _dist_excitation: np.ndarray
-    target: int
+    sequence: Final[pl.Sequence]
+    state_dict: Final[dict[str, int]]
+    _dist_excitation: Final[np.ndarray]
+    target: Final[int | None]
 
-    def __init__(self, sequence: pl.Sequence, state_dict: dict[str, np.int64], target: int):
+    def __init__(
+        self, sequence: pl.Sequence, state_dict: dict[str, int | np.int64], target: int | None
+    ):
         self.sequence = sequence
-        self.state_dict = _convert_np_int64_to_int(data=state_dict)
+        # Some emulators will actually be `dict[str, int64]` instead of `dict[str, int]` and `int64`
+        # is not JSON-serializable.
+        #
+        # The reason for which `int64` is not JSON-serializable is that JSON limits ints to 2^53-1.
+        # In practice, this should not be a problem, since the `int`/`int64` in our dict is
+        # limited to the number of runs, and we don't expect to be launching 2^53 consecutive runs
+        # for a single sequence on a device in any foreseeable future (assuming a run of 1ns,
+        # this would still take ~4 billion years to execute).
+        self.state_dict = {k: int(value) for k, value in state_dict.items()}
         self._dist_excitation = dist_excitation(self.state_dict)
         self.target = target
 
@@ -156,16 +166,6 @@ def dist_excitation(state_dict: dict[str, int], size: int | None = None) -> np.n
     return result
 
 
-def _convert_np_int64_to_int(data: dict[str, np.int64]) -> dict[str, int]:
-    """
-    Utility function: convert the values of a dict from `np.int64` to `int`,
-    for serialization purposes.
-    """
-    return {
-        key: (int(value) if isinstance(value, np.integer) else value) for key, value in data.items()
-    }
-
-
 def save_dataset(dataset: list[ProcessedData], file_path: str) -> None:
     """Saves a dataset to a JSON file.