skrub-data · LilianBoulard · Jul 20, 2023 · Jul 20, 2023 · Jul 20, 2023 · Jul 20, 2023
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -60,6 +60,12 @@ Major changes
   compliance with the scikit-learn API.
   :pr:`647` by :user:`Guillaume Lemaitre <glemaitre>`
 
+* Fetching functions now have a unified and simpler API: a :class:`dataset.Dataset`
+  object is returned by all functions. Lazy loading (parameters `load_dataframe`)
+  has been removed. Parameter `download_if_missing` added to world bank and
+  figshare fetchers.
+  :pr:`669` by :user:`Lilian Boulard <LilianBoulard>`.
+
 * Fixes a bug in :class:`TableVectorizer` with `remainder`: it is now cloned if it's
   a transformer so that the same instance is not shared between different
   transformers.

diff --git a/benchmarks/utils/_various.py b/benchmarks/utils/_various.py
@@ -11,7 +11,7 @@
     fetch_road_safety,
     fetch_traffic_violations,
 )
-from skrub.datasets import DatasetAll
+from skrub.datasets import Dataset
 
 
 def find_result(bench_name: str) -> Path:
@@ -66,7 +66,7 @@ def choose_file(results: list[Path]) -> Path:
         return results[int(choice) - 1]
 
 
-def get_classification_datasets() -> dict[str, DatasetAll]:
+def get_classification_datasets() -> dict[str, Dataset]:
     return {
         "open_payments": fetch_open_payments(),
         "drug_directory": fetch_drug_directory(),
@@ -76,7 +76,7 @@ def get_classification_datasets() -> dict[str, DatasetAll]:
     }
 
 
-def get_regression_datasets() -> dict[str, DatasetAll]:
+def get_regression_datasets() -> dict[str, Dataset]:
     return {
         "medical_charge": fetch_medical_charge(),
         "employee_salaries": fetch_employee_salaries(),

diff --git a/doc/conf.py b/doc/conf.py
@@ -502,8 +502,7 @@ def notebook_modification_function(notebook_content, notebook_filename):
     "DatetimeEncoder": "skrub.DatetimeEncoder",
     "deduplicate": "skrub.deduplicate",
     "TableVectorizer": "skrub.TableVectorizer",
-    "DatasetInfoOnly": "skrub.datasets._fetching.DatasetInfoOnly",
-    "DatasetAll": "skrub.datasets._fetching.DatasetAll",
+    "Dataset": "skrub.datasets.Dataset",
     "_replace_false_missing": "skrub._table_vectorizer._replace_false_missing",
 }
 

diff --git a/skrub/_utils.py b/skrub/_utils.py
@@ -84,15 +84,13 @@ def import_optional_dependency(name: str, extra: str = ""):
     maybe_module : Optional[ModuleType]
         The imported module when found.
     """
-
-    msg = (
-        f"Missing optional dependency '{name}'. {extra} "
-        f"Use pip or conda to install {name}."
-    )
     try:
         module = importlib.import_module(name)
     except ImportError as exc:
-        raise ImportError(msg) from exc
+        raise ImportError(
+            f"Missing optional dependency '{name}'. {extra} "
+            f"Use pip or conda to install {name}. "
+        ) from exc
 
     return module
 

diff --git a/skrub/datasets/__init__.py b/skrub/datasets/__init__.py
@@ -1,6 +1,5 @@
-from ._fetching import (
-    DatasetAll,
-    DatasetInfoOnly,
+from ._fetching_functions import (
+    Dataset,
     fetch_drug_directory,
     fetch_employee_salaries,
     fetch_figshare,
@@ -10,18 +9,17 @@
     fetch_road_safety,
     fetch_traffic_violations,
     fetch_world_bank_indicator,
-    get_data_dir,
 )
 from ._generating import make_deduplication_data
 from ._ken_embeddings import (
     fetch_ken_embeddings,
     fetch_ken_table_aliases,
     fetch_ken_types,
 )
+from ._utils import get_data_dir
 
 __all__ = [
-    "DatasetAll",
-    "DatasetInfoOnly",
+    "Dataset",
     "fetch_drug_directory",
     "fetch_employee_salaries",
     "fetch_medical_charge",