Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python-package] Separately check whether pyarrow and cffi are installed #6785

Merged
merged 12 commits into from
Feb 5, 2025
Merged
9 changes: 5 additions & 4 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import scipy.sparse

from .compat import (
CFFI_INSTALLED,
PANDAS_INSTALLED,
PYARROW_INSTALLED,
arrow_cffi,
Expand Down Expand Up @@ -1706,8 +1707,8 @@ def __pred_for_pyarrow_table(
predict_type: int,
) -> Tuple[np.ndarray, int]:
"""Predict for a PyArrow table."""
if not PYARROW_INSTALLED:
raise LightGBMError("Cannot predict from Arrow without `pyarrow` installed.")
if not (PYARROW_INSTALLED and CFFI_INSTALLED):
raise LightGBMError("Cannot predict from Arrow without 'pyarrow' and 'cffi' installed.")

# Check that the input is valid: we only handle numbers (for now)
if not all(arrow_is_integer(t) or arrow_is_floating(t) or arrow_is_boolean(t) for t in table.schema.types):
Expand Down Expand Up @@ -2458,8 +2459,8 @@ def __init_from_pyarrow_table(
ref_dataset: Optional[_DatasetHandle],
) -> "Dataset":
"""Initialize data from a PyArrow table."""
if not PYARROW_INSTALLED:
raise LightGBMError("Cannot init dataframe from Arrow without `pyarrow` installed.")
if not (PYARROW_INSTALLED and CFFI_INSTALLED):
raise LightGBMError("Cannot init Dataset from Arrow without 'pyarrow' and 'cffi' installed.")

# Check that the input is valid: we only handle numbers (for now)
if not all(arrow_is_integer(t) or arrow_is_floating(t) or arrow_is_boolean(t) for t in table.schema.types):
Expand Down
32 changes: 19 additions & 13 deletions python-package/lightgbm/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ def __init__(self, *args: Any, **kwargs: Any):
from pyarrow import ChunkedArray as pa_ChunkedArray
from pyarrow import Table as pa_Table
from pyarrow import chunked_array as pa_chunked_array
from pyarrow.cffi import ffi as arrow_cffi
from pyarrow.types import is_boolean as arrow_is_boolean
from pyarrow.types import is_floating as arrow_is_floating
from pyarrow.types import is_integer as arrow_is_integer
Expand All @@ -316,19 +315,8 @@ class pa_Table: # type: ignore
def __init__(self, *args: Any, **kwargs: Any):
pass

class arrow_cffi: # type: ignore
"""Dummy class for pyarrow.cffi.ffi."""

CData = None
addressof = None
cast = None
new = None

def __init__(self, *args: Any, **kwargs: Any):
pass

class pa_compute: # type: ignore
"""Dummy class for pyarrow.compute."""
"""Dummy class for pyarrow.compute module."""

all = None
equal = None
Expand All @@ -338,6 +326,24 @@ class pa_compute: # type: ignore
arrow_is_integer = None
arrow_is_floating = None


"""cffi"""
try:
from pyarrow.cffi import ffi as arrow_cffi

CFFI_INSTALLED = True
except ImportError:
CFFI_INSTALLED = False

class arrow_cffi: # type: ignore
"""Dummy class for pyarrow.cffi.ffi."""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need

        CData = None
        addressof = None
        cast = None
        new = None

class members?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CData is needed for type hinting:

chunks: arrow_cffi.CData

But I think the others could be safely removed. It's only showing up in the diff in this PR because this code is being moved around... so this was missed in earlier PRs (I guess #6034).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed all but CData in 7396613

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Guess that was my editor complaining when I originally added this... no good reason to keep it around though, thanks for removing :)


CData = None

def __init__(self, *args: Any, **kwargs: Any):
pass


"""cpu_count()"""
try:
from joblib import cpu_count
Expand Down
9 changes: 9 additions & 0 deletions tests/python_package_test/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
import numpy as np
import pytest

import lightgbm


@pytest.fixture(scope="function")
def missing_module_cffi(monkeypatch):
"""Mock 'cffi' not being importable"""
monkeypatch.setattr(lightgbm.compat, "CFFI_INSTALLED", False)
monkeypatch.setattr(lightgbm.basic, "CFFI_INSTALLED", False)


@pytest.fixture(scope="function")
def rng():
Expand Down
29 changes: 29 additions & 0 deletions tests/python_package_test/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,3 +454,32 @@ def test_arrow_feature_name_manual():
)
booster = lgb.train({"num_leaves": 7}, dataset, num_boost_round=5)
assert booster.feature_name() == ["c", "d"]


def test_dataset_construction_from_pa_table_without_cffi_raises_informative_error(missing_module_cffi):
with pytest.raises(
lgb.basic.LightGBMError, match="Cannot init Dataset from Arrow without 'pyarrow' and 'cffi' installed."
):
lgb.Dataset(
generate_dummy_arrow_table(),
label=pa.array([0, 1, 0, 0, 1]),
params=dummy_dataset_params(),
).construct()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.construct() is necessary here... __init_from_pyarrow_table() is not run as part of lgb.Dataset().



def test_predicting_from_pa_table_without_cffi_raises_informative_error(missing_module_cffi):
data = generate_random_arrow_table(num_columns=3, num_datapoints=1_000, seed=42)
labels = generate_random_arrow_array(num_datapoints=data.shape[0], seed=42)
bst = lgb.train(
params={"num_leaves": 7, "verbose": -1},
train_set=lgb.Dataset(
data.to_pandas(),
label=labels.to_pandas(),
),
num_boost_round=2,
)

with pytest.raises(
lgb.basic.LightGBMError, match="Cannot predict from Arrow without 'pyarrow' and 'cffi' installed."
):
bst.predict(data)
Loading