Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python-package] Separately check whether pyarrow and cffi are installed #6785

Merged
merged 12 commits into from
Feb 5, 2025
Merged
9 changes: 5 additions & 4 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import scipy.sparse

from .compat import (
CFFI_INSTALLED,
PANDAS_INSTALLED,
PYARROW_INSTALLED,
arrow_cffi,
Expand Down Expand Up @@ -1706,8 +1707,8 @@ def __pred_for_pyarrow_table(
predict_type: int,
) -> Tuple[np.ndarray, int]:
"""Predict for a PyArrow table."""
if not PYARROW_INSTALLED:
raise LightGBMError("Cannot predict from Arrow without `pyarrow` installed.")
if not (PYARROW_INSTALLED and CFFI_INSTALLED):
raise LightGBMError("Cannot predict from Arrow without `pyarrow` and `cffi` installed.")

# Check that the input is valid: we only handle numbers (for now)
if not all(arrow_is_integer(t) or arrow_is_floating(t) or arrow_is_boolean(t) for t in table.schema.types):
Expand Down Expand Up @@ -2459,8 +2460,8 @@ def __init_from_pyarrow_table(
ref_dataset: Optional[_DatasetHandle],
) -> "Dataset":
"""Initialize data from a PyArrow table."""
if not PYARROW_INSTALLED:
raise LightGBMError("Cannot init dataframe from Arrow without `pyarrow` installed.")
if not (PYARROW_INSTALLED and CFFI_INSTALLED):
raise LightGBMError("Cannot init dataframe from Arrow without `pyarrow` and `cffi` installed.")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
raise LightGBMError("Cannot init dataframe from Arrow without `pyarrow` and `cffi` installed.")
raise LightGBMError("Cannot init Dataset from Arrow without `pyarrow` and `cffi` installed.")

This really should be Dataset, not dataframe... I'll make that change when I push testing changes.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in e72d5e2.

In that commit, I also removed backticks from these log messages, in favor of single quotes. Special characters in log messages can occasionally be problematic.

I know these things were already there before this PR, but might as well fix them right here while we're touching these lines.


# Check that the input is valid: we only handle numbers (for now)
if not all(arrow_is_integer(t) or arrow_is_floating(t) or arrow_is_boolean(t) for t in table.schema.types):
Expand Down
33 changes: 21 additions & 12 deletions python-package/lightgbm/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ def __init__(self, *args: Any, **kwargs: Any):
from pyarrow import ChunkedArray as pa_ChunkedArray
from pyarrow import Table as pa_Table
from pyarrow import chunked_array as pa_chunked_array
from pyarrow.cffi import ffi as arrow_cffi
from pyarrow.types import is_boolean as arrow_is_boolean
from pyarrow.types import is_floating as arrow_is_floating
from pyarrow.types import is_integer as arrow_is_integer
Expand All @@ -316,17 +315,6 @@ class pa_Table: # type: ignore
def __init__(self, *args: Any, **kwargs: Any):
pass

class arrow_cffi: # type: ignore
"""Dummy class for pyarrow.cffi.ffi."""

CData = None
addressof = None
cast = None
new = None

def __init__(self, *args: Any, **kwargs: Any):
pass

class pa_compute: # type: ignore
"""Dummy class for pyarrow.compute."""
jameslamb marked this conversation as resolved.
Show resolved Hide resolved

Expand All @@ -338,6 +326,27 @@ class pa_compute: # type: ignore
arrow_is_integer = None
arrow_is_floating = None


"""cffi"""
try:
from pyarrow.cffi import ffi as arrow_cffi

CFFI_INSTALLED = True
except ImportError:
CFFI_INSTALLED = False

class arrow_cffi: # type: ignore
"""Dummy class for pyarrow.cffi.ffi."""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need

        CData = None
        addressof = None
        cast = None
        new = None

class members?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CData is needed for type hinting:

chunks: arrow_cffi.CData

But I think the others could be safely removed. It's only showing up in the diff in this PR because this code is being moved around... so this was missed in earlier PRs (I guess #6034).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed all but CData in 7396613

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Guess that was my editor complaining when I originally added this... no good reason to keep it around though, thanks for removing :)


CData = None
addressof = None
cast = None
new = None

def __init__(self, *args: Any, **kwargs: Any):
pass
jameslamb marked this conversation as resolved.
Show resolved Hide resolved


"""cpu_count()"""
try:
from joblib import cpu_count
Expand Down
25 changes: 25 additions & 0 deletions tests/python_package_test/test_arrow.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# coding: utf-8
import filecmp
import os
import sys
from pathlib import Path
from typing import Any, Dict, Optional
from unittest import mock

import numpy as np
import pytest
Expand Down Expand Up @@ -454,3 +456,26 @@ def test_arrow_feature_name_manual():
)
booster = lgb.train({"num_leaves": 7}, dataset, num_boost_round=5)
assert booster.feature_name() == ["c", "d"]


def test_training_and_predicting_from_pa_table_without_cffi_raises():
data = generate_dummy_arrow_table()
with mock.patch.dict(sys.modules, {"pyarrow.cffi": None}):
assert lgb.compat.PYARROW_INSTALLED is True
assert lgb.compat.CFFI_INSTALLED is False

with pytest.raises(
lgb.basic.LightGBMError, match="Cannot init dataframe from Arrow without `pyarrow` and `cffi` installed."
):
dataset = lgb.Dataset(
data,
label=pa.array([0, 1, 0, 0, 1]),
params=dummy_dataset_params(),
feature_name=["c", "d"],
categorical_feature=["c"],
)

with pytest.raises(
lgb.basic.LightGBMError, match="Cannot predict from Arrow without `pyarrow` and `cffi` installed."
):
_ = lgb.train({"num_leaves": 7}, dataset, num_boost_round=5)
Loading