-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: introduce eager loading functions #147
Changes from all commits
0a97e8d
5cc31be
4db77ee
b9ec9a2
5026389
15b52a1
230a832
6cd77ab
1cde690
d6548a4
d64dc03
a7e8175
6adca86
bb51f94
4a8532c
3eb6ca3
6bf5fb1
c1c0990
eb51afc
bfb92fe
ad5f326
4bcc947
50d518d
9089378
6b67cfc
a7b8665
cc9588b
6ed47ce
01dfb76
371b2ca
9e92efd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -131,7 +131,7 @@ def load_sheet( | |
use_columns: list[str] | list[int] | str | None = None, | ||
dtypes: DTypeMap | None = None, | ||
) -> ExcelSheet: | ||
"""Loads a sheet by index or name. | ||
"""Loads a sheet lazily by index or name. | ||
|
||
:param idx_or_name: The index (starting at 0) or the name of the sheet to load. | ||
:param header_row: The index of the row containing the column labels, default index is 0. | ||
|
@@ -165,9 +165,41 @@ def load_sheet( | |
schema_sample_rows=schema_sample_rows, | ||
use_columns=use_columns, | ||
dtypes=dtypes, | ||
eager=False, | ||
) | ||
) | ||
|
||
def load_sheet_eager( | ||
self, | ||
idx_or_name: int | str, | ||
*, | ||
header_row: int | None = 0, | ||
column_names: list[str] | None = None, | ||
skip_rows: int = 0, | ||
n_rows: int | None = None, | ||
schema_sample_rows: int | None = 1_000, | ||
use_columns: list[str] | list[int] | str | None = None, | ||
dtypes: DTypeMap | None = None, | ||
) -> pa.RecordBatch: | ||
"""Loads a sheet eagerly by index or name. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only has an impact for xlsx since the other formats don't support lazy iteration There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will update There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
For xlsx files, this will be faster and more memory-efficient, as it will use | ||
`worksheet_range_ref` under the hood, which returns borrowed types. | ||
|
||
Refer to `load_sheet` for parameter documentation | ||
""" | ||
return self._reader.load_sheet( | ||
idx_or_name=idx_or_name, | ||
header_row=header_row, | ||
column_names=column_names, | ||
skip_rows=skip_rows, | ||
n_rows=n_rows, | ||
schema_sample_rows=schema_sample_rows, | ||
use_columns=use_columns, | ||
dtypes=dtypes, | ||
eager=True, | ||
) | ||
|
||
def load_sheet_by_name( | ||
self, | ||
name: str, | ||
|
@@ -184,17 +216,15 @@ def load_sheet_by_name( | |
|
||
Refer to `load_sheet` for parameter documentation | ||
""" | ||
return ExcelSheet( | ||
self._reader.load_sheet( | ||
name, | ||
header_row=header_row, | ||
column_names=column_names, | ||
skip_rows=skip_rows, | ||
n_rows=n_rows, | ||
schema_sample_rows=schema_sample_rows, | ||
use_columns=use_columns, | ||
dtypes=dtypes, | ||
) | ||
return self.load_sheet( | ||
name, | ||
header_row=header_row, | ||
column_names=column_names, | ||
skip_rows=skip_rows, | ||
n_rows=n_rows, | ||
schema_sample_rows=schema_sample_rows, | ||
use_columns=use_columns, | ||
dtypes=dtypes, | ||
) | ||
|
||
def load_sheet_by_idx( | ||
|
@@ -213,17 +243,15 @@ def load_sheet_by_idx( | |
|
||
Refer to `load_sheet` for parameter documentation | ||
""" | ||
return ExcelSheet( | ||
self._reader.load_sheet( | ||
idx, | ||
header_row=header_row, | ||
column_names=column_names, | ||
skip_rows=skip_rows, | ||
n_rows=n_rows, | ||
schema_sample_rows=schema_sample_rows, | ||
use_columns=use_columns, | ||
dtypes=dtypes, | ||
) | ||
return self.load_sheet( | ||
idx, | ||
header_row=header_row, | ||
column_names=column_names, | ||
skip_rows=skip_rows, | ||
n_rows=n_rows, | ||
schema_sample_rows=schema_sample_rows, | ||
use_columns=use_columns, | ||
dtypes=dtypes, | ||
) | ||
|
||
def __repr__(self) -> str: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from datetime import date, datetime, timedelta | ||
|
||
import fastexcel | ||
import polars as pl | ||
from pandas.testing import assert_frame_equal as pd_assert_frame_equal | ||
from polars.testing import assert_frame_equal as pl_assert_frame_equal | ||
from pyarrow import RecordBatch | ||
from utils import path_for_fixture | ||
|
||
|
||
def test_load_sheet_eager_single_sheet() -> None: | ||
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) | ||
|
||
eager_pandas = excel_reader.load_sheet_eager(0).to_pandas() | ||
lazy_pandas = excel_reader.load_sheet(0).to_pandas() | ||
pd_assert_frame_equal(eager_pandas, lazy_pandas) | ||
|
||
eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager(0)) | ||
assert isinstance(eager_polars, pl.DataFrame) | ||
lazy_polars = excel_reader.load_sheet(0).to_polars() | ||
pl_assert_frame_equal(eager_polars, lazy_polars) | ||
|
||
|
||
def test_multiple_sheets_with_unnamed_columns(): | ||
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) | ||
|
||
eager_pandas = excel_reader.load_sheet_eager("With unnamed columns").to_pandas() | ||
lazy_pandas = excel_reader.load_sheet("With unnamed columns").to_pandas() | ||
pd_assert_frame_equal(eager_pandas, lazy_pandas) | ||
|
||
eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager("With unnamed columns")) | ||
assert isinstance(eager_polars, pl.DataFrame) | ||
lazy_polars = excel_reader.load_sheet("With unnamed columns").to_polars() | ||
pl_assert_frame_equal(eager_polars, lazy_polars) | ||
|
||
|
||
def test_eager_with_an_ods_file_should_return_a_recordbatch() -> None: | ||
ods_reader = fastexcel.read_excel(path_for_fixture("dates.ods")) | ||
|
||
record_batch = ods_reader.load_sheet_eager(0) | ||
assert isinstance(record_batch, RecordBatch) | ||
pl_df = pl.from_arrow(record_batch) | ||
assert isinstance(pl_df, pl.DataFrame) | ||
pl_assert_frame_equal( | ||
pl_df, | ||
pl.DataFrame( | ||
{ | ||
"date": [date(2023, 6, 1)], | ||
"datestr": ["2023-06-01T02:03:04+02:00"], | ||
"time": [timedelta(hours=1, minutes=2, seconds=3)], | ||
"datetime": [datetime(2023, 6, 1, 2, 3, 4)], | ||
} | ||
).with_columns(*(pl.col(col).dt.cast_time_unit("ms") for col in ("datetime", "time"))), | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
mod error; | ||
mod types; | ||
mod utils; | ||
|
||
use error::{py_errors, ErrorContext}; | ||
use pyo3::prelude::*; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should probably improve the doc string here "lazy load"?
And question don't we want to have the eager version by default?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd rather not have the eager version by default as I don't want to introduce a breaking change. But you're right, I'll improve the docstring 👍
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
371b2ca