Skip to content

Commit

Permalink
feat: introduce eager loading functions
Browse files Browse the repository at this point in the history
Signed-off-by: Luka Peschke <[email protected]>
  • Loading branch information
lukapeschke committed Dec 22, 2023
1 parent 0a97e8d commit 5cc31be
Show file tree
Hide file tree
Showing 8 changed files with 307 additions and 115 deletions.
5 changes: 2 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ crate-type = ["cdylib"]

[dependencies]
anyhow = "1.0.76"
calamine = { version = "0.23.0", features = ["dates"] }
calamine = { git = "https://github.com/lukapeschke/calamine", branch = "public-data-type-ref", features = ["dates"] }
# calamine = { version = "0.23.1", features = ["dates"] }
chrono = { version = "0.4.31", default-features = false }
pyo3 = { version = "0.18.3", features = ["extension-module", "anyhow"] }

Expand Down
31 changes: 31 additions & 0 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,37 @@ def load_sheet(
)
)

def load_sheet_eager(
self,
idx_or_name: int | str,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
) -> pa.RecordBatch:
"""Loads a sheet by name if a string is passed or by index if an integer is passed.
See `load_sheet_by_idx` and `load_sheet_by_name` for parameter documentation.
"""
return (
self._reader.load_sheet_by_idx_eager(
idx_or_name,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
)
if isinstance(idx_or_name, int)
else self._reader.load_sheet_by_name_eager(
idx_or_name,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
)
)

def __repr__(self) -> str:
return self._reader.__repr__()

Expand Down
18 changes: 18 additions & 0 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ class _ExcelReader:
skip_rows: int = 0,
n_rows: int | None = None,
) -> _ExcelSheet: ...
def load_sheet_by_name_eager(
self,
name: str,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
) -> pa.RecordBatch: ...
def load_sheet_by_idx(
self,
idx: int,
Expand All @@ -40,6 +49,15 @@ class _ExcelReader:
skip_rows: int = 0,
n_rows: int | None = None,
) -> _ExcelSheet: ...
def load_sheet_by_idx_eager(
self,
idx: int,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
) -> pa.RecordBatch: ...
def load_sheet(
self,
idx_or_name: int | str,
Expand Down
32 changes: 32 additions & 0 deletions python/tests/test_eagerness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import fastexcel

from utils import path_for_fixture
import polars as pl
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal


def test_load_sheet_eager_single_sheet() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))

eager_pandas = excel_reader.load_sheet_eager(0).to_pandas()
lazy_pandas = excel_reader.load_sheet(0).to_pandas()
pd_assert_frame_equal(eager_pandas, lazy_pandas)

eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager(0))
assert isinstance(eager_polars, pl.DataFrame)
lazy_polars = excel_reader.load_sheet(0).to_polars()
pl_assert_frame_equal(eager_polars, lazy_polars)


def test_multiple_sheets_with_unnamed_columns():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))

eager_pandas = excel_reader.load_sheet_eager("With unnamed columns").to_pandas()
lazy_pandas = excel_reader.load_sheet("With unnamed columns").to_pandas()
pd_assert_frame_equal(eager_pandas, lazy_pandas)

eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager("With unnamed columns"))
assert isinstance(eager_polars, pl.DataFrame)
lazy_polars = excel_reader.load_sheet("With unnamed columns").to_polars()
pl_assert_frame_equal(eager_polars, lazy_polars)
99 changes: 96 additions & 3 deletions src/types/excelreader.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
use std::fmt::Debug;
use std::{fs::File, io::BufReader};

use anyhow::{Context, Result};
use calamine::{open_workbook_auto, Reader, Sheets};
use pyo3::{pyclass, pymethods};
use arrow::pyarrow::PyArrowConvert;
use arrow::record_batch::RecordBatch;
use calamine::{open_workbook_auto, CellType, DataTypeTrait, Range, Reader, Sheets};
use pyo3::prelude::PyObject;
use pyo3::{pyclass, pymethods, PyResult, Python};

use crate::types::excelsheet::sheet_column_names_from_header_and_range;
use crate::utils::arrow::arrow_schema_from_column_names_and_range;

use super::excelsheet::record_batch_from_data_and_schema;
use super::{
excelsheet::{Header, Pagination},
ExcelSheet,
Expand All @@ -30,6 +38,33 @@ impl ExcelReader {
path: path.to_owned(),
})
}

fn load_sheet_eager<DT: CellType + DataTypeTrait + Debug>(
data: Range<DT>,
pagination: Pagination,
header: Header,
) -> Result<RecordBatch> {
let column_names = sheet_column_names_from_header_and_range(&header, &data);

let offset = header.offset() + pagination.offset();
let limit = {
let upper_bound = data.height();
if let Some(n_rows) = pagination.n_rows() {
let limit = offset + n_rows;
if limit < upper_bound {
limit
} else {
upper_bound
}
} else {
upper_bound
}
};
let schema = arrow_schema_from_column_names_and_range(&data, &column_names, offset)
.with_context(|| "could not build arrow schema")?;

record_batch_from_data_and_schema(schema, &data, offset, limit)
}
}

#[pymethods]
Expand Down Expand Up @@ -57,14 +92,43 @@ impl ExcelReader {
let range = self
.sheets
.worksheet_range(&name)
.with_context(|| format!("Sheet {name} not found"))?
.with_context(|| format!("Error while loading sheet {name}"))?;

let header = Header::new(header_row, column_names);
let pagination = Pagination::new(skip_rows, n_rows, &range)?;

Ok(ExcelSheet::new(name, range, header, pagination))
}

#[pyo3(signature = (
name,
*,
header_row = 0,
column_names = None,
skip_rows = 0,
n_rows = None
))]
pub fn load_sheet_by_name_eager(
&mut self,
name: String,
header_row: Option<usize>,
column_names: Option<Vec<String>>,
skip_rows: usize,
n_rows: Option<usize>,
py: Python<'_>,
) -> PyResult<PyObject> {
let range = self
.sheets
.worksheet_range(&name)
.with_context(|| format!("Error while loading sheet {name}"))?;

let header = Header::new(header_row, column_names);
let pagination = Pagination::new(skip_rows, n_rows, &range)?;
let rb = ExcelReader::load_sheet_eager(range, pagination, header)
.with_context(|| "could not load sheet eagerly")?;
rb.to_pyarrow(py)
}

#[pyo3(signature = (
idx,
*,
Expand Down Expand Up @@ -101,4 +165,33 @@ impl ExcelReader {
let pagination = Pagination::new(skip_rows, n_rows, &range)?;
Ok(ExcelSheet::new(name, range, header, pagination))
}

#[pyo3(signature = (
idx,
*,
header_row = 0,
column_names = None,
skip_rows = 0,
n_rows = None)
)]
pub fn load_sheet_by_idx_eager(
&mut self,
idx: usize,
header_row: Option<usize>,
column_names: Option<Vec<String>>,
skip_rows: usize,
n_rows: Option<usize>,
py: Python<'_>,
) -> PyResult<PyObject> {
let range = self
.sheets
.worksheet_range_at(idx)
.with_context(|| format!("Sheet at idx {idx} not found"))?
.with_context(|| format!("Error while loading sheet at idx {idx}"))?;
let header = Header::new(header_row, column_names);
let pagination = Pagination::new(skip_rows, n_rows, &range)?;
let rb = ExcelReader::load_sheet_eager(range, pagination, header)
.with_context(|| "could not load sheet eagerly")?;
rb.to_pyarrow(py)
}
}
Loading

0 comments on commit 5cc31be

Please sign in to comment.