feat: introduce eager loading functions

Signed-off-by: Luka Peschke <[email protected]>
ToucanToco · Dec 22, 2023 · 5cc31be · 5cc31be
1 parent 0a97e8d
commit 5cc31be
Show file tree

Hide file tree

Showing 8 changed files with 307 additions and 115 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,7 +10,8 @@ crate-type = ["cdylib"]
 
 [dependencies]
 anyhow = "1.0.76"
-calamine = { version = "0.23.0", features = ["dates"] }
+calamine = { git = "https://github.com/lukapeschke/calamine", branch = "public-data-type-ref", features = ["dates"] }
+# calamine = { version = "0.23.1", features = ["dates"] }
 chrono = { version = "0.4.31", default-features = false }
 pyo3 = { version = "0.18.3", features = ["extension-module", "anyhow"] }
 

diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py
@@ -167,6 +167,37 @@ def load_sheet(
             )
         )
 
+    def load_sheet_eager(
+        self,
+        idx_or_name: int | str,
+        *,
+        header_row: int | None = 0,
+        column_names: list[str] | None = None,
+        skip_rows: int = 0,
+        n_rows: int | None = None,
+    ) -> pa.RecordBatch:
+        """Loads a sheet by name if a string is passed or by index if an integer is passed.
+
+        See `load_sheet_by_idx` and `load_sheet_by_name` for parameter documentation.
+        """
+        return (
+            self._reader.load_sheet_by_idx_eager(
+                idx_or_name,
+                header_row=header_row,
+                column_names=column_names,
+                skip_rows=skip_rows,
+                n_rows=n_rows,
+            )
+            if isinstance(idx_or_name, int)
+            else self._reader.load_sheet_by_name_eager(
+                idx_or_name,
+                header_row=header_row,
+                column_names=column_names,
+                skip_rows=skip_rows,
+                n_rows=n_rows,
+            )
+        )
+
     def __repr__(self) -> str:
         return self._reader.__repr__()
 

diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi
@@ -31,6 +31,15 @@ class _ExcelReader:
         skip_rows: int = 0,
         n_rows: int | None = None,
     ) -> _ExcelSheet: ...
+    def load_sheet_by_name_eager(
+        self,
+        name: str,
+        *,
+        header_row: int | None = 0,
+        column_names: list[str] | None = None,
+        skip_rows: int = 0,
+        n_rows: int | None = None,
+    ) -> pa.RecordBatch: ...
     def load_sheet_by_idx(
         self,
         idx: int,
@@ -40,6 +49,15 @@ class _ExcelReader:
         skip_rows: int = 0,
         n_rows: int | None = None,
     ) -> _ExcelSheet: ...
+    def load_sheet_by_idx_eager(
+        self,
+        idx: int,
+        *,
+        header_row: int | None = 0,
+        column_names: list[str] | None = None,
+        skip_rows: int = 0,
+        n_rows: int | None = None,
+    ) -> pa.RecordBatch: ...
     def load_sheet(
         self,
         idx_or_name: int | str,

diff --git a/python/tests/test_eagerness.py b/python/tests/test_eagerness.py
@@ -0,0 +1,32 @@
+import fastexcel
+
+from utils import path_for_fixture
+import polars as pl
+from pandas.testing import assert_frame_equal as pd_assert_frame_equal
+from polars.testing import assert_frame_equal as pl_assert_frame_equal
+
+
+def test_load_sheet_eager_single_sheet() -> None:
+    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
+
+    eager_pandas = excel_reader.load_sheet_eager(0).to_pandas()
+    lazy_pandas = excel_reader.load_sheet(0).to_pandas()
+    pd_assert_frame_equal(eager_pandas, lazy_pandas)
+
+    eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager(0))
+    assert isinstance(eager_polars, pl.DataFrame)
+    lazy_polars = excel_reader.load_sheet(0).to_polars()
+    pl_assert_frame_equal(eager_polars, lazy_polars)
+
+
+def test_multiple_sheets_with_unnamed_columns():
+    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
+
+    eager_pandas = excel_reader.load_sheet_eager("With unnamed columns").to_pandas()
+    lazy_pandas = excel_reader.load_sheet("With unnamed columns").to_pandas()
+    pd_assert_frame_equal(eager_pandas, lazy_pandas)
+
+    eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager("With unnamed columns"))
+    assert isinstance(eager_polars, pl.DataFrame)
+    lazy_polars = excel_reader.load_sheet("With unnamed columns").to_polars()
+    pl_assert_frame_equal(eager_polars, lazy_polars)
diff --git a/src/types/excelreader.rs b/src/types/excelreader.rs
@@ -1,9 +1,17 @@
+use std::fmt::Debug;
 use std::{fs::File, io::BufReader};
 
 use anyhow::{Context, Result};
-use calamine::{open_workbook_auto, Reader, Sheets};
-use pyo3::{pyclass, pymethods};
+use arrow::pyarrow::PyArrowConvert;
+use arrow::record_batch::RecordBatch;
+use calamine::{open_workbook_auto, CellType, DataTypeTrait, Range, Reader, Sheets};
+use pyo3::prelude::PyObject;
+use pyo3::{pyclass, pymethods, PyResult, Python};
 
+use crate::types::excelsheet::sheet_column_names_from_header_and_range;
+use crate::utils::arrow::arrow_schema_from_column_names_and_range;
+
+use super::excelsheet::record_batch_from_data_and_schema;
 use super::{
     excelsheet::{Header, Pagination},
     ExcelSheet,
@@ -30,6 +38,33 @@ impl ExcelReader {
             path: path.to_owned(),
         })
     }
+
+    fn load_sheet_eager<DT: CellType + DataTypeTrait + Debug>(
+        data: Range<DT>,
+        pagination: Pagination,
+        header: Header,
+    ) -> Result<RecordBatch> {
+        let column_names = sheet_column_names_from_header_and_range(&header, &data);
+
+        let offset = header.offset() + pagination.offset();
+        let limit = {
+            let upper_bound = data.height();
+            if let Some(n_rows) = pagination.n_rows() {
+                let limit = offset + n_rows;
+                if limit < upper_bound {
+                    limit
+                } else {
+                    upper_bound
+                }
+            } else {
+                upper_bound
+            }
+        };
+        let schema = arrow_schema_from_column_names_and_range(&data, &column_names, offset)
+            .with_context(|| "could not build arrow schema")?;
+
+        record_batch_from_data_and_schema(schema, &data, offset, limit)
+    }
 }
 
 #[pymethods]
@@ -57,14 +92,43 @@ impl ExcelReader {
         let range = self
             .sheets
             .worksheet_range(&name)
-            .with_context(|| format!("Sheet {name} not found"))?
             .with_context(|| format!("Error while loading sheet {name}"))?;
 
         let header = Header::new(header_row, column_names);
         let pagination = Pagination::new(skip_rows, n_rows, &range)?;
+
         Ok(ExcelSheet::new(name, range, header, pagination))
     }
 
+    #[pyo3(signature = (
+        name,
+        *,
+        header_row = 0,
+        column_names = None,
+        skip_rows = 0,
+        n_rows = None
+    ))]
+    pub fn load_sheet_by_name_eager(
+        &mut self,
+        name: String,
+        header_row: Option<usize>,
+        column_names: Option<Vec<String>>,
+        skip_rows: usize,
+        n_rows: Option<usize>,
+        py: Python<'_>,
+    ) -> PyResult<PyObject> {
+        let range = self
+            .sheets
+            .worksheet_range(&name)
+            .with_context(|| format!("Error while loading sheet {name}"))?;
+
+        let header = Header::new(header_row, column_names);
+        let pagination = Pagination::new(skip_rows, n_rows, &range)?;
+        let rb = ExcelReader::load_sheet_eager(range, pagination, header)
+            .with_context(|| "could not load sheet eagerly")?;
+        rb.to_pyarrow(py)
+    }
+
     #[pyo3(signature = (
         idx,
         *,
@@ -101,4 +165,33 @@ impl ExcelReader {
         let pagination = Pagination::new(skip_rows, n_rows, &range)?;
         Ok(ExcelSheet::new(name, range, header, pagination))
     }
+
+    #[pyo3(signature = (
+        idx,
+        *,
+        header_row = 0,
+        column_names = None,
+        skip_rows = 0,
+        n_rows = None)
+    )]
+    pub fn load_sheet_by_idx_eager(
+        &mut self,
+        idx: usize,
+        header_row: Option<usize>,
+        column_names: Option<Vec<String>>,
+        skip_rows: usize,
+        n_rows: Option<usize>,
+        py: Python<'_>,
+    ) -> PyResult<PyObject> {
+        let range = self
+            .sheets
+            .worksheet_range_at(idx)
+            .with_context(|| format!("Sheet at idx {idx} not found"))?
+            .with_context(|| format!("Error while loading sheet at idx {idx}"))?;
+        let header = Header::new(header_row, column_names);
+        let pagination = Pagination::new(skip_rows, n_rows, &range)?;
+        let rb = ExcelReader::load_sheet_eager(range, pagination, header)
+            .with_context(|| "could not load sheet eagerly")?;
+        rb.to_pyarrow(py)
+    }
 }