diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 1f79c824e46e..8af5d9069643 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -62,7 +62,10 @@ def _sources(source: FileSource) -> tuple[Any, bool]: if isinstance(src, (str, os.PathLike)) and not Path(src).exists(): src = os.path.expanduser(str(src)) # noqa: PTH111 sources.extend(files := glob(src, recursive=True)) # noqa: PTH207 - read_multiple_workbooks = bool(files) + if not files: + msg = f"no workbook found at path {src!r}" + raise FileNotFoundError(msg) + read_multiple_workbooks = True else: if isinstance(src, os.PathLike): src = str(src) @@ -107,6 +110,7 @@ def read_excel( *, sheet_id: None = ..., sheet_name: str, + table_name: str | None = ..., engine: ExcelSpreadsheetEngine = ..., engine_options: dict[str, Any] | None = ..., read_options: dict[str, Any] | None = ..., @@ -127,6 +131,7 @@ def read_excel( *, sheet_id: None = ..., sheet_name: None = ..., + table_name: str | None = ..., engine: ExcelSpreadsheetEngine = ..., engine_options: dict[str, Any] | None = ..., has_header: bool = ..., @@ -147,6 +152,7 @@ def read_excel( *, sheet_id: int, sheet_name: str, + table_name: str | None = ..., engine: ExcelSpreadsheetEngine = ..., engine_options: dict[str, Any] | None = ..., read_options: dict[str, Any] | None = ..., @@ -169,6 +175,7 @@ def read_excel( *, sheet_id: Literal[0] | Sequence[int], sheet_name: None = ..., + table_name: str | None = ..., engine: ExcelSpreadsheetEngine = ..., engine_options: dict[str, Any] | None = ..., read_options: dict[str, Any] | None = ..., @@ -189,6 +196,7 @@ def read_excel( *, sheet_id: int, sheet_name: None = ..., + table_name: str | None = ..., engine: ExcelSpreadsheetEngine = ..., engine_options: dict[str, Any] | None = ..., read_options: dict[str, Any] | None = ..., @@ -209,6 +217,7 @@ def read_excel( *, sheet_id: None, sheet_name: list[str] | tuple[str], + table_name: str | None = ..., engine: ExcelSpreadsheetEngine = ..., engine_options: dict[str, Any] | None = ..., read_options: dict[str, Any] | None = ..., @@ -230,6 +239,7 @@ def read_excel( *, sheet_id: int | Sequence[int] | None = None, sheet_name: str | list[str] | tuple[str] | None = None, + table_name: str | None = None, engine: ExcelSpreadsheetEngine = "calamine", engine_options: dict[str, Any] | None = None, read_options: dict[str, Any] | None = None, @@ -268,6 +278,11 @@ def read_excel( sheet_name Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If more than one is given then a `{sheetname:frame,}` dict is returned. + table_name + Name of a specific table to read; note that table names are unique across + the workbook, so additionally specifying a sheet id or name is optional; + if one of those parameters *is* specified, an error will be raised if + the named table is not found in that particular sheet. engine : {'calamine', 'xlsx2csv', 'openpyxl'} Library used to parse the spreadsheet file; defaults to "calamine". @@ -382,6 +397,7 @@ def read_excel( src, sheet_id=sheet_id, sheet_name=sheet_name, + table_name=table_name, engine=engine, engine_options=engine_options, read_options=read_options, @@ -594,6 +610,7 @@ def read_ods( src, sheet_id=sheet_id, sheet_name=sheet_name, + table_name=None, engine="calamine", engine_options={}, read_options=None, @@ -619,6 +636,7 @@ def _read_spreadsheet( *, sheet_id: int | Sequence[int] | None, sheet_name: str | Sequence[str] | None, + table_name: str | None, engine: ExcelSpreadsheetEngine, engine_options: dict[str, Any] | None = None, read_options: dict[str, Any] | None = None, @@ -653,7 +671,7 @@ def _read_spreadsheet( try: # parse data from the indicated sheet(s) sheet_names, return_multiple_sheets = _get_sheet_names( - sheet_id, sheet_name, worksheets + sheet_id, sheet_name, table_name, worksheets ) parsed_sheets = { name: reader_fn( @@ -663,6 +681,7 @@ def _read_spreadsheet( read_options=read_options, raise_if_empty=raise_if_empty, columns=columns, + table_name=table_name, drop_empty_rows=drop_empty_rows, drop_empty_cols=drop_empty_cols, ) @@ -698,6 +717,7 @@ def _get_read_options( ) -> dict[str, Any]: """Normalise top-level parameters to engine-specific 'read_options' dict.""" read_options = (read_options or {}).copy() + if engine == "calamine": if ("use_columns" in read_options) and columns: msg = 'cannot specify both `columns` and `read_options["use_columns"]`' @@ -744,6 +764,7 @@ def _get_read_options( def _get_sheet_names( sheet_id: int | Sequence[int] | None, sheet_name: str | Sequence[str] | None, + table_name: str | None, worksheets: list[dict[str, Any]], ) -> tuple[list[str], bool]: """Establish sheets to read; indicate if we are returning a dict frames.""" @@ -753,7 +774,8 @@ def _get_sheet_names( sheet_names = [] if sheet_id is None and sheet_name is None: - sheet_names.append(worksheets[0]["name"]) + name = None if table_name else worksheets[0]["name"] + sheet_names.append(name) return_multiple_sheets = False elif sheet_id == 0: sheet_names.extend(ws["name"] for ws in worksheets) @@ -781,11 +803,12 @@ def _get_sheet_names( if (sheet_id == 0 or ws["index"] in ids or ws["name"] in names) } for idx in ids: - if (name := sheet_names_by_idx.get(idx)) is None: # type: ignore[assignment] + if (name := sheet_names_by_idx.get(idx)) is None: msg = f"no matching sheet found when `sheet_id` is {idx}" raise ValueError(msg) sheet_names.append(name) - return sheet_names, return_multiple_sheets + + return sheet_names, return_multiple_sheets # type: ignore[return-value] def _initialise_spreadsheet_parser( @@ -970,6 +993,7 @@ def _read_spreadsheet_openpyxl( read_options: dict[str, Any], schema_overrides: SchemaDict | None, columns: Sequence[int] | Sequence[str] | None, + table_name: str | None = None, drop_empty_rows: bool, drop_empty_cols: bool, raise_if_empty: bool, @@ -978,13 +1002,29 @@ def _read_spreadsheet_openpyxl( infer_schema_length = read_options.pop("infer_schema_length", None) has_header = read_options.pop("has_header", True) no_inference = infer_schema_length == 0 - ws = parser[sheet_name] + header: list[str | None] = [] + + if table_name and not sheet_name: + sheet_name, n_tables = None, 0 + for sheet in parser.worksheets: + n_tables += 1 + if table_name in sheet.tables: + ws, sheet_name = sheet, sheet.title + break + if sheet_name is None: + msg = ( + f"table named {table_name!r} not found in sheet {sheet_name!r}" + if n_tables + else f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})" + ) + raise RuntimeError(msg) + else: + ws = parser[sheet_name] # prefer detection of actual table objects; otherwise read # data in the used worksheet range, dropping null columns - header: list[str | None] = [] if tables := getattr(ws, "tables", None): - table = next(iter(tables.values())) + table = tables[table_name] if table_name else next(iter(tables.values())) rows = list(ws[table.ref]) if not rows: return _empty_frame(raise_if_empty) @@ -995,6 +1035,9 @@ def _read_spreadsheet_openpyxl( if table.totalsRowCount: rows = rows[: -table.totalsRowCount] rows_iter = rows + elif table_name: + msg = f"no named tables found in sheet {sheet_name!r} (looking for {table_name!r})" + raise RuntimeError(msg) else: if not has_header: if not (rows_iter := list(ws.iter_rows())): @@ -1046,6 +1089,7 @@ def _read_spreadsheet_calamine( read_options: dict[str, Any], schema_overrides: SchemaDict | None, columns: Sequence[int] | Sequence[str] | None, + table_name: str | None = None, drop_empty_rows: bool, drop_empty_cols: bool, raise_if_empty: bool, @@ -1056,11 +1100,14 @@ def _read_spreadsheet_calamine( fastexcel_version = parse_version(original_version := fastexcel.__version__) if fastexcel_version < (0, 9) and "schema_sample_rows" in read_options: - msg = f"a more recent version of `fastexcel` is required (>= 0.9; found {original_version})" + msg = f"a more recent version of `fastexcel` is required for 'schema_sample_rows' (>= 0.9; found {original_version})" raise ModuleUpgradeRequiredError(msg) if fastexcel_version < (0, 10, 2) and "use_columns" in read_options: - msg = f"a more recent version of `fastexcel` is required (>= 0.10.2; found {original_version})" + msg = f"a more recent version of `fastexcel` is required for 'use_columns' (>= 0.10.2; found {original_version})" raise ModuleUpgradeRequiredError(msg) + if table_name and fastexcel_version < (0, 12): + msg = f"a more recent version of `fastexcel` is required for 'table_name' (>= 0.12.0; found {original_version})" + raise ValueError(msg) if columns: read_options["use_columns"] = columns @@ -1069,6 +1116,7 @@ def _read_spreadsheet_calamine( if read_options.get("schema_sample_rows") == 0: # ref: https://github.com/ToucanToco/fastexcel/issues/236 read_options["dtypes"] = dict.fromkeys(range(16384), "string") + elif schema_overrides and fastexcel_version >= (0, 10): parser_dtypes = read_options.get("dtypes", {}) for name, dtype in schema_overrides.items(): @@ -1094,8 +1142,16 @@ def _read_spreadsheet_calamine( ws = parser.load_sheet_by_name(name=sheet_name, **read_options) df = ws.to_polars() else: - ws_arrow = parser.load_sheet_eager(sheet_name, **read_options) - df = from_arrow(ws_arrow) + if table_name: + xl_table = parser.load_table(table_name, **read_options) + if sheet_name and sheet_name != xl_table.sheet_name: + msg = f"table named {table_name!r} not found in sheet {sheet_name!r}" + raise RuntimeError(msg) + df = xl_table.to_polars() + else: + ws_arrow = parser.load_sheet_eager(sheet_name, **read_options) + df = from_arrow(ws_arrow) + if read_options.get("header_row", False) is None and not read_options.get( "column_names" ): @@ -1153,13 +1209,17 @@ def _read_spreadsheet_xlsx2csv( read_options: dict[str, Any], schema_overrides: SchemaDict | None, columns: Sequence[int] | Sequence[str] | None, + table_name: str | None = None, drop_empty_rows: bool, drop_empty_cols: bool, raise_if_empty: bool, ) -> pl.DataFrame: """Use the 'xlsx2csv' library to read data from the given worksheet.""" - csv_buffer = StringIO() + if table_name: + msg = "the `table_name` parameter is not supported by the 'xlsx2csv' engine" + raise ValueError(msg) + csv_buffer = StringIO() with warnings.catch_warnings(): # xlsx2csv version 0.8.4 throws a DeprecationWarning in Python 3.13 # https://github.com/dilshod/xlsx2csv/pull/287 diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index c1838351d86d..aae667312109 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -4,6 +4,7 @@ from collections import OrderedDict from datetime import date, datetime from io import BytesIO +from pathlib import Path from typing import TYPE_CHECKING, Any, Callable import pytest @@ -16,7 +17,6 @@ if TYPE_CHECKING: from collections.abc import Sequence - from pathlib import Path from polars._typing import ExcelSpreadsheetEngine, SchemaDict, SelectorType @@ -859,6 +859,41 @@ def test_excel_write_compound_types(engine: ExcelSpreadsheetEngine) -> None: ] +def test_excel_read_named_table_with_total_row(tmp_path: Path) -> None: + df = pl.DataFrame( + { + "x": ["aa", "bb", "cc"], + "y": [100, 325, -250], + "z": [975, -444, 123], + } + ) + # when we read back a named table object with a total row we expect the read + # to automatically omit that row as it is *not* part of the actual table data + wb_path = Path(tmp_path).joinpath("test_named_table_read.xlsx") + df.write_excel( + wb_path, + worksheet="data", + table_name="PolarsFrameTable", + column_totals=True, + ) + for engine in ("calamine", "openpyxl"): + xldf = pl.read_excel(wb_path, table_name="PolarsFrameTable", engine=engine) + assert_frame_equal(df, xldf) + + # xlsx2csv doesn't support reading named tables, so we see the + # column total if we don't filter it out after reading the data + with pytest.raises( + ValueError, + match="the `table_name` parameter is not supported by the 'xlsx2csv' engine", + ): + pl.read_excel(wb_path, table_name="PolarsFrameTable", engine="xlsx2csv") + + xldf = pl.read_excel(wb_path, sheet_name="data", engine="xlsx2csv") + assert_frame_equal(df, xldf.head(3)) + assert xldf.height == 4 + assert xldf.row(3) == (None, 0, 0) + + @pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl", "calamine"]) def test_excel_read_no_headers(engine: ExcelSpreadsheetEngine) -> None: df = pl.DataFrame( @@ -960,18 +995,24 @@ def test_excel_write_multiple_tables() -> None: # write multiple frames to multiple worksheets xls = BytesIO() with Workbook(xls) as wb: - df.write_excel(workbook=wb, worksheet="sheet1", position="A1") - df.write_excel(workbook=wb, worksheet="sheet1", position="A6") - df.write_excel(workbook=wb, worksheet="sheet2", position="A1") + df.rename({"colx": "colx0", "coly": "coly0", "colz": "colz0"}).write_excel( + workbook=wb, worksheet="sheet1", position="A1" + ) + df.rename({"colx": "colx1", "coly": "coly1", "colz": "colz1"}).write_excel( + workbook=wb, worksheet="sheet1", position="X10" + ) + df.rename({"colx": "colx2", "coly": "coly2", "colz": "colz2"}).write_excel( + workbook=wb, worksheet="sheet2", position="C25" + ) - # validate integration of externally-added formats + # also validate integration of externally-added formats fmt = wb.add_format({"bg_color": "#ffff00"}) - df.write_excel( + df.rename({"colx": "colx3", "coly": "coly3", "colz": "colz3"}).write_excel( workbook=wb, worksheet="sheet3", - position="A1", + position="D4", conditional_formats={ - "colz": { + "colz3": { "type": "formula", "criteria": "=C2=B2", "format": fmt, @@ -979,14 +1020,38 @@ def test_excel_write_multiple_tables() -> None: }, ) - table_names: set[str] = set() - for sheet in ("sheet1", "sheet2", "sheet3"): - table_names.update( - tbl["name"] for tbl in wb.get_worksheet_by_name(sheet).tables - ) + table_names = { + tbl["name"] + for sheet in wb.sheetnames + for tbl in wb.get_worksheet_by_name(sheet).tables + } assert table_names == {f"Frame{n}" for n in range(4)} assert pl.read_excel(xls, sheet_name="sheet3").rows() == [] + # test loading one of the written tables by name + for engine in ("calamine", "openpyxl"): + df1 = pl.read_excel( + xls, + sheet_name="sheet2", + table_name="Frame2", + engine=engine, + ) + df2 = pl.read_excel( + xls, + table_name="Frame2", + engine=engine, + ) + assert df1.columns == ["colx2", "coly2", "colz2"] + assert_frame_equal(df1, df2) + + # if we supply a sheet name (which is optional when using `table_name`), + # then the table name must be present in *that* sheet, or we raise an error + with pytest.raises( + RuntimeError, + match="table named 'Frame3' not found in sheet 'sheet1'", + ): + pl.read_excel(xls, sheet_name="sheet1", table_name="Frame3") + def test_excel_write_worksheet_object() -> None: # write to worksheet object