From 2306b877d8e12c243f05f55c9b1fa19302cae5ba Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 6 Nov 2024 05:13:56 -0800 Subject: [PATCH 1/4] Add a regression test demonstrating how to use duckdb in the presence of extension types --- python/python/tests/test_integration.py | 79 ++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/python/python/tests/test_integration.py b/python/python/tests/test_integration.py index 399565f102..49f13475fc 100644 --- a/python/python/tests/test_integration.py +++ b/python/python/tests/test_integration.py @@ -21,6 +21,54 @@ def test_duckdb_filter_on_rowid(tmp_path): assert actual.to_pydict() == expected.to_pydict() +class DuckDataset(pa.dataset.Dataset): + """ + Hacky way to wrap a lance dataset hide extension types. + + Usage: + >>> scanner = DuckDataset(lance.dataset("my_dataset.lance")) + >>> duckdb.sql("SELECT uid FROM scanner LIMIT 10;") + """ + + def __init__(self, ds): + self._ds = ds + fields = [ds.schema.field(i) for i in range(len(ds.schema))] + fields = [f.remove_metadata() for f in fields] + self.pruned_schema = pa.schema(fields) + + @staticmethod + def _filter_field(field: pa.Field) -> bool: + # Filter metadata + UNSUPPORTED_METADATA = [ + { + b"ARROW:extension:metadata": b"", + b"ARROW:extension:name": b"lance.arrow.encoded_image", + }, + { + b"ARROW:extension:metadata": b"", + b"ARROW:extension:name": b"lance.arrow.image_uri", + }, + ] + metadata_is_supported = all(field.metadata != m for m in UNSUPPORTED_METADATA) + type_is_supported = field.type not in { + pa.large_binary(), + } + return metadata_is_supported and type_is_supported + + @property + def schema(self): + return self._schema + + def __getattribute__(self, attr): + if attr == "schema": + return object.__getattribute__(self, "pruned_schema") + elif attr == "_filter_field": + return object.__getattribute__(self, "_filter_field") + else: + ds = super().__getattribute__("_ds") + return object.__getattribute__(ds, attr) + + def test_duckdb_pushdown_extension_types(tmp_path): # large_binary is reported by pyarrow as a substrait extension type. Datafusion # does not currently handle these extension types. This should be ok as long @@ -28,14 +76,41 @@ def test_duckdb_pushdown_extension_types(tmp_path): # # Lance works around this by removing any columns with extension types from the # schema it gives to duckdb. + # + # image is an extension type. DuckDb currently rejects anything that's an extension + # type. We can clumsily work around this by pretending its not an extension type. tab = pa.table( { "filterme": [1, 2, 3], "largebin": pa.array([b"123", b"456", b"789"], pa.large_binary()), "othercol": [4, 5, 6], - } + "image": pa.array( + [b"123", b"456", b"789"], + pa.binary(), + ), + }, + schema=pa.schema( + [ + pa.field("filterme", pa.int64()), + pa.field("largebin", pa.large_binary()), + pa.field("othercol", pa.int64()), + pa.field( + "image", + pa.binary(), + metadata={ + b"ARROW:extension:metadata": b"", + b"ARROW:extension:name": b"lance.arrow.encoded_image", + }, + ), + ] + ), ) ds = lance.write_dataset(tab, str(tmp_path)) # noqa: F841 + ds = DuckDataset(ds) + for field in ds.schema: + print(field) + print(field.metadata) + expected = tab.slice(1, 1) actual = duckdb.query("SELECT * FROM ds WHERE filterme = 2").fetch_arrow_table() assert actual.to_pydict() == expected.to_pydict() @@ -55,6 +130,8 @@ def test_duckdb_pushdown_extension_types(tmp_path): ): duckdb.query("SELECT * FROM ds WHERE largebin = '456'").fetchall() + # Need to subtract extension types from tab to use as our expected results + tab = pa.table(tab.columns, schema=ds.schema) # Unclear if all of these result in pushdown or not but they shouldn't error if # they do. for filt in [ From a1483e7cc3fec3e10414d4d9b797990aca4544c0 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 6 Nov 2024 05:15:24 -0800 Subject: [PATCH 2/4] Remove debug prints --- python/python/tests/test_integration.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/python/tests/test_integration.py b/python/python/tests/test_integration.py index 49f13475fc..172efd160f 100644 --- a/python/python/tests/test_integration.py +++ b/python/python/tests/test_integration.py @@ -107,9 +107,6 @@ def test_duckdb_pushdown_extension_types(tmp_path): ) ds = lance.write_dataset(tab, str(tmp_path)) # noqa: F841 ds = DuckDataset(ds) - for field in ds.schema: - print(field) - print(field.metadata) expected = tab.slice(1, 1) actual = duckdb.query("SELECT * FROM ds WHERE filterme = 2").fetch_arrow_table() From 27f58126b16a777f38ccd2222e138599a1dfff58 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 6 Nov 2024 05:20:19 -0800 Subject: [PATCH 3/4] Simplify --- python/python/tests/test_integration.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/python/python/tests/test_integration.py b/python/python/tests/test_integration.py index 172efd160f..a42f145a91 100644 --- a/python/python/tests/test_integration.py +++ b/python/python/tests/test_integration.py @@ -36,34 +36,9 @@ def __init__(self, ds): fields = [f.remove_metadata() for f in fields] self.pruned_schema = pa.schema(fields) - @staticmethod - def _filter_field(field: pa.Field) -> bool: - # Filter metadata - UNSUPPORTED_METADATA = [ - { - b"ARROW:extension:metadata": b"", - b"ARROW:extension:name": b"lance.arrow.encoded_image", - }, - { - b"ARROW:extension:metadata": b"", - b"ARROW:extension:name": b"lance.arrow.image_uri", - }, - ] - metadata_is_supported = all(field.metadata != m for m in UNSUPPORTED_METADATA) - type_is_supported = field.type not in { - pa.large_binary(), - } - return metadata_is_supported and type_is_supported - - @property - def schema(self): - return self._schema - def __getattribute__(self, attr): if attr == "schema": return object.__getattribute__(self, "pruned_schema") - elif attr == "_filter_field": - return object.__getattribute__(self, "_filter_field") else: ds = super().__getattribute__("_ds") return object.__getattribute__(ds, attr) From 4b790fc0039ea12fd5197c0e57f2627608be79d4 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 6 Nov 2024 05:22:45 -0800 Subject: [PATCH 4/4] Add test case ensuring we can query extension column --- python/python/tests/test_integration.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/python/tests/test_integration.py b/python/python/tests/test_integration.py index a42f145a91..4c1082b0ca 100644 --- a/python/python/tests/test_integration.py +++ b/python/python/tests/test_integration.py @@ -91,6 +91,10 @@ def test_duckdb_pushdown_extension_types(tmp_path): actual = duckdb.query("SELECT * FROM ds WHERE othercol = 4").fetch_arrow_table() assert actual.to_pydict() == expected.to_pydict() + expected = pa.table({"max(image)": [b"789"]}) + actual = duckdb.query("SELECT MAX(image) FROM ds").fetch_arrow_table() + assert actual.to_pydict() == expected.to_pydict() + # Not the best error message but hopefully this is short lived until datafusion # supports substrait extension types. with pytest.raises(