From ded97ef6810c027e82ca0306b9681338301bf090 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Sun, 2 Feb 2025 22:04:05 +0000 Subject: [PATCH] PyVortex array subclasses (#2194) Downcast a PyArray to an encoding-specific array when we return it. --- docs/python/api/arrays.rst | 51 +++++++++- docs/python/api/compress.rst | 4 + docs/python/api/encoding.rst | 14 --- docs/python/api/index.rst | 6 +- pyvortex/python/vortex/__init__.py | 25 ++++- pyvortex/python/vortex/_lib/arrays.pyi | 35 ++++++- pyvortex/python/vortex/_lib/encoding.pyi | 8 +- pyvortex/python/vortex/encoding.py | 3 - pyvortex/src/arrays/builtins/chunked.rs | 24 +++++ pyvortex/src/arrays/builtins/constant.rs | 21 +++++ pyvortex/src/arrays/builtins/mod.rs | 38 ++++++++ pyvortex/src/arrays/builtins/struct_.rs | 26 +++++ pyvortex/src/arrays/mod.rs | 115 ++++++++++++++++++++--- pyvortex/src/compress.rs | 4 +- pyvortex/src/dataset.rs | 42 +++++---- pyvortex/src/encoding/bool.rs | 83 ---------------- pyvortex/src/encoding/mod.rs | 16 +--- pyvortex/src/io.rs | 24 ++--- pyvortex/test/test_encoding.py | 21 ----- vortex-array/src/data/mod.rs | 5 +- 20 files changed, 371 insertions(+), 194 deletions(-) create mode 100644 docs/python/api/compress.rst delete mode 100644 docs/python/api/encoding.rst delete mode 100644 pyvortex/python/vortex/encoding.py create mode 100644 pyvortex/src/arrays/builtins/chunked.rs create mode 100644 pyvortex/src/arrays/builtins/constant.rs create mode 100644 pyvortex/src/arrays/builtins/mod.rs create mode 100644 pyvortex/src/arrays/builtins/struct_.rs delete mode 100644 pyvortex/src/encoding/bool.rs delete mode 100644 pyvortex/test/test_encoding.py diff --git a/docs/python/api/arrays.rst b/docs/python/api/arrays.rst index 67feaf0cc6..4f3af642a3 100644 --- a/docs/python/api/arrays.rst +++ b/docs/python/api/arrays.rst @@ -7,10 +7,10 @@ functions, serialized, and otherwise manipulated as a generic array. There are two ways of "downcasting" an array for more specific access patterns: -1. Into an encoding-specific array, like `vortex.encoding.BitPackedArray`.vortex. +1. Into an encoding-specific array, like `vortex.BitPackedArray`.vortex. 2. Into a type-specific array, like `vortex.array.BoolTypeArray`. -Be careful to note that :class:`vortex.encoding.BoolArray` represents an array that stores physical data +Be careful to note that :class:`vortex.BoolArray` represents an array that stores physical data as a bit-buffer of booleans, vs `vortex.array.BoolTypeArray` which represents any array that has a logical type of boolean. @@ -20,9 +20,52 @@ Factory Functions .. autofunction:: vortex.array -Type Classes ------------- +Base Class +---------- .. autoclass:: vortex.Array :members: :special-members: __len__ + + +Builtin Encodings +----------------- + +.. autoclass:: vortex.ChunkedArray + :members: + + +.. autoclass:: vortex.ConstantArray + :members: + + +.. autoclass:: vortex.NullArray + :members: + + +.. autoclass:: vortex.BoolArray + :members: + + +.. autoclass:: vortex.PrimitiveArray + :members: + + +.. autoclass:: vortex.VarBinArray + :members: + + +.. autoclass:: vortex.VarBinViewArray + :members: + + +.. autoclass:: vortex.StructArray + :members: + + +.. autoclass:: vortex.ListArray + :members: + + +.. autoclass:: vortex.ExtensionArray + :members: diff --git a/docs/python/api/compress.rst b/docs/python/api/compress.rst new file mode 100644 index 0000000000..cd5803b706 --- /dev/null +++ b/docs/python/api/compress.rst @@ -0,0 +1,4 @@ +Compression +=========== + +.. autofunction:: vortex.compress diff --git a/docs/python/api/encoding.rst b/docs/python/api/encoding.rst deleted file mode 100644 index 681a8156cc..0000000000 --- a/docs/python/api/encoding.rst +++ /dev/null @@ -1,14 +0,0 @@ -Encodings -========= - -Vortex arrays have both a logical data type and a physical encoding. Arrays in PyVortex are downcast to their -specific physical encoding where such a Python class exists, otherwise a base :class:`~vortex.Array` is used. - -Each encoding-specific class may have additional methods and properties that are specific to that encoding. -To be concise, we do not show the base class methods in this encoding-specific class documentation. - -.. autofunction:: vortex.compress - -.. autoclass:: vortex.encoding.BoolArray - :members: - :show-inheritance: \ No newline at end of file diff --git a/docs/python/api/index.rst b/docs/python/api/index.rst index 6d26b0ea85..cf35c4a172 100644 --- a/docs/python/api/index.rst +++ b/docs/python/api/index.rst @@ -6,8 +6,8 @@ Python API arrays dtypes - encoding + scalars + expr + compress io dataset - expr - scalars diff --git a/pyvortex/python/vortex/__init__.py b/pyvortex/python/vortex/__init__.py index 0d1fff0a94..2138a41d41 100644 --- a/pyvortex/python/vortex/__init__.py +++ b/pyvortex/python/vortex/__init__.py @@ -1,4 +1,16 @@ from . import _lib +from ._lib.arrays import ( + BoolArray, + ChunkedArray, + ConstantArray, + ExtensionArray, + ListArray, + NullArray, + PrimitiveArray, + StructArray, + VarBinArray, + VarBinViewArray, +) from ._lib.compress import compress from ._lib.dtype import ( BinaryDType, @@ -37,9 +49,20 @@ assert _lib, "Ensure we eagerly import the Vortex native library" __all__ = [ - "Array", "array", "compress", + # Arrays and builtin encodings + "Array", + "ConstantArray", + "ChunkedArray", + "NullArray", + "BoolArray", + "PrimitiveArray", + "VarBinArray", + "VarBinViewArray", + "StructArray", + "ListArray", + "ExtensionArray", # DTypes "DType", "NullDType", diff --git a/pyvortex/python/vortex/_lib/arrays.pyi b/pyvortex/python/vortex/_lib/arrays.pyi index d969c4bdfc..eb384b32b5 100644 --- a/pyvortex/python/vortex/_lib/arrays.pyi +++ b/pyvortex/python/vortex/_lib/arrays.pyi @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, final import numpy as np import pandas as pd @@ -32,3 +32,36 @@ class Array: def to_polars_dataframe(self) -> pl.DataFrame: ... def to_polars_series(self) -> pl.Series: ... def to_pylist(self) -> list: ... + +@final +class NullArray(Array): ... + +@final +class BoolArray(Array): ... + +@final +class PrimitiveArray(Array): ... + +@final +class VarBinArray(Array): ... + +@final +class VarBinViewArray(Array): ... + +@final +class StructArray(Array): + def field(self, name: str) -> Array: ... + +@final +class ListArray(Array): ... + +@final +class ExtensionArray(Array): ... + +@final +class ConstantArray(Array): + def scalar(self) -> vx.Scalar: ... + +@final +class ChunkedArray(Array): + def chunks(self) -> list[Array]: ... diff --git a/pyvortex/python/vortex/_lib/encoding.pyi b/pyvortex/python/vortex/_lib/encoding.pyi index 26caacc7a6..66ae41cbba 100644 --- a/pyvortex/python/vortex/_lib/encoding.pyi +++ b/pyvortex/python/vortex/_lib/encoding.pyi @@ -1,11 +1,5 @@ -from typing import Any, final +from typing import Any import pyarrow as pa -import vortex as vx - def _encode(obj: Any) -> pa.Array: ... -@final -class BoolArray(vx.Array): - def __new__(cls, array: vx.Array) -> BoolArray: ... - def true_count(self) -> int: ... diff --git a/pyvortex/python/vortex/encoding.py b/pyvortex/python/vortex/encoding.py deleted file mode 100644 index 37938cc50b..0000000000 --- a/pyvortex/python/vortex/encoding.py +++ /dev/null @@ -1,3 +0,0 @@ -from vortex._lib.encoding import BoolArray - -__all__ = ["BoolArray"] diff --git a/pyvortex/src/arrays/builtins/chunked.rs b/pyvortex/src/arrays/builtins/chunked.rs new file mode 100644 index 0000000000..fb45544c40 --- /dev/null +++ b/pyvortex/src/arrays/builtins/chunked.rs @@ -0,0 +1,24 @@ +use itertools::Itertools; +use pyo3::{pyclass, pymethods, Bound, PyRef, PyResult}; +use vortex::array::ChunkedEncoding; + +use crate::arrays::{ArraySubclass, AsArrayRef, PyArray}; + +/// Concrete class for arrays with `vortex.chunked` encoding. +#[pyclass(name = "ChunkedArray", module = "vortex", extends=PyArray, frozen)] +pub(crate) struct PyChunkedArray; + +impl ArraySubclass for PyChunkedArray { + type Encoding = ChunkedEncoding; +} + +#[pymethods] +impl PyChunkedArray { + pub fn chunks(self_: PyRef<'_, Self>) -> PyResult>> { + self_ + .as_array_ref() + .chunks() + .map(|chunk| PyArray::init(self_.py(), chunk)) + .try_collect() + } +} diff --git a/pyvortex/src/arrays/builtins/constant.rs b/pyvortex/src/arrays/builtins/constant.rs new file mode 100644 index 0000000000..e984868269 --- /dev/null +++ b/pyvortex/src/arrays/builtins/constant.rs @@ -0,0 +1,21 @@ +use pyo3::{pyclass, pymethods, Bound, PyRef, PyResult}; +use vortex::array::ConstantEncoding; + +use crate::arrays::{ArraySubclass, AsArrayRef, PyArray}; +use crate::scalar::PyScalar; + +/// Concrete class for arrays with `vortex.constant` encoding. +#[pyclass(name = "ConstantArray", module = "vortex", extends=PyArray, frozen)] +pub(crate) struct PyConstantArray; + +impl ArraySubclass for PyConstantArray { + type Encoding = ConstantEncoding; +} + +#[pymethods] +impl PyConstantArray { + /// Return the scalar value of the constant array. + pub fn scalar(self_: PyRef<'_, Self>) -> PyResult> { + PyScalar::init(self_.py(), self_.as_array_ref().scalar()) + } +} diff --git a/pyvortex/src/arrays/builtins/mod.rs b/pyvortex/src/arrays/builtins/mod.rs new file mode 100644 index 0000000000..26a9047f2d --- /dev/null +++ b/pyvortex/src/arrays/builtins/mod.rs @@ -0,0 +1,38 @@ +mod chunked; +mod constant; +mod struct_; + +pub(crate) use chunked::*; +pub(crate) use constant::*; +use pyo3::prelude::*; +pub(crate) use struct_::*; + +use crate::arrays::PyArray; + +/// Concrete class for arrays with `vortex.null` encoding. +#[pyclass(name = "NullArray", module = "vortex", extends=PyArray, frozen)] +pub(crate) struct PyNullArray; + +/// Concrete class for arrays with `vortex.bool` encoding. +#[pyclass(name = "BoolArray", module = "vortex", extends=PyArray, frozen)] +pub(crate) struct PyBoolArray; + +/// Concrete class for arrays with `vortex.primitive` encoding. +#[pyclass(name = "PrimitiveArray", module = "vortex", extends=PyArray, frozen)] +pub(crate) struct PyPrimitiveArray; + +/// Concrete class for arrays with `vortex.varbin` encoding. +#[pyclass(name = "VarBinArray", module = "vortex", extends=PyArray, frozen)] +pub(crate) struct PyVarBinArray; + +/// Concrete class for arrays with `vortex.varbinview` encoding. +#[pyclass(name = "VarBinViewArray", module = "vortex", extends=PyArray, frozen)] +pub(crate) struct PyVarBinViewArray; + +/// Concrete class for arrays with `vortex.list` encoding. +#[pyclass(name = "ListArray", module = "vortex", extends=PyArray, frozen)] +pub(crate) struct PyListArray; + +/// Concrete class for arrays with `vortex.ext` encoding. +#[pyclass(name = "ExtensionArray", module = "vortex", extends=PyArray, frozen)] +pub(crate) struct PyExtensionArray; diff --git a/pyvortex/src/arrays/builtins/struct_.rs b/pyvortex/src/arrays/builtins/struct_.rs new file mode 100644 index 0000000000..e75235b478 --- /dev/null +++ b/pyvortex/src/arrays/builtins/struct_.rs @@ -0,0 +1,26 @@ +use pyo3::exceptions::PyKeyError; +use pyo3::{pyclass, pymethods, Bound, PyRef, PyResult}; +use vortex::array::StructEncoding; +use vortex::variants::StructArrayTrait; + +use crate::arrays::{ArraySubclass, AsArrayRef, PyArray}; + +/// Concrete class for arrays with `vortex.struct` encoding. +#[pyclass(name = "StructArray", module = "vortex", extends=PyArray, frozen)] +pub(crate) struct PyStructArray; + +impl ArraySubclass for PyStructArray { + type Encoding = StructEncoding; +} + +#[pymethods] +impl PyStructArray { + /// Returns the given field of the struct array. + pub fn field<'py>(self_: PyRef<'py, Self>, name: &str) -> PyResult> { + let field = self_ + .as_array_ref() + .maybe_null_field_by_name(name) + .ok_or_else(|| PyKeyError::new_err(format!("Field name not found: {}", name)))?; + PyArray::init(self_.py(), field) + } +} diff --git a/pyvortex/src/arrays/mod.rs b/pyvortex/src/arrays/mod.rs index 7a437b7adb..53e887c0f0 100644 --- a/pyvortex/src/arrays/mod.rs +++ b/pyvortex/src/arrays/mod.rs @@ -1,3 +1,5 @@ +mod builtins; + use std::ops::Deref; use arrow::array::{Array as ArrowArray, ArrayRef}; @@ -5,12 +7,19 @@ use arrow::pyarrow::ToPyArrow; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::{IntoPyDict, PyList}; -use vortex::array::ChunkedArray; +use pyo3::PyClass; +use vortex::array::{ChunkedArray, ChunkedEncoding, ConstantEncoding, VarBinEncoding}; use vortex::arrow::{infer_data_type, IntoArrowArray}; use vortex::compute::{compare, fill_forward, scalar_at, slice, take, Operator}; +use vortex::dtype::DType; +use vortex::error::{VortexError, VortexExpect}; use vortex::mask::Mask; -use vortex::Array; +use vortex::{Array, Encoding}; +use crate::arrays::builtins::{ + PyBoolArray, PyChunkedArray, PyConstantArray, PyExtensionArray, PyListArray, PyNullArray, + PyPrimitiveArray, PyStructArray, PyVarBinArray, PyVarBinViewArray, +}; use crate::dtype::PyDType; use crate::install_module; use crate::python_repr::PythonRepr; @@ -23,6 +32,18 @@ pub(crate) fn init(py: Python, parent: &Bound) -> PyResult<()> { m.add_class::()?; + // Builtin encodings + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + Ok(()) } @@ -92,24 +113,76 @@ pub(crate) fn init(py: Python, parent: &Bound) -> PyResult<()> { /// ] #[pyclass(name = "Array", module = "vortex", sequence, subclass)] #[derive(Clone)] -pub struct PyArray(pub Array); +pub struct PyArray(Array); -impl PyArray { - pub fn inner(&self) -> &Array { +impl Deref for PyArray { + type Target = Array; + + fn deref(&self) -> &Self::Target { &self.0 } +} - pub fn into_inner(self) -> Array { - self.0 +impl PyArray { + /// Initialize a [`PyArray`] from a Vortex [`Array`], ensuring the correct subclass is + /// returned if possible. + // TODO(ngates): in future, we should use a Python registry to allow users to register + // additional subclasses for their own encodings. + pub fn init(py: Python, array: Array) -> PyResult> { + // Make sure we always downcast canonical arrays to their subclass. + // We use an exhaustive match here to ensure a compilation error if we ever add a new + // canonical encoding. + if array.is_canonical() { + return match array.dtype() { + DType::Null => Self::with_subclass(py, array, PyNullArray), + DType::Bool(_) => Self::with_subclass(py, array, PyBoolArray), + DType::Primitive(..) => Self::with_subclass(py, array, PyPrimitiveArray), + DType::Utf8(_) | DType::Binary(_) => { + Self::with_subclass(py, array, PyVarBinViewArray) + } + DType::Struct(..) => Self::with_subclass(py, array, PyStructArray), + DType::List(..) => Self::with_subclass(py, array, PyListArray), + DType::Extension(_) => Self::with_subclass(py, array, PyExtensionArray), + }; + } + + if array.is_encoding(ConstantEncoding::ID) { + return Self::with_subclass(py, array, PyConstantArray); + } + if array.is_encoding(ChunkedEncoding::ID) { + return Self::with_subclass(py, array, PyChunkedArray); + } + if array.is_encoding(VarBinEncoding::ID) { + return Self::with_subclass(py, array, PyVarBinArray); + } + + // For other arrays, we should check in a registry of encoding subclasses, these are + // discovered at runtime using entry points or manual registration. + + // Otherwise, we return the base type. + Bound::new(py, PyArray(array)) } -} -impl Deref for PyArray { - type Target = Array; + fn with_subclass>( + py: Python, + array: Array, + subclass: S, + ) -> PyResult> { + Ok(Bound::new( + py, + PyClassInitializer::from(PyArray(array)).add_subclass(subclass), + )? + .into_any() + .downcast_into::()?) + } - fn deref(&self) -> &Self::Target { + pub fn inner(&self) -> &Array { &self.0 } + + pub fn into_inner(self) -> Array { + self.0 + } } #[pymethods] @@ -545,3 +618,23 @@ impl PyArray { self.0.tree_display().to_string() } } + +/// A marker trait indicating a PyO3 class is a subclass of Vortex `Array`. +pub trait ArraySubclass: PyClass { + type Encoding: Encoding; +} + +/// Unwrap a downcasted Vortex array from a `PyRef`. +pub trait AsArrayRef { + fn as_array_ref(&self) -> &T; +} + +impl AsArrayRef<::Array> for PyRef<'_, A> +where + for<'a> &'a ::Array: TryFrom<&'a Array, Error = VortexError>, +{ + fn as_array_ref(&self) -> &::Array { + <&::Array>::try_from(self.as_super().inner()) + .vortex_expect("Failed to downcast array") + } +} diff --git a/pyvortex/src/compress.rs b/pyvortex/src/compress.rs index fe676b0dd3..b963cd20cc 100644 --- a/pyvortex/src/compress.rs +++ b/pyvortex/src/compress.rs @@ -46,8 +46,8 @@ pub(crate) fn init(py: Python, parent: &Bound) -> PyResult<()> { /// >>> str(vx.compress(a)) /// 'vortex.alp(0x11)(f64?, len=1000)' #[pyfunction] -pub fn compress(array: &Bound) -> PyResult { +pub fn compress<'py>(array: &'py Bound<'py, PyArray>) -> PyResult> { let compressor = SamplingCompressor::default(); let inner = compressor.compress(&array.borrow(), None)?.into_array(); - Ok(PyArray(inner)) + PyArray::init(array.py(), inner) } diff --git a/pyvortex/src/dataset.rs b/pyvortex/src/dataset.rs index a90a96b7f4..36e5617894 100644 --- a/pyvortex/src/dataset.rs +++ b/pyvortex/src/dataset.rs @@ -115,15 +115,14 @@ impl TokioFileDataset { columns: Option>>, row_filter: Option<&Bound<'_, PyExpr>>, indices: Option<&PyArray>, - ) -> PyResult { - let inner = read_array_from_reader( + ) -> PyResult { + Ok(read_array_from_reader( &self.vxf, projection_from_python(columns)?, filter_from_python(row_filter), indices.cloned().map(PyArray::into_inner), ) - .await?; - Ok(PyArray(inner)) + .await?) } async fn async_to_record_batch_reader( @@ -159,13 +158,17 @@ impl TokioFileDataset { } #[pyo3(signature = (*, columns = None, row_filter = None, indices = None))] - pub fn to_array( + pub fn to_array<'py>( &self, - columns: Option>>, - row_filter: Option<&Bound<'_, PyExpr>>, + py: Python<'py>, + columns: Option>>, + row_filter: Option<&Bound<'py, PyExpr>>, indices: Option<&PyArray>, - ) -> PyResult { - TOKIO_RUNTIME.block_on(self.async_to_array(columns, row_filter, indices)) + ) -> PyResult> { + PyArray::init( + py, + TOKIO_RUNTIME.block_on(self.async_to_array(columns, row_filter, indices))?, + ) } #[pyo3(signature = (*, columns = None, row_filter = None, indices = None))] @@ -204,15 +207,14 @@ impl ObjectStoreUrlDataset { columns: Option>>, row_filter: Option<&Bound<'_, PyExpr>>, indices: Option<&PyArray>, - ) -> PyResult { - let inner = read_array_from_reader( + ) -> PyResult { + Ok(read_array_from_reader( &self.vxf, projection_from_python(columns)?, filter_from_python(row_filter), indices.cloned().map(PyArray::into_inner), ) - .await?; - Ok(PyArray(inner)) + .await?) } async fn async_to_record_batch_reader( @@ -247,13 +249,17 @@ impl ObjectStoreUrlDataset { } #[pyo3(signature = (*, columns = None, row_filter = None, indices = None))] - pub fn to_array( + pub fn to_array<'py>( &self, - columns: Option>>, - row_filter: Option<&Bound<'_, PyExpr>>, + py: Python<'py>, + columns: Option>>, + row_filter: Option<&Bound<'py, PyExpr>>, indices: Option<&PyArray>, - ) -> PyResult { - TOKIO_RUNTIME.block_on(self.async_to_array(columns, row_filter, indices)) + ) -> PyResult> { + PyArray::init( + py, + TOKIO_RUNTIME.block_on(self.async_to_array(columns, row_filter, indices))?, + ) } #[pyo3(signature = (*, columns = None, row_filter = None, indices = None))] diff --git a/pyvortex/src/encoding/bool.rs b/pyvortex/src/encoding/bool.rs deleted file mode 100644 index 11d7f416df..0000000000 --- a/pyvortex/src/encoding/bool.rs +++ /dev/null @@ -1,83 +0,0 @@ -use pyo3::exceptions::PyValueError; -use pyo3::prelude::*; -use pyo3::{pyclass, pymethods, Bound, PyClass, PyResult}; -use vortex::array::BoolEncoding; -use vortex::error::{VortexError, VortexExpect}; -use vortex::{Array, Encoding}; - -use crate::arrays::PyArray; - -#[pyclass(name = "BoolArray", module = "vortex.encoding", extends=PyArray)] -pub struct PyBoolArray; - -impl ArraySubclass for PyBoolArray { - type Encoding = BoolEncoding; -} - -#[pymethods] -impl PyBoolArray { - /// Downcasts a :class:`vortex.Array` into a :class:`vortex.encoding.BoolArray`. - #[new] - pub fn new(array: &Bound<'_, PyArray>) -> PyResult<(Self, PyArray)> { - let array: Array = array.extract::()?.0; - - if array.encoding() != BoolEncoding::ID { - return Err(PyValueError::new_err(format!( - "Expected array with {} encoding, but found {}", - BoolEncoding::ID, - array.encoding(), - ))); - } - - Ok((PyBoolArray, PyArray(array))) - } - - /// Compute the number of true values in the array. - pub fn true_count(self_: PyRef<'_, Self>) -> PyResult { - self_ - .as_array_ref() - .statistics() - .compute_true_count() - .ok_or_else(|| PyValueError::new_err("Failed to compute true count")) - } -} - -/// A marker trait indicating a PyO3 class is a subclass of Vortex `Array`. -pub trait ArraySubclass: PyClass { - type Encoding: Encoding; -} - -/// Unwrap a downcasted Vortex array from a `PyRef`. -pub trait AsArrayRef { - fn as_array_ref(&self) -> &T; -} - -impl AsArrayRef<<::Encoding as Encoding>::Array> - for PyRef<'_, A> -where - for<'a> &'a <::Encoding as Encoding>::Array: - TryFrom<&'a Array, Error = VortexError>, -{ - fn as_array_ref(&self) -> &<::Encoding as Encoding>::Array { - <&<::Encoding as Encoding>::Array>::try_from(self.as_super().inner()) - .vortex_expect("Failed to downcast array") - } -} - -// TODO(ngates): requires newer PyO3 version -// /// Convert a `Bound` into a Vortex array. -// pub trait IntoArray { -// fn into_array(self) -> T; -// } -// -// impl<'py, A: ArraySubclass> IntoArray<<::Encoding as Encoding>::Array> -// for Bound<'py, A> -// where -// <::Encoding as Encoding>::Array: TryFrom, -// { -// fn into_array(self) -> <::Encoding as Encoding>::Array { -// let array = self.into_super().unwrap(); -// <&<::Encoding as Encoding>::Array>::try_from(&array.0) -// .vortex_expect("Failed to downcast array") -// } -// } diff --git a/pyvortex/src/encoding/mod.rs b/pyvortex/src/encoding/mod.rs index 85b3207234..98dc475476 100644 --- a/pyvortex/src/encoding/mod.rs +++ b/pyvortex/src/encoding/mod.rs @@ -1,5 +1,3 @@ -mod bool; - use arrow::array::{make_array, ArrayData as ArrowArrayData}; use arrow::datatypes::{DataType, Field}; use arrow::ffi_stream::ArrowArrayStreamReader; @@ -14,7 +12,6 @@ use vortex::error::{VortexError, VortexResult}; use vortex::{Array, IntoArray}; use crate::arrays::PyArray; -use crate::encoding::bool::PyBoolArray; use crate::install_module; pub(crate) fn init(py: Python, parent: &Bound) -> PyResult<()> { @@ -24,8 +21,6 @@ pub(crate) fn init(py: Python, parent: &Bound) -> PyResult<()> { m.add_function(wrap_pyfunction!(_encode, &m)?)?; - m.add_class::()?; - Ok(()) } @@ -41,7 +36,7 @@ pub fn _encode<'py>(obj: &Bound<'py, PyAny>) -> PyResult> { let arrow_array = ArrowArrayData::from_pyarrow_bound(obj).map(make_array)?; let is_nullable = arrow_array.is_nullable(); let enc_array = Array::from_arrow(arrow_array, is_nullable); - Bound::new(obj.py(), PyArray(enc_array)) + PyArray::init(obj.py(), enc_array) } else if obj.is_instance(&chunked_array)? { let chunks: Vec> = obj.getattr("chunks")?.extract()?; let encoded_chunks = chunks @@ -56,9 +51,9 @@ pub fn _encode<'py>(obj: &Bound<'py, PyAny>) -> PyResult> { .getattr("type") .and_then(|v| DataType::from_pyarrow_bound(&v)) .map(|dt| DType::from_arrow(&Field::new("_", dt, false)))?; - Bound::new( + PyArray::init( obj.py(), - PyArray(ChunkedArray::try_new(encoded_chunks, dtype)?.into_array()), + ChunkedArray::try_new(encoded_chunks, dtype)?.into_array(), ) } else if obj.is_instance(&table)? { let array_stream = ArrowArrayStreamReader::from_pyarrow_bound(obj)?; @@ -68,10 +63,7 @@ pub fn _encode<'py>(obj: &Bound<'py, PyAny>) -> PyResult> { .map(|b| b.map_err(VortexError::ArrowError)) .map(|b| b.and_then(Array::try_from)) .collect::>>()?; - Bound::new( - obj.py(), - PyArray(ChunkedArray::try_new(chunks, dtype)?.into_array()), - ) + PyArray::init(obj.py(), ChunkedArray::try_new(chunks, dtype)?.into_array()) } else { Err(PyValueError::new_err( "Cannot convert object to Vortex array", diff --git a/pyvortex/src/io.rs b/pyvortex/src/io.rs index 3c65038d09..c2ed56b1dd 100644 --- a/pyvortex/src/io.rs +++ b/pyvortex/src/io.rs @@ -122,14 +122,14 @@ pub(crate) fn init(py: Python, parent: &Bound) -> PyResult<()> { /// >>> # b.to_arrow_array() #[pyfunction] #[pyo3(signature = (path, *, projection = None, row_filter = None, indices = None))] -pub fn read_path( - path: Bound, - projection: Option>>, - row_filter: Option<&Bound>, +pub fn read_path<'py>( + path: Bound<'py, PyString>, + projection: Option>>, + row_filter: Option<&Bound<'py, PyExpr>>, indices: Option<&PyArray>, -) -> PyResult { +) -> PyResult> { let dataset = TOKIO_RUNTIME.block_on(TokioFileDataset::try_new(path.extract()?))?; - dataset.to_array(projection, row_filter, indices) + dataset.to_array(path.py(), projection, row_filter, indices) } /// Read a vortex struct array from a URL. @@ -176,14 +176,14 @@ pub fn read_path( /// #[pyfunction] #[pyo3(signature = (url, *, projection = None, row_filter = None, indices = None))] -pub fn read_url( - url: Bound, - projection: Option>>, - row_filter: Option<&Bound>, +pub fn read_url<'py>( + url: Bound<'py, PyString>, + projection: Option>>, + row_filter: Option<&Bound<'py, PyExpr>>, indices: Option<&PyArray>, -) -> PyResult { +) -> PyResult> { let dataset = TOKIO_RUNTIME.block_on(ObjectStoreUrlDataset::try_new(url.extract()?))?; - dataset.to_array(projection, row_filter, indices) + dataset.to_array(url.py(), projection, row_filter, indices) } /// Write a vortex struct array to the local filesystem. diff --git a/pyvortex/test/test_encoding.py b/pyvortex/test/test_encoding.py deleted file mode 100644 index c28a5e1ef8..0000000000 --- a/pyvortex/test/test_encoding.py +++ /dev/null @@ -1,21 +0,0 @@ -import pytest - -import vortex as vx -from vortex.encoding import BoolArray - - -def test_bool_array(): - arr = vx.array([True, False, True, None]) - assert arr.dtype == vx.bool_(nullable=True) - - # Downcast to a BoolArray - # TODO(ngates): I think this should be automatic if we have a registered Python class for the encoding - arr = BoolArray(arr) - assert str(arr) == "vortex.bool(0x02)(bool?, len=4)" - - # Test the bool-specific true_count method - assert arr.true_count() == 2 - - # Fail to downcast a non-boolean array BoolArray - with pytest.raises(ValueError): - BoolArray(vx.array([1, 2, 3])) diff --git a/vortex-array/src/data/mod.rs b/vortex-array/src/data/mod.rs index 51b5d223b8..df78fca3ef 100644 --- a/vortex-array/src/data/mod.rs +++ b/vortex-array/src/data/mod.rs @@ -10,8 +10,8 @@ use vortex_flatbuffers::FlatBuffer; use vortex_scalar::Scalar; use crate::array::{ - BoolEncoding, ChunkedArray, ExtensionEncoding, NullEncoding, PrimitiveEncoding, StructEncoding, - VarBinEncoding, VarBinViewEncoding, + BoolEncoding, ChunkedArray, ExtensionEncoding, ListEncoding, NullEncoding, PrimitiveEncoding, + StructEncoding, VarBinEncoding, VarBinViewEncoding, }; use crate::compute::scalar_at; use crate::encoding::{Encoding, EncodingId}; @@ -176,6 +176,7 @@ impl Array { || self.is_encoding(BoolEncoding.id()) || self.is_encoding(PrimitiveEncoding.id()) || self.is_encoding(StructEncoding.id()) + || self.is_encoding(ListEncoding.id()) || self.is_encoding(VarBinViewEncoding.id()) || self.is_encoding(ExtensionEncoding.id()) }