Skip to content

Commit

Permalink
Array Metadata (#1985)
Browse files Browse the repository at this point in the history
The current ArrayMetadata system was put together somewhat quickly and
uses `serde` + Flexbuffers to serialize all metadata.

The end state looks like this:

- [ ] An array has a fixed 8 bytes of metadata. If it needs more, it
should use a buffer (made possible by #1743)
- [ ] Rkyv can optionally be used to help with serde for these bytes.
- [ ] No eager deserialization of metadata is performed, although arrays
should validate metadata in the `ValidateVTable` (see #1979).

To support 8 byte metadata, we need to:
- [ ] Move scalars and scalar values out of metadata (e.g.
ConstantArray)
- [ ] Move shift from FoR into BitPacking (this is a bit cheeky, it's
not strictly necessary, but FoR is then left with a 8-byte PValue for
metadata, and shifting feels like it should live in BitPacking anyway?)
- [ ] All other metadata should easily fit into 8 bytes.
  • Loading branch information
gatesn authored Jan 22, 2025
1 parent e1a7d47 commit 852f1d2
Show file tree
Hide file tree
Showing 59 changed files with 1,004 additions and 521 deletions.
122 changes: 122 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,12 @@ prost-build = "0.13.0"
prost-types = "0.13.0"
pyo3 = { version = ">= 0.22", features = ["extension-module", "abi3-py310"] }
pyo3-log = ">= 0.11"
rancor = "0.1.0"
rand = "0.8.5"
rayon = "1.10.0"
regex = "1.11.0"
reqwest = { version = "0.12.0", features = ["blocking"] }
rkyv = { version = "0.8", features = ["little_endian", "pointer_width_32", "bytecheck"] }
rstest = "0.24"
serde = "1.0.197"
serde_json = "1.0.116"
Expand Down
4 changes: 2 additions & 2 deletions docs/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Vortex array:
>>> parquet = pq.read_table("_static/example.parquet")
>>> vtx = vortex.array(parquet)
>>> vtx.nbytes
141069
141057

Compress
^^^^^^^^
Expand All @@ -46,7 +46,7 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the

>>> cvtx = vortex.compress(vtx)
>>> cvtx.nbytes
16791
15888
>>> cvtx.nbytes / vtx.nbytes
0.11...

Expand Down
26 changes: 14 additions & 12 deletions encodings/alp/src/alp/array.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::fmt::{Debug, Display};
use std::fmt::Debug;

use serde::{Deserialize, Serialize};
use vortex_array::array::PrimitiveArray;
Expand All @@ -10,27 +10,22 @@ use vortex_array::validity::{ArrayValidity, LogicalValidity, ValidityVTable};
use vortex_array::variants::{PrimitiveArrayTrait, VariantsVTable};
use vortex_array::visitor::{ArrayVisitor, VisitorVTable};
use vortex_array::{
impl_encoding, ArrayDType, ArrayData, ArrayLen, Canonical, IntoArrayData, IntoCanonical,
impl_encoding, ArrayDType, ArrayData, ArrayLen, Canonical, DeserializeMetadata, IntoArrayData,
IntoCanonical, SerdeMetadata,
};
use vortex_dtype::{DType, PType};
use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult};

use crate::alp::{alp_encode, decompress, Exponents};

impl_encoding!("vortex.alp", ids::ALP, ALP);
impl_encoding!("vortex.alp", ids::ALP, ALP, SerdeMetadata<ALPMetadata>);

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ALPMetadata {
pub(crate) exponents: Exponents,
pub(crate) patches: Option<PatchesMetadata>,
}

impl Display for ALPMetadata {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
Debug::fmt(self, f)
}
}

impl ALPArray {
pub fn try_new(
encoded: ArrayData,
Expand Down Expand Up @@ -60,7 +55,7 @@ impl ALPArray {
Self::try_from_parts(
dtype,
length,
ALPMetadata { exponents, patches },
SerdeMetadata(ALPMetadata { exponents, patches }),
None,
Some(children.into()),
Default::default(),
Expand All @@ -75,6 +70,12 @@ impl ALPArray {
}
}

fn metadata(&self) -> ALPMetadata {
SerdeMetadata::<ALPMetadata>::deserialize(self.as_ref().metadata_bytes())
.vortex_expect("ALPMetadata metadata")
.0
}

pub fn encoded(&self) -> ArrayData {
self.as_ref()
.child(0, &self.encoded_dtype(), self.len())
Expand Down Expand Up @@ -156,6 +157,7 @@ impl StatisticsVTable<ALPArray> for ALPEncoding {}
mod tests {
use vortex_array::patches::PatchesMetadata;
use vortex_array::test_harness::check_metadata;
use vortex_array::SerdeMetadata;
use vortex_dtype::PType;

use crate::{ALPMetadata, Exponents};
Expand All @@ -165,13 +167,13 @@ mod tests {
fn test_alp_metadata() {
check_metadata(
"alp.metadata",
ALPMetadata {
SerdeMetadata(ALPMetadata {
patches: Some(PatchesMetadata::new(usize::MAX, PType::U64)),
exponents: Exponents {
e: u8::MAX,
f: u8::MAX,
},
},
}),
);
}
}
Loading

0 comments on commit 852f1d2

Please sign in to comment.