Skip to content

Commit

Permalink
perf: make miniblock decoding cheaper (#3438)
Browse files Browse the repository at this point in the history
This fixes a few performance bottlenecks on 2.1 take operations

* Change the protobuf config to generate `bytes::Bytes` instead of
`Vec`. This helps avoid some expensive FSST symbol table clones.
* Moka cache lookups during initialization are expensive. Instead of one
cache lookup per page we now do one cache lookup per column
* Our current scheduling approach for mini block was slow. There were
many switches to calculate info about the repetition index. We now
precompute that during initialization. In addition, we now search the
repetition index with a binary search instead of a full scan.
  • Loading branch information
westonpace authored Feb 12, 2025
1 parent c70d1d2 commit c054697
Show file tree
Hide file tree
Showing 18 changed files with 1,024 additions and 293 deletions.
79 changes: 75 additions & 4 deletions rust/lance-datagen/src/generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ impl From<u32> for Dimension {
}

/// A trait for anything that can generate arrays of data
pub trait ArrayGenerator: Send + Sync {
pub trait ArrayGenerator: Send + Sync + std::fmt::Debug {
/// Generate an array of the given length
///
/// # Arguments
Expand Down Expand Up @@ -92,6 +92,7 @@ pub trait ArrayGenerator: Send + Sync {
fn element_size_bytes(&self) -> Option<ByteCount>;
}

#[derive(Debug)]
pub struct CycleNullGenerator {
generator: Box<dyn ArrayGenerator>,
validity: Vec<bool>,
Expand Down Expand Up @@ -139,6 +140,7 @@ impl ArrayGenerator for CycleNullGenerator {
}
}

#[derive(Debug)]
pub struct MetadataGenerator {
generator: Box<dyn ArrayGenerator>,
metadata: HashMap<String, String>,
Expand Down Expand Up @@ -166,6 +168,7 @@ impl ArrayGenerator for MetadataGenerator {
}
}

#[derive(Debug)]
pub struct NullGenerator {
generator: Box<dyn ArrayGenerator>,
null_probability: f64,
Expand Down Expand Up @@ -245,6 +248,10 @@ impl ArrayGenerator for NullGenerator {
}
}

fn metadata(&self) -> Option<HashMap<String, String>> {
self.generator.metadata()
}

fn data_type(&self) -> &DataType {
self.generator.data_type()
}
Expand Down Expand Up @@ -349,6 +356,23 @@ where
element_size_bytes: Option<ByteCount>,
}

impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> std::fmt::Debug
for FnGen<T, ArrayType, F>
where
T: Copy + Default,
ArrayType: arrow_array::Array + From<Vec<T>>,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("FnGen")
.field("data_type", &self.data_type)
.field("array_type", &self.array_type)
.field("repeat", &self.repeat)
.field("leftover_count", &self.leftover_count)
.field("element_size_bytes", &self.element_size_bytes)
.finish()
}
}

impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> FnGen<T, ArrayType, F>
where
T: Copy + Default,
Expand Down Expand Up @@ -422,6 +446,7 @@ impl From<u64> for Seed {
}
}

#[derive(Debug)]
pub struct CycleVectorGenerator {
underlying_gen: Box<dyn ArrayGenerator>,
dimension: Dimension,
Expand Down Expand Up @@ -470,7 +495,7 @@ impl ArrayGenerator for CycleVectorGenerator {
}
}

#[derive(Default)]
#[derive(Debug, Default)]
pub struct PseudoUuidGenerator {}

impl ArrayGenerator for PseudoUuidGenerator {
Expand All @@ -497,7 +522,7 @@ impl ArrayGenerator for PseudoUuidGenerator {
}
}

#[derive(Default)]
#[derive(Debug, Default)]
pub struct PseudoUuidHexGenerator {}

impl ArrayGenerator for PseudoUuidHexGenerator {
Expand All @@ -524,7 +549,7 @@ impl ArrayGenerator for PseudoUuidHexGenerator {
}
}

#[derive(Default)]
#[derive(Debug, Default)]
pub struct RandomBooleanGenerator {}

impl ArrayGenerator for RandomBooleanGenerator {
Expand Down Expand Up @@ -558,6 +583,14 @@ pub struct RandomBytesGenerator<T: ArrowPrimitiveType + Send + Sync> {
data_type: DataType,
}

impl<T: ArrowPrimitiveType + Send + Sync> std::fmt::Debug for RandomBytesGenerator<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("RandomBytesGenerator")
.field("data_type", &self.data_type)
.finish()
}
}

impl<T: ArrowPrimitiveType + Send + Sync> RandomBytesGenerator<T> {
fn new(data_type: DataType) -> Self {
Self {
Expand Down Expand Up @@ -597,6 +630,7 @@ impl<T: ArrowPrimitiveType + Send + Sync> ArrayGenerator for RandomBytesGenerato

// This is pretty much the same thing as RandomBinaryGenerator but we can't use that
// because there is no ArrowPrimitiveType for FixedSizeBinary
#[derive(Debug)]
pub struct RandomFixedSizeBinaryGenerator {
data_type: DataType,
size: i32,
Expand Down Expand Up @@ -636,6 +670,7 @@ impl ArrayGenerator for RandomFixedSizeBinaryGenerator {
}
}

#[derive(Debug)]
pub struct RandomIntervalGenerator {
unit: IntervalUnit,
data_type: DataType,
Expand Down Expand Up @@ -688,6 +723,7 @@ impl ArrayGenerator for RandomIntervalGenerator {
Some(ByteCount::from(12))
}
}
#[derive(Debug)]
pub struct RandomBinaryGenerator {
bytes_per_element: ByteCount,
scale_to_utf8: bool,
Expand Down Expand Up @@ -776,6 +812,7 @@ impl ArrayGenerator for RandomBinaryGenerator {
}
}

#[derive(Debug)]
pub struct VariableRandomBinaryGenerator {
lengths_gen: Box<dyn ArrayGenerator>,
data_type: DataType,
Expand Down Expand Up @@ -830,6 +867,18 @@ pub struct CycleBinaryGenerator<T: ByteArrayType> {
idx: usize,
}

impl<T: ByteArrayType> std::fmt::Debug for CycleBinaryGenerator<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CycleBinaryGenerator")
.field("values", &self.values)
.field("lengths", &self.lengths)
.field("data_type", &self.data_type)
.field("width", &self.width)
.field("idx", &self.idx)
.finish()
}
}

impl<T: ByteArrayType> CycleBinaryGenerator<T> {
pub fn from_strings(values: &[&str]) -> Self {
if values.is_empty() {
Expand Down Expand Up @@ -905,6 +954,15 @@ pub struct FixedBinaryGenerator<T: ByteArrayType> {
array_type: PhantomData<T>,
}

impl<T: ByteArrayType> std::fmt::Debug for FixedBinaryGenerator<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("FixedBinaryGenerator")
.field("value", &self.value)
.field("data_type", &self.data_type)
.finish()
}
}

impl<T: ByteArrayType> FixedBinaryGenerator<T> {
pub fn new(value: Vec<u8>) -> Self {
Self {
Expand Down Expand Up @@ -954,6 +1012,16 @@ pub struct DictionaryGenerator<K: ArrowDictionaryKeyType> {
key_width: u64,
}

impl<K: ArrowDictionaryKeyType> std::fmt::Debug for DictionaryGenerator<K> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("DictionaryGenerator")
.field("generator", &self.generator)
.field("data_type", &self.data_type)
.field("key_width", &self.key_width)
.finish()
}
}

impl<K: ArrowDictionaryKeyType> DictionaryGenerator<K> {
fn new(generator: Box<dyn ArrayGenerator>) -> Self {
let key_type = Box::new(K::DATA_TYPE);
Expand Down Expand Up @@ -993,6 +1061,7 @@ impl<K: ArrowDictionaryKeyType + Send + Sync> ArrayGenerator for DictionaryGener
}
}

#[derive(Debug)]
struct RandomListGenerator {
field: Arc<Field>,
child_field: Arc<Field>,
Expand Down Expand Up @@ -1069,6 +1138,7 @@ impl ArrayGenerator for RandomListGenerator {
}
}

#[derive(Debug)]
struct NullArrayGenerator {}

impl ArrayGenerator for NullArrayGenerator {
Expand All @@ -1089,6 +1159,7 @@ impl ArrayGenerator for NullArrayGenerator {
}
}

#[derive(Debug)]
struct RandomStructGenerator {
fields: Fields,
data_type: DataType,
Expand Down
1 change: 1 addition & 0 deletions rust/lance-encoding/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ fn main() -> Result<()> {
let mut prost_build = prost_build::Config::new();
prost_build.protoc_arg("--experimental_allow_proto3_optional");
prost_build.enable_type_names();
prost_build.bytes(["."]); // Enable Bytes type for all messages to avoid Vec clones.
prost_build.compile_protos(&["./protos/encodings.proto"], &["./protos"])?;

Ok(())
Expand Down
8 changes: 8 additions & 0 deletions rust/lance-encoding/src/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,14 @@ impl LanceBuffer {
}
}

/// Convert a buffer into a bytes::Bytes object
pub fn into_bytes(self) -> bytes::Bytes {
match self {
Self::Owned(buf) => buf.into(),
Self::Borrowed(buf) => buf.into_vec::<u8>().unwrap().into(),
}
}

/// Convert into a borrowed buffer, this is a zero-copy operation
///
/// This is often called before cloning the buffer
Expand Down
Loading

0 comments on commit c054697

Please sign in to comment.