From 70fbfefa84f4bb9a7b99bf897f4231361eee1412 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 3 Apr 2024 17:30:53 -0400 Subject: [PATCH] small unpacking & benchmark improvements (#190) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``` ~/g/vortex ❯❯❯ cargo bench --bench bitpacking ✘ 130 Compiling vortex-fastlanes v0.1.0 (/Users/will/git/vortex/vortex-fastlanes) Finished `bench` profile [optimized] target(s) in 3.29s Running benches/bitpacking.rs (target/release/deps/bitpacking-274b666c83950221) Gnuplot not found, using plotters backend bitpack_1M time: [55.836 µs 56.008 µs 56.193 µs] change: [-1.3761% -0.8817% -0.3672%] (p = 0.00 < 0.05) Change within noise threshold. Found 2 outliers among 100 measurements (2.00%) 2 (2.00%) high mild unpack_1M time: [216.54 µs 232.54 µs 251.50 µs] change: [-9.0508% -1.5376% +6.7838%] (p = 0.71 > 0.05) No change in performance detected. Found 13 outliers among 100 measurements (13.00%) 3 (3.00%) high mild 10 (10.00%) high severe unpack_1M_singles time: [2.2917 ms 2.3220 ms 2.3539 ms] change: [-5.6770% -3.6532% -1.6082%] (p = 0.00 < 0.05) Performance has improved. Found 1 outliers among 100 measurements (1.00%) 1 (1.00%) high mild unpack_1024_alloc time: [218.64 µs 232.55 µs 248.50 µs] change: [-8.6257% -1.5167% +5.9634%] (p = 0.69 > 0.05) No change in performance detected. Found 12 outliers among 100 measurements (12.00%) 4 (4.00%) high mild 8 (8.00%) high severe unpack_1024_noalloc time: [43.505 ns 43.651 ns 43.806 ns] change: [+0.5580% +1.0303% +1.4821%] (p = 0.00 < 0.05) Change within noise threshold. unpack_single time: [4.9364 ns 4.9507 ns 4.9647 ns] change: [+0.3408% +0.7604% +1.1548%] (p = 0.00 < 0.05) Change within noise threshold. Found 1 outliers among 100 measurements (1.00%) 1 (1.00%) high mild ``` --- vortex-fastlanes/benches/bitpacking.rs | 6 +++- vortex-fastlanes/src/bitpacking/compress.rs | 36 +++++++++++---------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/vortex-fastlanes/benches/bitpacking.rs b/vortex-fastlanes/benches/bitpacking.rs index c253b841d1..34611dcabf 100644 --- a/vortex-fastlanes/benches/bitpacking.rs +++ b/vortex-fastlanes/benches/bitpacking.rs @@ -39,8 +39,12 @@ fn pack_unpack(c: &mut Criterion) { // 1024 elements pack into `128 * bits` bytes let packed_1024 = &packed[0..128 * bits]; + c.bench_function("unpack_1024_alloc", |b| { + b.iter(|| black_box(unpack_primitive::(&packed, bits, values.len()))); + }); + let mut output: Vec = Vec::with_capacity(1024); - c.bench_function("unpack_1024", |b| { + c.bench_function("unpack_1024_noalloc", |b| { b.iter(|| { output.clear(); TryBitPack::try_unpack_into(packed_1024, bits, &mut output).unwrap(); diff --git a/vortex-fastlanes/src/bitpacking/compress.rs b/vortex-fastlanes/src/bitpacking/compress.rs index 2ba89361ac..78c8c317bf 100644 --- a/vortex-fastlanes/src/bitpacking/compress.rs +++ b/vortex-fastlanes/src/bitpacking/compress.rs @@ -212,31 +212,33 @@ pub fn unpack_primitive( } // How many fastlanes vectors we will process. - let num_chunks = length / 1024; + let num_chunks = (length + 1023) / 1024; + let bytes_per_chunk = 128 * bit_width; + assert_eq!( + packed.len(), + num_chunks * bytes_per_chunk, + "Invalid packed length: got {}, expected {}", + packed.len(), + num_chunks * bytes_per_chunk + ); // Allocate a result vector. - let mut output = Vec::with_capacity(length); - - // Loop over all but the last chunk. - let bytes_per_chunk = 128 * bit_width; + let mut output = Vec::with_capacity(num_chunks * 1024); + // Loop over all the chunks. (0..num_chunks).for_each(|i| { let chunk: &[u8] = &packed[i * bytes_per_chunk..][0..bytes_per_chunk]; TryBitPack::try_unpack_into(chunk, bit_width, &mut output).unwrap(); }); - // Handle the final chunk which may contain padding. - let last_chunk_size = length % 1024; - if last_chunk_size > 0 { - let mut last_output = Vec::with_capacity(1024); - TryBitPack::try_unpack_into( - &packed[num_chunks * bytes_per_chunk..], - bit_width, - &mut last_output, - ) - .unwrap(); - output.extend_from_slice(&last_output[..last_chunk_size]); - } + // The final chunk may have had padding + output.truncate(length); + // For small vectors, the overhead of rounding up is more noticable. + // Shrink to fit may or may not reallocate depending on the implementation. + // But for very small vectors, the reallocation is cheap enough even if it does happen. + if output.len() < 1024 { + output.shrink_to_fit(); + } output }