From 70fbfefa84f4bb9a7b99bf897f4231361eee1412 Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Wed, 3 Apr 2024 17:30:53 -0400
Subject: [PATCH] small unpacking & benchmark improvements (#190)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

```
~/g/vortex ❯❯❯ cargo bench --bench bitpacking                                                                                                             ✘ 130
   Compiling vortex-fastlanes v0.1.0 (/Users/will/git/vortex/vortex-fastlanes)
    Finished `bench` profile [optimized] target(s) in 3.29s
     Running benches/bitpacking.rs (target/release/deps/bitpacking-274b666c83950221)
Gnuplot not found, using plotters backend
bitpack_1M              time:   [55.836 µs 56.008 µs 56.193 µs]
                        change: [-1.3761% -0.8817% -0.3672%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) high mild

unpack_1M               time:   [216.54 µs 232.54 µs 251.50 µs]
                        change: [-9.0508% -1.5376% +6.7838%] (p = 0.71 > 0.05)
                        No change in performance detected.
Found 13 outliers among 100 measurements (13.00%)
  3 (3.00%) high mild
  10 (10.00%) high severe

unpack_1M_singles       time:   [2.2917 ms 2.3220 ms 2.3539 ms]
                        change: [-5.6770% -3.6532% -1.6082%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high mild

unpack_1024_alloc       time:   [218.64 µs 232.55 µs 248.50 µs]
                        change: [-8.6257% -1.5167% +5.9634%] (p = 0.69 > 0.05)
                        No change in performance detected.
Found 12 outliers among 100 measurements (12.00%)
  4 (4.00%) high mild
  8 (8.00%) high severe

unpack_1024_noalloc     time:   [43.505 ns 43.651 ns 43.806 ns]
                        change: [+0.5580% +1.0303% +1.4821%] (p = 0.00 < 0.05)
                        Change within noise threshold.

unpack_single           time:   [4.9364 ns 4.9507 ns 4.9647 ns]
                        change: [+0.3408% +0.7604% +1.1548%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high mild
```
---
 vortex-fastlanes/benches/bitpacking.rs      |  6 +++-
 vortex-fastlanes/src/bitpacking/compress.rs | 36 +++++++++++----------
 2 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/vortex-fastlanes/benches/bitpacking.rs b/vortex-fastlanes/benches/bitpacking.rs
index c253b841d1..34611dcabf 100644
--- a/vortex-fastlanes/benches/bitpacking.rs
+++ b/vortex-fastlanes/benches/bitpacking.rs
@@ -39,8 +39,12 @@ fn pack_unpack(c: &mut Criterion) {
 
     // 1024 elements pack into `128 * bits` bytes
     let packed_1024 = &packed[0..128 * bits];
+    c.bench_function("unpack_1024_alloc", |b| {
+        b.iter(|| black_box(unpack_primitive::<u32>(&packed, bits, values.len())));
+    });
+
     let mut output: Vec<u32> = Vec::with_capacity(1024);
-    c.bench_function("unpack_1024", |b| {
+    c.bench_function("unpack_1024_noalloc", |b| {
         b.iter(|| {
             output.clear();
             TryBitPack::try_unpack_into(packed_1024, bits, &mut output).unwrap();
diff --git a/vortex-fastlanes/src/bitpacking/compress.rs b/vortex-fastlanes/src/bitpacking/compress.rs
index 2ba89361ac..78c8c317bf 100644
--- a/vortex-fastlanes/src/bitpacking/compress.rs
+++ b/vortex-fastlanes/src/bitpacking/compress.rs
@@ -212,31 +212,33 @@ pub fn unpack_primitive<T: NativePType + TryBitPack>(
     }
 
     // How many fastlanes vectors we will process.
-    let num_chunks = length / 1024;
+    let num_chunks = (length + 1023) / 1024;
+    let bytes_per_chunk = 128 * bit_width;
+    assert_eq!(
+        packed.len(),
+        num_chunks * bytes_per_chunk,
+        "Invalid packed length: got {}, expected {}",
+        packed.len(),
+        num_chunks * bytes_per_chunk
+    );
 
     // Allocate a result vector.
-    let mut output = Vec::with_capacity(length);
-
-    // Loop over all but the last chunk.
-    let bytes_per_chunk = 128 * bit_width;
+    let mut output = Vec::with_capacity(num_chunks * 1024);
+    // Loop over all the chunks.
     (0..num_chunks).for_each(|i| {
         let chunk: &[u8] = &packed[i * bytes_per_chunk..][0..bytes_per_chunk];
         TryBitPack::try_unpack_into(chunk, bit_width, &mut output).unwrap();
     });
 
-    // Handle the final chunk which may contain padding.
-    let last_chunk_size = length % 1024;
-    if last_chunk_size > 0 {
-        let mut last_output = Vec::with_capacity(1024);
-        TryBitPack::try_unpack_into(
-            &packed[num_chunks * bytes_per_chunk..],
-            bit_width,
-            &mut last_output,
-        )
-        .unwrap();
-        output.extend_from_slice(&last_output[..last_chunk_size]);
-    }
+    // The final chunk may have had padding
+    output.truncate(length);
 
+    // For small vectors, the overhead of rounding up is more noticable.
+    // Shrink to fit may or may not reallocate depending on the implementation.
+    // But for very small vectors, the reallocation is cheap enough even if it does happen.
+    if output.len() < 1024 {
+        output.shrink_to_fit();
+    }
     output
 }