U64 words (#114)

fixes decompression on 32 bit architectures
mwlon · Oct 29, 2023 · 4ce1182 · 4ce1182
1 parent 852b501
commit 4ce1182
Show file tree

Hide file tree

Showing 11 changed files with 113 additions and 100 deletions.
diff --git a/pco/src/bit_reader.rs b/pco/src/bit_reader.rs
@@ -2,7 +2,7 @@ use std::cmp::min;
 use std::mem;
 
 use crate::bits;
-use crate::constants::{Bitlen, BYTES_PER_WORD, WORD_BITLEN};
+use crate::constants::Bitlen;
 use crate::errors::{PcoError, PcoResult};
 use crate::read_write_uint::ReadWriteUint;
 
@@ -16,49 +16,53 @@ pub fn make_extension_for(slice: &[u8], padding: usize) -> Vec<u8> {
   res
 }
 
+// Q: Why u64?
+// A: It's the largest data type most instruction sets have support for (and
+//    can do few-cycle/SIMD ops on). e.g. even 32-bit wasm has 64-bit ints and
+//    opcodes.
 #[inline]
-pub fn word_at(src: &[u8], byte_idx: usize) -> usize {
-  let raw_bytes = unsafe { *(src.as_ptr().add(byte_idx) as *const [u8; BYTES_PER_WORD]) };
-  usize::from_le_bytes(raw_bytes)
+pub fn u64_at(src: &[u8], byte_idx: usize) -> u64 {
+  let raw_bytes = unsafe { *(src.as_ptr().add(byte_idx) as *const [u8; 8]) };
+  u64::from_le_bytes(raw_bytes)
 }
 
 #[inline]
-pub fn read_uint_at<U: ReadWriteUint, const MAX_EXTRA_WORDS: usize>(
+pub fn read_uint_at<U: ReadWriteUint, const MAX_EXTRA_U64S: usize>(
   src: &[u8],
   mut byte_idx: usize,
   bits_past_byte: Bitlen,
   n: Bitlen,
 ) -> U {
   // Q: Why is this fast?
-  // A: The 0..MAX_EXTRA_WORDS can be unrolled at compile time and interact
+  // A: The 0..MAX_EXTRA_U64S can be unrolled at compile time and interact
   //    freely with an outer loop, allowing really fast SIMD stuff.
   //
   // Q: Why does this work?
-  // A: We set MAX_EXTRA_WORDS so that e.g. on 64 bit architectures,
-  //    0  to 57  bit reads -> 0 extra words
-  //    58 to 113 bit reads -> 1 extra words
-  //    113 to 128 bit reads -> 2 extra words
-  //    During the 1st word (prior to the loop), we read all bytes from the
-  //    current word. Due to our bit packing, up to the first 7 of these may
+  // A: We set MAX_EXTRA_U64S so that e.g. on 64 bit architectures,
+  //    0  to 57  bit reads -> 0 extra u64's
+  //    58 to 113 bit reads -> 1 extra u64's
+  //    113 to 128 bit reads -> 2 extra u64's
+  //    During the 1st u64 (prior to the loop), we read all bytes from the
+  //    current u64. Due to our bit packing, up to the first 7 of these may
   //    be useless, so we can read up to (64 - 7) = 57 bits safely from a
-  //    single word. We right shift by only up to 7 bits, which is safe.
+  //    single u64. We right shift by only up to 7 bits, which is safe.
   //
-  //    For the 2nd word, we skip only 7 bytes forward. This will overlap with
-  //    the 1st word by 1 byte, which seems useless, but allows us to avoid one
+  //    For the 2nd u64, we skip only 7 bytes forward. This will overlap with
+  //    the 1st u64 by 1 byte, which seems useless, but allows us to avoid one
   //    nasty case: left shifting by U::BITS (a panic). This could happen e.g.
   //    with 64-bit reads when we start out byte-aligned (bits_past_byte=0).
   //
-  //    For the 3rd word and onward, we skip 8 bytes forward. Due to how we
-  //    handled the 2nd word, the most we'll every need to shift by is
+  //    For the 3rd u64 and onward, we skip 8 bytes forward. Due to how we
+  //    handled the 2nd u64, the most we'll every need to shift by is
   //    precision - 8, which is safe.
-  let mut res = U::from_word(word_at(src, byte_idx) >> bits_past_byte);
-  let mut processed = min(n, WORD_BITLEN - 8 - bits_past_byte);
-  byte_idx += BYTES_PER_WORD - 1;
-
-  for _ in 0..MAX_EXTRA_WORDS {
-    res |= U::from_word(word_at(src, byte_idx)) << processed;
-    processed += WORD_BITLEN;
-    byte_idx += BYTES_PER_WORD;
+  let mut res = U::from_u64(u64_at(src, byte_idx) >> bits_past_byte);
+  let mut processed = min(n, 56 - bits_past_byte);
+  byte_idx += 7;
+
+  for _ in 0..MAX_EXTRA_U64S {
+    res |= U::from_u64(u64_at(src, byte_idx)) << processed;
+    processed += 64;
+    byte_idx += 8;
   }
 
   bits::lowest_bits(res, n)
@@ -201,7 +205,7 @@ impl<'a> BitReader<'a> {
 
   pub fn read_uint<U: ReadWriteUint>(&mut self, n: Bitlen) -> U {
     self.refill();
-    let res = match U::MAX_EXTRA_WORDS {
+    let res = match U::MAX_EXTRA_U64S {
       0 => read_uint_at::<U, 0>(
         self.current_stream,
         self.stale_byte_idx,
@@ -221,8 +225,8 @@ impl<'a> BitReader<'a> {
         n,
       ),
       _ => panic!(
-        "[BitReader] data type too large (extra words {} > 2)",
-        U::MAX_EXTRA_WORDS
+        "[BitReader] data type too large (extra u64's {} > 2)",
+        U::MAX_EXTRA_U64S
       ),
     };
     self.consume(n);

diff --git a/pco/src/bit_writer.rs b/pco/src/bit_writer.rs
@@ -1,8 +1,8 @@
 use std::io::Write;
 
-use crate::bit_reader::word_at;
+use crate::bit_reader::u64_at;
 use crate::bits;
-use crate::constants::{Bitlen, BYTES_PER_WORD, WORD_BITLEN};
+use crate::constants::Bitlen;
 use crate::errors::{PcoError, PcoResult};
 use crate::read_write_uint::ReadWriteUint;
 
@@ -13,15 +13,15 @@ use crate::read_write_uint::ReadWriteUint;
 // BitWriter (wrapping BitBuffer, generic to W) to reduce binary size
 
 #[inline]
-pub fn write_word_to(word: usize, byte_idx: usize, dst: &mut [u8]) {
+pub fn write_u64_to(x: u64, byte_idx: usize, dst: &mut [u8]) {
   unsafe {
-    let target = dst.as_mut_ptr().add(byte_idx) as *mut [u8; BYTES_PER_WORD];
-    *target = word.to_le_bytes();
+    let target = dst.as_mut_ptr().add(byte_idx) as *mut [u8; 8];
+    *target = x.to_le_bytes();
   };
 }
 
 #[inline]
-pub fn write_uint_to<U: ReadWriteUint, const MAX_EXTRA_WORDS: Bitlen>(
+pub fn write_uint_to<U: ReadWriteUint, const MAX_EXTRA_U64S: Bitlen>(
   x: U,
   mut byte_idx: usize,
   bits_past_byte: Bitlen,
@@ -30,15 +30,18 @@ pub fn write_uint_to<U: ReadWriteUint, const MAX_EXTRA_WORDS: Bitlen>(
 ) {
   // See bit_reader for an explanation of why this is fast and how it works.
   let x = bits::lowest_bits(x, n);
-  let word = word_at(dst, byte_idx) | (x.to_usize() << bits_past_byte);
-  write_word_to(word, byte_idx, dst);
-  let mut processed = WORD_BITLEN - 8 - bits_past_byte;
-  byte_idx += BYTES_PER_WORD - 1;
-
-  for _ in 0..MAX_EXTRA_WORDS {
-    write_word_to((x >> processed).to_usize(), byte_idx, dst);
-    processed += WORD_BITLEN;
-    byte_idx += BYTES_PER_WORD;
+  write_u64_to(
+    u64_at(dst, byte_idx) | (x.to_u64() << bits_past_byte),
+    byte_idx,
+    dst,
+  );
+  let mut processed = 56 - bits_past_byte;
+  byte_idx += 7;
+
+  for _ in 0..MAX_EXTRA_U64S {
+    write_u64_to((x >> processed).to_u64(), byte_idx, dst);
+    processed += 64;
+    byte_idx += 8;
   }
 }
 
@@ -98,7 +101,7 @@ impl<W: Write> BitWriter<W> {
 
   pub fn write_uint<U: ReadWriteUint>(&mut self, x: U, n: Bitlen) {
     self.refill();
-    match U::MAX_EXTRA_WORDS {
+    match U::MAX_EXTRA_U64S {
       0 => write_uint_to::<U, 0>(
         x,
         self.stale_byte_idx,
@@ -121,8 +124,8 @@ impl<W: Write> BitWriter<W> {
         &mut self.buf,
       ),
       _ => panic!(
-        "[BitWriter] data type too large (extra words {} > 2)",
-        U::MAX_EXTRA_WORDS
+        "[BitWriter] data type too large (extra u64's {} > 2)",
+        U::MAX_EXTRA_U64S
       ),
     }
     self.consume(n);

diff --git a/pco/src/bits.rs b/pco/src/bits.rs
@@ -3,11 +3,11 @@ use crate::data_types::UnsignedLike;
 use crate::read_write_uint::ReadWriteUint;
 
 #[inline]
-pub fn lowest_bits<U: ReadWriteUint>(word: U, n: Bitlen) -> U {
+pub fn lowest_bits<U: ReadWriteUint>(x: U, n: Bitlen) -> U {
   if n >= U::BITS {
-    word
+    x
   } else {
-    word & ((U::ONE << n) - U::ONE)
+    x & ((U::ONE << n) - U::ONE)
   }
 }
 

diff --git a/pco/src/constants.rs b/pco/src/constants.rs
@@ -19,7 +19,8 @@ pub const BITS_TO_ENCODE_N_BINS: Bitlen = 15;
 
 // padding
 pub const HEADER_PADDING: usize = 1;
-pub const OVERSHOOT_PADDING: usize = MAX_SUPPORTED_PRECISION_BYTES + BYTES_PER_WORD + 1;
+// + 9 because we might read an extra u64 (8 bytes), plus 1 for good measure
+pub const OVERSHOOT_PADDING: usize = MAX_SUPPORTED_PRECISION_BYTES + 9;
 // Chunk meta padding is enough for one full batch of bins; this should also
 // generously cover the data needed to read the other parts of chunk meta.
 pub const CHUNK_META_PADDING: usize =
@@ -31,11 +32,6 @@ pub const PAGE_LATENT_META_PADDING: usize =
 pub const PAGE_PADDING: usize =
   FULL_BATCH_SIZE * (MAX_SUPPORTED_PRECISION_BYTES + MAX_ANS_BYTES) + OVERSHOOT_PADDING;
 
-// native architecture info
-pub const WORD_SIZE: usize = usize::BITS as usize;
-pub const WORD_BITLEN: Bitlen = usize::BITS as Bitlen;
-pub const BYTES_PER_WORD: usize = WORD_SIZE / 8;
-
 // cutoffs and legal parameter values
 pub const AUTO_DELTA_LIMIT: usize = 1100;
 pub const MAX_ANS_BITS: Bitlen = 14;

diff --git a/pco/src/data_types/floats.rs b/pco/src/data_types/floats.rs
@@ -62,7 +62,6 @@ macro_rules! impl_float_number {
 
     impl NumberLike for $t {
       const DTYPE_BYTE: u8 = $header_byte;
-      const PHYSICAL_BITS: usize = $bits;
       const IS_FLOAT: bool = true;
 
       type Unsigned = $unsigned;

diff --git a/pco/src/data_types/mod.rs b/pco/src/data_types/mod.rs
@@ -90,12 +90,12 @@ pub trait UnsignedLike:
 
   /// Converts a `usize` into this type. Panics if the conversion is
   /// impossible.
-  fn from_word(word: usize) -> Self;
+  fn from_u64(x: u64) -> Self;
 
   fn leading_zeros(self) -> Bitlen;
 
   /// Converts the unsigned integer to a usize, truncating higher bits if necessary.
-  fn to_usize(self) -> usize;
+  fn to_u64(self) -> u64;
 
   fn wrapping_add(self, other: Self) -> Self;
   fn wrapping_sub(self, other: Self) -> Self;
@@ -142,7 +142,6 @@ pub trait NumberLike: Copy + Debug + Display + Default + PartialEq + 'static {
   /// This must match the number of bytes in the `to_bytes` and `from_bytes`
   /// implementations.
   /// Note that booleans have 8 physical bits (not 1).
-  const PHYSICAL_BITS: usize;
   const IS_FLOAT: bool = false;
 
   /// The unsigned integer this type can convert between to do

diff --git a/pco/src/data_types/signeds.rs b/pco/src/data_types/signeds.rs
@@ -4,7 +4,6 @@ macro_rules! impl_signed {
   ($t: ty, $unsigned: ty, $header_byte: expr) => {
     impl NumberLike for $t {
       const DTYPE_BYTE: u8 = $header_byte;
-      const PHYSICAL_BITS: usize = Self::BITS as usize;
 
       type Unsigned = $unsigned;
 

diff --git a/pco/src/data_types/unsigneds.rs b/pco/src/data_types/unsigneds.rs
@@ -13,8 +13,8 @@ macro_rules! impl_unsigned {
       type Float = $float;
 
       #[inline]
-      fn from_word(word: usize) -> Self {
-        word as Self
+      fn from_u64(x: u64) -> Self {
+        x as Self
       }
 
       #[inline]
@@ -23,8 +23,8 @@ macro_rules! impl_unsigned {
       }
 
       #[inline]
-      fn to_usize(self) -> usize {
-        self as usize
+      fn to_u64(self) -> u64 {
+        self as u64
       }
 
       #[inline]
@@ -94,7 +94,6 @@ macro_rules! impl_unsigned_number {
   ($t: ty, $signed: ty, $float: ty, $header_byte: expr) => {
     impl NumberLike for $t {
       const DTYPE_BYTE: u8 = $header_byte;
-      const PHYSICAL_BITS: usize = Self::BITS as usize;
 
       type Unsigned = Self;