Skip to content

Commit

Permalink
Api revamp (#113)
Browse files Browse the repository at this point in the history
  • Loading branch information
mwlon authored Oct 28, 2023
1 parent 4e4c354 commit 852b501
Show file tree
Hide file tree
Showing 61 changed files with 2,599 additions and 3,412 deletions.
10 changes: 5 additions & 5 deletions bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ Compression ratio is reported with 3 significant figures.

| dataset | compression speed / (million/s) | decompression speed / (million/s) | compression ratio |
|--------------------|---------------------------------|-----------------------------------|-------------------|
| `f64_decimal` | 12 | 92 | 4.67 |
| `f64_slow_cosine` | 15 | 120 | 4.35 |
| `i64_lomax05_reg` | 16 | 190 | 4.62 |
| `i64_sparse` | 46 | 210 | 792 |
| `micros_millis` | 12 | 160 | 2.08 |
| `f64_decimal` | 12 | 96 | 4.67 |
| `f64_slow_cosine` | 16 | 120 | 4.35 |
| `i64_lomax05_reg` | 19 | 200 | 4.62 |
| `i64_sparse` | 37 | 170 | 792 |
| `micros_millis` | 12 | 180 | 2.08 |

`i64` and `f64` are each 8 bytes, so these speeds are in the ballpark of 1GB/s.
For reference, on the same hardware and heavy-tail integers dataset, ZStandard
Expand Down
5 changes: 2 additions & 3 deletions bench/src/codecs/pco.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use anyhow::{anyhow, Result};

#[derive(Clone, Debug, Default)]
pub struct PcoConfig {
compressor_config: pco::CompressorConfig,
compressor_config: pco::ChunkConfig,
}

impl CodecInternal for PcoConfig {
Expand Down Expand Up @@ -49,9 +49,8 @@ impl CodecInternal for PcoConfig {
}

fn compress<T: Dtype>(&self, nums: &[T]) -> Vec<u8> {
let c_config = self.compressor_config.clone();
let pco_nums = T::slice_to_pco(nums);
pco::standalone::simple_compress(pco_nums, c_config).expect("invalid config")
pco::standalone::simple_compress(pco_nums, &self.compressor_config).expect("invalid config")
}

fn decompress<T: Dtype>(&self, bytes: &[u8]) -> Vec<T> {
Expand Down
8 changes: 2 additions & 6 deletions pco/src/ans/decoding.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
use crate::ans::spec::Spec;
use crate::ans::{AnsState, Token};

use crate::constants::Bitlen;
use crate::data_types::UnsignedLike;
use crate::errors::PcoResult;

use crate::ChunkLatentMetadata;
use crate::ChunkLatentMeta;

#[derive(Clone, Debug)]
pub struct Node {
Expand Down Expand Up @@ -43,9 +41,7 @@ impl Decoder {
Self { nodes }
}

pub fn from_latent_meta<U: UnsignedLike>(
latent_meta: &ChunkLatentMetadata<U>,
) -> PcoResult<Self> {
pub fn from_latent_meta<U: UnsignedLike>(latent_meta: &ChunkLatentMeta<U>) -> PcoResult<Self> {
let weights = latent_meta
.bins
.iter()
Expand Down
23 changes: 14 additions & 9 deletions pco/src/ans/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ pub(crate) type Token = u16;
#[cfg(test)]
mod tests {
use crate::ans::spec::Spec;
use crate::ans::{Decoder, Encoder, Token};
use crate::ans::{AnsState, Decoder, Encoder, Token};
use crate::bit_reader;
use crate::bit_reader::BitReader;
use crate::bit_words::PaddedBytes;
use crate::bit_writer::BitWriter;
use crate::errors::PcoResult;

Expand All @@ -32,25 +32,30 @@ mod tests {
to_write.push((state, bitlen));
state = new_state;
}
let mut writer = BitWriter::default();

let mut bytes = Vec::new();
let mut writer = BitWriter::new(&mut bytes, 5);
for (word, bitlen) in to_write.into_iter().rev() {
writer.write_diff(word, bitlen);
writer.write_uint(word, bitlen);
writer.flush()?;
}
writer.finish_byte();
writer.flush()?;
drop(writer);
assert_eq!(bytes.len(), expected_byte_len);
let final_state = state;
let table_size = 1 << encoder.size_log();

// DECODE
let decoder = Decoder::new(spec);
let bytes = writer.drain_bytes();
assert_eq!(bytes.len(), expected_byte_len);
let bit_words = PaddedBytes::from(bytes);
let mut reader = BitReader::from(&bit_words);
let extension = bit_reader::make_extension_for(&bytes, 100);
let mut reader = BitReader::new(&bytes, &extension);
let mut decoded = Vec::new();
let mut state_idx = final_state - table_size;
for _ in 0..tokens.len() {
let node = decoder.get_node(state_idx);
decoded.push(node.token);
state_idx = node.next_state_idx_base + reader.read_small(node.bits_to_read)?;
state_idx = node.next_state_idx_base + reader.read_uint::<AnsState>(node.bits_to_read);
}

assert_eq!(decoded, tokens);
Expand Down
46 changes: 31 additions & 15 deletions pco/src/auto.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
use std::cmp::min;

use crate::chunk_config::{ChunkConfig, PagingSpec};
use crate::constants::{AUTO_DELTA_LIMIT, MAX_AUTO_DELTA_COMPRESSION_LEVEL};
use crate::data_types::NumberLike;
use crate::standalone::Compressor;
use crate::CompressorConfig;
use std::cmp::min;
use crate::errors::PcoResult;
use crate::wrapped::FileCompressor;

/// Automatically makes an educated guess for the best compression
/// delta encoding order, based on `nums` and `compression_level`.
///
/// This has some compute cost by trying different configurations on a subset
/// of the numbers to determine the most likely one to do well.
/// See [`CompressorConfig`] for information about compression levels.
pub fn auto_delta_encoding_order<T: NumberLike>(nums: &[T], compression_level: usize) -> usize {
/// See [`ChunkConfig`] for information about compression levels.
pub fn auto_delta_encoding_order<T: NumberLike>(
nums: &[T],
compression_level: usize,
) -> PcoResult<usize> {
let mut sampled_nums;
let head_nums = if nums.len() < AUTO_DELTA_LIMIT {
nums
Expand All @@ -36,20 +41,24 @@ pub fn auto_delta_encoding_order<T: NumberLike>(nums: &[T], compression_level: u
// Taking deltas of a large dataset won't change the GCD,
// so we don't need to waste compute here inferring GCD's just to
// determine the best delta order.
let config = CompressorConfig {
let config = ChunkConfig {
delta_encoding_order: Some(delta_encoding_order),
compression_level: min(
compression_level,
MAX_AUTO_DELTA_COMPRESSION_LEVEL,
),
use_gcds: false,
use_float_mult: true,
paging_spec: PagingSpec::default(),
};
let mut compressor = Compressor::<T>::from_config(config).unwrap();
compressor.header().unwrap();
compressor.chunk(head_nums).unwrap(); // only unreachable errors
let size = compressor.byte_size();
let fc = FileCompressor::default();
let cc = fc.chunk_compressor(head_nums, &config)?;
let size_estimate = cc.chunk_meta_size_hint() + cc.page_size_hint(0);
let mut dst = Vec::with_capacity(size_estimate);
cc.write_chunk_meta(&mut dst)?;
cc.write_page(0, &mut dst)?;

let size = dst.len();
if size < best_size {
best_order = delta_encoding_order;
best_size = size;
Expand All @@ -58,7 +67,8 @@ pub fn auto_delta_encoding_order<T: NumberLike>(nums: &[T], compression_level: u
break;
}
}
best_order

Ok(best_order)
}

#[cfg(test)]
Expand All @@ -78,13 +88,16 @@ mod tests {
linear_trend.push(i);
quadratic_trend.push(i * i);
}
assert_eq!(auto_delta_encoding_order(&no_trend, 3), 0);
assert_eq!(
auto_delta_encoding_order(&linear_trend, 3),
auto_delta_encoding_order(&no_trend, 3).unwrap(),
0
);
assert_eq!(
auto_delta_encoding_order(&linear_trend, 3).unwrap(),
1
);
assert_eq!(
auto_delta_encoding_order(&quadratic_trend, 3),
auto_delta_encoding_order(&quadratic_trend, 3).unwrap(),
2
);
}
Expand All @@ -94,6 +107,9 @@ mod tests {
let mut nums = Vec::with_capacity(2000);
nums.resize(1000, 77);
nums.resize(2000, 78);
assert_eq!(auto_delta_encoding_order(&nums, 3), 1);
assert_eq!(
auto_delta_encoding_order(&nums, 3).unwrap(),
1
);
}
}
Loading

0 comments on commit 852b501

Please sign in to comment.