From 497d75337df2f2e8be74f5ffa2e43fa3eb5b753c Mon Sep 17 00:00:00 2001 From: mwlon Date: Tue, 31 Oct 2023 08:56:07 -0400 Subject: [PATCH] page sizes work, added todo --- bench/src/codecs/pco.rs | 8 ++++++++ bench/src/main.rs | 14 ++++++-------- pco/src/chunk_meta.rs | 6 ++++++ pco/src/standalone/simple.rs | 28 +++++++++++++++++----------- pco/src/wrapped/chunk_compressor.rs | 3 ++- 5 files changed, 39 insertions(+), 20 deletions(-) diff --git a/bench/src/codecs/pco.rs b/bench/src/codecs/pco.rs index b5d0efcf..7e673833 100644 --- a/bench/src/codecs/pco.rs +++ b/bench/src/codecs/pco.rs @@ -1,6 +1,7 @@ use crate::codecs::CodecInternal; use crate::dtypes::Dtype; use anyhow::{anyhow, Result}; +use pco::PagingSpec; #[derive(Clone, Debug, Default)] pub struct PcoConfig { @@ -22,6 +23,10 @@ impl CodecInternal for PcoConfig { .unwrap_or("auto".to_string()), "use_gcds" => self.compressor_config.use_gcds.to_string(), "use_float_mult" => self.compressor_config.use_float_mult.to_string(), + "page_size" => match self.compressor_config.paging_spec { + PagingSpec::EqualPagesUpTo(page_size) => page_size.to_string(), + _ => panic!("unexpected paging spec"), + }, _ => panic!("bad conf"), } } @@ -43,6 +48,9 @@ impl CodecInternal for PcoConfig { } "use_gcds" => self.compressor_config.use_gcds = value.parse::().unwrap(), "use_float_mult" => self.compressor_config.use_float_mult = value.parse::().unwrap(), + "page_size" => { + self.compressor_config.paging_spec = PagingSpec::EqualPagesUpTo(value.parse().unwrap()) + } _ => return Err(anyhow!("unknown conf: {}", key)), } Ok(()) diff --git a/bench/src/main.rs b/bench/src/main.rs index 1feb27bf..74b5261b 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -136,14 +136,12 @@ fn print_stats(mut stats: Vec, opt: &Opt) { .or_default() .add_assign(stat.clone()); } - stats.extend( - opt.codecs.iter().map(|codec| { - let codec = codec.to_string(); - let mut stat = aggregate_by_codec.get(&codec).cloned().unwrap(); - stat.codec = codec; - stat - }) - ); + stats.extend(opt.codecs.iter().map(|codec| { + let codec = codec.to_string(); + let mut stat = aggregate_by_codec.get(&codec).cloned().unwrap(); + stat.codec = codec; + stat + })); stats.push(aggregate); let table = Table::new(stats) .with(Style::rounded()) diff --git a/pco/src/chunk_meta.rs b/pco/src/chunk_meta.rs index 3ad1a945..0ce4376e 100644 --- a/pco/src/chunk_meta.rs +++ b/pco/src/chunk_meta.rs @@ -112,6 +112,12 @@ impl ChunkLatentMeta { ans_size_log, ))); } + if ans_size_log > MAX_ANS_BITS { + return Err(PcoError::corruption(format!( + "ANS size log ({}) should not be greater than {}", + ans_size_log, MAX_ANS_BITS, + ))); + } let mut bins = Vec::with_capacity(n_bins); while bins.len() < n_bins { diff --git a/pco/src/standalone/simple.rs b/pco/src/standalone/simple.rs index 45b56bef..2f03aece 100644 --- a/pco/src/standalone/simple.rs +++ b/pco/src/standalone/simple.rs @@ -1,29 +1,35 @@ -use crate::bits; use crate::chunk_config::ChunkConfig; use crate::data_types::NumberLike; use crate::errors::PcoResult; use crate::standalone::compressor::FileCompressor; use crate::standalone::decompressor::FileDecompressor; - -const DEFAULT_CHUNK_SIZE: usize = 1_000_000; +use crate::PagingSpec; /// Takes in a slice of numbers and an exact configuration and returns /// compressed bytes. /// /// Will return an error if the compressor config is invalid. +/// This will use the `PagingSpec` in `ChunkConfig` to decide where to split +/// chunks. +/// For standalone, the concepts of chunk and page are conflated since each +/// chunk has exactly one page. pub fn simple_compress(nums: &[T], config: &ChunkConfig) -> PcoResult> { let mut dst = Vec::new(); let file_compressor = FileCompressor::default(); file_compressor.write_header(&mut dst)?; - let n_chunks = bits::ceil_div(nums.len(), DEFAULT_CHUNK_SIZE); - if n_chunks > 0 { - let n_per_chunk = bits::ceil_div(nums.len(), n_chunks); - for chunk in nums.chunks(n_per_chunk) { - let chunk_compressor = file_compressor.chunk_compressor(chunk, config)?; - dst.reserve(chunk_compressor.chunk_size_hint()); - chunk_compressor.write_chunk(&mut dst)?; - } + // here we use the paging spec to determine chunks; each chunk has 1 page + let page_sizes = config.paging_spec.page_sizes(nums.len())?; + let mut start = 0; + let mut this_chunk_config = config.clone(); + for &page_size in &page_sizes { + let end = start + page_size; + this_chunk_config.paging_spec = PagingSpec::ExactPageSizes(vec![page_size]); + let chunk_compressor = + file_compressor.chunk_compressor(&nums[start..end], &this_chunk_config)?; + dst.reserve(chunk_compressor.chunk_size_hint()); + chunk_compressor.write_chunk(&mut dst)?; + start = end; } file_compressor.write_footer(&mut dst)?; diff --git a/pco/src/wrapped/chunk_compressor.rs b/pco/src/wrapped/chunk_compressor.rs index b8410b92..94df1821 100644 --- a/pco/src/wrapped/chunk_compressor.rs +++ b/pco/src/wrapped/chunk_compressor.rs @@ -395,7 +395,6 @@ pub(crate) fn new( let table = CompressionTable::from(trained.infos); let encoder = ans::Encoder::from_bins(trained.ans_size_log, &bins)?; - println!("encoder size log {}", encoder.size_log()); latent_metas.push(ChunkLatentMeta { bins, @@ -554,6 +553,8 @@ impl ChunkCompressor { let mut writer = BitWriter::new(dst, PAGE_PADDING); + // TODO why doesn't this take page_idx? Am I doing repeated work + // (or worse)? let dissected_src = self.dissect_unsigneds()?; let mut latent_metas = Vec::with_capacity(self.n_latents);