Skip to content

Commit

Permalink
[TST] Benchmark metadata filtering (#2903)
Browse files Browse the repository at this point in the history
## Description of changes

*Summarize the changes made by this PR.*
 - Improvements & Bug fixes
	 - N/A
 - New functionality
	 - Benchmark metadata filtering operator

## Test plan
*How are these changes tested?*

- [x] Tests pass locally with `pytest` for python, `yarn test` for js, `cargo test` for rust

## Documentation Changes
*Are all docstrings for user-facing APIs updated if required? Do we need to make documentation changes in the [docs repository](https://github.com/chroma-core/docs)?*
N/A
  • Loading branch information
Sicheng-Pan authored Oct 7, 2024
1 parent 091609c commit ff69c4d
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 9 deletions.
11 changes: 2 additions & 9 deletions .github/workflows/_rust-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,5 @@ jobs:
uses: actions/checkout@v3
- name: Setup
uses: ./.github/actions/rust
- name: Cache test datasets
uses: actions/cache@v4
with:
path: /home/runner/.cache/chroma-test-datasets
key: chroma-test-datasets
- name: Build
run: cargo build --verbose
- name: Test benches
run: cargo nextest run --benches
- name: Test compile
run: cargo bench --no-run
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions rust/worker/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,9 @@ shuttle = { workspace = true }
proptest = { workspace = true }
proptest-state-machine = { workspace = true }
criterion = { workspace = true }

chroma-test = { workspace = true }

[[bench]]
name = "metadata_filtering"
harness = false
157 changes: 157 additions & 0 deletions rust/worker/benches/metadata_filtering.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
use std::collections::HashMap;
use std::iter::once;

use chroma_test::benchmark::{bench, tokio_multi_thread};
use chroma_test::log::{offset_as_id, random_document, random_embedding, LogGenerator};
use chroma_test::segment::CompactSegment;
use chroma_types::{
BooleanOperator, Chunk, DirectDocumentComparison, DirectWhereComparison, DocumentOperator,
MetadataValue, Operation, OperationRecord, PrimitiveOperator, UpdateMetadataValue, Where,
WhereChildren, WhereComparison,
};
use criterion::Criterion;
use criterion::{criterion_group, criterion_main};
use worker::execution::operator::Operator;
use worker::execution::operators::metadata_filtering::{
MetadataFilteringInput, MetadataFilteringOperator,
};

const DOCUMENT_LENGTH: usize = 64;
const EMBEDDING_DIMENSION: usize = 6;
const PRIMES: [usize; 8] = [2, 3, 5, 7, 11, 13, 17, 19];

fn modulo_metadata(id: usize) -> HashMap<String, UpdateMetadataValue> {
PRIMES
.iter()
.map(|p| {
(
format!("modulo_{p}"),
UpdateMetadataValue::Int((id % p) as i64),
)
})
.chain(once((
"val".to_string(),
UpdateMetadataValue::Int(id as i64),
)))
.collect()
}

fn log_generator(id: usize) -> OperationRecord {
OperationRecord {
id: offset_as_id(id),
embedding: Some(random_embedding(EMBEDDING_DIMENSION)),
encoding: None,
metadata: Some(modulo_metadata(id)),
document: Some(random_document(DOCUMENT_LENGTH)),
operation: Operation::Add,
}
}

fn baseline_where_clauses() -> Vec<(&'static str, Option<Where>)> {
use BooleanOperator::*;
use DocumentOperator::*;
use MetadataValue::*;
use PrimitiveOperator::*;
use WhereComparison::*;
vec![
(
"$eq",
Where::DirectWhereComparison(DirectWhereComparison {
key: "modulo_11".to_string(),
comparison: Primitive(Equal, Int(6)),
}),
),
(
"$ne",
Where::DirectWhereComparison(DirectWhereComparison {
key: "modulo_11".to_string(),
comparison: Primitive(NotEqual, Int(6)),
}),
),
(
"$gt-small",
Where::DirectWhereComparison(DirectWhereComparison {
key: "modulo_11".to_string(),
comparison: Primitive(GreaterThan, Int(6)),
}),
),
(
"$gt-large",
Where::DirectWhereComparison(DirectWhereComparison {
key: "val".to_string(),
comparison: Primitive(GreaterThan, Int(0)),
}),
),
(
"$and-[$ne, $eq]",
Where::WhereChildren(WhereChildren {
operator: And,
children: vec![
Where::DirectWhereComparison(DirectWhereComparison {
key: "modulo_11".to_string(),
comparison: Primitive(NotEqual, Int(6)),
}),
Where::DirectWhereComparison(DirectWhereComparison {
key: "modulo_2".to_string(),
comparison: Primitive(Equal, Int(0)),
}),
],
}),
),
(
"$contains",
Where::DirectWhereDocumentComparison(DirectDocumentComparison {
document: random_document(4),
operator: Contains,
}),
),
]
.into_iter()
.map(|(s, w)| (s, Some(w)))
.chain(once(("$true", None)))
.collect()
}

fn bench_metadata_filtering(criterion: &mut Criterion) {
let runtime = tokio_multi_thread();
let logen = LogGenerator {
generator: log_generator,
};

let routine = |metadata_filter_input| async move {
MetadataFilteringOperator::new()
.run(&metadata_filter_input)
.await
.expect("Metadata filtering should not fail.");
};

for record_count in [1000, 10000, 100000] {
let mut compact = CompactSegment::default();
runtime.block_on(async { compact.populate_with_generator(record_count, &logen).await });

for (op, where_clause) in baseline_where_clauses() {
let setup = || {
MetadataFilteringInput::new(
compact.blockfile_provider.clone(),
compact.record.clone(),
compact.metadata.clone(),
Chunk::new(Vec::new().into()),
None,
where_clause.clone(),
None,
None,
)
};
bench(
format!("metadata-filtering-{}-{}", record_count, op).as_str(),
criterion,
&runtime,
setup,
routine,
);
}
}
}

criterion_group!(benches, bench_metadata_filtering);
criterion_main!(benches);
2 changes: 2 additions & 0 deletions rust/worker/src/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -650,11 +650,13 @@ mod tests {
use chroma_blockstore::arrow::config::TEST_MAX_BLOCK_SIZE_BYTES;
#[cfg(test)]
use chroma_cache::new_cache_for_test;
#[cfg(debug_assertions)]
use chroma_proto::debug_client::DebugClient;
use chroma_storage::{local::LocalStorage, Storage};
use tempfile::tempdir;

#[tokio::test]
#[cfg(debug_assertions)]
async fn gracefully_handles_panics() {
let sysdb = TestSysDb::new();
let log = InMemoryLog::new();
Expand Down

0 comments on commit ff69c4d

Please sign in to comment.