Skip to content

Commit

Permalink
docs: better README
Browse files Browse the repository at this point in the history
  • Loading branch information
nishaq503 committed Nov 30, 2024
1 parent 815ff10 commit 5d5c4ce
Show file tree
Hide file tree
Showing 18 changed files with 101 additions and 106 deletions.
32 changes: 15 additions & 17 deletions crates/abd-clam/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ This means that the API is not yet stable and breaking changes may occur frequen

CLAM is a library crate so you can add it to your crate using `cargo add [email protected]`.

### Cakes: Nearest Neighbor Search
### `Cakes`: Nearest Neighbor Search

```rust
use abd_clam::{
Expand Down Expand Up @@ -44,20 +44,18 @@ let labels: Vec<bool> = rows.iter().map(|v| v[0] > 0.0).collect();
// We use the `Euclidean` metric for this example.
let metric = abd_clam::metric::Euclidean;

// We can create a `Dataset` object. We make it mutable here so we can reorder it after building the tree.
let data = FlatVec::new(rows).unwrap();
// We can create a `Dataset` object and assign metadata.
let data = FlatVec::new(rows).unwrap().with_metadata(&labels).unwrap();

// We can assign the labels as metadata to the dataset.
let data = data.with_metadata(&labels).unwrap();

// We define the criteria for building the tree to partition the `Cluster`s until each contains a single point.
// We define the criteria for building the tree to partition the `Cluster`s
// until each contains a single point.
let criteria = |c: &Ball<_>| c.cardinality() > 1;

// Now we create a tree.
let root = Ball::new_tree(&data, &metric, &criteria, Some(seed));

// We will use the origin as our query.
let query: Vec<f32> = vec![0.0; dimensionality];
let query = vec![0_f32; dimensionality];

// We can now perform Ranged Nearest Neighbors search on the tree.
let radius = 0.05;
Expand All @@ -67,7 +65,8 @@ let rnn_results: Vec<(usize, f32)> = root.search(&data, &metric, &query, alg);
// KNN search is also supported.
let k = 10;

// The `KnnRepeatedRnn` algorithm starts RNN search with a small radius and increases it until it finds `k` neighbors.
// The `KnnRepeatedRnn` algorithm starts RNN search with a small radius and
// increases it until it finds `k` neighbors.
let alg = Algorithm::KnnRepeatedRnn(k, 2.0);
let knn_results: Vec<(usize, f32)> = root.search(&data, &metric, &query, alg);

Expand All @@ -79,9 +78,6 @@ let knn_results: Vec<(usize, f32)> = root.search(&data, &metric, &query, alg);
let alg = Algorithm::KnnDepthFirst(k);
let knn_results: Vec<(usize, f32)> = root.search(&data, &metric, &query, alg);

// We can borrow the reordered labels from the model.
let labels: &[bool] = data.metadata();

// We can use the results to get the labels of the points that are within the
// radius of the query point.
let rnn_labels: Vec<bool> = rnn_results.iter().map(|&(i, _)| labels[i]).collect();
Expand All @@ -91,7 +87,7 @@ let rnn_labels: Vec<bool> = rnn_results.iter().map(|&(i, _)| labels[i]).collect(
let knn_labels: Vec<bool> = knn_results.iter().map(|&(i, _)| labels[i]).collect();
```

### Compression and Compressive Search
### `PanCakes`: Compression and Compressive Search

We also support compression of certain datasets and trees to reduce memory usage.
We can then perform compressed search on the compressed dataset without having to decompress the whole dataset.
Expand Down Expand Up @@ -128,7 +124,8 @@ let (metadata, data) = symagen::random_edits::generate_clumped_data(
.into_iter()
.map(|(m, d)| (m, Sequence::from(d)))
.unzip::<_, _, Vec<_>, Vec<_>>();
// Create a `FlatVec` dataset from the sequence data.

// We create a `FlatVec` dataset from the sequence data and assign metadata.
let data = FlatVec::new(data).unwrap().with_metadata(&metadata).unwrap();

// The dataset will use the `levenshtein` distance metric.
Expand Down Expand Up @@ -162,14 +159,15 @@ codec_data.write_to(&codec_path).unwrap();
let squishy_ball_path = temp_dir.path().join("strings.squishy_ball");
squishy_ball.write_to(&squishy_ball_path).unwrap();

// We can perform compressed search on the compressed dataset.
// We can perform compressive search on the compressed dataset.
let query = &Sequence::from(seed_string);
let radius = 2;
let k = 10;

let alg = Algorithm::RnnClustered(radius);
let results: Vec<(usize, u16)> = squishy_ball.par_search(&codec_data, &metric, query, alg);
assert!(!results.is_empty());

let k = 10;
let alg = Algorithm::KnnRepeatedRnn(k, 2);
let results: Vec<(usize, u16)> = squishy_ball.par_search(&codec_data, &metric, query, alg);
assert_eq!(results.len(), k);
Expand All @@ -183,7 +181,7 @@ let results: Vec<(usize, u16)> = squishy_ball.par_search(&codec_data, &metric, q
assert_eq!(results.len(), k);

// The dataset can be deserialized from disk.
let mut flat_data: FlatVec<Sequence, String> = FlatVec::read_from(&flat_path).unwrap();
let flat_data: FlatVec<Sequence, String> = FlatVec::read_from(&flat_path).unwrap();

// The tree can be deserialized from disk.
let ball: Ball<u16> = Ball::read_from(&ball_path).unwrap();
Expand Down
6 changes: 3 additions & 3 deletions crates/abd-clam/benches/ann_benchmarks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::{
};

use abd_clam::{
cakes::OffBall,
cakes::PermutedBall,
cluster::{adapter::ParBallAdapter, BalancedBall, ParPartition},
metric::{Cosine, Euclidean, ParMetric},
Ball, Cluster, Dataset, FlatVec,
Expand Down Expand Up @@ -122,12 +122,12 @@ fn run_search<M: ParMetric<Row<f32>, f32>>(
) {
let criteria = |c: &Ball<_>| c.cardinality() > 1;
let ball = Ball::par_new_tree(data, metric, &criteria, seed);
let (perm_ball, perm_data) = OffBall::par_from_ball_tree(ball.clone(), data.clone(), metric);
let (perm_ball, perm_data) = PermutedBall::par_from_ball_tree(ball.clone(), data.clone(), metric);

let criteria = |c: &BalancedBall<_>| c.cardinality() > 1;
let balanced_ball = BalancedBall::par_new_tree(data, metric, &criteria, seed).into_ball();
let (perm_balanced_ball, perm_balanced_data) =
OffBall::par_from_ball_tree(balanced_ball.clone(), data.clone(), metric);
PermutedBall::par_from_ball_tree(balanced_ball.clone(), data.clone(), metric);

let root_radius = ball.radius();
let radii = radii_fractions.iter().map(|&r| r * root_radius).collect::<Vec<_>>();
Expand Down
6 changes: 3 additions & 3 deletions crates/abd-clam/benches/genomic_search.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! Benchmark for genomic search.
use abd_clam::{
cakes::OffBall,
cakes::PermutedBall,
cluster::{adapter::ParBallAdapter, BalancedBall, ParPartition},
metric::Levenshtein,
pancakes::{Sequence, SquishyBall},
Expand Down Expand Up @@ -58,13 +58,13 @@ fn genomic_search(c: &mut Criterion) {

let criteria = |c: &Ball<u16>| c.cardinality() > 1;
let ball = Ball::par_new_tree(&data, &metric, &criteria, seed);
let (perm_ball, perm_data) = OffBall::par_from_ball_tree(ball.clone(), data.clone(), &metric);
let (perm_ball, perm_data) = PermutedBall::par_from_ball_tree(ball.clone(), data.clone(), &metric);
let (squishy_ball, dec_data) = SquishyBall::par_from_ball_tree(ball.clone(), data.clone(), &metric);

let criteria = |c: &BalancedBall<u16>| c.cardinality() > 1;
let balanced_ball = BalancedBall::par_new_tree(&data, &metric, &criteria, seed).into_ball();
let (perm_balanced_ball, perm_balanced_data) =
OffBall::par_from_ball_tree(balanced_ball.clone(), data.clone(), &metric);
PermutedBall::par_from_ball_tree(balanced_ball.clone(), data.clone(), &metric);
let (squishy_balanced_ball, dec_balanced_data) =
SquishyBall::par_from_ball_tree(balanced_ball.clone(), data.clone(), &metric);

Expand Down
6 changes: 3 additions & 3 deletions crates/abd-clam/benches/utils/compare_permuted.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//! on permuted datasets.
use abd_clam::{
cakes::{Algorithm, OffBall, ParSearchable, Searchable},
cakes::{Algorithm, ParSearchable, PermutedBall, Searchable},
metric::ParMetric,
pancakes::{CodecData, Decodable, Encodable, ParCompressible, SquishyBall},
Ball,
Expand Down Expand Up @@ -42,8 +42,8 @@ pub fn compare_permuted<I, U, Co, M>(
metric: &M,
ball_data: (&Ball<U>, &Co),
balanced_ball_data: (&Ball<U>, &Co),
perm_ball_data: (&OffBall<U, Ball<U>>, &Co),
perm_balanced_ball_data: (&OffBall<U, Ball<U>>, &Co),
perm_ball_data: (&PermutedBall<U, Ball<U>>, &Co),
perm_balanced_ball_data: (&PermutedBall<U, Ball<U>>, &Co),
dec_ball_data: Option<(&SquishyBall<U, Ball<U>>, &CodecData<I, usize>)>,
dec_balanced_ball_data: Option<(&SquishyBall<U, Ball<U>>, &CodecData<I, usize>)>,
queries: &[I],
Expand Down
6 changes: 3 additions & 3 deletions crates/abd-clam/benches/vector_search.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! Benchmark for vector search.
use abd_clam::{
cakes::OffBall,
cakes::PermutedBall,
cluster::{adapter::ParBallAdapter, BalancedBall, ParPartition},
metric::{Euclidean, ParMetric},
Ball, Cluster, Dataset, FlatVec, Metric,
Expand Down Expand Up @@ -59,12 +59,12 @@ fn run_search<M: ParMetric<Row<f32>, f32>>(
let criteria = |c: &Ball<_>| c.cardinality() > 1;

let ball = Ball::par_new_tree(data, metric, &criteria, seed);
let (perm_ball, perm_data) = OffBall::par_from_ball_tree(ball.clone(), data.clone(), metric);
let (perm_ball, perm_data) = PermutedBall::par_from_ball_tree(ball.clone(), data.clone(), metric);

let criteria = |c: &BalancedBall<_>| c.cardinality() > 1;
let balanced_ball = BalancedBall::par_new_tree(data, metric, &criteria, seed).into_ball();
let (perm_balanced_ball, perm_balanced_data) =
OffBall::par_from_ball_tree(balanced_ball.clone(), data.clone(), metric);
PermutedBall::par_from_ball_tree(balanced_ball.clone(), data.clone(), metric);

utils::compare_permuted(
c,
Expand Down
6 changes: 3 additions & 3 deletions crates/abd-clam/src/cakes/cluster/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ use crate::{cluster::ParCluster, dataset::ParDataset, metric::ParMetric, Ball, C
mod permuted_ball;
mod searchable;

pub use permuted_ball::OffBall;
pub use permuted_ball::PermutedBall;
pub use searchable::{ParSearchable, Searchable};

impl<I, T: Number, D: Dataset<I>, M: Metric<I, T>> Searchable<I, T, D, M> for Ball<T> {}
impl<I, T: Number, D: Dataset<I>, S: Cluster<T>, M: Metric<I, T>> Searchable<I, T, D, M> for OffBall<T, S> {}
impl<I, T: Number, D: Dataset<I>, S: Cluster<T>, M: Metric<I, T>> Searchable<I, T, D, M> for PermutedBall<T, S> {}

impl<I: Send + Sync, T: Number, D: ParDataset<I>, M: ParMetric<I, T>> ParSearchable<I, T, D, M> for Ball<T> {}
impl<I: Send + Sync, T: Number, D: ParDataset<I>, S: ParCluster<T>, M: ParMetric<I, T>> ParSearchable<I, T, D, M>
for OffBall<T, S>
for PermutedBall<T, S>
{
}
44 changes: 22 additions & 22 deletions crates/abd-clam/src/cakes/cluster/permuted_ball.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use crate::{
#[derive(Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "bitcode", derive(bitcode::Encode, bitcode::Decode))]
#[cfg_attr(feature = "bitcode", bitcode(recursive))]
pub struct OffBall<T: Number, S: Cluster<T>> {
pub struct PermutedBall<T: Number, S: Cluster<T>> {
/// The `Cluster` type that the `PermutedBall` is based on.
source: S,
/// The children of the `Cluster`.
Expand All @@ -32,7 +32,7 @@ pub struct OffBall<T: Number, S: Cluster<T>> {
params: Offset,
}

impl<T: Number, S: Cluster<T> + core::fmt::Debug> core::fmt::Debug for OffBall<T, S> {
impl<T: Number, S: Cluster<T> + core::fmt::Debug> core::fmt::Debug for PermutedBall<T, S> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("PermutedBall")
.field("source", &self.source)
Expand All @@ -42,21 +42,21 @@ impl<T: Number, S: Cluster<T> + core::fmt::Debug> core::fmt::Debug for OffBall<T
}
}

impl<T: Number, S: Cluster<T>> PartialEq for OffBall<T, S> {
impl<T: Number, S: Cluster<T>> PartialEq for PermutedBall<T, S> {
fn eq(&self, other: &Self) -> bool {
self.params.offset == other.params.offset && self.cardinality() == other.cardinality()
}
}

impl<T: Number, S: Cluster<T>> Eq for OffBall<T, S> {}
impl<T: Number, S: Cluster<T>> Eq for PermutedBall<T, S> {}

impl<T: Number, S: Cluster<T>> PartialOrd for OffBall<T, S> {
impl<T: Number, S: Cluster<T>> PartialOrd for PermutedBall<T, S> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}

impl<T: Number, S: Cluster<T>> Ord for OffBall<T, S> {
impl<T: Number, S: Cluster<T>> Ord for PermutedBall<T, S> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.params
.offset
Expand All @@ -65,21 +65,21 @@ impl<T: Number, S: Cluster<T>> Ord for OffBall<T, S> {
}
}

impl<T: Number, S: Cluster<T>> std::hash::Hash for OffBall<T, S> {
impl<T: Number, S: Cluster<T>> std::hash::Hash for PermutedBall<T, S> {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
(self.params.offset, self.cardinality()).hash(state);
}
}

impl<T: Number, S: Cluster<T>> OffBall<T, S> {
impl<T: Number, S: Cluster<T>> PermutedBall<T, S> {
/// Returns the offset of the `Cluster`.
#[must_use]
pub const fn offset(&self) -> usize {
self.params.offset
}
}

impl<T: Number, S: Cluster<T>> Cluster<T> for OffBall<T, S> {
impl<T: Number, S: Cluster<T>> Cluster<T> for PermutedBall<T, S> {
fn depth(&self) -> usize {
self.source.depth()
}
Expand Down Expand Up @@ -147,7 +147,7 @@ impl<T: Number, S: Cluster<T>> Cluster<T> for OffBall<T, S> {
}
}

impl<T: Number, S: ParCluster<T>> ParCluster<T> for OffBall<T, S> {
impl<T: Number, S: ParCluster<T>> ParCluster<T> for PermutedBall<T, S> {
fn par_indices(&self) -> impl ParallelIterator<Item = usize> {
(self.params.offset..(self.params.offset + self.cardinality())).into_par_iter()
}
Expand Down Expand Up @@ -183,7 +183,7 @@ impl<I: Send + Sync, T: Number, D: ParDataset<I>, S: ParCluster<T>> ParParams<I,
}
}

impl<I, T: Number, D: Dataset<I> + Permutable> BallAdapter<I, T, D, D, Offset> for OffBall<T, Ball<T>> {
impl<I, T: Number, D: Dataset<I> + Permutable> BallAdapter<I, T, D, D, Offset> for PermutedBall<T, Ball<T>> {
/// Creates a new `PermutedBall` tree from a `Ball` tree.
fn from_ball_tree<M: Metric<I, T>>(ball: Ball<T>, mut data: D, metric: &M) -> (Self, D) {
let mut root = Self::adapt_tree_iterative(ball, None, &data, metric);
Expand All @@ -194,7 +194,7 @@ impl<I, T: Number, D: Dataset<I> + Permutable> BallAdapter<I, T, D, D, Offset> f
}

impl<I: Send + Sync, T: Number, D: ParDataset<I> + Permutable> ParBallAdapter<I, T, D, D, Offset>
for OffBall<T, Ball<T>>
for PermutedBall<T, Ball<T>>
{
/// Creates a new `PermutedBall` tree from a `Ball` tree.
fn par_from_ball_tree<M: ParMetric<I, T>>(ball: Ball<T>, mut data: D, metric: &M) -> (Self, D) {
Expand All @@ -205,7 +205,7 @@ impl<I: Send + Sync, T: Number, D: ParDataset<I> + Permutable> ParBallAdapter<I,
}
}

impl<I, T: Number, D: Dataset<I> + Permutable, S: Cluster<T>> Adapter<I, T, D, D, S, Offset> for OffBall<T, S> {
impl<I, T: Number, D: Dataset<I> + Permutable, S: Cluster<T>> Adapter<I, T, D, D, S, Offset> for PermutedBall<T, S> {
fn new_adapted<M: Metric<I, T>>(
source: S,
children: Vec<(usize, T, Box<Self>)>,
Expand Down Expand Up @@ -258,7 +258,7 @@ fn new_index(i: usize, indices: &[usize], offset: usize) -> usize {
}

impl<I: Send + Sync, T: Number, D: ParDataset<I> + Permutable, S: ParCluster<T>> ParAdapter<I, T, D, D, S, Offset>
for OffBall<T, S>
for PermutedBall<T, S>
{
fn par_new_adapted<M: ParMetric<I, T>>(
source: S,
Expand All @@ -272,7 +272,7 @@ impl<I: Send + Sync, T: Number, D: ParDataset<I> + Permutable, S: ParCluster<T>>
}

#[cfg(feature = "csv")]
impl<T: Number, S: crate::cluster::Csv<T>> crate::cluster::Csv<T> for OffBall<T, S> {
impl<T: Number, S: crate::cluster::Csv<T>> crate::cluster::Csv<T> for PermutedBall<T, S> {
fn header(&self) -> Vec<String> {
let mut header = self.source.header();
header.push("offset".to_string());
Expand All @@ -291,13 +291,13 @@ impl<T: Number, S: crate::cluster::Csv<T>> crate::cluster::Csv<T> for OffBall<T,
}

#[cfg(feature = "csv")]
impl<T: Number, S: crate::cluster::ParCsv<T>> crate::cluster::ParCsv<T> for OffBall<T, S> {}
impl<T: Number, S: crate::cluster::ParCsv<T>> crate::cluster::ParCsv<T> for PermutedBall<T, S> {}

#[cfg(feature = "bitcode")]
impl<T: Number, S: crate::cluster::ClusterIO<T>> crate::cluster::ClusterIO<T> for OffBall<T, S> {}
impl<T: Number, S: crate::cluster::ClusterIO<T>> crate::cluster::ClusterIO<T> for PermutedBall<T, S> {}

#[cfg(feature = "bitcode")]
impl<T: Number, S: crate::cluster::ParClusterIO<T>> crate::cluster::ParClusterIO<T> for OffBall<T, S> {}
impl<T: Number, S: crate::cluster::ParClusterIO<T>> crate::cluster::ParClusterIO<T> for PermutedBall<T, S> {}

#[cfg(test)]
mod tests {
Expand All @@ -310,11 +310,11 @@ mod tests {
Ball, Cluster, Dataset, FlatVec, Metric,
};

use super::OffBall;
use super::PermutedBall;

type Fv = FlatVec<Vec<i32>, usize>;
type B = Ball<i32>;
type Ob = OffBall<i32, B>;
type Ob = PermutedBall<i32, B>;

fn gen_tiny_data() -> Fv {
let items = vec![vec![1, 2], vec![3, 4], vec![5, 6], vec![7, 8], vec![11, 12]];
Expand Down Expand Up @@ -342,10 +342,10 @@ mod tests {

let ball = Ball::new_tree(&data, &metric, &criteria, seed);

let (root, perm_data) = OffBall::from_ball_tree(ball.clone(), data.clone(), &metric);
let (root, perm_data) = PermutedBall::from_ball_tree(ball.clone(), data.clone(), &metric);
assert!(check_permutation(&root, &perm_data, &metric));

let (root, perm_data) = OffBall::par_from_ball_tree(ball, data, &metric);
let (root, perm_data) = PermutedBall::par_from_ball_tree(ball, data, &metric);
assert!(check_permutation(&root, &perm_data, &metric));

Ok(())
Expand Down
Loading

0 comments on commit 5d5c4ce

Please sign in to comment.