Skip to content

Commit

Permalink
chore: consolidated features for disk-io
Browse files Browse the repository at this point in the history
  • Loading branch information
nishaq503 committed Nov 30, 2024
1 parent 5130765 commit d788f98
Show file tree
Hide file tree
Showing 18 changed files with 86 additions and 77 deletions.
4 changes: 2 additions & 2 deletions Earthfile
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ build:
# This target formats the project.
fmt:
FROM +chef-cook
RUN cargo fmt --all --all-features -- --check && rye fmt --all --check
RUN cargo fmt --all -- --check && rye fmt --all --check

# This target lints the project.
lint:
Expand All @@ -77,7 +77,7 @@ fix:
# This target runs the tests.
test:
FROM +chef-cook
RUN cargo test --release --lib --bins --examples --tests --all-features
RUN cargo test -r
# TODO: switch to --all, blocked on https://github.com/astral-sh/rye/issues/853
RUN rye test --package abd-distances

Expand Down
15 changes: 6 additions & 9 deletions crates/abd-clam/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ authors = [
"Oliver McLaughlin <[email protected]>",
]
edition = "2021"
rust-version = "1.79"
rust-version = "1.83"
description = "Clustering, Learning and Approximation with Manifolds"
license = "MIT"
readme = "./README.md"
Expand Down Expand Up @@ -43,8 +43,7 @@ ndarray-npy = { workspace = true, optional = true }

# For:
# - CHAODA
smartcore = { git = "https://github.com/smartcorelib/smartcore.git", rev = "239c00428f7448d30b78bf8653923f6bc0e2c29b", features = ["serde"], optional = true }
ordered-float = { version = "4.2.2", optional = true }
smartcore = { version = "0.4", features = ["serde"], optional = true }

# For:
# - MSA
Expand All @@ -58,20 +57,18 @@ flate2 = { workspace = true, optional = true }
[dev-dependencies]
symagen = { workspace = true }
bitcode = { workspace = true }
criterion = { version = "0.5.1", features = ["html_reports"] }
criterion = { version = "0.5", features = ["html_reports"] }
tempdir = "0.3.7"
float-cmp = "0.10.0"
test-case = "3.2.1"
statistical = "1.0.0"

[features]
csv = ["dep:csv"]
bitcode = ["dep:bitcode", "dep:flate2"]
ndarray-bindings = ["dep:ndarray", "dep:ndarray-npy"]
chaoda = ["dep:smartcore", "dep:ordered-float", "bitcode"]
disk-io = ["dep:csv", "dep:bitcode", "dep:flate2", "dep:ndarray", "dep:ndarray-npy"]
chaoda = ["dep:smartcore"]
mbed = ["chaoda"]
msa = ["dep:stringzilla"]
all = ["csv", "bitcode", "ndarray-bindings", "chaoda", "mbed", "msa"]
all = ["disk-io", "chaoda", "mbed", "msa"]

[[bench]]
name = "vector_search"
Expand Down
14 changes: 7 additions & 7 deletions crates/abd-clam/src/cakes/cluster/permuted_ball.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ use crate::{
/// - `T`: The type of the distance values.
/// - `S`: The `Cluster` type that the `PermutedBall` is based on.
#[derive(Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "bitcode", derive(bitcode::Encode, bitcode::Decode))]
#[cfg_attr(feature = "bitcode", bitcode(recursive))]
#[cfg_attr(feature = "disk-io", derive(bitcode::Encode, bitcode::Decode))]
#[cfg_attr(feature = "disk-io", bitcode(recursive))]
pub struct PermutedBall<T: Number, S: Cluster<T>> {
/// The `Cluster` type that the `PermutedBall` is based on.
source: S,
Expand Down Expand Up @@ -155,7 +155,7 @@ impl<T: Number, S: ParCluster<T>> ParCluster<T> for PermutedBall<T, S> {

/// Parameters for adapting the `PermutedBall`.
#[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "bitcode", derive(bitcode::Encode, bitcode::Decode))]
#[cfg_attr(feature = "disk-io", derive(bitcode::Encode, bitcode::Decode))]
pub struct Offset {
/// The offset of the slice of indices of the `Cluster` in the reordered
/// dataset.
Expand Down Expand Up @@ -271,7 +271,7 @@ impl<I: Send + Sync, T: Number, D: ParDataset<I> + Permutable, S: ParCluster<T>>
}
}

#[cfg(feature = "csv")]
#[cfg(feature = "disk-io")]
impl<T: Number, S: crate::cluster::Csv<T>> crate::cluster::Csv<T> for PermutedBall<T, S> {
fn header(&self) -> Vec<String> {
let mut header = self.source.header();
Expand All @@ -290,13 +290,13 @@ impl<T: Number, S: crate::cluster::Csv<T>> crate::cluster::Csv<T> for PermutedBa
}
}

#[cfg(feature = "csv")]
#[cfg(feature = "disk-io")]
impl<T: Number, S: crate::cluster::ParCsv<T>> crate::cluster::ParCsv<T> for PermutedBall<T, S> {}

#[cfg(feature = "bitcode")]
#[cfg(feature = "disk-io")]
impl<T: Number, S: crate::cluster::ClusterIO<T>> crate::cluster::ClusterIO<T> for PermutedBall<T, S> {}

#[cfg(feature = "bitcode")]
#[cfg(feature = "disk-io")]
impl<T: Number, S: crate::cluster::ParClusterIO<T>> crate::cluster::ParClusterIO<T> for PermutedBall<T, S> {}

#[cfg(test)]
Expand Down
13 changes: 6 additions & 7 deletions crates/abd-clam/src/chaoda/graph/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@
use core::cmp::Reverse;

use std::collections::{BinaryHeap, HashMap};
use std::collections::HashMap;

use distances::Number;
use ordered_float::OrderedFloat;
use rayon::prelude::*;

use crate::{cluster::ParCluster, dataset::ParDataset, metric::ParMetric, Cluster, Dataset, Metric};
use crate::{cluster::ParCluster, dataset::ParDataset, metric::ParMetric, Cluster, Dataset, Metric, SizedHeap};

use super::Vertex;

Expand Down Expand Up @@ -90,10 +89,10 @@ impl<'a, T: Number, S: Cluster<T>> Graph<'a, T, S> {
// `Vertex`es are selected by highest score and then by shallowest depth.
let mut candidates = clusters
.into_iter()
.zip(scores.into_iter().map(OrderedFloat))
.zip(scores)
.filter(|(c, _)| c.is_leaf() || c.depth() >= min_depth)
.map(|(c, s)| (s, Reverse(c)))
.collect::<BinaryHeap<_>>();
.collect::<SizedHeap<_>>();

let mut clusters = vec![];
while let Some((_, Reverse(v))) = candidates.pop() {
Expand Down Expand Up @@ -239,10 +238,10 @@ impl<'a, T: Number, S: ParCluster<T>> Graph<'a, T, S> {
// `Vertex`es are selected by highest score and then by shallowest depth.
let mut candidates = clusters
.into_iter()
.zip(scores.into_iter().map(OrderedFloat))
.zip(scores)
.filter(|(c, _)| c.is_leaf() || c.depth() >= min_depth)
.map(|(c, s)| (s, Reverse(c)))
.collect::<BinaryHeap<_>>();
.collect::<SizedHeap<_>>();

let mut clusters = vec![];
while let Some((_, Reverse(v))) = candidates.pop() {
Expand Down
12 changes: 6 additions & 6 deletions crates/abd-clam/src/core/cluster/ball.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ use super::{partition::ParPartition, Cluster, ParCluster, Partition, LFD};
/// A metric-`Ball` is a collection of items that are within a certain distance
/// of a center.
#[derive(Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "bitcode", derive(bitcode::Encode, bitcode::Decode))]
#[cfg_attr(feature = "bitcode", bitcode(recursive))]
#[cfg_attr(feature = "disk-io", derive(bitcode::Encode, bitcode::Decode))]
#[cfg_attr(feature = "disk-io", bitcode(recursive))]
pub struct Ball<T: Number> {
/// Parameters used for creating the `Ball`.
depth: usize,
Expand Down Expand Up @@ -268,7 +268,7 @@ impl<T: Number> ParPartition<T> for Ball<T> {
}
}

#[cfg(feature = "csv")]
#[cfg(feature = "disk-io")]
impl<T: Number> super::Csv<T> for Ball<T> {
fn header(&self) -> Vec<String> {
vec![
Expand All @@ -295,13 +295,13 @@ impl<T: Number> super::Csv<T> for Ball<T> {
}
}

#[cfg(feature = "csv")]
#[cfg(feature = "disk-io")]
impl<T: Number> super::ParCsv<T> for Ball<T> {}

#[cfg(feature = "bitcode")]
#[cfg(feature = "disk-io")]
impl<T: Number> super::ClusterIO<T> for Ball<T> {}

#[cfg(feature = "bitcode")]
#[cfg(feature = "disk-io")]
impl<T: Number> super::ParClusterIO<T> for Ball<T> {}

#[cfg(test)]
Expand Down
8 changes: 4 additions & 4 deletions crates/abd-clam/src/core/cluster/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use rayon::prelude::*;

use super::{Cluster, ParCluster};

#[cfg(feature = "csv")]
#[cfg(feature = "disk-io")]
/// Write a tree to a CSV file.
pub trait Csv<T: Number>: Cluster<T> {
/// Returns the names of the columns in the CSV file.
Expand Down Expand Up @@ -45,7 +45,7 @@ pub trait Csv<T: Number>: Cluster<T> {
}
}

#[cfg(feature = "csv")]
#[cfg(feature = "disk-io")]
/// Parallel version of `Csv`.
pub trait ParCsv<T: Number>: Csv<T> + ParCluster<T> {
/// Parallel version of `Csv::write_to_csv`.
Expand Down Expand Up @@ -84,7 +84,7 @@ pub trait ParCsv<T: Number>: Csv<T> + ParCluster<T> {
}
}

#[cfg(feature = "bitcode")]
#[cfg(feature = "disk-io")]
/// Reading and writing `Cluster` trees to disk using `bitcode`.
pub trait ClusterIO<T: Number>: Cluster<T> {
/// Writes the `Cluster` to disk in binary format using `bitcode`.
Expand Down Expand Up @@ -116,7 +116,7 @@ pub trait ClusterIO<T: Number>: Cluster<T> {
}
}

#[cfg(feature = "bitcode")]
#[cfg(feature = "disk-io")]
/// Parallel version of `ClusterIO`.
pub trait ParClusterIO<T: Number>: ParCluster<T> + ClusterIO<T> {
/// Parallel version of `ClusterIO::write_to`.
Expand Down
9 changes: 4 additions & 5 deletions crates/abd-clam/src/core/cluster/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ use super::{
pub mod adapter;
mod balanced_ball;
mod ball;
mod io;
mod lfd;
mod partition;

Expand All @@ -20,12 +19,12 @@ pub use ball::Ball;
pub use lfd::LFD;
pub use partition::{ParPartition, Partition};

#[cfg(feature = "csv")]
pub use io::{Csv, ParCsv};
#[cfg(feature = "disk-io")]
mod io;

#[cfg(feature = "bitcode")]
#[cfg(feature = "disk-io")]
#[allow(clippy::module_name_repetitions)]
pub use io::{ClusterIO, ParClusterIO};
pub use io::{ClusterIO, Csv, ParClusterIO, ParCsv};

/// A `Cluster` is a collection of "similar" items in a dataset.
///
Expand Down
18 changes: 9 additions & 9 deletions crates/abd-clam/src/core/dataset/flat_vec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ use super::{AssociatesMetadata, AssociatesMetadataMut, Dataset, ParDataset, Perm
/// - `I`: The type of the items in the dataset.
/// - `Me`: The type of the metadata associated with the items.
#[derive(Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "bitcode", derive(bitcode::Encode, bitcode::Decode))]
#[cfg_attr(feature = "bitcode", bitcode(recursive))]
#[cfg_attr(feature = "disk-io", derive(bitcode::Encode, bitcode::Decode))]
#[cfg_attr(feature = "disk-io", bitcode(recursive))]
pub struct FlatVec<I, Me> {
/// The items in the dataset.
items: Vec<I>,
Expand Down Expand Up @@ -223,13 +223,13 @@ impl<I, Me> Permutable for FlatVec<I, Me> {
}
}

#[cfg(feature = "bitcode")]
#[cfg(feature = "disk-io")]
impl<I, Me> super::DatasetIO<I> for FlatVec<I, Me> {}

#[cfg(feature = "bitcode")]
#[cfg(feature = "disk-io")]
impl<I: Send + Sync, Me: Send + Sync> super::ParDatasetIO<I> for FlatVec<I, Me> {}

#[cfg(feature = "ndarray-bindings")]
#[cfg(feature = "disk-io")]
impl<T: ndarray_npy::ReadableElement + Copy> FlatVec<Vec<T>, usize> {
/// Reads a `FlatVec` from a `.npy` file.
///
Expand All @@ -250,7 +250,7 @@ impl<T: ndarray_npy::ReadableElement + Copy> FlatVec<Vec<T>, usize> {
}
}

#[cfg(feature = "ndarray-bindings")]
#[cfg(feature = "disk-io")]
impl<T: ndarray_npy::WritableElement + Copy> FlatVec<Vec<T>, usize> {
/// Writes the `FlatVec` to a `.npy` file in the given directory.
///
Expand Down Expand Up @@ -282,7 +282,7 @@ impl<T: ndarray_npy::WritableElement + Copy> FlatVec<Vec<T>, usize> {
}
}

#[cfg(feature = "csv")]
#[cfg(feature = "disk-io")]
impl<T: std::str::FromStr + Copy> FlatVec<Vec<T>, usize> {
/// Reads a `FlatVec` from a `.csv` file.
///
Expand Down Expand Up @@ -321,7 +321,7 @@ impl<T: std::str::FromStr + Copy> FlatVec<Vec<T>, usize> {
}
}

#[cfg(feature = "csv")]
#[cfg(feature = "disk-io")]
impl<T: std::string::ToString + Copy, M> FlatVec<Vec<T>, M> {
/// Writes the `FlatVec` to a `.csv` file with the given path.
///
Expand Down Expand Up @@ -376,7 +376,7 @@ mod tests {
Ok(())
}

#[cfg(feature = "ndarray-bindings")]
#[cfg(feature = "disk-io")]
#[test]
fn npy_io() -> Result<(), String> {
let items = vec![vec![1, 2], vec![3, 4], vec![5, 6]];
Expand Down
6 changes: 2 additions & 4 deletions crates/abd-clam/src/core/dataset/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
use super::{Dataset, ParDataset};

#[cfg(feature = "bitcode")]
#[cfg(feature = "disk-io")]
/// For writing and reading datasets to and from disk.
pub trait DatasetIO<I>: Dataset<I> {
/// Writes the `Dataset` to disk in binary format using `bitcode`.
Expand Down Expand Up @@ -34,7 +34,7 @@ pub trait DatasetIO<I>: Dataset<I> {
}
}

#[cfg(feature = "bitcode")]
#[cfg(feature = "disk-io")]
/// Parallel version of `DatasetIO`.
pub trait ParDatasetIO<I: Send + Sync>: DatasetIO<I> + ParDataset<I> {
/// Parallel version of `DatasetIO::write_to`.
Expand All @@ -45,7 +45,6 @@ pub trait ParDatasetIO<I: Send + Sync>: DatasetIO<I> + ParDataset<I> {
///
/// - If the dataset cannot be encoded.
/// - If the file cannot be written.
#[cfg(feature = "bitcode")]
fn par_write_to<P: AsRef<std::path::Path>>(&self, path: &P) -> Result<(), String>
where
Self: bitcode::Encode,
Expand All @@ -61,7 +60,6 @@ pub trait ParDatasetIO<I: Send + Sync>: DatasetIO<I> + ParDataset<I> {
///
/// - If the file cannot be read.
/// - If the dataset cannot be decoded.
#[cfg(feature = "bitcode")]
fn par_read_from<P: AsRef<std::path::Path>>(path: &P) -> Result<Self, String>
where
Self: bitcode::Decode,
Expand Down
6 changes: 4 additions & 2 deletions crates/abd-clam/src/core/dataset/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ use super::{metric::ParMetric, Metric};

mod associates_metadata;
mod flat_vec;
mod io;
mod permutable;
mod sized_heap;

Expand All @@ -17,7 +16,10 @@ pub use flat_vec::FlatVec;
pub use permutable::Permutable;
pub use sized_heap::SizedHeap;

#[cfg(feature = "bitcode")]
#[cfg(feature = "disk-io")]
mod io;

#[cfg(feature = "disk-io")]
#[allow(clippy::module_name_repetitions)]
pub use io::{DatasetIO, ParDatasetIO};

Expand Down
Loading

0 comments on commit d788f98

Please sign in to comment.