diff --git a/Cargo.lock b/Cargo.lock index 0479eff..ceb3204 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -427,7 +427,7 @@ dependencies = [ [[package]] name = "gathers" -version = "0.3.0" +version = "0.3.1" dependencies = [ "argh", "criterion", diff --git a/Cargo.toml b/Cargo.toml index f114952..c9ecd5c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gathers" -version = "0.3.0" +version = "0.3.1" edition = "2021" authors = ["Keming "] license = "Apache-2.0" @@ -8,7 +8,7 @@ readme = "README.md" repository = "https://github.com/kemingy/gathers" description = "Clustering algorithms." documentation = "https://docs.rs/gathers" -keywords = ["clustering"] +keywords = ["cluster", "kmeans", "rabitq", "machine-learning", "vector-search"] categories = ["algorithms", "science"] [dependencies] diff --git a/README.md b/README.md index 8c02017..56227ea 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,25 @@ [![crates.io](https://img.shields.io/crates/v/gathers.svg)](https://crates.io/crates/gathers) [![docs.rs](https://docs.rs/gathers/badge.svg)](https://docs.rs/gathers) -Clustering algorithm implementation: +Clustering algorithm implementation in Rust and binding to Python. + +For Python users, check the [Python README](./python/README.md). - [x] K-means - [x] PyO3 binding - [x] RaBitQ assignment - [x] Parallel with Rayon +- [x] `x86` & `x86_64` SIMD acceleration - [ ] mini batch K-means - [ ] Hierarchical K-means +- [ ] `arm` & `aarch64` SIMD acceleration + +## Installation + +```sh +cargo add gathers +``` + +## Usage + +Check the [docs](https://docs.rs/gathers) and [main.rs](./src/main.rs). diff --git a/python/README.md b/python/README.md index 3d9fe5c..f07f96a 100644 --- a/python/README.md +++ b/python/README.md @@ -17,7 +17,7 @@ import numpy as np gathers = Gathers(verbose=True) rng = np.random.default_rng() -data = rng.random((1000, 64), dtype=np.float32) +data = rng.random((1000, 64), dtype=np.float32) # only support float32 centroids = gathers.fit(data, 10) labels = gathers.batch_assign(data, centroids) print(labels) diff --git a/src/kmeans.rs b/src/kmeans.rs index 61e2d85..9ca954e 100644 --- a/src/kmeans.rs +++ b/src/kmeans.rs @@ -154,6 +154,7 @@ pub struct KMeans { tolerance: f32, distance: Distance, use_residual: bool, + use_default_config: bool, } impl Default for KMeans { @@ -164,12 +165,21 @@ impl Default for KMeans { tolerance: 1e-4, distance: Distance::default(), use_residual: false, + use_default_config: true, } } } impl KMeans { /// Create a new KMeans instance. + /// + /// # Arguments + /// + /// * `n_cluster` - number of clusters, recommend to be a number in [sqrt(n) * 4, sqrt(n) * 8] + /// * `max_iter` - max number of iterations + /// * `tolerance` - convergence tolerance, stop when the diff is less than this value + /// * `distance` - distance metric + /// * `use_residual` - use residual for more accurate L2 distance computations, only work for L2 pub fn new( n_cluster: u32, max_iter: u32, @@ -192,17 +202,25 @@ impl KMeans { tolerance, distance, use_residual, + use_default_config: false, } } /// Fit the KMeans configurations to the given vectors and return the centroids. pub fn fit(&self, mut vecs: Vec, dim: usize) -> Vec { let num = vecs.len() / dim; - debug!("num of points: {}", num); - if num < self.n_cluster as usize { + + // auto-config the `n_cluster` if it's initialized with `default()` + let n_cluster = match self.use_default_config { + true => (((num as f32).sqrt() as u32) * 4).min((num / MIN_POINTS_PER_CENTROID) as u32), + false => self.n_cluster, + }; + debug!("num of points: {}, num of clusters: {}", num, n_cluster); + + if num < n_cluster as usize { panic!("number of samples must be greater than n_cluster"); } - if num < self.n_cluster as usize * MIN_POINTS_PER_CENTROID { + if num < n_cluster as usize * MIN_POINTS_PER_CENTROID { panic!("too few samples for n_cluster"); } @@ -213,13 +231,13 @@ impl KMeans { } // subsample - if num > MAX_POINTS_PER_CENTROID * self.n_cluster as usize { - let n_sample = MAX_POINTS_PER_CENTROID * self.n_cluster as usize; + if num > MAX_POINTS_PER_CENTROID * n_cluster as usize { + let n_sample = MAX_POINTS_PER_CENTROID * n_cluster as usize; debug!("subsample to {} points", n_sample); vecs = as_continuous_vec(&subsample(n_sample, &vecs, dim)); } - let mut centroids = as_continuous_vec(&subsample(self.n_cluster as usize, &vecs, dim)); + let mut centroids = as_continuous_vec(&subsample(n_cluster as usize, &vecs, dim)); if self.distance == Distance::NegativeDotProduct { centroids.chunks_mut(dim).for_each(normalize); } diff --git a/src/lib.rs b/src/lib.rs index 54a8bf9..2ae9da0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,25 @@ //! Clustering algorithms for Rust. +//! +//! ## Examples +//! +//! ``` +//! use gathers::kmeans::{KMeans, rabitq_assign}; +//! use gathers::utils::as_continuous_vec; +//! # use rand::Rng; +//! # let mut rng = rand::thread_rng(); +//! # let vecs = (0..1000).map(|_| (0..32).map(|_| rng.gen::()).collect::>()).collect::>>(); +//! +//! +//! let kmeans = KMeans::default(); +//! let num = vecs.len(); +//! let dim = vecs[0].len(); +//! +//! // fit +//! let centroids = kmeans.fit(as_continuous_vec(&vecs), dim); +//! // predict +//! let mut labels = vec![0; num]; +//! rabitq_assign(&as_continuous_vec(&vecs), ¢roids, dim, &mut labels); +//! ``` #![deny(missing_docs)] diff --git a/src/simd.rs b/src/simd.rs index d8b1938..5db48eb 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -3,7 +3,8 @@ use crate::rabitq::THETA_LOG_DIM; /// Compute the squared Euclidean distance between two vectors. -/// Code refer to https://github.com/nmslib/hnswlib/blob/master/hnswlib/space_l2.h +/// +/// Code refer to /// /// # Safety /// @@ -425,7 +426,7 @@ pub unsafe fn vector_binarize_query(vec: &[u8], binary: &mut [u64]) { /// Compute the binary dot product of two vectors. /// -/// Refer to: https://github.com/komrad36/popcount +/// Refer to: /// /// # Safety /// diff --git a/src/utils.rs b/src/utils.rs index 1076f52..0be6e47 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -28,7 +28,7 @@ where } } -/// Convert a 2-D Vec> to a 1-D continuous vector. +/// Convert a 2-D `Vec>` to a 1-D continuous vector. #[inline] pub fn as_continuous_vec(mat: &[Vec]) -> Vec where @@ -37,7 +37,7 @@ where mat.iter().flat_map(|v| v.iter().cloned()).collect() } -/// Convert a 1-D continuous vector to a 2-D Vec>. +/// Convert a 1-D continuous vector to a 2-D `Vec>`. #[inline] pub fn as_matrix(vecs: &[T], dim: usize) -> Vec> where