diff --git a/Cargo.lock b/Cargo.lock
index 0479eff..ceb3204 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -427,7 +427,7 @@ dependencies = [
 
 [[package]]
 name = "gathers"
-version = "0.3.0"
+version = "0.3.1"
 dependencies = [
  "argh",
  "criterion",
diff --git a/Cargo.toml b/Cargo.toml
index f114952..c9ecd5c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "gathers"
-version = "0.3.0"
+version = "0.3.1"
 edition = "2021"
 authors = ["Keming <kemingy94@gmail.com>"]
 license = "Apache-2.0"
@@ -8,7 +8,7 @@ readme = "README.md"
 repository = "https://github.com/kemingy/gathers"
 description = "Clustering algorithms."
 documentation = "https://docs.rs/gathers"
-keywords = ["clustering"]
+keywords = ["cluster", "kmeans", "rabitq", "machine-learning", "vector-search"]
 categories = ["algorithms", "science"]
 
 [dependencies]
diff --git a/README.md b/README.md
index 8c02017..56227ea 100644
--- a/README.md
+++ b/README.md
@@ -4,11 +4,25 @@
 [![crates.io](https://img.shields.io/crates/v/gathers.svg)](https://crates.io/crates/gathers)
 [![docs.rs](https://docs.rs/gathers/badge.svg)](https://docs.rs/gathers)
 
-Clustering algorithm implementation:
+Clustering algorithm implementation in Rust and binding to Python.
+
+For Python users, check the [Python README](./python/README.md).
 
 - [x] K-means
 - [x] PyO3 binding
 - [x] RaBitQ assignment
 - [x] Parallel with Rayon
+- [x] `x86` & `x86_64` SIMD acceleration
 - [ ] mini batch K-means
 - [ ] Hierarchical K-means
+- [ ] `arm` & `aarch64` SIMD acceleration
+
+## Installation
+
+```sh
+cargo add gathers
+```
+
+## Usage
+
+Check the [docs](https://docs.rs/gathers) and [main.rs](./src/main.rs).
diff --git a/python/README.md b/python/README.md
index 3d9fe5c..f07f96a 100644
--- a/python/README.md
+++ b/python/README.md
@@ -17,7 +17,7 @@ import numpy as np
 
 gathers = Gathers(verbose=True)
 rng = np.random.default_rng()
-data = rng.random((1000, 64), dtype=np.float32)
+data = rng.random((1000, 64), dtype=np.float32)  # only support float32
 centroids = gathers.fit(data, 10)
 labels = gathers.batch_assign(data, centroids)
 print(labels)
diff --git a/src/kmeans.rs b/src/kmeans.rs
index 61e2d85..9ca954e 100644
--- a/src/kmeans.rs
+++ b/src/kmeans.rs
@@ -154,6 +154,7 @@ pub struct KMeans {
     tolerance: f32,
     distance: Distance,
     use_residual: bool,
+    use_default_config: bool,
 }
 
 impl Default for KMeans {
@@ -164,12 +165,21 @@ impl Default for KMeans {
             tolerance: 1e-4,
             distance: Distance::default(),
             use_residual: false,
+            use_default_config: true,
         }
     }
 }
 
 impl KMeans {
     /// Create a new KMeans instance.
+    ///
+    /// # Arguments
+    ///
+    /// * `n_cluster` - number of clusters, recommend to be a number in [sqrt(n) * 4, sqrt(n) * 8]
+    /// * `max_iter` - max number of iterations
+    /// * `tolerance` - convergence tolerance, stop when the diff is less than this value
+    /// * `distance` - distance metric
+    /// * `use_residual` - use residual for more accurate L2 distance computations, only work for L2
     pub fn new(
         n_cluster: u32,
         max_iter: u32,
@@ -192,17 +202,25 @@ impl KMeans {
             tolerance,
             distance,
             use_residual,
+            use_default_config: false,
         }
     }
 
     /// Fit the KMeans configurations to the given vectors and return the centroids.
     pub fn fit(&self, mut vecs: Vec<f32>, dim: usize) -> Vec<f32> {
         let num = vecs.len() / dim;
-        debug!("num of points: {}", num);
-        if num < self.n_cluster as usize {
+
+        // auto-config the `n_cluster` if it's initialized with `default()`
+        let n_cluster = match self.use_default_config {
+            true => (((num as f32).sqrt() as u32) * 4).min((num / MIN_POINTS_PER_CENTROID) as u32),
+            false => self.n_cluster,
+        };
+        debug!("num of points: {}, num of clusters: {}", num, n_cluster);
+
+        if num < n_cluster as usize {
             panic!("number of samples must be greater than n_cluster");
         }
-        if num < self.n_cluster as usize * MIN_POINTS_PER_CENTROID {
+        if num < n_cluster as usize * MIN_POINTS_PER_CENTROID {
             panic!("too few samples for n_cluster");
         }
 
@@ -213,13 +231,13 @@ impl KMeans {
         }
 
         // subsample
-        if num > MAX_POINTS_PER_CENTROID * self.n_cluster as usize {
-            let n_sample = MAX_POINTS_PER_CENTROID * self.n_cluster as usize;
+        if num > MAX_POINTS_PER_CENTROID * n_cluster as usize {
+            let n_sample = MAX_POINTS_PER_CENTROID * n_cluster as usize;
             debug!("subsample to {} points", n_sample);
             vecs = as_continuous_vec(&subsample(n_sample, &vecs, dim));
         }
 
-        let mut centroids = as_continuous_vec(&subsample(self.n_cluster as usize, &vecs, dim));
+        let mut centroids = as_continuous_vec(&subsample(n_cluster as usize, &vecs, dim));
         if self.distance == Distance::NegativeDotProduct {
             centroids.chunks_mut(dim).for_each(normalize);
         }
diff --git a/src/lib.rs b/src/lib.rs
index 54a8bf9..2ae9da0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,25 @@
 //! Clustering algorithms for Rust.
+//!
+//! ## Examples
+//!
+//! ```
+//! use gathers::kmeans::{KMeans, rabitq_assign};
+//! use gathers::utils::as_continuous_vec;
+//! # use rand::Rng;
+//! # let mut rng = rand::thread_rng();
+//! # let vecs = (0..1000).map(|_| (0..32).map(|_| rng.gen::<f32>()).collect::<Vec<f32>>()).collect::<Vec<Vec<f32>>>();
+//!
+//!
+//! let kmeans = KMeans::default();
+//! let num = vecs.len();
+//! let dim = vecs[0].len();
+//!
+//! // fit
+//! let centroids = kmeans.fit(as_continuous_vec(&vecs), dim);
+//! // predict
+//! let mut labels = vec![0; num];
+//! rabitq_assign(&as_continuous_vec(&vecs), &centroids, dim, &mut labels);
+//! ```
 
 #![deny(missing_docs)]
 
diff --git a/src/simd.rs b/src/simd.rs
index d8b1938..5db48eb 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -3,7 +3,8 @@
 use crate::rabitq::THETA_LOG_DIM;
 
 /// Compute the squared Euclidean distance between two vectors.
-/// Code refer to https://github.com/nmslib/hnswlib/blob/master/hnswlib/space_l2.h
+///
+/// Code refer to <https://github.com/nmslib/hnswlib/blob/master/hnswlib/space_l2.h>
 ///
 /// # Safety
 ///
@@ -425,7 +426,7 @@ pub unsafe fn vector_binarize_query(vec: &[u8], binary: &mut [u64]) {
 
 /// Compute the binary dot product of two vectors.
 ///
-/// Refer to: https://github.com/komrad36/popcount
+/// Refer to: <https://github.com/komrad36/popcount>
 ///
 /// # Safety
 ///
diff --git a/src/utils.rs b/src/utils.rs
index 1076f52..0be6e47 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -28,7 +28,7 @@ where
     }
 }
 
-/// Convert a 2-D Vec<Vec<T>> to a 1-D continuous vector.
+/// Convert a 2-D `Vec<Vec<T>>` to a 1-D continuous vector.
 #[inline]
 pub fn as_continuous_vec<T>(mat: &[Vec<T>]) -> Vec<T>
 where
@@ -37,7 +37,7 @@ where
     mat.iter().flat_map(|v| v.iter().cloned()).collect()
 }
 
-/// Convert a 1-D continuous vector to a 2-D Vec<Vec<T>>.
+/// Convert a 1-D continuous vector to a 2-D `Vec<Vec<T>>`.
 #[inline]
 pub fn as_matrix<T>(vecs: &[T], dim: usize) -> Vec<Vec<T>>
 where