diff --git a/Cargo.lock b/Cargo.lock index dab3206d7..8f4aef017 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1472,6 +1472,7 @@ dependencies = [ "dashmap", "flat", "hnsw", + "inverted", "ivf", "k_means", "log", @@ -1513,6 +1514,16 @@ dependencies = [ "ulock-sys", ] +[[package]] +name = "inverted" +version = "0.0.0" +dependencies = [ + "base", + "common", + "quantization", + "storage", +] + [[package]] name = "io-lifetimes" version = "1.0.11" diff --git a/crates/base/src/index.rs b/crates/base/src/index.rs index e0be3fba4..9965a2df7 100644 --- a/crates/base/src/index.rs +++ b/crates/base/src/index.rs @@ -105,10 +105,11 @@ pub struct IndexOptions { impl IndexOptions { fn validate_self(&self) -> Result<(), ValidationError> { - match (self.vector.v, &self.indexing) { - (VectorKind::Vecf32, _) => Ok(()), - (VectorKind::Vecf16, _) => Ok(()), + match (self.vector.v, self.vector.d, &self.indexing) { + (VectorKind::Vecf32, _, _) => Ok(()), + (VectorKind::Vecf16, _, _) => Ok(()), ( + _, _, IndexingOptions::Flat(FlatIndexingOptions { quantization: QuantizationOptions::Trivial(_), @@ -123,6 +124,7 @@ impl IndexOptions { .. }), ) => Ok(()), + (VectorKind::SVecf32, DistanceKind::Dot, IndexingOptions::InvertedIndex(_)) => Ok(()), _ => Err(ValidationError::new("not valid index options")), } } @@ -259,6 +261,7 @@ pub enum IndexingOptions { Flat(FlatIndexingOptions), Ivf(IvfIndexingOptions), Hnsw(HnswIndexingOptions), + InvertedIndex(InvertedIndexingOptions), } impl IndexingOptions { @@ -294,10 +297,21 @@ impl Validate for IndexingOptions { Self::Flat(x) => x.validate(), Self::Ivf(x) => x.validate(), Self::Hnsw(x) => x.validate(), + Self::InvertedIndex(_) => Ok(()), } } } +#[derive(Debug, Clone, Serialize, Deserialize, Validate)] +#[serde(deny_unknown_fields)] +pub struct InvertedIndexingOptions {} + +impl Default for InvertedIndexingOptions { + fn default() -> Self { + Self {} + } +} + #[derive(Debug, Clone, Serialize, Deserialize, Validate)] #[serde(deny_unknown_fields)] pub struct FlatIndexingOptions { diff --git a/crates/base/src/vector/svecf32.rs b/crates/base/src/vector/svecf32.rs index be71b5f3f..6a2934fa8 100644 --- a/crates/base/src/vector/svecf32.rs +++ b/crates/base/src/vector/svecf32.rs @@ -138,12 +138,12 @@ impl<'a> SVecf32Borrowed<'a> { } #[inline(always)] - pub fn indexes(&self) -> &[u32] { + pub fn indexes(&self) -> &'a [u32] { self.indexes } #[inline(always)] - pub fn values(&self) -> &[F32] { + pub fn values(&self) -> &'a [F32] { self.values } diff --git a/crates/index/Cargo.toml b/crates/index/Cargo.toml index 752e49b66..6f3cc5231 100644 --- a/crates/index/Cargo.toml +++ b/crates/index/Cargo.toml @@ -28,6 +28,7 @@ storage = { path = "../storage" } # algorithms flat = { path = "../flat" } hnsw = { path = "../hnsw" } +inverted = { path = "../inverted" } ivf = { path = "../ivf" } [lints] diff --git a/crates/index/src/indexing/sealed.rs b/crates/index/src/indexing/sealed.rs index 5507b296d..27aee6dff 100644 --- a/crates/index/src/indexing/sealed.rs +++ b/crates/index/src/indexing/sealed.rs @@ -4,6 +4,7 @@ use base::operator::*; use base::search::*; use flat::Flat; use hnsw::Hnsw; +use inverted::InvertedIndex; use ivf::Ivf; use std::path::Path; @@ -11,6 +12,7 @@ pub enum SealedIndexing { Flat(Flat), Ivf(Ivf), Hnsw(Hnsw), + InvertedIndex(InvertedIndex), } impl SealedIndexing { @@ -23,6 +25,9 @@ impl SealedIndexing { IndexingOptions::Flat(_) => Self::Flat(Flat::create(path, options, source)), IndexingOptions::Ivf(_) => Self::Ivf(Ivf::create(path, options, source)), IndexingOptions::Hnsw(_) => Self::Hnsw(Hnsw::create(path, options, source)), + IndexingOptions::InvertedIndex(_) => { + Self::InvertedIndex(InvertedIndex::create(path, options, source)) + } } } @@ -31,6 +36,7 @@ impl SealedIndexing { IndexingOptions::Flat(_) => Self::Flat(Flat::open(path)), IndexingOptions::Ivf(_) => Self::Ivf(Ivf::open(path)), IndexingOptions::Hnsw(_) => Self::Hnsw(Hnsw::open(path)), + IndexingOptions::InvertedIndex(_) => Self::InvertedIndex(InvertedIndex::open(path)), } } @@ -43,6 +49,7 @@ impl SealedIndexing { SealedIndexing::Flat(x) => x.vbase(vector, opts), SealedIndexing::Ivf(x) => x.vbase(vector, opts), SealedIndexing::Hnsw(x) => x.vbase(vector, opts), + SealedIndexing::InvertedIndex(x) => x.vbase(vector, opts), } } @@ -51,6 +58,7 @@ impl SealedIndexing { SealedIndexing::Flat(x) => x.len(), SealedIndexing::Ivf(x) => x.len(), SealedIndexing::Hnsw(x) => x.len(), + SealedIndexing::InvertedIndex(x) => x.len(), } } @@ -59,6 +67,7 @@ impl SealedIndexing { SealedIndexing::Flat(x) => x.vector(i), SealedIndexing::Ivf(x) => x.vector(i), SealedIndexing::Hnsw(x) => x.vector(i), + SealedIndexing::InvertedIndex(x) => x.vector(i), } } @@ -67,6 +76,7 @@ impl SealedIndexing { SealedIndexing::Flat(x) => x.payload(i), SealedIndexing::Ivf(x) => x.payload(i), SealedIndexing::Hnsw(x) => x.payload(i), + SealedIndexing::InvertedIndex(x) => x.payload(i), } } } diff --git a/crates/index/src/lib.rs b/crates/index/src/lib.rs index 78016f835..c60202eec 100644 --- a/crates/index/src/lib.rs +++ b/crates/index/src/lib.rs @@ -25,6 +25,7 @@ use common::dir_ops::sync_walk_from_dir; use common::file_atomic::FileAtomic; use crossbeam::atomic::AtomicCell; use crossbeam::channel::Sender; +use inverted::operator::OperatorInvertedIndex; use ivf::operator::OperatorIvf; use parking_lot::Mutex; use quantization::operator::OperatorQuantization; @@ -41,9 +42,16 @@ use storage::OperatorStorage; use thiserror::Error; use validator::Validate; -pub trait Op: Operator + OperatorQuantization + OperatorStorage + OperatorIvf {} +pub trait Op: + Operator + OperatorQuantization + OperatorStorage + OperatorIvf + OperatorInvertedIndex +{ +} -impl Op for T {} +impl< + T: Operator + OperatorQuantization + OperatorStorage + OperatorIvf + OperatorInvertedIndex, + > Op for T +{ +} #[derive(Debug, Error)] #[error("The index view is outdated.")] diff --git a/crates/index/src/segment/sealed.rs b/crates/index/src/segment/sealed.rs index 70e1083d9..a77344e85 100644 --- a/crates/index/src/segment/sealed.rs +++ b/crates/index/src/segment/sealed.rs @@ -121,6 +121,7 @@ impl SealedSegment { SealedIndexing::Flat(x) => x, SealedIndexing::Ivf(x) => x, SealedIndexing::Hnsw(x) => x, + SealedIndexing::InvertedIndex(x) => x, } } } diff --git a/crates/inverted/Cargo.toml b/crates/inverted/Cargo.toml new file mode 100644 index 000000000..080323f8a --- /dev/null +++ b/crates/inverted/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "inverted" +version.workspace = true +edition.workspace = true + +[dependencies] +base = { path = "../base" } +common = { path = "../common" } +quantization = { path = "../quantization" } +storage = { path = "../storage" } + +[lints] +workspace = true diff --git a/crates/inverted/src/lib.rs b/crates/inverted/src/lib.rs new file mode 100644 index 000000000..1ddbce58b --- /dev/null +++ b/crates/inverted/src/lib.rs @@ -0,0 +1,154 @@ +#![allow(clippy::len_without_is_empty)] + +pub mod operator; + +use self::operator::OperatorInvertedIndex; +use base::index::{IndexOptions, SearchOptions}; +use base::operator::Borrowed; +use base::scalar::{ScalarLike, F32}; +use base::search::{Collection, Element, Payload, Source, Vectors}; +use common::json::Json; +use common::mmap_array::MmapArray; +use common::remap::RemappedCollection; +use storage::Storage; + +use std::collections::BinaryHeap; +use std::fs::create_dir; +use std::path::Path; + +const ZERO: F32 = F32(0.0); + +#[allow(dead_code)] +pub struct InvertedIndex { + storage: O::Storage, + payloads: MmapArray, + indexes: Json>, + offsets: Json>, + scores: Json>, +} + +impl InvertedIndex { + pub fn create(path: impl AsRef, options: IndexOptions, source: &impl Source) -> Self { + let remapped = RemappedCollection::from_source(source); + from_nothing(path, options, &remapped) + } + + pub fn open(path: impl AsRef) -> Self { + open(path) + } + + pub fn vbase<'a>( + &'a self, + vector: Borrowed<'a, O>, + _: &'a SearchOptions, + ) -> (Vec, Box<(dyn Iterator + 'a)>) { + let mut doc_score = vec![ZERO; self.payloads.len()]; + for (token, val) in O::to_index_vec(vector) { + let start = self.offsets[token as usize]; + let end = self.offsets[token as usize + 1]; + for i in (start as usize)..(end as usize) { + doc_score[self.indexes[i] as usize] += self.scores[i] * val; + } + } + let mut candidates: BinaryHeap<(F32, Payload)> = doc_score + .iter() + .enumerate() + .map(|(i, score)| (*score, self.payload(i as u32))) + .collect::>() + .into(); + + ( + Vec::new(), + Box::new(std::iter::from_fn(move || { + candidates.pop().map(|(score, payload)| Element { + distance: -score, + payload, + }) + })), + ) + } + + pub fn len(&self) -> u32 { + self.storage.len() + } + + pub fn vector(&self, i: u32) -> Borrowed<'_, O> { + self.storage.vector(i) + } + + pub fn payload(&self, i: u32) -> Payload { + self.payloads[i as usize] + } +} + +fn from_nothing( + path: impl AsRef, + opts: IndexOptions, + collection: &impl Collection, +) -> InvertedIndex { + create_dir(path.as_ref()).expect("failed to create path for inverted index"); + + let mut token_collection = vec![Vec::new(); opts.vector.dims as usize]; + for i in 0..collection.len() { + for (token, score) in O::to_index_vec(collection.vector(i)) { + token_collection[token as usize].push((i, score.to_f())); + } + } + let (indexes, offsets, scores) = build_compressed_matrix(token_collection); + + let storage = O::Storage::create(path.as_ref().join("storage"), collection); + let payloads = MmapArray::create( + path.as_ref().join("payloads"), + (0..collection.len()).map(|i| collection.payload(i)), + ); + let json_index = Json::create(path.as_ref().join("indexes"), indexes); + let json_offset = Json::create(path.as_ref().join("offsets"), offsets); + let json_score = Json::create(path.as_ref().join("scores"), scores); + InvertedIndex { + storage, + payloads, + indexes: json_index, + offsets: json_offset, + scores: json_score, + } +} + +fn open(path: impl AsRef) -> InvertedIndex { + let storage = O::Storage::open(path.as_ref().join("storage")); + let payloads = MmapArray::open(path.as_ref().join("payloads")); + let offsets = Json::open(path.as_ref().join("offsets")); + let indexes = Json::open(path.as_ref().join("indexes")); + let scores = Json::open(path.as_ref().join("scores")); + InvertedIndex { + storage, + payloads, + indexes, + offsets, + scores, + } +} + +fn build_compressed_matrix( + token_collection: Vec>, +) -> (Vec, Vec, Vec) { + let mut indexes = Vec::new(); + let mut offsets = Vec::new(); + let mut scores = Vec::new(); + + let mut last: u32 = 0; + offsets.push(0); + for doc_scores in token_collection.iter() { + if doc_scores.is_empty() { + offsets.push(last); + continue; + } + for (id, score) in doc_scores { + indexes.push(*id); + scores.push(*score); + } + last += doc_scores.len() as u32; + offsets.push(last); + } + + (indexes, offsets, scores) +} diff --git a/crates/inverted/src/operator.rs b/crates/inverted/src/operator.rs new file mode 100644 index 000000000..1685064ca --- /dev/null +++ b/crates/inverted/src/operator.rs @@ -0,0 +1,40 @@ +use base::operator::*; +use base::scalar::F32; +use quantization::operator::OperatorQuantization; +use storage::OperatorStorage; + +use std::iter::{zip, Empty}; + +pub trait OperatorInvertedIndex: OperatorQuantization + OperatorStorage { + fn to_index_vec(vec: Borrowed<'_, Self>) -> impl Iterator + '_; +} + +impl OperatorInvertedIndex for SVecf32Dot { + fn to_index_vec(vec: Borrowed<'_, Self>) -> impl Iterator + '_ { + zip(vec.indexes().iter().copied(), vec.values().iter().copied()) + } +} + +macro_rules! unimpl_operator_inverted_index { + ($t:ty) => { + impl OperatorInvertedIndex for $t { + fn to_index_vec(_: Borrowed<'_, Self>) -> impl Iterator + '_ { + #![allow(unreachable_code)] + unimplemented!() as Empty<(u32, F32)> + } + } + }; +} + +unimpl_operator_inverted_index!(SVecf32Cos); +unimpl_operator_inverted_index!(SVecf32L2); +unimpl_operator_inverted_index!(BVecf32Cos); +unimpl_operator_inverted_index!(BVecf32Dot); +unimpl_operator_inverted_index!(BVecf32Jaccard); +unimpl_operator_inverted_index!(BVecf32L2); +unimpl_operator_inverted_index!(Vecf32Cos); +unimpl_operator_inverted_index!(Vecf32Dot); +unimpl_operator_inverted_index!(Vecf32L2); +unimpl_operator_inverted_index!(Vecf16Cos); +unimpl_operator_inverted_index!(Vecf16Dot); +unimpl_operator_inverted_index!(Vecf16L2);