Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
address commments
Browse files Browse the repository at this point in the history
Signed-off-by: Keming <[email protected]>
kemingy committed Jul 23, 2024
1 parent 72c791e commit 64df757
Showing 12 changed files with 73 additions and 142 deletions.
38 changes: 7 additions & 31 deletions crates/base/src/index.rs
Original file line number Diff line number Diff line change
@@ -105,10 +105,11 @@ pub struct IndexOptions {

impl IndexOptions {
fn validate_self(&self) -> Result<(), ValidationError> {
match (self.vector.v, &self.indexing) {
(VectorKind::Vecf32, _) => Ok(()),
(VectorKind::Vecf16, _) => Ok(()),
match (self.vector.v, self.vector.d, &self.indexing) {
(VectorKind::Vecf32, _, _) => Ok(()),
(VectorKind::Vecf16, _, _) => Ok(()),
(
_,
_,
IndexingOptions::Flat(FlatIndexingOptions {
quantization: QuantizationOptions::Trivial(_),
@@ -118,15 +119,12 @@ impl IndexOptions {
quantization: QuantizationOptions::Trivial(_),
..
})
| IndexingOptions::Inverted(InvertedIndexingOptions {
quantization: QuantizationOptions::Trivial(_),
..
})
| IndexingOptions::Hnsw(HnswIndexingOptions {
quantization: QuantizationOptions::Trivial(_),
..
}),
) => Ok(()),
(VectorKind::SVecf32, DistanceKind::Dot, IndexingOptions::Inverted()) => Ok(()),
_ => Err(ValidationError::new("not valid index options")),
}
}
@@ -263,7 +261,7 @@ pub enum IndexingOptions {
Flat(FlatIndexingOptions),
Ivf(IvfIndexingOptions),
Hnsw(HnswIndexingOptions),
Inverted(InvertedIndexingOptions),
Inverted(),
}

impl IndexingOptions {
@@ -279,12 +277,6 @@ impl IndexingOptions {
};
x
}
pub fn unwrap_inverted(self) -> InvertedIndexingOptions {
let IndexingOptions::Inverted(x) = self else {
unreachable!()
};
x
}
pub fn unwrap_hnsw(self) -> HnswIndexingOptions {
let IndexingOptions::Hnsw(x) = self else {
unreachable!()
@@ -305,7 +297,7 @@ impl Validate for IndexingOptions {
Self::Flat(x) => x.validate(),
Self::Ivf(x) => x.validate(),
Self::Hnsw(x) => x.validate(),
Self::Inverted(x) => x.validate(),
Self::Inverted() => Ok(()),
}
}
}
@@ -352,22 +344,6 @@ impl Default for IvfIndexingOptions {
}
}

#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
#[serde(deny_unknown_fields)]
pub struct InvertedIndexingOptions {
#[serde(default)]
#[validate(nested)]
pub quantization: QuantizationOptions,
}

impl Default for InvertedIndexingOptions {
fn default() -> Self {
Self {
quantization: Default::default(),
}
}
}

#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
#[serde(deny_unknown_fields)]
pub struct HnswIndexingOptions {
7 changes: 0 additions & 7 deletions crates/base/src/vector/bvecf32.rs
Original file line number Diff line number Diff line change
@@ -140,13 +140,6 @@ impl<'a> VectorBorrowed for BVecf32Borrowed<'a> {
}
}

#[inline(always)]
fn to_index_vec(&self) -> Vec<(u32, Self::Scalar)> {
(0..self.dims())
.map(|i| (i, F32(self.get(i) as u32 as f32)))
.collect()
}

#[inline(always)]
fn to_vec(&self) -> Vec<F32> {
self.iter().map(|i| F32(i as u32 as f32)).collect()
2 changes: 0 additions & 2 deletions crates/base/src/vector/mod.rs
Original file line number Diff line number Diff line change
@@ -41,8 +41,6 @@ pub trait VectorBorrowed: Copy + PartialEq + PartialOrd {

fn to_vec(&self) -> Vec<Self::Scalar>;

fn to_index_vec(&self) -> Vec<(u32, Self::Scalar)>;

fn length(&self) -> F32;

fn function_normalize(&self) -> Self::Owned;
5 changes: 0 additions & 5 deletions crates/base/src/vector/svecf32.rs
Original file line number Diff line number Diff line change
@@ -171,11 +171,6 @@ impl<'a> VectorBorrowed for SVecf32Borrowed<'a> {
}
}

#[inline(always)]
fn to_index_vec(&self) -> Vec<(u32, Self::Scalar)> {
std::iter::zip(self.indexes.to_vec(), self.values.to_vec()).collect()
}

#[inline(always)]
fn to_vec(&self) -> Vec<F32> {
let mut dense = vec![F32::zero(); self.dims as usize];
10 changes: 0 additions & 10 deletions crates/base/src/vector/vecf16.rs
Original file line number Diff line number Diff line change
@@ -99,16 +99,6 @@ impl<'a> VectorBorrowed for Vecf16Borrowed<'a> {
Vecf16Owned(self.0.to_vec())
}

#[inline(always)]
fn to_index_vec(&self) -> Vec<(u32, Self::Scalar)> {
self.0
.iter()
.copied()
.enumerate()
.map(|(i, x)| (i as u32, x))
.collect()
}

#[inline(always)]
fn to_vec(&self) -> Vec<F16> {
self.0.to_vec()
10 changes: 0 additions & 10 deletions crates/base/src/vector/vecf32.rs
Original file line number Diff line number Diff line change
@@ -99,16 +99,6 @@ impl<'a> VectorBorrowed for Vecf32Borrowed<'a> {
Vecf32Owned(self.0.to_vec())
}

#[inline(always)]
fn to_index_vec(&self) -> Vec<(u32, Self::Scalar)> {
self.0
.iter()
.copied()
.enumerate()
.map(|(i, x)| (i as u32, x))
.collect()
}

#[inline(always)]
fn to_vec(&self) -> Vec<F32> {
self.0.to_vec()
4 changes: 2 additions & 2 deletions crates/index/src/indexing/sealed.rs
Original file line number Diff line number Diff line change
@@ -25,7 +25,7 @@ impl<O: Op> SealedIndexing<O> {
IndexingOptions::Flat(_) => Self::Flat(Flat::create(path, options, source)),
IndexingOptions::Ivf(_) => Self::Ivf(Ivf::create(path, options, source)),
IndexingOptions::Hnsw(_) => Self::Hnsw(Hnsw::create(path, options, source)),
IndexingOptions::Inverted(_) => Self::Inverted(Inverted::create(path, options, source)),
IndexingOptions::Inverted() => Self::Inverted(Inverted::create(path, options, source)),
}
}

@@ -34,7 +34,7 @@ impl<O: Op> SealedIndexing<O> {
IndexingOptions::Flat(_) => Self::Flat(Flat::open(path)),
IndexingOptions::Ivf(_) => Self::Ivf(Ivf::open(path)),
IndexingOptions::Hnsw(_) => Self::Hnsw(Hnsw::open(path)),
IndexingOptions::Inverted(_) => Self::Inverted(Inverted::open(path)),
IndexingOptions::Inverted() => Self::Inverted(Inverted::open(path)),
}
}

11 changes: 9 additions & 2 deletions crates/index/src/lib.rs
Original file line number Diff line number Diff line change
@@ -25,6 +25,7 @@ use common::dir_ops::sync_walk_from_dir;
use common::file_atomic::FileAtomic;
use crossbeam::atomic::AtomicCell;
use crossbeam::channel::Sender;
use inverted::operator::OperatorInverted;
use ivf::operator::OperatorIvf;
use parking_lot::Mutex;
use quantization::operator::OperatorQuantization;
@@ -41,9 +42,15 @@ use storage::OperatorStorage;
use thiserror::Error;
use validator::Validate;

pub trait Op: Operator + OperatorQuantization + OperatorStorage + OperatorIvf {}
pub trait Op:
Operator + OperatorQuantization + OperatorStorage + OperatorIvf + OperatorInverted
{
}

impl<T: Operator + OperatorQuantization + OperatorStorage + OperatorIvf> Op for T {}
impl<T: Operator + OperatorQuantization + OperatorStorage + OperatorIvf + OperatorInverted> Op
for T
{
}

#[derive(Debug, Error)]
#[error("The index view is outdated.")]
69 changes: 18 additions & 51 deletions crates/inverted/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,31 +1,28 @@
#![allow(clippy::len_without_is_empty)]

use base::distance::DistanceKind;
pub mod operator;

use self::operator::OperatorInverted;
use base::index::{IndexOptions, SearchOptions};
use base::operator::Borrowed;
use base::operator::Operator;
use base::scalar::{ScalarLike, F32};
use base::search::{Collection, Element, Payload, Source, Vectors};
use base::vector::{VectorBorrowed, VectorKind};
use common::dir_ops::sync_dir;
use common::json::Json;
use common::mmap_array::MmapArray;
use common::remap::RemappedCollection;
use quantization::{operator::OperatorQuantization, Quantization};
use storage::{OperatorStorage, Storage};
use storage::Storage;

use std::collections::BTreeMap;
use std::fs::create_dir;
use std::path::Path;

pub trait OperatorInverted: Operator + OperatorQuantization + OperatorStorage {}

impl<T: Operator + OperatorQuantization + OperatorStorage> OperatorInverted for T {}
const ZERO: F32 = F32(0.0);
const NEGATIVE_ONE: F32 = F32(-1.0);

#[allow(dead_code)]
pub struct Inverted<O: OperatorInverted> {
storage: O::Storage,
quantization: Quantization<O>,
payloads: MmapArray<Payload>,
indexes: Json<Vec<u32>>,
offsets: Json<Vec<u32>>,
@@ -34,12 +31,6 @@ pub struct Inverted<O: OperatorInverted> {

impl<O: OperatorInverted> Inverted<O> {
pub fn create(path: impl AsRef<Path>, options: IndexOptions, source: &impl Source<O>) -> Self {
if options.vector.v != VectorKind::SVecf32 {
panic!("inverted index only supports `SVecf32` vectors");
}
if options.vector.d != DistanceKind::Dot {
panic!("inverted index only supports `Dot` distance");
}
let remapped = RemappedCollection::from_source(source);
from_nothing(path, options, &remapped)
}
@@ -51,41 +42,28 @@ impl<O: OperatorInverted> Inverted<O> {
pub fn vbase<'a>(
&'a self,
vector: Borrowed<'a, O>,
opts: &'a SearchOptions,
_: &'a SearchOptions,
) -> (Vec<Element>, Box<(dyn Iterator<Item = Element> + 'a)>) {
const ZERO: F32 = F32(0.0);
let mut doc_score = vec![ZERO; self.payloads.len()];
for (token, _) in vector.to_index_vec() {
for (token, _) in O::to_index_vec(vector) {
let start = self.offsets[token as usize];
let end = self.offsets[token as usize + 1];
for i in (start as usize)..(end as usize) {
doc_score[self.indexes[i] as usize] += self.scores[i];
}
}
let candidates: Vec<usize> = doc_score
let mut candidates: Vec<Element> = doc_score
.iter()
.enumerate()
.filter(|&(_, score)| *score > ZERO)
.map(|(i, _)| i)
.map(|(i, score)| Element {
distance: *score * NEGATIVE_ONE, // use negative score to match the negative dot product distance
payload: self.payloads[i],
})
.collect();
let mut reranker = self.quantization.inverted_rerank(vector, opts, move |u| {
(
-doc_score[u as usize], // use negative score to match the negative dot product distance
self.payloads[u as usize],
)
});
for i in candidates {
reranker.push(i as u32, ());
}
(
Vec::new(),
Box::new(std::iter::from_fn(move || {
reranker.pop().map(|(dis_u, _, payload_u)| Element {
distance: dis_u,
payload: payload_u,
})
})),
)
candidates.sort_by(|a, b| a.distance.cmp(&b.distance));

(Vec::new(), Box::new(candidates.into_iter()))
}

pub fn len(&self) -> u32 {
@@ -103,15 +81,14 @@ impl<O: OperatorInverted> Inverted<O> {

fn from_nothing<O: OperatorInverted>(
path: impl AsRef<Path>,
options: IndexOptions,
_: IndexOptions,
collection: &impl Collection<O>,
) -> Inverted<O> {
create_dir(path.as_ref()).expect("failed to create path for inverted index");

let inverted_options = options.indexing.clone().unwrap_inverted();
let mut token_collection = BTreeMap::new();
for i in 0..collection.len() {
for (token, score) in collection.vector(i).to_index_vec() {
for (token, score) in O::to_index_vec(collection.vector(i)) {
token_collection
.entry(token)
.or_insert_with(Vec::new)
@@ -121,13 +98,6 @@ fn from_nothing<O: OperatorInverted>(
let (indexes, offsets, scores) = build_compressed_matrix(token_collection);

let storage = O::Storage::create(path.as_ref().join("storage"), collection);
let quantization = Quantization::create(
path.as_ref().join("quantization"),
options.vector,
inverted_options.quantization,
collection,
|vector| vector.own(),
);
let payloads = MmapArray::create(
path.as_ref().join("payloads"),
(0..collection.len()).map(|i| collection.payload(i)),
@@ -138,7 +108,6 @@ fn from_nothing<O: OperatorInverted>(
sync_dir(path);
Inverted {
storage,
quantization,
payloads,
indexes: json_index,
offsets: json_offset,
@@ -148,14 +117,12 @@ fn from_nothing<O: OperatorInverted>(

fn open<O: OperatorInverted>(path: impl AsRef<Path>) -> Inverted<O> {
let storage = O::Storage::open(path.as_ref().join("storage"));
let quantization = Quantization::open(path.as_ref().join("quantization"));
let payloads = MmapArray::open(path.as_ref().join("payloads"));
let offsets = Json::open(path.as_ref().join("offsets"));
let indexes = Json::open(path.as_ref().join("indexes"));
let scores = Json::open(path.as_ref().join("scores"));
Inverted {
storage,
quantization,
payloads,
indexes,
offsets,
37 changes: 37 additions & 0 deletions crates/inverted/src/operator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use base::operator::*;
use base::scalar::F32;
use quantization::operator::OperatorQuantization;
use storage::OperatorStorage;

pub trait OperatorInverted: OperatorQuantization + OperatorStorage {
fn to_index_vec(vec: Borrowed<'_, Self>) -> Vec<(u32, F32)>;
}

impl OperatorInverted for SVecf32Dot {
fn to_index_vec(vec: Borrowed<'_, Self>) -> Vec<(u32, F32)> {
std::iter::zip(vec.indexes().to_vec(), vec.values().to_vec()).collect()
}
}

macro_rules! unimpl_operator_inverted {
($t:ty) => {
impl OperatorInverted for $t {
fn to_index_vec(_: Borrowed<'_, Self>) -> Vec<(u32, F32)> {
unimplemented!()
}
}
};
}

unimpl_operator_inverted!(SVecf32Cos);
unimpl_operator_inverted!(SVecf32L2);
unimpl_operator_inverted!(BVecf32Cos);
unimpl_operator_inverted!(BVecf32Dot);
unimpl_operator_inverted!(BVecf32Jaccard);
unimpl_operator_inverted!(BVecf32L2);
unimpl_operator_inverted!(Vecf32Cos);
unimpl_operator_inverted!(Vecf32Dot);
unimpl_operator_inverted!(Vecf32L2);
unimpl_operator_inverted!(Vecf16Cos);
unimpl_operator_inverted!(Vecf16Dot);
unimpl_operator_inverted!(Vecf16L2);
13 changes: 0 additions & 13 deletions crates/quantization/src/lib.rs
Original file line number Diff line number Diff line change
@@ -179,19 +179,6 @@ impl<O: OperatorQuantization> Quantization<O> {
}
}

pub fn inverted_rerank<'a, T: 'a>(
&'a self,
vector: Borrowed<'a, O>,
opts: &'a SearchOptions,
r: impl Fn(u32) -> (F32, T) + 'a,
) -> Box<dyn Reranker<T> + 'a> {
use Quantizer::*;
match &*self.train {
Trivial(x) => x.inverted_rerank(vector, opts, r),
_ => panic!("Inverted rerank does not supported non-trivial quantization"),
}
}

pub fn ivf_naive_rerank<'a, T: 'a>(
&'a self,
vector: Borrowed<'a, O>,
9 changes: 0 additions & 9 deletions crates/quantization/src/trivial/mod.rs
Original file line number Diff line number Diff line change
@@ -51,15 +51,6 @@ impl<O: OperatorTrivialQuantization> TrivialQuantizer<O> {
Box::new(DisabledReranker::new(r))
}

pub fn inverted_rerank<'a, T: 'a>(
&'a self,
_: Borrowed<'a, O>,
_: &'a SearchOptions,
r: impl Fn(u32) -> (F32, T) + 'a,
) -> Box<dyn Reranker<T> + 'a> {
Box::new(DisabledReranker::new(r))
}

pub fn ivf_naive_rerank<'a, T: 'a>(
&'a self,
_: Borrowed<'a, O>,

0 comments on commit 64df757

Please sign in to comment.