Skip to content

Commit

Permalink
Merge pull request #29 from oramasearch/feat/dump-db-vector-index
Browse files Browse the repository at this point in the history
Dump db vector index
  • Loading branch information
allevo authored Dec 13, 2024
2 parents c627a72 + 98e1152 commit 853b807
Show file tree
Hide file tree
Showing 8 changed files with 696 additions and 138 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ criterion = { version = "0.5.1", features = ["async_tokio"] }
http-body-util = "0.1.2"
mime = "0.3.17"
tower = { version = "0.5.1", features = ["util"] }
rand = "0.8.5"

[[bin]]
name = "rustorama"
Expand Down
50 changes: 50 additions & 0 deletions src/capped_heap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use std::{cmp::Reverse, collections::BinaryHeap};

pub struct CappedHeap<K, V> {
heap: BinaryHeap<Reverse<(K, V)>>,
limit: usize,
}

impl<K: std::cmp::Ord, V: std::cmp::Ord> CappedHeap<K, V> {
pub fn new(limit: usize) -> Self {
Self {
heap: BinaryHeap::new(),
limit,
}
}

pub fn insert(&mut self, key: K, value: V) {
if self.heap.len() < self.limit {
self.heap.push(Reverse((key, value)));
} else if let Some(Reverse((min_key, _))) = self.heap.peek() {
if key > *min_key {
self.heap.pop();
self.heap.push(Reverse((key, value)));
}
}
}

pub fn into_top(self) -> Vec<Reverse<(K, V)>> {
self.heap.into_sorted_vec()
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_cappend_heap() {
let mut heap = CappedHeap::new(3);

heap.insert(1, 1);
heap.insert(2, 2);
heap.insert(3, 3);
heap.insert(4, 4);

let top = heap.into_top();

assert_eq!(top.len(), 3);
assert_eq!(top, vec![Reverse((4, 4)), Reverse((3, 3)), Reverse((2, 2))]);
}
}
32 changes: 15 additions & 17 deletions src/collection_manager/sides/read.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::{
cmp::Reverse,
collections::{BinaryHeap, HashMap, HashSet},
collections::{HashMap, HashSet},
fmt::Debug,
ops::Deref,
path::PathBuf,
Expand All @@ -16,6 +16,7 @@ use tokio::sync::{RwLock, RwLockReadGuard};
use tracing::{debug, error, info, instrument};

use crate::{
capped_heap::CappedHeap,
collection_manager::{
dto::{
FacetDefinition, FacetResult, FieldId, Filter, SearchMode, SearchParams, SearchResult,
Expand Down Expand Up @@ -81,8 +82,10 @@ impl CollectionsReader {

document_storage: Arc::clone(&self.document_storage),

vector_index: VectorIndex::try_new(VectorIndexConfig {})
.context("Cannot create vector index during collection creation")?,
vector_index: VectorIndex::try_new(VectorIndexConfig {
base_path: collection_data_dir.join("vectors"),
})
.context("Cannot create vector index during collection creation")?,
fields_per_model: Default::default(),

string_index: StringIndex::new(self.posting_id_generator.clone()),
Expand Down Expand Up @@ -647,24 +650,19 @@ impl CollectionReader {
}

fn top_n(map: HashMap<DocumentId, f32>, n: usize) -> Vec<TokenScore> {
// A min-heap of size `n` to keep track of the top N elements
let mut heap: BinaryHeap<Reverse<(NotNan<f32>, DocumentId)>> = BinaryHeap::with_capacity(n);
let mut capped_heap = CappedHeap::new(n);

for (key, value) in map {
// Insert into the heap if it's not full, or replace the smallest element if the current one is larger
if heap.len() < n {
heap.push(Reverse((NotNan::new(value).unwrap(), key)));
} else if let Some(Reverse((min_value, _))) = heap.peek() {
if value > *min_value.as_ref() {
heap.pop();
heap.push(Reverse((NotNan::new(value).unwrap(), key)));
}
}
let k = match NotNan::new(value) {
Ok(k) => k,
Err(_) => continue,
};
let v = key;
capped_heap.insert(k, v);
}

// Collect results into a sorted Vec (optional sorting based on descending values)
let result: Vec<TokenScore> = heap
.into_sorted_vec()
let result: Vec<TokenScore> = capped_heap
.into_top()
.into_iter()
.map(|Reverse((value, key))| TokenScore {
document_id: key,
Expand Down
121 changes: 0 additions & 121 deletions src/indexes/vector.rs

This file was deleted.

Loading

0 comments on commit 853b807

Please sign in to comment.