lancedb · eddyxu · Dec 2, 2024 · Nov 29, 2024 · Nov 29, 2024 · Nov 30, 2024
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -0,0 +1,52 @@
+name: Rust
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+env:
+  # This env var is used by Swatinem/rust-cache@v2 for the cache
+  # key, so we set it to make sure it is always consistent.
+  CARGO_TERM_COLOR: always
+  # Disable full debug symbol generation to speed up CI build and keep memory down
+  # "1" means line tables only, which is useful for panic tracebacks.
+  RUSTFLAGS: "-C debuginfo=1"
+  RUST_BACKTRACE: "1"
+  # according to: https://matklad.github.io/2021/09/04/fast-rust-builds.html
+  # CI builds are faster with incremental disabled.
+  CARGO_INCREMENTAL: "0"
+  CARGO_BUILD_JOBS: "1"
+
+jobs:
+  lint:
+    runs-on: ubuntu-24.04
+    timeout-minutes: 30
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          toolchain: 1.83
+          components: rustfmt, clippy
+      - uses: Swatinem/rust-cache@v2
+      - name: Check formatting
+        run: cargo fmt -- --check
+      - name: Check clippy
+        run: cargo clippy --tests --benches -- -D warnings
+  test:
+    strategy:
+      matrix:
+        machine:
+          - ubuntu-24.04
+          # - ubuntu-2404-4x-arm64
+          - macos-14
+    runs-on: ${{ matrix.machine }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          toolchain: 1.83
+      - uses: rui314/setup-mold@v1
+      - uses: Swatinem/rust-cache@v2
+      - name: Run tests
+        run: cargo test
diff --git a/.gitignore b/.gitignore
@@ -18,4 +18,8 @@ Cargo.lock
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+#.idea/
+
+# Added by cargo
+
+/target
diff --git a/Cargo.toml b/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "ocra"
+version = "0.1.0"
+authors = ["[email protected]"]
+description = "OCRA: A Rust implementation of Cache in arrow-rs' ObjectStore interface"
+edition = "2021"
+license-file = "LICENSE"
+keywords = ["cache", "object-store", "arrow"]
+categories = ["caching"]
+
+[dependencies]
+async-trait = "0.1"
+bytes = "~1.9"
+futures = "0.3"
+log = "~0.4"
+moka = { version = "~0.12", features = ["future"] }
+num_cpus = "1.16"
+object_store = "~0.11"
+sysinfo = "~0.32"
+tokio = { version = "1", features = ["sync"] }
+
+[dev-dependencies]
+criterion = { version = "~0.5", features = ["async_tokio"] }
+tempfile = "~3.14"
+tokio = { version = "1", features = ["full"] }
+rand = "~0.8"
+
+[[bench]]
+name = "memory"
+harness = false
diff --git a/README.md b/README.md
@@ -1,2 +1,3 @@
-# ocra
-OCRA: Object-store Cache in Rust for All
+# OCRA
+
+**OCRA**: (**A**) (**R**)ust (**C**)ache implementation using _arrow-rs_ [(**O**)bjectStore](https://docs.rs/object_store/latest/object_store/) trait.
diff --git a/benches/memory.rs b/benches/memory.rs
@@ -0,0 +1,85 @@
+//! Benchmark for in-memory page cache.
+//!
+//!
+
+use std::{fs::File, io::Write, sync::Arc};
+
+use criterion::{criterion_group, criterion_main, Criterion};
+use object_store::{path::Path, ObjectStore};
+use rand::Rng;
+
+use ocra::{memory::InMemoryCache, paging::PageCache};
+
+fn memory_cache_bench(c: &mut Criterion) {
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    let mut rng = rand::thread_rng();
+
+    const FILE_SIZE: usize = 1024 * 1024 * 1024;
+    // TODO: support other object stores later
+    let store: Arc<dyn ObjectStore> = Arc::new(object_store::local::LocalFileSystem::new());
+    let temp_file = tempfile::NamedTempFile::new().unwrap().into_temp_path();
+    {
+        let mut writer = File::create(temp_file.to_str().unwrap()).unwrap();
+        let mut buf = vec![0_u8; 128 * 1024];
+
+        for _ in 0..FILE_SIZE / (128 * 1024) {
+            rng.fill(&mut buf[..]);
+            writer.write_all(&buf).unwrap();
+        }
+    }
+
+    for page_size in &[1024 * 1024, 8 * 1024 * 1024] {
+        let cache = Arc::new(InMemoryCache::new(FILE_SIZE + 32 * 1024, *page_size));
+        let location = Path::from(temp_file.to_str().unwrap());
+
+        // Warm up the cache
+        println!("Starting warm up cache with page size: {}", page_size);
+        rt.block_on(async {
+            let loc = location.clone();
+            for i in 0..FILE_SIZE / page_size {
+                let data = cache
+                    .get_with(&loc, i as u32, {
+                        let store = store.clone();
+                        let location = loc.clone();
+                        async move {
+                            store
+                                .get_range(&location, i * page_size..(i + 1) * page_size)
+                                .await
+                        }
+                    })
+                    .await
+                    .unwrap();
+                assert!(!data.is_empty());
+            }
+        });
+        println!("Warm up cache done");
+
+        c.bench_function(
+            format!("memory_cache,warm,page_size={}", page_size).as_str(),
+            |b| {
+                b.to_async(&rt).iter(|| {
+                    let mut rng = rand::thread_rng();
+                    let cache = cache.clone();
+                    let loc = location.clone();
+                    async move {
+                        let page_id = rng.gen_range(0..FILE_SIZE / page_size);
+
+                        let _data = cache
+                            .get_with(&loc, page_id as u32, async {
+                                panic!("Should not be called page_id={}", page_id)
+                            })
+                            .await
+                            .unwrap();
+                    }
+                })
+            },
+        );
+    }
+}
+
+criterion_group!(
+    name=benches;
+    config = Criterion::default().significance_level(0.1).sample_size(10);
+    targets = memory_cache_bench);
+
+criterion_main!(benches);
diff --git a/src/lib.rs b/src/lib.rs
@@ -0,0 +1,40 @@
+//! **OCRA**: (**A**) (**R**)ust (**C**)ache implementation for *arrow-rs*
+//! [(**O**)bjectStore](object_store::ObjectStore).
+//!
+//! It offers a few `ObjectStore` implementations that work with
+//! caches.
+//!
+//! For example, you can use [`ReadThroughCache`] to wrap an existing
+//! `ObjectStore` instance with a [`PageCache`](paging::PageCache).
+//!
+//! ```no_run
+//! # use std::sync::Arc;
+//! # use tokio::runtime::Runtime;
+//! use object_store::{ObjectStore, local::LocalFileSystem, path::Path};
+//! use ocra::{ReadThroughCache, memory::InMemoryCache};
+//!
+//! # let mut rt = Runtime::new().unwrap();
+//! # rt.block_on(async {
+//! let fs = Arc::new(LocalFileSystem::new());
+//! // Use 75% of system memory for cache
+//! let memory_cache = Arc::new(
+//!     InMemoryCache::with_sys_memory(0.75).build());
+//! let cached_store: Arc<dyn ObjectStore> =
+//!     Arc::new(ReadThroughCache::new(fs, memory_cache));
+//!
+//! // Now you can use `cached_store` as a regular ObjectStore
+//! let path = Path::from("my-key");
+//! let data = cached_store.get_range(&path, 1024..2048).await.unwrap();
+//! # })
+//! ```
+
+// pub mod error;
+pub mod memory;
+pub mod paging;
+mod read_through;
+
+// We reuse `object_store` Error and Result to make this crate work well
+// with the rest of object_store implementations.
+pub use object_store::{Error, Result};
+
+pub use read_through::ReadThroughCache;