diff --git a/.github/workflows/cargo-publish.yml b/.github/workflows/cargo-publish.yml index 8172b1e584..0bcea7042d 100644 --- a/.github/workflows/cargo-publish.yml +++ b/.github/workflows/cargo-publish.yml @@ -8,7 +8,7 @@ on: workflow_dispatch: inputs: tag: - description: 'Tag to publish (e.g., v1.0.0)' + description: "Tag to publish (e.g., v1.0.0)" required: true type: string @@ -24,7 +24,7 @@ jobs: env: # Need up-to-date compilers for kernels CC: clang-18 - CXX: clang-18 + CXX: clang++-18 defaults: run: working-directory: . @@ -53,5 +53,5 @@ jobs: - uses: albertlockett/publish-crates@v2.2 with: registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }} - args: '--all-features' + args: "--all-features" path: . diff --git a/.github/workflows/ci-benchmarks.yml b/.github/workflows/ci-benchmarks.yml index 1b87ec69e0..bf6c4ee59f 100644 --- a/.github/workflows/ci-benchmarks.yml +++ b/.github/workflows/ci-benchmarks.yml @@ -13,7 +13,7 @@ jobs: env: # Need up-to-date compilers for kernels CC: clang-18 - CXX: clang-18 + CXX: clang++-18 defaults: run: shell: bash diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index e716a0d6f6..f26a22b7de 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -39,7 +39,7 @@ jobs: env: # Need up-to-date compilers for kernels CC: clang-18 - CXX: clang-18 + CXX: clang++-18 steps: - uses: actions/checkout@v4 with: @@ -67,8 +67,9 @@ jobs: sudo apt install -y protobuf-compiler libssl-dev - name: Lint Rust run: | + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` cargo fmt --all -- --check - cargo clippy --locked --all-features --tests -- -D warnings + cargo clippy --locked --features ${ALL_FEATURES} --tests -- -D warnings - name: Build run: | python -m venv venv diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4a927d9a53..d9255d55ff 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -46,8 +46,9 @@ jobs: sudo apt install -y protobuf-compiler libssl-dev - name: Run clippy run: | + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` cargo clippy --version - cargo clippy --locked --all-features --tests --benches -- -D warnings + cargo clippy --locked --features ${ALL_FEATURES} --tests --benches -- -D warnings linux-build: runs-on: "ubuntu-24.04" timeout-minutes: 45 @@ -59,7 +60,7 @@ jobs: env: # Need up-to-date compilers for kernels CC: clang - CXX: clang + CXX: clang++ steps: - uses: actions/checkout@v4 # pin the toolchain version to avoid surprises @@ -81,13 +82,18 @@ jobs: - name: Run tests if: ${{ matrix.toolchain == 'stable' }} run: | - cargo llvm-cov --locked --workspace --codecov --output-path coverage.codecov --all-features + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + cargo llvm-cov --locked --workspace --codecov --output-path coverage.codecov --features ${ALL_FEATURES} - name: Build tests (nightly) - run: cargo test --locked --all-features --workspace --no-run + if: ${{ matrix.toolchain != 'stable' }} + run: | + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + cargo test --locked --features ${ALL_FEATURES} --workspace --no-run - name: Run tests (nightly) if: ${{ matrix.toolchain != 'stable' }} run: | - cargo test --all-features --workspace + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + cargo test --features ${ALL_FEATURES} --workspace - name: Upload coverage to Codecov if: ${{ matrix.toolchain == 'stable' }} uses: codecov/codecov-action@v4 @@ -113,20 +119,22 @@ jobs: sudo apt install -y protobuf-compiler libssl-dev pkg-config - name: Build tests run: | - cargo test --locked --all-features --no-run + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + cargo test --locked --features ${ALL_FEATURES} --no-run - name: Start DynamoDB local for tests run: | docker run -d -e AWS_ACCESS_KEY_ID=DUMMYKEY -e AWS_SECRET_ACCESS_KEY=DUMMYKEY -p 8000:8000 amazon/dynamodb-local - name: Run tests run: | - cargo test --locked --all-features + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + cargo test --locked --features ${ALL_FEATURES} build-no-lock: runs-on: ubuntu-24.04 timeout-minutes: 30 env: # Need up-to-date compilers for kernels CC: clang - CXX: clang + CXX: clang++ steps: - uses: actions/checkout@v4 # Remote cargo.lock to force a fresh build @@ -139,7 +147,9 @@ jobs: sudo apt update sudo apt install -y protobuf-compiler libssl-dev - name: Build all - run: cargo build --benches --all-features --tests + run: | + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + cargo build --benches --features ${ALL_FEATURES} --tests mac-build: runs-on: "macos-14" timeout-minutes: 45 @@ -165,11 +175,14 @@ jobs: run: | rustup update ${{ matrix.toolchain }} && rustup default ${{ matrix.toolchain }} - name: Build tests - run: cargo test --locked --all-features --no-run + run: | + cargo test --locked --features fp16kernels,cli,tensorflow,dynamodb,substrait --no-run - name: Run tests - run: cargo test --all-features + run: | + cargo test --features fp16kernels,cli,tensorflow,dynamodb,substrait - name: Check benchmarks - run: cargo check --benches --all-features + run: | + cargo check --benches --features fp16kernels,cli,tensorflow,dynamodb,substrait windows-build: runs-on: windows-latest defaults: @@ -203,7 +216,7 @@ jobs: env: # Need up-to-date compilers for kernels CC: clang - CXX: clang + CXX: clang++ steps: - uses: actions/checkout@v4 with: @@ -218,4 +231,6 @@ jobs: with: toolchain: ${{ matrix.msrv }} - name: cargo +${{ matrix.msrv }} check - run: cargo check --workspace --tests --benches --all-features + run: | + ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` + cargo check --workspace --tests --benches --features ${ALL_FEATURES} diff --git a/Cargo.lock b/Cargo.lock index bde2af8f09..c428391884 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1313,6 +1313,15 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" +[[package]] +name = "cmake" +version = "0.1.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c682c223677e0e5b6b7f63a64b9351844c3f1b1678a68b7ee617e30fb082620e" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.3" @@ -3462,7 +3471,6 @@ dependencies = [ "pretty_assertions", "prost 0.12.6", "prost 0.13.3", - "prost-build 0.13.3", "prost-types 0.13.3", "rand", "random_word", @@ -3617,6 +3625,7 @@ dependencies = [ "prost 0.13.3", "prost-build 0.13.3", "prost-types 0.13.3", + "protobuf-src", "rand", "rand_xoshiro", "rstest", @@ -3655,6 +3664,7 @@ dependencies = [ "prost 0.13.3", "prost-build 0.13.3", "prost-types 0.13.3", + "protobuf-src", "rand", "snafu 0.7.5", "test-log", @@ -3694,6 +3704,7 @@ dependencies = [ "prost 0.13.3", "prost-build 0.13.3", "prost-types 0.13.3", + "protobuf-src", "rand", "roaring", "snafu 0.7.5", @@ -3751,6 +3762,7 @@ dependencies = [ "pprof", "prost 0.13.3", "prost-build 0.13.3", + "protobuf-src", "rand", "random_word", "rayon", @@ -3800,7 +3812,6 @@ dependencies = [ "pin-project", "pprof", "prost 0.13.3", - "prost-build 0.13.3", "rand", "shellexpand", "snafu 0.7.5", @@ -3894,6 +3905,7 @@ dependencies = [ "prost 0.13.3", "prost-build 0.13.3", "prost-types 0.13.3", + "protobuf-src", "rand", "rangemap", "roaring", @@ -5168,6 +5180,15 @@ dependencies = [ "prost 0.13.3", ] +[[package]] +name = "protobuf-src" +version = "2.1.0+27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7edafa3bcc668fa93efafcbdf58d7821bbda0f4b458ac7fae3d57ec0fec8167" +dependencies = [ + "cmake", +] + [[package]] name = "quanta" version = "0.12.3" diff --git a/python/Cargo.lock b/python/Cargo.lock index 3fe87b7edd..238c9f3e39 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -3063,7 +3063,6 @@ dependencies = [ "pin-project", "prost 0.12.6", "prost 0.13.3", - "prost-build 0.13.3", "prost-types 0.13.3", "rand", "roaring", @@ -3330,7 +3329,6 @@ dependencies = [ "path_abs", "pin-project", "prost 0.13.3", - "prost-build 0.13.3", "rand", "shellexpand", "snafu 0.7.5", diff --git a/python/python/lance/ray/sink.py b/python/python/lance/ray/sink.py index 863a9fd014..20765f3e83 100644 --- a/python/python/lance/ray/sink.py +++ b/python/python/lance/ray/sink.py @@ -46,23 +46,8 @@ def _pd_to_arrow( tbl.schema = tbl.schema.remove_metadata() return tbl elif isinstance(df, pa.Table): - fields = df.schema.names - new_columns = [] - new_fields = [] - for field in fields: - col = df[field] - new_field = pa.field(field, col.type) - if ( - pa.types.is_null(col.type) - and schema.field_by_name(field).type == pa.string() - ): - new_field = pa.field(field, pa.string()) - col = pa.compute.if_else(pa.compute.is_null(col), NONE_ARROW_STR, col) - new_columns.append(col) - new_fields.append(new_field) - new_schema = pa.schema(fields=new_fields) - new_table = pa.Table.from_arrays(new_columns, schema=new_schema) - return new_table + if schema is not None: + return df.cast(schema) return df @@ -439,6 +424,7 @@ def write_lance( output_uri: str, *, schema: Optional[pa.Schema] = None, + mode: Literal["create", "append", "overwrite"] = "create", transform: Optional[ Callable[[pa.Table], Union[pa.Table, Generator[None, pa.Table, None]]] ] = None, @@ -485,7 +471,9 @@ def write_lance( ), batch_size=max_rows_per_file, ).write_datasink( - LanceCommitter(output_uri, schema=schema, storage_options=storage_options) + LanceCommitter( + output_uri, schema=schema, mode=mode, storage_options=storage_options + ) ) diff --git a/python/python/tests/test_ray.py b/python/python/tests/test_ray.py index 54f1c42492..4c135c28be 100644 --- a/python/python/tests/test_ray.py +++ b/python/python/tests/test_ray.py @@ -138,3 +138,26 @@ def f(row): assert len(pylist) == 10 for item in pylist: assert item is None + + +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_ray_write_lance_none_str_datasink(tmp_path: Path): + def f(row): + return { + "id": row["id"], + "str": None, + } + + schema = pa.schema([pa.field("id", pa.int64()), pa.field("str", pa.string())]) + + sink = LanceDatasink(tmp_path, schema=schema) + (ray.data.range(10).map(f).write_datasink(sink)) + ds = lance.dataset(tmp_path) + ds.count_rows() == 10 + assert ds.schema == schema + + tbl = ds.to_table() + pylist = tbl["str"].to_pylist() + assert len(pylist) == 10 + for item in pylist: + assert item is None diff --git a/rust/lance-encoding-datafusion/Cargo.toml b/rust/lance-encoding-datafusion/Cargo.toml index 3cccdc9ec6..ffc608ed30 100644 --- a/rust/lance-encoding-datafusion/Cargo.toml +++ b/rust/lance-encoding-datafusion/Cargo.toml @@ -42,9 +42,17 @@ lance-datagen.workspace = true [build-dependencies] prost-build.workspace = true +protobuf-src = { version = "2.1", optional = true } [target.'cfg(target_os = "linux")'.dev-dependencies] pprof = { workspace = true } +[features] +protoc = ["dep:protobuf-src"] + +[package.metadata.docs.rs] +# docs.rs uses an older version of Ubuntu that does not have the necessary protoc version +features = ["protoc"] + [lints] workspace = true diff --git a/rust/lance-encoding-datafusion/build.rs b/rust/lance-encoding-datafusion/build.rs index 8d89a39ac3..9d0206e201 100644 --- a/rust/lance-encoding-datafusion/build.rs +++ b/rust/lance-encoding-datafusion/build.rs @@ -6,6 +6,10 @@ use std::io::Result; fn main() -> Result<()> { println!("cargo:rerun-if-changed=protos"); + #[cfg(feature = "protoc")] + // Use vendored protobuf compiler if requested. + std::env::set_var("PROTOC", protobuf_src::protoc()); + let mut prost_build = prost_build::Config::new(); prost_build.extern_path(".lance.encodings", "::lance_encoding::format::pb"); prost_build.protoc_arg("--experimental_allow_proto3_optional"); diff --git a/rust/lance-encoding/Cargo.toml b/rust/lance-encoding/Cargo.toml index e43a7c634e..27955b8340 100644 --- a/rust/lance-encoding/Cargo.toml +++ b/rust/lance-encoding/Cargo.toml @@ -56,10 +56,18 @@ rand_xoshiro = "0.6.0" [build-dependencies] prost-build.workspace = true +protobuf-src = { version = "2.1", optional = true } [target.'cfg(target_os = "linux")'.dev-dependencies] pprof = { workspace = true } +[features] +protoc = ["dep:protobuf-src"] + +[package.metadata.docs.rs] +# docs.rs uses an older version of Ubuntu that does not have the necessary protoc version +features = ["protoc"] + [[bench]] name = "decoder" harness = false diff --git a/rust/lance-encoding/build.rs b/rust/lance-encoding/build.rs index 1f030d6d7f..4c9929a978 100644 --- a/rust/lance-encoding/build.rs +++ b/rust/lance-encoding/build.rs @@ -6,6 +6,10 @@ use std::io::Result; fn main() -> Result<()> { println!("cargo:rerun-if-changed=protos"); + #[cfg(feature = "protoc")] + // Use vendored protobuf compiler if requested. + std::env::set_var("PROTOC", protobuf_src::protoc()); + let mut prost_build = prost_build::Config::new(); prost_build.protoc_arg("--experimental_allow_proto3_optional"); prost_build.enable_type_names(); diff --git a/rust/lance-file/Cargo.toml b/rust/lance-file/Cargo.toml index eabb950e08..17fd79801d 100644 --- a/rust/lance-file/Cargo.toml +++ b/rust/lance-file/Cargo.toml @@ -51,10 +51,18 @@ test-log.workspace = true [build-dependencies] prost-build.workspace = true +protobuf-src = { version = "2.1", optional = true } [target.'cfg(target_os = "linux")'.dev-dependencies] pprof = { workspace = true } +[features] +protoc = ["dep:protobuf-src"] + +[package.metadata.docs.rs] +# docs.rs uses an older version of Ubuntu that does not have the necessary protoc version +features = ["protoc"] + [[bench]] name = "reader" harness = false diff --git a/rust/lance-file/build.rs b/rust/lance-file/build.rs index dd004147ec..05b791fac3 100644 --- a/rust/lance-file/build.rs +++ b/rust/lance-file/build.rs @@ -6,6 +6,10 @@ use std::io::Result; fn main() -> Result<()> { println!("cargo:rerun-if-changed=protos"); + #[cfg(feature = "protoc")] + // Use vendored protobuf compiler if requested. + std::env::set_var("PROTOC", protobuf_src::protoc()); + let mut prost_build = prost_build::Config::new(); prost_build.protoc_arg("--experimental_allow_proto3_optional"); prost_build.extern_path(".lance.encodings", "::lance_encoding::format::pb"); diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index e6cf51d2d7..f28d900539 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -73,16 +73,22 @@ datafusion-sql.workspace = true random_word = { version = "0.4.3", features = ["en"] } [features] +protoc = ["dep:protobuf-src"] tokenizer-lindera = ["lindera", "lindera-tantivy", "tokenizer-common"] tokenizer-jieba = ["jieba-rs", "tokenizer-common"] tokenizer-common = [] [build-dependencies] prost-build.workspace = true +protobuf-src = { version = "2.1", optional = true } [target.'cfg(target_os = "linux")'.dev-dependencies] pprof.workspace = true +[package.metadata.docs.rs] +# docs.rs uses an older version of Ubuntu that does not have the necessary protoc version +features = ["protoc"] + [[bench]] name = "find_partitions" harness = false diff --git a/rust/lance-index/build.rs b/rust/lance-index/build.rs index 8a31fbf600..402ef5012c 100644 --- a/rust/lance-index/build.rs +++ b/rust/lance-index/build.rs @@ -7,6 +7,10 @@ use std::io::Result; fn main() -> Result<()> { println!("cargo:rerun-if-changed=protos"); + #[cfg(feature = "protoc")] + // Use vendored protobuf compiler if requested. + std::env::set_var("PROTOC", protobuf_src::protoc()); + let mut prost_build = prost_build::Config::new(); prost_build.protoc_arg("--experimental_allow_proto3_optional"); prost_build.compile_protos(&["./protos/index.proto"], &["./protos"])?; diff --git a/rust/lance-io/Cargo.toml b/rust/lance-io/Cargo.toml index c416f7556d..cd4c184eb7 100644 --- a/rust/lance-io/Cargo.toml +++ b/rust/lance-io/Cargo.toml @@ -53,9 +53,6 @@ tempfile.workspace = true test-log.workspace = true mockall.workspace = true -[build-dependencies] -prost-build.workspace = true - [target.'cfg(target_os = "linux")'.dev-dependencies] pprof.workspace = true diff --git a/rust/lance-io/src/lib.rs b/rust/lance-io/src/lib.rs index 8e7a7694d8..b61c820f88 100644 --- a/rust/lance-io/src/lib.rs +++ b/rust/lance-io/src/lib.rs @@ -200,6 +200,16 @@ impl ReadBatchParams { )), } } + + pub fn to_offsets_total(&self, total: u32) -> PrimitiveArray { + match self { + Self::Indices(indices) => indices.clone(), + Self::Range(r) => UInt32Array::from_iter_values(r.start as u32..r.end as u32), + Self::RangeFull => UInt32Array::from_iter_values(0_u32..total), + Self::RangeTo(r) => UInt32Array::from_iter_values(0..r.end as u32), + Self::RangeFrom(r) => UInt32Array::from_iter_values(r.start as u32..total), + } + } } #[cfg(test)] diff --git a/rust/lance-table/Cargo.toml b/rust/lance-table/Cargo.toml index f469676041..233073cb81 100644 --- a/rust/lance-table/Cargo.toml +++ b/rust/lance-table/Cargo.toml @@ -57,10 +57,16 @@ pprof = { workspace = true } [build-dependencies] prost-build.workspace = true +protobuf-src = { version = "2.1", optional = true } [features] dynamodb = ["aws-sdk-dynamodb", "lazy_static"] dynamodb_tests = ["dynamodb"] +protoc = ["dep:protobuf-src"] + +[package.metadata.docs.rs] +# docs.rs uses an older version of Ubuntu that does not have the necessary protoc version +features = ["protoc"] [[bench]] name = "row_id_index" diff --git a/rust/lance-table/build.rs b/rust/lance-table/build.rs index e0d0c15393..c4b2cc52dc 100644 --- a/rust/lance-table/build.rs +++ b/rust/lance-table/build.rs @@ -6,6 +6,10 @@ use std::io::Result; fn main() -> Result<()> { println!("cargo:rerun-if-changed=protos"); + #[cfg(feature = "protoc")] + // Use vendored protobuf compiler if requested. + std::env::set_var("PROTOC", protobuf_src::protoc()); + let mut prost_build = prost_build::Config::new(); prost_build.extern_path(".lance.file", "::lance_file::format::pb"); prost_build.protoc_arg("--experimental_allow_proto3_optional"); diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index d368654731..35eb75acbb 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -82,9 +82,6 @@ pprof.workspace = true # Need this so we can prevent dynamic linking in binaries (see cli feature) lzma-sys = { version = "0.1" } -[build-dependencies] -prost-build.workspace = true - [dev-dependencies] lance-test-macros = { workspace = true } lance-datagen = { workspace = true } @@ -111,6 +108,12 @@ tensorflow = ["tfrecord", "prost_old"] dynamodb = ["lance-table/dynamodb", "aws-sdk-dynamodb"] dynamodb_tests = ["dynamodb"] substrait = ["lance-datafusion/substrait"] +protoc = [ + "lance-encoding/protoc", + "lance-file/protoc", + "lance-index/protoc", + "lance-table/protoc", +] [[bin]] name = "lq" diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs index a3aa9af1c7..71f590498d 100644 --- a/rust/lance/src/dataset/fragment.rs +++ b/rust/lance/src/dataset/fragment.rs @@ -1981,12 +1981,7 @@ impl FragmentReader { let merged = if self.with_row_addr as usize + self.with_row_id as usize == self.output_schema.fields.len() { - let selected_rows = params - .slice(0, total_num_rows as usize) - .unwrap() - .to_offsets() - .unwrap() - .len(); + let selected_rows = params.to_offsets_total(total_num_rows).len(); let tasks = (0..selected_rows) .step_by(batch_size as usize) .map(move |offset| { @@ -2389,6 +2384,71 @@ mod tests { } } + #[tokio::test] + async fn test_rowid_rowaddr_only() { + let test_dir = tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + // Creates 400 rows in 10 fragments + let mut dataset = create_dataset(test_uri, LanceFileVersion::Legacy).await; + // Delete last 20 rows in first fragment + dataset.delete("i >= 20").await.unwrap(); + // Last fragment has 20 rows but 40 addressable rows + let fragment = &dataset.get_fragments()[0]; + assert_eq!(fragment.metadata.num_rows().unwrap(), 20); + + // Test with take_range (all rows addressable) + for (with_row_id, with_row_address) in [(false, true), (true, false), (true, true)] { + let reader = fragment + .open( + &fragment.schema().project::<&str>(&[]).unwrap(), + FragReadConfig::default() + .with_row_id(with_row_id) + .with_row_address(with_row_address), + None, + ) + .await + .unwrap(); + for valid_range in [0..40, 20..40] { + reader + .take_range(valid_range, 100) + .unwrap() + .buffered(1) + .try_collect::>() + .await + .unwrap(); + } + for invalid_range in [0..41, 41..42] { + assert!(reader.take_range(invalid_range, 100).is_err()); + } + } + + // Test with read_range (only non-deleted rows addressable) + for (with_row_id, with_row_address) in [(false, true), (true, false), (true, true)] { + let reader = fragment + .open( + &fragment.schema().project::<&str>(&[]).unwrap(), + FragReadConfig::default() + .with_row_id(with_row_id) + .with_row_address(with_row_address), + None, + ) + .await + .unwrap(); + for valid_range in [0..20, 0..10, 10..20] { + reader + .read_range(valid_range, 100) + .unwrap() + .buffered(1) + .try_collect::>() + .await + .unwrap(); + } + for invalid_range in [0..21, 21..22] { + assert!(reader.read_range(invalid_range, 100).is_err()); + } + } + } + #[rstest] #[tokio::test] async fn test_fragment_take_range_deletions( diff --git a/rust/lance/src/index/vector/utils.rs b/rust/lance/src/index/vector/utils.rs index 0d8eaad5b4..34d01ec319 100644 --- a/rust/lance/src/index/vector/utils.rs +++ b/rust/lance/src/index/vector/utils.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use arrow_array::{cast::AsArray, FixedSizeListArray}; use lance_core::datatypes::Schema; +use log::info; use snafu::{location, Location}; use tokio::sync::Mutex; @@ -86,11 +87,21 @@ pub async fn maybe_sample_training_data( let num_rows = dataset.count_rows(None).await?; let batch = if num_rows > sample_size_hint { let projection = dataset.schema().project(&[column])?; - dataset.sample(sample_size_hint, &projection).await? + let batch = dataset.sample(sample_size_hint, &projection).await?; + info!( + "Sample training data: retrieved {} rows by sampling", + batch.num_rows() + ); + batch } else { let mut scanner = dataset.scan(); scanner.project(&[column])?; - scanner.try_into_batch().await? + let batch = scanner.try_into_batch().await?; + info!( + "Sample training data: retrieved {} rows scanning full datasets", + batch.num_rows() + ); + batch }; let array = batch.column_by_name(column).ok_or(Error::Index {