lancedb · wjones127 · Jan 21, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs
@@ -1741,21 +1741,28 @@ mod tests {
     use std::ops::Range;
 
     use arrow_array::types::UInt64Type;
-    use arrow_array::{Float32Array, RecordBatchIterator, RecordBatchReader, UInt64Array};
+    use arrow_array::{
+        make_array, Float32Array, RecordBatchIterator, RecordBatchReader, UInt64Array,
+    };
+    use arrow_buffer::{BooleanBuffer, NullBuffer};
     use arrow_schema::Field;
     use itertools::Itertools;
     use lance_core::utils::address::RowAddress;
     use lance_core::ROW_ID;
+    use lance_datagen::{array, gen, Dimension, RowCount};
     use lance_index::vector::sq::builder::SQBuildParams;
     use lance_linalg::distance::l2_distance_batch;
     use lance_testing::datagen::{
         generate_random_array, generate_random_array_with_range, generate_random_array_with_seed,
         generate_scaled_random_array, sample_without_replacement,
     };
     use rand::{seq::SliceRandom, thread_rng};
+    use rstest::rstest;
     use tempfile::tempdir;
 
+    use crate::dataset::InsertBuilder;
     use crate::index::prefilter::DatasetPreFilter;
+    use crate::index::vector::IndexFileVersion;
     use crate::index::vector_index_details;
     use crate::index::{vector::VectorIndexParams, DatasetIndexExt, DatasetIndexInternalExt};
 
@@ -2215,6 +2222,85 @@ mod tests {
             .await;
     }
 
+    #[rstest]
+    #[tokio::test]
+    async fn test_create_index_nulls(
+        // We test L2 and Dot, because L2 PQ uses residuals while Dot doesn't,
+        // so they have slightly different code paths.
+        #[values(
+            VectorIndexParams::with_ivf_pq_params(
+                MetricType::L2,
+                IvfBuildParams::new(2),
+                PQBuildParams::new(2, 4),
+            ),
+            VectorIndexParams::with_ivf_pq_params(
+                MetricType::Dot,
+                IvfBuildParams::new(2),
+                PQBuildParams::new(2, 4),
+            ),
+            VectorIndexParams::ivf_flat(1, MetricType::Dot),
+            VectorIndexParams::with_ivf_hnsw_pq_params(
+                MetricType::Dot,
+                IvfBuildParams::new(2),
+                HnswBuildParams::default(),
+                PQBuildParams::new(2, 4)
+            ),
+            VectorIndexParams::with_ivf_hnsw_sq_params(
+                MetricType::Dot,
+                IvfBuildParams::new(2),
+                HnswBuildParams::default(),
+                SQBuildParams::default()
+            )
+        )]
+        mut index_params: VectorIndexParams,
+        #[values(IndexFileVersion::Legacy, IndexFileVersion::V3)] index_version: IndexFileVersion,
+    ) {
+        index_params.version(index_version);
+
+        let nrows = 2_000;
+        let data = gen()
+            .col("vec", array::rand_vec::<Float32Type>(Dimension::from(16)))
+            .into_batch_rows(RowCount::from(nrows))
+            .unwrap();
+
+        // Make every other row null
+        let null_buffer = (0..nrows).map(|i| i % 2 == 0).collect::<BooleanBuffer>();
+        let null_buffer = NullBuffer::new(null_buffer);
+        let vectors = data["vec"]
+            .clone()
+            .to_data()
+            .into_builder()
+            .nulls(Some(null_buffer))
+            .build()
+            .unwrap();
+        let vectors = make_array(vectors);
+        let num_non_null = vectors.len() - vectors.logical_null_count();
+        let data = RecordBatch::try_new(data.schema(), vec![vectors]).unwrap();
+
+        let mut dataset = InsertBuilder::new("memory://")
+            .execute(vec![data])
+            .await
+            .unwrap();
+
+        // Create index
+        dataset
+            .create_index(&["vec"], IndexType::Vector, None, &index_params, false)
+            .await
+            .unwrap();
+
+        let query = vec![0.0; 16].into_iter().collect::<Float32Array>();
+        let results = dataset
+            .scan()
+            .nearest("vec", &query, 2_000)
+            .unwrap()
+            .nprobs(2)
+            .try_into_batch()
+            .await
+            .unwrap();
+        assert_eq!(results.num_rows(), num_non_null);
+        assert_eq!(results["vec"].logical_null_count(), 0);
+    }
+
     #[tokio::test]
     async fn test_create_ivf_pq_cosine() {
         let test_dir = tempdir().unwrap();

diff --git a/rust/lance/src/index/vector/pq.rs b/rust/lance/src/index/vector/pq.rs
@@ -447,6 +447,7 @@ pub async fn build_pq_model(
         "Finished loading training data in {:02} seconds",
         start.elapsed().as_secs_f32()
     );
+    debug_assert_eq!(training_data.logical_null_count(), 0);
 
     info!(
         "starting to compute partitions for PQ training, sample size: {}",

diff --git a/rust/lance/src/index/vector/utils.rs b/rust/lance/src/index/vector/utils.rs
@@ -9,7 +9,7 @@ use log::info;
 use snafu::{location, Location};
 use tokio::sync::Mutex;
 
-use crate::dataset::Dataset;
+use crate::dataset::{Dataset, ProjectionRequest, TakeBuilder};
 use crate::{Error, Result};
 
 /// Get the vector dimension of the given column in the schema.
@@ -107,17 +107,72 @@ pub async fn maybe_sample_training_data(
     sample_size_hint: usize,
 ) -> Result<FixedSizeListArray> {
     let num_rows = dataset.count_rows(None).await?;
-    let batch = if num_rows > sample_size_hint {
+
+    let is_nullable = dataset
+        .schema()
+        .field(column)
+        .ok_or(Error::Index {
+            message: format!(
+                "Sample training data: column {} does not exist in schema",
+                column
+            ),
+            location: location!(),
+        })?
+        .nullable;
+
+    let batch = if num_rows > sample_size_hint && !is_nullable {
         let projection = dataset.schema().project(&[column])?;
         let batch = dataset.sample(sample_size_hint, &projection).await?;
         info!(
             "Sample training data: retrieved {} rows by sampling",
             batch.num_rows()
         );
         batch
+    } else if num_rows > sample_size_hint && is_nullable {
+        // Need to filter out null values
+        // Use a scan to collect row ids. Then sample from the row ids. Then do take.
+        let row_addrs = dataset
+            .scan()
+            .filter_expr(datafusion_expr::col(column).is_not_null())
+            .with_row_address()
+            .project::<&str>(&[])?
+            .try_into_batch()
+            .await?;
+        debug_assert_eq!(row_addrs.num_columns(), 1);
+        debug_assert_eq!(row_addrs["_rowaddr"].logical_null_count(), 0);
+        let row_addrs = row_addrs
+            .column(0)
+            .as_any()
+            .downcast_ref::<arrow::array::UInt64Array>()
+            .ok_or(Error::Index {
+                message: format!(
+                    "Sample training data: column {} is not a UInt64Array",
+                    column
+                ),
+                location: location!(),
+            })?;
+
+        let batch = TakeBuilder::try_new_from_addresses(
+            Arc::new(dataset.clone()),
+            row_addrs.values().to_vec(),
+            Arc::new(
+                ProjectionRequest::from_columns([column], dataset.schema())
+                    .into_projection_plan(dataset.schema())?,
+            ),
+        )?
+        .execute()
+        .await?;
+        info!(
+            "Sample training data: retrieved {} rows by sampling after filtering out nulls",
+            batch.num_rows()
+        );
+        batch
     } else {
         let mut scanner = dataset.scan();
         scanner.project(&[column])?;
+        if is_nullable {
+            scanner.filter_expr(datafusion_expr::col(column).is_not_null());
+        }
         let batch = scanner.try_into_batch().await?;
         info!(
             "Sample training data: retrieved {} rows scanning full datasets",