diff --git a/CHANGELOG.md b/CHANGELOG.md index d2508a05e..1385fe1fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add a new build mode, `FAISS_OPT_LEVEL=avx512_spr`, which enables the use of advanced AVX-512 instructions introduced with Intel(R) Sapphire Rapids (#2404)[https://github.com/opensearch-project/k-NN/pull/2404] - Add cosine similarity support for faiss engine (#2376)[https://github.com/opensearch-project/k-NN/pull/2376] - Add concurrency optimizations with native memory graph loading and force eviction (#2265) [https://github.com/opensearch-project/k-NN/pull/2345] - +- Add derived source feature for vector fields (#2449)[https://github.com/opensearch-project/k-NN/pull/2449] ### Enhancements - Introduced a writing layer in native engines where relies on the writing interface to process IO. (#2241)[https://github.com/opensearch-project/k-NN/pull/2241] - Allow method parameter override for training based indices (#2290) https://github.com/opensearch-project/k-NN/pull/2290] diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsReader.java b/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsReader.java index 233197ae6..ef9eba126 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsReader.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN9120Codec/DerivedSourceStoredFieldsReader.java @@ -9,6 +9,7 @@ import lombok.Setter; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.index.StoredFieldVisitor; +import org.opensearch.index.fieldvisitor.FieldsVisitor; import org.opensearch.knn.index.codec.derivedsource.DerivedSourceStoredFieldVisitor; import org.opensearch.knn.index.codec.derivedsource.DerivedSourceVectorInjector; @@ -25,7 +26,15 @@ public class DerivedSourceStoredFieldsReader extends StoredFieldsReader { @Override public void document(int docId, StoredFieldVisitor storedFieldVisitor) throws IOException { - if (shouldInject) { + // If the visitor has explicitly indicated it does not need the fields, we should not inject them + boolean isVisitorNeedFields = true; + if (storedFieldVisitor instanceof FieldsVisitor) { + isVisitorNeedFields = derivedSourceVectorInjector.shouldInject( + ((FieldsVisitor) storedFieldVisitor).includes(), + ((FieldsVisitor) storedFieldVisitor).excludes() + ); + } + if (shouldInject && isVisitorNeedFields) { delegate.document(docId, new DerivedSourceStoredFieldVisitor(storedFieldVisitor, docId, derivedSourceVectorInjector)); return; } @@ -47,6 +56,13 @@ public void close() throws IOException { delegate.close(); } + /** + * For merging, we need to tell the derived source stored fields reader to skip injecting the source. Otherwise, + * on merge we will end up just writing the source to disk + * + * @param storedFieldsReader stored fields reader to wrap + * @return wrapped stored fields reader + */ public static StoredFieldsReader wrapForMerge(StoredFieldsReader storedFieldsReader) { if (storedFieldsReader instanceof DerivedSourceStoredFieldsReader) { StoredFieldsReader storedFieldsReaderClone = storedFieldsReader.clone(); diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldVisitor.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldVisitor.java index 41a01c15c..9610eff68 100644 --- a/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldVisitor.java +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceStoredFieldVisitor.java @@ -25,7 +25,6 @@ public class DerivedSourceStoredFieldVisitor extends StoredFieldVisitor { @Override public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException { - // TODO: Add skip condition here if the delegate specifies which fields are not required for source if (fieldInfo.name.equals(SourceFieldMapper.NAME)) { delegate.binaryField(fieldInfo, derivedSourceVectorInjector.injectVectors(documentId, value)); return; diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceVectorInjector.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceVectorInjector.java index 520b0bbf5..c7789a1cd 100644 --- a/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceVectorInjector.java +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceVectorInjector.java @@ -20,8 +20,10 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; /** * This class is responsible for injecting vectors into the source of a document. From a high level, it uses alternative @@ -31,6 +33,7 @@ public class DerivedSourceVectorInjector { private final List perFieldDerivedVectorInjectors; + private final Set fieldNames; /** * Constructor for DerivedSourceVectorInjector. @@ -46,10 +49,12 @@ public DerivedSourceVectorInjector( ) throws IOException { DerivedSourceReaders derivedSourceReaders = derivedSourceReadersSupplier.getReaders(segmentReadState); this.perFieldDerivedVectorInjectors = new ArrayList<>(); + this.fieldNames = new HashSet<>(); for (FieldInfo fieldInfo : fieldsToInjectVector) { this.perFieldDerivedVectorInjectors.add( PerFieldDerivedVectorInjectorFactory.create(fieldInfo, derivedSourceReaders, segmentReadState) ); + this.fieldNames.add(fieldInfo.name); } } @@ -84,4 +89,34 @@ public byte[] injectVectors(Integer docId, byte[] sourceAsBytes) throws IOExcept builder.close(); return BytesReference.toBytes(BytesReference.bytes(builder)); } + + /** + * Whether or not to inject vectors based on what fields are explicitly required + * + * @param includes List of fields that are required to be injected + * @param excludes List of fields that are not required to be injected + * @return true if vectors should be injected, false otherwise + */ + public boolean shouldInject(String[] includes, String[] excludes) { + // If any of the vector fields are explicitly required we should inject + if (includes != null) { + for (String includedField : includes) { + if (fieldNames.contains(includedField)) { + return true; + } + } + } + + // If all of the vector fields are explicitly excluded we should not inject + if (excludes != null) { + int excludedVectorFieldCount = 0; + for (String excludedField : excludes) { + if (fieldNames.contains(excludedField)) { + excludedVectorFieldCount++; + } + } + return excludedVectorFieldCount >= fieldNames.size(); + } + return true; + } } diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldDerivedVectorInjector.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldDerivedVectorInjector.java index f3c72a27c..6dce3f12a 100644 --- a/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldDerivedVectorInjector.java +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldDerivedVectorInjector.java @@ -5,6 +5,7 @@ package org.opensearch.knn.index.codec.derivedsource; +import lombok.AllArgsConstructor; import lombok.extern.log4j.Log4j2; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.PostingsEnum; @@ -26,22 +27,13 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; @Log4j2 +@AllArgsConstructor public class NestedPerFieldDerivedVectorInjector implements PerFieldDerivedVectorInjector { private final FieldInfo childFieldInfo; private final DerivedSourceReaders derivedSourceReaders; private final SegmentReadState segmentReadState; - public NestedPerFieldDerivedVectorInjector( - FieldInfo childFieldInfo, - DerivedSourceReaders derivedSourceReaders, - SegmentReadState segmentReadState - ) { - this.childFieldInfo = childFieldInfo; - this.derivedSourceReaders = derivedSourceReaders; - this.segmentReadState = segmentReadState; - } - @Override public void inject(Integer parentDocId, Map sourceAsMap) throws IOException { // Setup the iterator. Return if not-relevant diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldParentToDocIdIterator.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldParentToDocIdIterator.java index 6729c9d09..2d1dffa7f 100644 --- a/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldParentToDocIdIterator.java +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldParentToDocIdIterator.java @@ -68,15 +68,6 @@ public int firstChild() { return previousParentDocId + 1; } - /** - * Get the number of children for this parent. - * - * @return number of children for this parent - */ - public int numChildren() { - return children.size(); - } - /** * Get the next child for this parent *