Skip to content

Commit

Permalink
Have one score definition for cosinesimilarity
Browse files Browse the repository at this point in the history
Currently we have different score calculation for cosine similarity,
for ex: script score, approximate search, exact search has diffent formula
to convert distance to cosine similarity that is aligned with OpenSearch
score. To keep it consistent, we will be using one defintion which is used
by Lucene as standard definition for cosine similarity for all search types.

Signed-off-by: Vijayan Balasubramanian <[email protected]>
  • Loading branch information
VijayanB committed Dec 27, 2024
1 parent c728f02 commit 37d132a
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
* Allow validation for non knn index only after 2.17.0 (#2315)[https://github.com/opensearch-project/k-NN/pull/2315]
* Release query vector memory after execution (#2346)[https://github.com/opensearch-project/k-NN/pull/2346]
* Fix shard level rescoring disabled setting flag (#2352)[https://github.com/opensearch-project/k-NN/pull/2352]
* Use one formula to calculate cosine similarity (#2357)[https://github.com/opensearch-project/k-NN/pull/2357]
### Infrastructure
* Updated C++ version in JNI from c++11 to c++17 [#2259](https://github.com/opensearch-project/k-NN/pull/2259)
* Upgrade bytebuddy and objenesis version to match OpenSearch core and, update github ci runner for macos [#2279](https://github.com/opensearch-project/k-NN/pull/2279)
Expand Down
5 changes: 4 additions & 1 deletion src/main/java/org/opensearch/knn/index/SpaceType.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ public float scoreToDistanceTranslation(float score) {
COSINESIMIL("cosinesimil") {
@Override
public float scoreTranslation(float rawScore) {
return 1 / (1 + rawScore);
// To be consistent, we will be using same formula used by lucene as mentioned below
// https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73
// Here raw score = 1 - cosine(x,y), hence, formula will be updated to below to get final score as similar to Lucene.
return Math.max((2.0F - rawScore) / 2.0F, 0.0F);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,12 @@ public CosineSimilarity(Object query, MappedFieldType fieldType) {
protected BiFunction<float[], float[], Float> getScoringMethod(final float[] processedQuery) {
SpaceType.COSINESIMIL.validateVector(processedQuery);
float qVectorSquaredMagnitude = getVectorMagnitudeSquared(processedQuery);
return (float[] q, float[] v) -> 1 + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude);
// To be consistent, we will be using same formula used by lucene as mentioned below
// https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73
return (float[] q, float[] v) -> Math.max(
(1.0F + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude)) / 2.0F,
0.0F
);
}
}

Expand Down
58 changes: 58 additions & 0 deletions src/test/java/org/opensearch/knn/index/NmslibIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,64 @@ public void testEndToEnd() throws Exception {
fail("Graphs are not getting evicted");
}

public void testEndToEnd_withApproxAndExactSearch_inSameIndex_ForCosineSpaceType() throws Exception {
String indexName = "test-index-1";
String fieldName = "test-field-1";
SpaceType spaceType = SpaceType.COSINESIMIL;
Integer dimension = testData.indexData.vectors[0].length;

// Create an index
XContentBuilder builder = XContentFactory.jsonBuilder()
.startObject()
.startObject("properties")
.startObject(fieldName)
.field("type", "knn_vector")
.field("dimension", dimension)
.field(KNNConstants.METHOD_PARAMETER_SPACE_TYPE, spaceType.getValue())
.startObject(KNNConstants.KNN_METHOD)
.field(KNNConstants.NAME, KNNConstants.METHOD_HNSW)
.field(KNNConstants.KNN_ENGINE, KNNEngine.NMSLIB.getName())
.endObject()
.endObject()
.endObject()
.endObject();

Map<String, Object> mappingMap = xContentBuilderToMap(builder);
String mapping = builder.toString();

createKnnIndex(indexName, buildKNNIndexSettings(0), mapping);

// Index one document
addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray());

// Assert we have the right number of documents in the index
refreshAllIndices();
assertEquals(1, getDocCount(indexName));
// update threshold setting to skip building graph
updateIndexSettings(indexName, Settings.builder().put(KNNSettings.INDEX_KNN_ADVANCED_APPROXIMATE_THRESHOLD, -1));
// add duplicate document with different id
addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray());
assertEquals(2, getDocCount(indexName));
final int k = 2;
// search index
Response response = searchKNNIndex(
indexName,
KNNQueryBuilder.builder().fieldName(fieldName).vector(testData.queries[0]).k(k).build(),
k
);
String responseBody = EntityUtils.toString(response.getEntity());
List<KNNResult> knnResults = parseSearchResponse(responseBody, fieldName);
assertEquals(k, knnResults.size());

List<Float> actualScores = parseSearchResponseScore(responseBody, fieldName);

// both document should have identical score
assertEquals(actualScores.get(0), actualScores.get(1), 0.001);

// Delete index
deleteKNNIndex(indexName);
}

@SneakyThrows
private void validateSearch(
final String indexName,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ public void testCosineSimilarity_whenValid_thenSucceed() {
getMappingConfigForMethodMapping(knnMethodContext, 3)
);
KNNScoringSpace.CosineSimilarity cosineSimilarity = new KNNScoringSpace.CosineSimilarity(arrayListQueryObject, fieldType);
assertEquals(2F, cosineSimilarity.getScoringMethod().apply(arrayFloat2, arrayFloat), 0.1F);
assertEquals(1F, cosineSimilarity.getScoringMethod().apply(arrayFloat2, arrayFloat), 0.1F);

// invalid zero vector
final List<Float> queryZeroVector = List.of(0.0f, 0.0f, 0.0f);
Expand Down

0 comments on commit 37d132a

Please sign in to comment.