diff --git a/CHANGELOG.md b/CHANGELOG.md index b92edd850..4e54cee1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.11...2.x) ### Features +adding z-score normalization for hybrid query [#470](https://github.com/opensearch-project/neural-search/pull/470) ### Enhancements ### Bug Fixes ### Infrastructure diff --git a/build.gradle b/build.gradle index 335157549..f654ada5f 100644 --- a/build.gradle +++ b/build.gradle @@ -19,7 +19,7 @@ apply plugin: "com.diffplug.spotless" apply plugin: 'io.freefair.lombok' def pluginName = 'opensearch-neural-search' -def pluginDescription = 'A plugin that adds dense neural retrieval into the OpenSearch ecosytem' +def pluginDescription = 'A plugin that adds dense neural retrieval into the OpenSearch ecosystem' def projectPath = 'org.opensearch' def pathToPlugin = 'neuralsearch.plugin' def pluginClassName = 'NeuralSearch' @@ -219,6 +219,12 @@ integTest { if (System.getProperty("test.debug") != null) { jvmArgs '-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005' } + + systemProperty 'log4j2.configurationFile', "${projectDir}/src/test/resources/log4j2-test.xml" + + // Set this to true this if you want to see the logs in the terminal test output. + // note: if left false the log output will still show in your IDE + testLogging.showStandardStreams = true } testClusters.integTest { diff --git a/scripts/Dockerfile b/scripts/Dockerfile new file mode 100644 index 000000000..985796015 --- /dev/null +++ b/scripts/Dockerfile @@ -0,0 +1,9 @@ +# Build with the following command +# docker build --tag="opensearch-zscore-test:2.11.0" . +FROM opensearchproject/opensearch:2.11.0 +# Remove previous neural search plugin and install new one +RUN /usr/share/opensearch/bin/opensearch-plugin remove opensearch-neural-search +# Make sure the build is preset, to create it `./gradlew clean assemble -Dopensearch.version="2.11.0"` +# Then copy it locally `cp ../build/distributions/opensearch-neural-search-2.11.0.0-SNAPSHOT.zip .` +COPY opensearch-neural-search-2.11.0.0-SNAPSHOT.zip /usr/share/opensearch/ +RUN /usr/share/opensearch/bin/opensearch-plugin install -b file:/usr/share/opensearch/opensearch-neural-search-2.11.0.0-SNAPSHOT.zip diff --git a/scripts/cleanup-previous-docker.sh b/scripts/cleanup-previous-docker.sh new file mode 100755 index 000000000..48d797f31 --- /dev/null +++ b/scripts/cleanup-previous-docker.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +#### Procedure to cleanup all previous experiments with docker compose #### + +# Stop the container(s) using the following command: +docker-compose down + +# Delete all containers using the following command: +docker rm -f $(docker ps -a -q) + +# Delete all volumes using the following command: +docker volume rm $(docker volume ls -q) + +# Restart the containers using the following command: +docker-compose up + diff --git a/scripts/docker-compose.yml b/scripts/docker-compose.yml new file mode 100644 index 000000000..2b0d0c14c --- /dev/null +++ b/scripts/docker-compose.yml @@ -0,0 +1,48 @@ +version: '3' +services: + opensearch-node1: # This is also the hostname of the container within the Docker network (i.e. https://opensearch-node1/) + #image: opensearchproject/opensearch:2.11.0 + image: opensearch-zscore-test:2.11.0 + container_name: opensearch-node1 + environment: + - cluster.name=opensearch-cluster # Name the cluster + - node.name=opensearch-node1 # Name the node that will run in this container + - discovery.seed_hosts=opensearch-node1 # Nodes to look for when discovering the cluster + - cluster.initial_cluster_manager_nodes=opensearch-node1 # Nodes eligibile to serve as cluster manager + - bootstrap.memory_lock=true # Disable JVM heap memory swapping + - plugins.security.disabled=true # Disable security plugin so it's easy to test + - "DISABLE_INSTALL_DEMO_CONFIG=true" # Disable security plugin so it's easy to test + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # Set min and max JVM heap sizes to at least 50% of system RAM + ulimits: + memlock: + soft: -1 # Set memlock to unlimited (no soft or hard limit) + hard: -1 + nofile: + soft: 65536 # Maximum number of open files for the opensearch user - set to at least 65536 + hard: 65536 + volumes: + - opensearch-data1:/usr/share/opensearch/data # Creates volume called opensearch-data1 and mounts it to the container + ports: + - 9200:9200 # REST API + - 9600:9600 # Performance Analyzer + networks: + - opensearch-net # All of the containers will join the same Docker bridge network + opensearch-dashboards: + image: opensearchproject/opensearch-dashboards:2.11.0 # Make sure the version of opensearch-dashboards matches the version of opensearch installed on other nodes + container_name: opensearch-dashboards + ports: + - 5601:5601 # Map host port 5601 to container port 5601 + expose: + - "5601" # Expose port 5601 for web access to OpenSearch Dashboards + environment: + - 'OPENSEARCH_HOSTS=["http://opensearch-node1:9200"]' # Define the OpenSearch nodes that OpenSearch Dashboards will query (Plain HTTP, security disabled) + - "DISABLE_SECURITY_DASHBOARDS_PLUGIN=true" # disables security dashboards plugin in OpenSearch Dashboards + networks: + - opensearch-net + +volumes: + opensearch-data1: + +networks: + opensearch-net: + diff --git a/scripts/model_setup_beir.md b/scripts/model_setup_beir.md new file mode 100644 index 000000000..2e26890a8 --- /dev/null +++ b/scripts/model_setup_beir.md @@ -0,0 +1,462 @@ +``` +# First create the model group to use +PUT /_cluster/settings +{ + "persistent" : { + "plugins.ml_commons.model_access_control_enabled" : true + } +} + +PUT /_cluster/settings +{ + "persistent" : { + "plugins.ml_commons.only_run_on_ml_node" : false + } +} + +PUT /_cluster/settings +{ + "persistent" : { + "plugins.ml_commons.allow_registering_model_via_url" : true + } +} + +PUT /_cluster/settings +{ + "persistent" : { + "plugins.ml_commons.allow_registering_model_via_local_file" : true + } +} + +# Or everything at the same time +PUT /_cluster/settings +{ + "persistent" : { + "plugins.ml_commons.model_access_control_enabled" : true + "plugins.ml_commons.only_run_on_ml_node" : false, + "plugins.ml_commons.allow_registering_model_via_url" : true, + "plugins.ml_commons.allow_registering_model_via_local_file" : true + } +} + +POST /_plugins/_ml/model_groups/_register +{ + "name": "test_model_group_public", + "description": "This is a public model group", + "model_access_mode": "public" +} + +# returned model_group_id = eRS4y4sBRT_C2Oekj20- + +# register model +POST /_plugins/_ml/models/_register +{ + "name": "huggingface/sentence-transformers/all-MiniLM-L12-v2", + "version": "1.0.1", + "model_format": "TORCH_SCRIPT", + "model_group_id": "eRS4y4sBRT_C2Oekj20-" +} +# returned task_id=exS6y4sBRT_C2OekYW2T + +GET /_plugins/_ml/tasks/exS6y4sBRT_C2OekYW2T +# returned model_id=fBS6y4sBRT_C2Oeka22E + +POST /_plugins/_ml/models/fBS6y4sBRT_C2Oeka22E/_load +# returned task_id=fRS7y4sBRT_C2Oekp20I + +GET /_plugins/_ml/tasks/fRS7y4sBRT_C2Oekp20I + +PUT _ingest/pipeline/nlp-pipeline +{ + "description": "An example neural search pipeline", + "processors" : [ + { + "text_embedding": { + "model_id": "fBS6y4sBRT_C2Oeka22E", + "field_map": { + "passage_text": "passage_embedding" + } + } + } + ] +} + +DELETE /scifact + +PUT /scifact +{ + "settings": { + "index.knn": true, + "default_pipeline": "nlp-pipeline" + }, + "mappings": { + "properties": { + "passage_embedding": { + "type": "knn_vector", + "dimension": 384, + "method": { + "name":"hnsw", + "engine":"lucene", + "space_type": "l2", + "parameters":{ + "m":16, + "ef_construction": 512 + } + } + }, + "passage_text": { + "type": "text" + }, + "passage_key": { + "type": "text" + }, + "passage_title": { + "type": "text" + } + } + } +} + +PUT /_search/pipeline/norm-minmax-pipeline +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean", + "parameters": { + "weights": [ + 1.0 + ] + } + } + } + } + ] +} + +PUT /_search/pipeline/norm-minmax-pipeline-hybrid +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean", + "parameters": { + "weights": [ + 0.4, + 0.3, + 0.3 + ] + } + } + } + } + ] +} + +GET scifact/_search?search_pipeline=norm-minmax-pipeline +``` + +Can also be created outside the console via curl +```bash +PORT=50365 +HOST=localhost +URL="$HOST:$PORT" + +curl -XPUT -H "Content-Type: application/json" $URL/_ingest/pipeline/nlp-pipeline -d ' +{ + "description": "An example neural search pipeline", + "processors" : [ + { + "text_embedding": { + "model_id": "AXA30IsByAqY8FkWHdIF", + "field_map": { + "passage_text": "passage_embedding" + } + } + } + ] +}' + +curl -XDELETE $URL/$INDEX + +curl -XPUT -H "Content-Type: application/json" $URL/scifact -d ' +{ + "settings": { + "index.knn": true, + "default_pipeline": "nlp-pipeline" + }, + "mappings": { + "properties": { + "passage_embedding": { + "type": "knn_vector", + "dimension": 384, + "method": { + "name":"hnsw", + "engine":"lucene", + "space_type": "l2", + "parameters":{ + "m":16, + "ef_construction": 512 + } + } + }, + "passage_text": { + "type": "text" + }, + "passage_key": { + "type": "text" + }, + "passage_title": { + "type": "text" + } + } + } +}' + +curl -XPUT -H "Content-Type: application/json" $URL/_search/pipeline/norm-minmax-pipeline-hybrid -d ' +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean", + "parameters": { + "weights": [ + 0.4, + 0.3, + 0.3 + ] + } + } + } + } + ] +}' + +curl -XPUT -H "Content-Type: application/json" $URL/_search/pipeline/norm-zscore-pipeline-hybrid -d ' +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "z_score" + }, + "combination": { + "technique": "arithmetic_mean", + "parameters": { + "weights": [ + 0.4, + 0.3, + 0.3 + ] + } + } + } + } + ] +}' + +``` + +To use later with +```bash +PORT=50365 +MODEL_ID="AXA30IsByAqY8FkWHdIF" +pipenv run python test_opensearch.py --dataset=scifact --dataset_url="https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip" --os_host=localhost --os_port=$PORT --os_index="scifact" --operation=ingest +pipenv run python test_opensearch.py --dataset=scifact --dataset_url="https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip" --os_host=localhost --os_port=$PORT --os_index="scifact" --operation=evaluate --method=bm25 +pipenv run python test_opensearch.py --dataset=scifact --dataset_url="https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip" --os_host=localhost --os_port=$PORT --os_index="scifact" --operation=evaluate --method=neural --pipelines=norm-minmax-pipeline --os_model_id=$MODEL_ID +pipenv run python test_opensearch.py --dataset=scifact --dataset_url="https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip" --os_host=localhost --os_port=$PORT --os_index="scifact" --operation=evaluate --method=hybrid --pipelines=norm-minmax-pipeline-hybrid --os_model_id=$MODEL_ID +pipenv run python test_opensearch.py --dataset=scifact --dataset_url="https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip" --os_host=localhost --os_port=$PORT --os_index="scifact" --operation=evaluate --method=hybrid --pipelines=norm-zscore-pipeline-hybrid --os_model_id=$MODEL_ID +``` + +To follow the Amazon approach +# Via the opensearch dashboard console +``` +PUT /_cluster/settings +{ + "persistent" : { + "plugins.ml_commons.model_access_control_enabled" : true, + "plugins.ml_commons.only_run_on_ml_node" : false, + "plugins.ml_commons.allow_registering_model_via_url" : true, + "plugins.ml_commons.allow_registering_model_via_local_file" : true + } +} + +POST /_plugins/_ml/model_groups/_register +{ + "name": "test_model_group_public", + "description": "This is a public model group", + "model_access_mode": "public" +} + +# returned model_group_id = eRS4y4sBRT_C2Oekj20- +# register model +POST /_plugins/_ml/models/_register +{ + "name": "huggingface/sentence-transformers/msmarco-distilbert-base-tas-b", + "version": "1.0.1", + "model_format": "TORCH_SCRIPT", + "model_group_id": "eRS4y4sBRT_C2Oekj20-" +} +# returned task_id=exS6y4sBRT_C2OekYW2T + +GET /_plugins/_ml/tasks/exS6y4sBRT_C2OekYW2T +# returned model_id=fBS6y4sBRT_C2Oeka22E + +POST /_plugins/_ml/models/fBS6y4sBRT_C2Oeka22E/_load +# returned task_id=fRS7y4sBRT_C2Oekp20I + +GET /_plugins/_ml/tasks/fRS7y4sBRT_C2Oekp20I +# Wait until successful +``` + +# via shell +```bash +PORT=9200 +HOST=localhost +URL="$HOST:$PORT" +INDEX="quora" +DATASET="quora" +MODEL_ID="dLrx6IsB4n5WT8oPiuAq" + +curl -XPUT -H "Content-Type: application/json" $URL/_ingest/pipeline/nlp-pipeline -d ' +{ + "description": "An example neural search pipeline", + "processors" : [ + { + "text_embedding": { + "model_id": "dLrx6IsB4n5WT8oPiuAq", + "field_map": { + "passage_text": "passage_embedding" + } + } + } + ] +}' + +curl -XDELETE $URL/$INDEX + +curl -XPUT -H "Content-Type: application/json" $URL/$INDEX -d ' +{ + "settings": { + "index.knn": true, + "default_pipeline": "nlp-pipeline", + "number_of_shards": 4 + }, + "mappings": { + "properties": { + "passage_embedding": { + "type": "knn_vector", + "dimension": 768, + "method": { + "name": "hnsw", + "engine": "nmslib", + "space_type": "innerproduct", + "parameters":{} + } + }, + "passage_text": { + "type": "text" + }, + "title_key": { + "type": "text", "analyzer" : "english" + }, + "text_key": { + "type": "text", "analyzer" : "english" + } + } + } +}' + +curl -XPUT -H "Content-Type: application/json" $URL/_search/pipeline/norm-minmax-pipeline-hybrid -d ' +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean" + } + } + } + ] +}' + +curl -XPUT -H "Content-Type: application/json" $URL/_search/pipeline/norm-ltwo-pipeline-hybrid -d ' +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "l2" + }, + "combination": { + "technique": "arithmetic_mean" + } + } + } + ] +}' + +curl -XPUT -H "Content-Type: application/json" $URL/_search/pipeline/norm-zscore-pipeline-hybrid -d ' +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "z_score" + }, + "combination": { + "technique": "arithmetic_mean" + } + } + } + ] +}' + +curl -XPUT -H "Content-Type: application/json" $URL/_search/pipeline/norm-zscore--with-negatives-pipeline-hybrid -d ' +{ + "description": "Post processor for hybrid search", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "z_score" + }, + "combination": { + "technique": "arithmetic_mean_with_negatives_support" + } + } + } + ] +}' + + +pipenv run python test_opensearch.py --dataset=$DATASET --dataset_url="https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip" --os_host=localhost --os_port=$PORT --os_index=$INDEX --operation=ingest +pipenv run python test_opensearch.py --dataset=$DATASET --dataset_url="https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip" --os_host=localhost --os_port=$PORT --os_index=$INDEX --operation=evaluate --method=bm25 > /tmp/bm25-results.log +pipenv run python test_opensearch.py --dataset=$DATASET --dataset_url="https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip" --os_host=localhost --os_port=$PORT --os_index=$INDEX --operation=evaluate --method=neural --pipelines=norm-minmax-pipeline-hybrid --os_model_id=$MODEL_ID > /tmp/neural-results.log +pipenv run python test_opensearch.py --dataset=$DATASET --dataset_url="https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip" --os_host=localhost --os_port=$PORT --os_index=$INDEX --operation=evaluate --method=hybrid --pipelines=norm-minmax-pipeline-hybrid --os_model_id=$MODEL_ID > /tmp/min-max-results.log +pipenv run python test_opensearch.py --dataset=$DATASET --dataset_url="https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip" --os_host=localhost --os_port=$PORT --os_index=$INDEX --operation=evaluate --method=hybrid --pipelines=norm-ltwo-pipeline-hybrid --os_model_id=$MODEL_ID > /tmp/l2-results.log +pipenv run python test_opensearch.py --dataset=$DATASET --dataset_url="https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip" --os_host=localhost --os_port=$PORT --os_index=$INDEX --operation=evaluate --method=hybrid --pipelines=norm-zscore-pipeline-hybrid --os_model_id=$MODEL_ID > /tmp/zscore-results.log +pipenv run python test_opensearch.py --dataset=$DATASET --dataset_url="https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip" --os_host=localhost --os_port=$PORT --os_index=$INDEX --operation=evaluate --method=hybrid --pipelines=norm-zscore--with-negatives-pipeline-hybrid --os_model_id=$MODEL_ID > /tmp/zscore-with-negatives-results.log +``` \ No newline at end of file diff --git a/src/main/java/org/opensearch/neuralsearch/processor/combination/ArithmeticMeanScoreCombinationWithNegativeSupportTechnique.java b/src/main/java/org/opensearch/neuralsearch/processor/combination/ArithmeticMeanScoreCombinationWithNegativeSupportTechnique.java new file mode 100644 index 000000000..07ab5151c --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/combination/ArithmeticMeanScoreCombinationWithNegativeSupportTechnique.java @@ -0,0 +1,56 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.neuralsearch.processor.combination; + +import lombok.ToString; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Abstracts combination of scores based on arithmetic mean method + */ +@ToString(onlyExplicitlyIncluded = true) +public class ArithmeticMeanScoreCombinationWithNegativeSupportTechnique implements ScoreCombinationTechnique { + @ToString.Include + public static final String TECHNIQUE_NAME = "arithmetic_mean_with_negatives_support"; + public static final String PARAM_NAME_WEIGHTS = "weights"; + private static final Set SUPPORTED_PARAMS = Set.of(PARAM_NAME_WEIGHTS); + private static final Float ZERO_SCORE = 0.0f; + private final List weights; + private final ScoreCombinationUtil scoreCombinationUtil; + + public ArithmeticMeanScoreCombinationWithNegativeSupportTechnique(final Map params, final ScoreCombinationUtil combinationUtil) { + scoreCombinationUtil = combinationUtil; + scoreCombinationUtil.validateParams(params, SUPPORTED_PARAMS); + weights = scoreCombinationUtil.getWeights(params); + } + + /** + * Arithmetic mean method for combining scores. + * score = (weight1*score1 + weight2*score2 +...+ weightN*scoreN)/(weight1 + weight2 + ... + weightN) + * + * Zero (0.0) scores are excluded from number of scores N + */ + @Override + public float combine(final float[] scores) { + scoreCombinationUtil.validateIfWeightsMatchScores(scores, weights); + float combinedScore = 0.0f; + float sumOfWeights = 0; + for (int indexOfSubQuery = 0; indexOfSubQuery < scores.length; indexOfSubQuery++) { + float score = scores[indexOfSubQuery]; + float weight = scoreCombinationUtil.getWeightForSubQuery(weights, indexOfSubQuery); + score = score * weight; + combinedScore += score; + sumOfWeights += weight; + } + if (sumOfWeights == 0.0f) { + return ZERO_SCORE; + } + return combinedScore / sumOfWeights; + } +} diff --git a/src/main/java/org/opensearch/neuralsearch/processor/combination/ScoreCombinationFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/combination/ScoreCombinationFactory.java index f05d24823..217d69573 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/combination/ScoreCombinationFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/combination/ScoreCombinationFactory.java @@ -26,7 +26,9 @@ public class ScoreCombinationFactory { HarmonicMeanScoreCombinationTechnique.TECHNIQUE_NAME, params -> new HarmonicMeanScoreCombinationTechnique(params, scoreCombinationUtil), GeometricMeanScoreCombinationTechnique.TECHNIQUE_NAME, - params -> new GeometricMeanScoreCombinationTechnique(params, scoreCombinationUtil) + params -> new GeometricMeanScoreCombinationTechnique(params, scoreCombinationUtil), + ArithmeticMeanScoreCombinationWithNegativeSupportTechnique.TECHNIQUE_NAME, + params -> new ArithmeticMeanScoreCombinationWithNegativeSupportTechnique(params, scoreCombinationUtil) ); /** diff --git a/src/main/java/org/opensearch/neuralsearch/processor/combination/ScoreCombiner.java b/src/main/java/org/opensearch/neuralsearch/processor/combination/ScoreCombiner.java index 0293efae6..5b9cd4378 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/combination/ScoreCombiner.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/combination/ScoreCombiner.java @@ -26,8 +26,6 @@ @Log4j2 public class ScoreCombiner { - private static final Float ZERO_SCORE = 0.0f; - /** * Performs score combination based on input combination technique. Mutates input object by updating combined scores * Main steps we're doing for combination: diff --git a/src/main/java/org/opensearch/neuralsearch/processor/normalization/ScoreNormalizationFactory.java b/src/main/java/org/opensearch/neuralsearch/processor/normalization/ScoreNormalizationFactory.java index 667c237c7..c42df96fe 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/normalization/ScoreNormalizationFactory.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/normalization/ScoreNormalizationFactory.java @@ -19,7 +19,9 @@ public class ScoreNormalizationFactory { MinMaxScoreNormalizationTechnique.TECHNIQUE_NAME, new MinMaxScoreNormalizationTechnique(), L2ScoreNormalizationTechnique.TECHNIQUE_NAME, - new L2ScoreNormalizationTechnique() + new L2ScoreNormalizationTechnique(), + ZScoreNormalizationTechnique.TECHNIQUE_NAME, + new ZScoreNormalizationTechnique() ); /** diff --git a/src/main/java/org/opensearch/neuralsearch/processor/normalization/ZScoreNormalizationTechnique.java b/src/main/java/org/opensearch/neuralsearch/processor/normalization/ZScoreNormalizationTechnique.java new file mode 100644 index 000000000..fc97e8a4b --- /dev/null +++ b/src/main/java/org/opensearch/neuralsearch/processor/normalization/ZScoreNormalizationTechnique.java @@ -0,0 +1,168 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.neuralsearch.processor.normalization; + +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.Optional; + +import lombok.ToString; + +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.opensearch.neuralsearch.processor.CompoundTopDocs; + +import com.google.common.primitives.Floats; + +/** + * Implementation of z-score normalization technique for hybrid query + * This is currently modeled based on the existing normalization techniques {@link L2ScoreNormalizationTechnique} and {@link MinMaxScoreNormalizationTechnique} + * However, this class as well as the original ones require a significant work to improve style and ease of use, see TODO items below + */ +/* +TODO: Some todo items that apply here but also on the original normalization techniques on which it is modeled {@link L2ScoreNormalizationTechnique} and {@link MinMaxScoreNormalizationTechnique} +1. Random access to abstract list object is a bad practice both stylistically and from performance perspective and should be removed +2. Identical sub queries and their distribution between shards is currently completely implicit based on ordering and should be explicit based on identifier +3. Implicit calculation of numOfSubQueries instead of having a more explicit upstream indicator/metadata regarding it + */ +@ToString(onlyExplicitlyIncluded = true) +public class ZScoreNormalizationTechnique implements ScoreNormalizationTechnique { + @ToString.Include + public static final String TECHNIQUE_NAME = "z_score"; + private static final float SINGLE_RESULT_SCORE = 1.0f; + + @Override + public void normalize(final List queryTopDocs) { + /* + TODO: There is an implicit assumption in this calculation that probably need to be made clearer by passing some metadata with the results. + Currently assuming that finding a single non empty shard result will contain all sub query results with 0 hits. + */ + final Optional maybeCompoundTopDocs = queryTopDocs.stream() + .filter(Objects::nonNull) + .filter(topDocs -> topDocs.getTopDocs().size() > 0) + .findAny(); + + final int numOfSubQueries = maybeCompoundTopDocs.map(compoundTopDocs -> compoundTopDocs.getTopDocs().size()).orElse(0); + + // to be done for each subquery + float[] sumPerSubquery = findScoreSumPerSubQuery(queryTopDocs, numOfSubQueries); + long[] elementsPerSubquery = findNumberOfElementsPerSubQuery(queryTopDocs, numOfSubQueries); + float[] meanPerSubQuery = findMeanPerSubquery(sumPerSubquery, elementsPerSubquery); + float[] stdPerSubquery = findStdPerSubquery(queryTopDocs, meanPerSubQuery, elementsPerSubquery, numOfSubQueries); + + // do normalization using actual score and z-scores for corresponding sub query + for (CompoundTopDocs compoundQueryTopDocs : queryTopDocs) { + if (Objects.isNull(compoundQueryTopDocs)) { + continue; + } + List topDocsPerSubQuery = compoundQueryTopDocs.getTopDocs(); + for (int j = 0; j < topDocsPerSubQuery.size(); j++) { + TopDocs subQueryTopDoc = topDocsPerSubQuery.get(j); + for (ScoreDoc scoreDoc : subQueryTopDoc.scoreDocs) { + scoreDoc.score = normalizeSingleScore(scoreDoc.score, stdPerSubquery[j], meanPerSubQuery[j]); + } + } + } + } + + static private float[] findScoreSumPerSubQuery(final List queryTopDocs, final int numOfScores) { + final float[] sumOfScorePerSubQuery = new float[numOfScores]; + Arrays.fill(sumOfScorePerSubQuery, 0); + // TODO: make this syntactically clearer regarding performance by avoiding List.get(j) with an abstract List type + for (CompoundTopDocs compoundQueryTopDocs : queryTopDocs) { + if (Objects.isNull(compoundQueryTopDocs)) { + continue; + } + List topDocsPerSubQuery = compoundQueryTopDocs.getTopDocs(); + for (int j = 0; j < topDocsPerSubQuery.size(); j++) { + sumOfScorePerSubQuery[j] += sumScoreDocsArray(topDocsPerSubQuery.get(j).scoreDocs); + } + } + + return sumOfScorePerSubQuery; + } + + static private long[] findNumberOfElementsPerSubQuery(final List queryTopDocs, final int numOfScores) { + final long[] numberOfElementsPerSubQuery = new long[numOfScores]; + Arrays.fill(numberOfElementsPerSubQuery, 0); + // TODO: make this syntactically clearer regarding performance by avoiding List.get(j) with an abstract List type + for (CompoundTopDocs compoundQueryTopDocs : queryTopDocs) { + if (Objects.isNull(compoundQueryTopDocs)) { + continue; + } + List topDocsPerSubQuery = compoundQueryTopDocs.getTopDocs(); + for (int j = 0; j < topDocsPerSubQuery.size(); j++) { + numberOfElementsPerSubQuery[j] += topDocsPerSubQuery.get(j).totalHits.value; + } + } + + return numberOfElementsPerSubQuery; + } + + static private float[] findMeanPerSubquery(final float[] sumPerSubquery, final long[] elementsPerSubquery) { + final float[] meanPerSubQuery = new float[elementsPerSubquery.length]; + for (int i = 0; i < elementsPerSubquery.length; i++) { + if (elementsPerSubquery[i] == 0) { + meanPerSubQuery[i] = 0; + } else { + meanPerSubQuery[i] = sumPerSubquery[i] / elementsPerSubquery[i]; + } + } + + return meanPerSubQuery; + } + + static private float[] findStdPerSubquery( + final List queryTopDocs, + final float[] meanPerSubQuery, + final long[] elementsPerSubquery, + final int numOfScores + ) { + final double[] deltaSumPerSubquery = new double[numOfScores]; + Arrays.fill(deltaSumPerSubquery, 0); + // TODO: make this syntactically clearer regarding performance by avoiding List.get(j) with an abstract List type + for (CompoundTopDocs compoundQueryTopDocs : queryTopDocs) { + if (Objects.isNull(compoundQueryTopDocs)) { + continue; + } + List topDocsPerSubQuery = compoundQueryTopDocs.getTopDocs(); + for (int j = 0; j < topDocsPerSubQuery.size(); j++) { + for (ScoreDoc scoreDoc : topDocsPerSubQuery.get(j).scoreDocs) { + deltaSumPerSubquery[j] += Math.pow(scoreDoc.score - meanPerSubQuery[j], 2); + } + } + } + + final float[] stdPerSubQuery = new float[numOfScores]; + for (int i = 0; i < deltaSumPerSubquery.length; i++) { + if (elementsPerSubquery[i] == 0) { + stdPerSubQuery[i] = 0; + } else { + stdPerSubQuery[i] = (float) Math.sqrt(deltaSumPerSubquery[i] / elementsPerSubquery[i]); + } + } + + return stdPerSubQuery; + } + + static private float sumScoreDocsArray(final ScoreDoc[] scoreDocs) { + float sum = 0; + for (ScoreDoc scoreDoc : scoreDocs) { + sum += scoreDoc.score; + } + + return sum; + } + + private static float normalizeSingleScore(final float score, final float standardDeviation, final float mean) { + // edge case when there is only one score and min and max scores are same + if (Floats.compare(mean, score) == 0) { + return SINGLE_RESULT_SCORE; + } + return (score - mean) / standardDeviation; + } +} diff --git a/src/test/java/org/opensearch/neuralsearch/common/BaseNeuralSearchIT.java b/src/test/java/org/opensearch/neuralsearch/common/BaseNeuralSearchIT.java index 33cdff9a0..6b213a9fd 100644 --- a/src/test/java/org/opensearch/neuralsearch/common/BaseNeuralSearchIT.java +++ b/src/test/java/org/opensearch/neuralsearch/common/BaseNeuralSearchIT.java @@ -772,6 +772,21 @@ private String registerModelGroup() { return modelGroupId; } + protected List> getNestedHits(Map searchResponseAsMap) { + Map hitsMap = (Map) searchResponseAsMap.get("hits"); + return (List>) hitsMap.get("hits"); + } + + protected Map getTotalHits(Map searchResponseAsMap) { + Map hitsMap = (Map) searchResponseAsMap.get("hits"); + return (Map) hitsMap.get("total"); + } + + protected Optional getMaxScore(Map searchResponseAsMap) { + Map hitsMap = (Map) searchResponseAsMap.get("hits"); + return hitsMap.get("max_score") == null ? Optional.empty() : Optional.of(((Double) hitsMap.get("max_score")).floatValue()); + } + /** * Enumeration for types of pipeline processors, used to lookup resources like create * processor request as those are type specific diff --git a/src/test/java/org/opensearch/neuralsearch/plugin/NeuralSearchTests.java b/src/test/java/org/opensearch/neuralsearch/plugin/NeuralSearchTests.java index 69791681e..fe84559c6 100644 --- a/src/test/java/org/opensearch/neuralsearch/plugin/NeuralSearchTests.java +++ b/src/test/java/org/opensearch/neuralsearch/plugin/NeuralSearchTests.java @@ -66,6 +66,7 @@ public void testProcessors() { null, mock(IngestService.class), null, + null, null ); Map processors = plugin.getProcessors(processorParams); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/HybridQueryZScoreIT.java b/src/test/java/org/opensearch/neuralsearch/processor/HybridQueryZScoreIT.java new file mode 100644 index 000000000..6c939e1e9 --- /dev/null +++ b/src/test/java/org/opensearch/neuralsearch/processor/HybridQueryZScoreIT.java @@ -0,0 +1,206 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.neuralsearch.processor; + +import static org.opensearch.neuralsearch.TestUtils.DELTA_FOR_SCORE_ASSERTION; +import static org.opensearch.neuralsearch.TestUtils.createRandomVector; + +import java.io.IOException; +import java.util.*; +import java.util.stream.IntStream; + +import lombok.SneakyThrows; + +import org.junit.After; +import org.junit.Before; +import org.opensearch.index.query.BoolQueryBuilder; +import org.opensearch.index.query.QueryBuilders; +import org.opensearch.index.query.TermQueryBuilder; +import org.opensearch.knn.index.SpaceType; +import org.opensearch.neuralsearch.common.BaseNeuralSearchIT; +import org.opensearch.neuralsearch.processor.normalization.ZScoreNormalizationTechnique; +import org.opensearch.neuralsearch.query.HybridQueryBuilder; +import org.opensearch.neuralsearch.query.NeuralQueryBuilder; + +import com.google.common.primitives.Floats; + +public class HybridQueryZScoreIT extends BaseNeuralSearchIT { + private static final String TEST_BASIC_VECTOR_DOC_FIELD_INDEX_NAME = "test-neural-vector-doc-field-index"; + private static final String TEST_QUERY_TEXT = "greetings"; + private static final String TEST_QUERY_TEXT4 = "place"; + private static final String TEST_QUERY_TEXT5 = "welcome"; + private static final String TEST_DOC_TEXT1 = "Hello world"; + private static final String TEST_DOC_TEXT2 = "Hi to this place"; + private static final String TEST_KNN_VECTOR_FIELD_NAME_1 = "test-knn-vector-1"; + private static final String TEST_KNN_VECTOR_FIELD_NAME_2 = "test-knn-vector-2"; + private static final String TEST_TEXT_FIELD_NAME_1 = "test-text-field-1"; + + private static final int TEST_DIMENSION = 768; + private static final SpaceType TEST_SPACE_TYPE = SpaceType.L2; + private final float[] testVector1 = createRandomVector(TEST_DIMENSION); + private final float[] testVector2 = createRandomVector(TEST_DIMENSION); + private final static String RELATION_EQUAL_TO = "eq"; + private static final String SEARCH_PIPELINE = "phase-results-pipeline"; + + @Before + public void setUp() throws Exception { + super.setUp(); + updateClusterSettings(); + prepareModel(); + createSearchPipeline( + SEARCH_PIPELINE, + ZScoreNormalizationTechnique.TECHNIQUE_NAME, + DEFAULT_COMBINATION_METHOD, + Map.of(PARAM_NAME_WEIGHTS, "[0.5,0.5]") + ); + } + + @After + @SneakyThrows + public void tearDown() { + super.tearDown(); + deleteSearchPipeline(SEARCH_PIPELINE); + /* this is required to minimize chance of model not being deployed due to open memory CB, + * this happens in case we leave model from previous test case. We use new model for every test, and old model + * can be undeployed and deleted to free resources after each test case execution. + */ + findDeployedModels().forEach(this::deleteModel); + } + + @Override + public boolean isUpdateClusterSettings() { + return false; + } + + @Override + protected boolean preserveClusterUponCompletion() { + return true; + } + + /** + * Tests complex query with multiple nested sub-queries: + * { + * "query": { + * "hybrid": { + * "queries": [ + * { + * "bool": { + * "should": [ + * { + * "term": { + * "text": "word1" + * } + * }, + * { + * "term": { + * "text": "word2" + * } + * } + * ] + * } + * }, + * { + * "term": { + * "text": "word3" + * } + * } + * ] + * } + * } + * } + */ + @SneakyThrows + public void testComplexQuery_withZScoreNormalization() { + initializeIndexIfNotExist(); + + TermQueryBuilder termQueryBuilder2 = QueryBuilders.termQuery(TEST_TEXT_FIELD_NAME_1, TEST_QUERY_TEXT4); + TermQueryBuilder termQueryBuilder3 = QueryBuilders.termQuery(TEST_TEXT_FIELD_NAME_1, TEST_QUERY_TEXT5); + BoolQueryBuilder boolQueryBuilder = new BoolQueryBuilder(); + boolQueryBuilder.should(termQueryBuilder2).should(termQueryBuilder3); + + String modelId = getDeployedModelId(); + NeuralQueryBuilder neuralQueryBuilder = new NeuralQueryBuilder( + TEST_KNN_VECTOR_FIELD_NAME_1, + TEST_QUERY_TEXT, + "", + modelId, + 5, + null, + null + ); + + HybridQueryBuilder hybridQueryBuilderNeuralThenTerm = new HybridQueryBuilder(); + hybridQueryBuilderNeuralThenTerm.add(neuralQueryBuilder); + hybridQueryBuilderNeuralThenTerm.add(boolQueryBuilder); + + final Map searchResponseAsMap = search( + TEST_BASIC_VECTOR_DOC_FIELD_INDEX_NAME, + hybridQueryBuilderNeuralThenTerm, + null, + 5, + Map.of("search_pipeline", SEARCH_PIPELINE) + ); + + assertEquals(2, getHitCount(searchResponseAsMap)); + + List> hits1NestedList = getNestedHits(searchResponseAsMap); + List ids = new ArrayList<>(); + List scores = new ArrayList<>(); + for (Map oneHit : hits1NestedList) { + ids.add((String) oneHit.get("_id")); + scores.add((Double) oneHit.get("_score")); + } + + assertEquals(2, scores.size()); + // by design when there are only two results with z score since it's z-score normalized we would expect 1 , -1 to be the + // corresponding score, + // furthermore the combination logic with weights should make it doc1Score: (1 * w1 + 0.98 * w2)/(w1 + w2), doc2Score: -1 ~ 0 + assertEquals(0.9999, scores.get(0).floatValue(), DELTA_FOR_SCORE_ASSERTION); + assertEquals(0, scores.get(1).floatValue(), DELTA_FOR_SCORE_ASSERTION); + + // verify that scores are in desc order + assertTrue(IntStream.range(0, scores.size() - 1).noneMatch(idx -> scores.get(idx) < scores.get(idx + 1))); + // verify that all ids are unique + assertEquals(Set.copyOf(ids).size(), ids.size()); + + Map total = getTotalHits(searchResponseAsMap); + assertNotNull(total.get("value")); + assertEquals(2, total.get("value")); + assertNotNull(total.get("relation")); + assertEquals(RELATION_EQUAL_TO, total.get("relation")); + } + + private void initializeIndexIfNotExist() throws IOException { + if (!indexExists(TEST_BASIC_VECTOR_DOC_FIELD_INDEX_NAME)) { + prepareKnnIndex( + TEST_BASIC_VECTOR_DOC_FIELD_INDEX_NAME, + List.of( + new KNNFieldConfig(TEST_KNN_VECTOR_FIELD_NAME_1, TEST_DIMENSION, TEST_SPACE_TYPE), + new KNNFieldConfig(TEST_KNN_VECTOR_FIELD_NAME_2, TEST_DIMENSION, TEST_SPACE_TYPE) + ), + 1 + ); + + addKnnDoc( + TEST_BASIC_VECTOR_DOC_FIELD_INDEX_NAME, + "1", + List.of(TEST_KNN_VECTOR_FIELD_NAME_1, TEST_KNN_VECTOR_FIELD_NAME_2), + List.of(Floats.asList(testVector1).toArray(), Floats.asList(testVector1).toArray()), + Collections.singletonList(TEST_TEXT_FIELD_NAME_1), + Collections.singletonList(TEST_DOC_TEXT1) + ); + addKnnDoc( + TEST_BASIC_VECTOR_DOC_FIELD_INDEX_NAME, + "2", + List.of(TEST_KNN_VECTOR_FIELD_NAME_1, TEST_KNN_VECTOR_FIELD_NAME_2), + List.of(Floats.asList(testVector2).toArray(), Floats.asList(testVector2).toArray()), + Collections.singletonList(TEST_TEXT_FIELD_NAME_1), + Collections.singletonList(TEST_DOC_TEXT2) + ); + assertEquals(2, getDocCount(TEST_BASIC_VECTOR_DOC_FIELD_INDEX_NAME)); + } + } +} diff --git a/src/test/java/org/opensearch/neuralsearch/processor/NormalizationProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/NormalizationProcessorIT.java index 86e75f736..149e44509 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/NormalizationProcessorIT.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/NormalizationProcessorIT.java @@ -12,7 +12,6 @@ import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.Set; import java.util.stream.IntStream; @@ -365,21 +364,6 @@ private void initializeIndexIfNotExist(String indexName) throws IOException { } } - private List> getNestedHits(Map searchResponseAsMap) { - Map hitsMap = (Map) searchResponseAsMap.get("hits"); - return (List>) hitsMap.get("hits"); - } - - private Map getTotalHits(Map searchResponseAsMap) { - Map hitsMap = (Map) searchResponseAsMap.get("hits"); - return (Map) hitsMap.get("total"); - } - - private Optional getMaxScore(Map searchResponseAsMap) { - Map hitsMap = (Map) searchResponseAsMap.get("hits"); - return hitsMap.get("max_score") == null ? Optional.empty() : Optional.of(((Double) hitsMap.get("max_score")).floatValue()); - } - private void assertQueryResults(Map searchResponseAsMap, int totalExpectedDocQty, boolean assertMinScore) { assertQueryResults(searchResponseAsMap, totalExpectedDocQty, assertMinScore, Range.between(0.5f, 1.0f)); } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/normalization/ZScoreNormalizationTechniqueTests.java b/src/test/java/org/opensearch/neuralsearch/processor/normalization/ZScoreNormalizationTechniqueTests.java new file mode 100644 index 000000000..702aead59 --- /dev/null +++ b/src/test/java/org/opensearch/neuralsearch/processor/normalization/ZScoreNormalizationTechniqueTests.java @@ -0,0 +1,187 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.neuralsearch.processor.normalization; + +import java.util.List; + +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TotalHits; +import org.opensearch.neuralsearch.processor.CompoundTopDocs; +import org.opensearch.neuralsearch.query.OpenSearchQueryTestCase; + +public class ZScoreNormalizationTechniqueTests extends OpenSearchQueryTestCase { + private static final float DELTA_FOR_ASSERTION = 0.0001f; + + /** + * Z score will check the relative distance from the center of distribution in units of standard deviation + * and hence can also be negative. It is using the formula of (score - mean_score)/std + * When only two values are available their z-score numbers will be 1 and -1 correspondingly. + * For more information regarding z-score you can check this link + * https://www.z-table.com/ + * + */ + public void testNormalization_whenResultFromOneShardOneSubQuery_thenSuccessful() { + ZScoreNormalizationTechnique normalizationTechnique = new ZScoreNormalizationTechnique(); + List compoundTopDocs = List.of( + new CompoundTopDocs( + new TotalHits(2, TotalHits.Relation.EQUAL_TO), + List.of( + new TopDocs( + new TotalHits(2, TotalHits.Relation.EQUAL_TO), + new ScoreDoc[] { new ScoreDoc(2, 0.5f), new ScoreDoc(4, 0.2f) } + ) + ) + ) + ); + normalizationTechnique.normalize(compoundTopDocs); + + // since we only have two scores of 0.5 and 0.2 their z-score numbers will be 1 and -1 + CompoundTopDocs expectedCompoundDocs = new CompoundTopDocs( + new TotalHits(2, TotalHits.Relation.EQUAL_TO), + List.of( + new TopDocs(new TotalHits(2, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] { new ScoreDoc(2, 1.0f), new ScoreDoc(4, -1.0f) }) + ) + ); + assertNotNull(compoundTopDocs); + assertEquals(1, compoundTopDocs.size()); + assertNotNull(compoundTopDocs.get(0).getTopDocs()); + assertCompoundTopDocs( + new TopDocs(expectedCompoundDocs.getTotalHits(), expectedCompoundDocs.getScoreDocs().toArray(new ScoreDoc[0])), + compoundTopDocs.get(0).getTopDocs().get(0) + ); + } + + /** + * Z score will check the relative distance from the center of distribution in units of standard deviation + * and hence can also be negative. It is using the formula of (score - mean_score)/std + * When only two values are available their z-score numbers will be 1 and -1 correspondingly as we see in the first query that returns only two document scores. + * When we have more than two documents scores as in the second query the distribution will not be binary and will have different results based on where the center of gravity of the distribution is. + * For more information regarding z-score you can check this link + * https://www.z-table.com/ + * + */ + public void testNormalization_whenResultFromOneShardMultipleSubQueries_thenSuccessful() { + ZScoreNormalizationTechnique normalizationTechnique = new ZScoreNormalizationTechnique(); + List compoundTopDocs = List.of( + new CompoundTopDocs( + new TotalHits(3, TotalHits.Relation.EQUAL_TO), + List.of( + new TopDocs( + new TotalHits(2, TotalHits.Relation.EQUAL_TO), + new ScoreDoc[] { new ScoreDoc(2, 0.5f), new ScoreDoc(4, 0.2f) } + ), + new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[0]), + new TopDocs( + new TotalHits(3, TotalHits.Relation.EQUAL_TO), + new ScoreDoc[] { new ScoreDoc(3, 0.9f), new ScoreDoc(4, 0.7f), new ScoreDoc(2, 0.1f) } + ) + ) + ) + ); + normalizationTechnique.normalize(compoundTopDocs); + + CompoundTopDocs expectedCompoundDocs = new CompoundTopDocs( + new TotalHits(3, TotalHits.Relation.EQUAL_TO), + List.of( + new TopDocs( + new TotalHits(2, TotalHits.Relation.EQUAL_TO), + // Calculated based on the formula (score - mean_score)/std + new ScoreDoc[] { new ScoreDoc(2, 1.0f), new ScoreDoc(4, -1.0f) } + ), + new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[0]), + new TopDocs( + new TotalHits(3, TotalHits.Relation.EQUAL_TO), + // Calculated based on the formula (score - mean_score)/std for the values of mean_score = (0.9 + 0.7 + 0.1)/3 ~ 0.56, + // std = sqrt(((0.9 - 0.56)^2 + (0.7 - 0.56)^2 + (0.1 - 0.56)^2)/3) + new ScoreDoc[] { new ScoreDoc(3, 0.98058068f), new ScoreDoc(4, 0.39223227f), new ScoreDoc(2, -1.37281295f) } + ) + ) + ); + assertNotNull(compoundTopDocs); + assertEquals(1, compoundTopDocs.size()); + assertNotNull(compoundTopDocs.get(0).getTopDocs()); + for (int i = 0; i < expectedCompoundDocs.getTopDocs().size(); i++) { + assertCompoundTopDocs(expectedCompoundDocs.getTopDocs().get(i), compoundTopDocs.get(0).getTopDocs().get(i)); + } + } + + public void testNormalization_whenResultFromMultipleShardsMultipleSubQueries_thenSuccessful() { + ZScoreNormalizationTechnique normalizationTechnique = new ZScoreNormalizationTechnique(); + List compoundTopDocs = List.of( + new CompoundTopDocs( + new TotalHits(3, TotalHits.Relation.EQUAL_TO), + List.of( + new TopDocs( + new TotalHits(2, TotalHits.Relation.EQUAL_TO), + new ScoreDoc[] { new ScoreDoc(2, 0.5f), new ScoreDoc(4, 0.2f) } + ), + new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[0]), + new TopDocs( + new TotalHits(3, TotalHits.Relation.EQUAL_TO), + new ScoreDoc[] { new ScoreDoc(3, 0.9f), new ScoreDoc(4, 0.7f), new ScoreDoc(2, 0.1f) } + ) + ) + ), + new CompoundTopDocs( + new TotalHits(2, TotalHits.Relation.EQUAL_TO), + List.of( + new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[0]), + new TopDocs( + new TotalHits(2, TotalHits.Relation.EQUAL_TO), + new ScoreDoc[] { new ScoreDoc(7, 2.9f), new ScoreDoc(9, 0.7f) } + ) + ) + ) + ); + normalizationTechnique.normalize(compoundTopDocs); + + CompoundTopDocs expectedCompoundDocsShard1 = new CompoundTopDocs( + new TotalHits(3, TotalHits.Relation.EQUAL_TO), + List.of( + new TopDocs( + new TotalHits(2, TotalHits.Relation.EQUAL_TO), + new ScoreDoc[] { new ScoreDoc(2, 1.0f), new ScoreDoc(4, -1.0f) } + ), + new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[0]), + new TopDocs( + new TotalHits(3, TotalHits.Relation.EQUAL_TO), + new ScoreDoc[] { new ScoreDoc(3, 0.98058068f), new ScoreDoc(4, 0.39223227f), new ScoreDoc(2, -1.37281295f) } + ) + ) + ); + + CompoundTopDocs expectedCompoundDocsShard2 = new CompoundTopDocs( + new TotalHits(2, TotalHits.Relation.EQUAL_TO), + List.of( + new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[0]), + new TopDocs(new TotalHits(2, TotalHits.Relation.EQUAL_TO), new ScoreDoc[] { new ScoreDoc(7, 1.0f), new ScoreDoc(9, -1.0f) }) + ) + ); + + assertNotNull(compoundTopDocs); + assertEquals(2, compoundTopDocs.size()); + assertNotNull(compoundTopDocs.get(0).getTopDocs()); + for (int i = 0; i < expectedCompoundDocsShard1.getTopDocs().size(); i++) { + assertCompoundTopDocs(expectedCompoundDocsShard1.getTopDocs().get(i), compoundTopDocs.get(0).getTopDocs().get(i)); + } + assertNotNull(compoundTopDocs.get(1).getTopDocs()); + for (int i = 0; i < expectedCompoundDocsShard2.getTopDocs().size(); i++) { + assertCompoundTopDocs(expectedCompoundDocsShard2.getTopDocs().get(i), compoundTopDocs.get(1).getTopDocs().get(i)); + } + } + + private void assertCompoundTopDocs(TopDocs expected, TopDocs actual) { + assertEquals(expected.totalHits.value, actual.totalHits.value); + assertEquals(expected.totalHits.relation, actual.totalHits.relation); + assertEquals(expected.scoreDocs.length, actual.scoreDocs.length); + for (int i = 0; i < expected.scoreDocs.length; i++) { + assertEquals(expected.scoreDocs[i].score, actual.scoreDocs[i].score, DELTA_FOR_ASSERTION); + assertEquals(expected.scoreDocs[i].doc, actual.scoreDocs[i].doc); + assertEquals(expected.scoreDocs[i].shardIndex, actual.scoreDocs[i].shardIndex); + } + } +} diff --git a/src/test/java/org/opensearch/neuralsearch/query/HybridQueryIT.java b/src/test/java/org/opensearch/neuralsearch/query/HybridQueryIT.java index eec6955ff..229374730 100644 --- a/src/test/java/org/opensearch/neuralsearch/query/HybridQueryIT.java +++ b/src/test/java/org/opensearch/neuralsearch/query/HybridQueryIT.java @@ -13,7 +13,6 @@ import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.Set; import java.util.stream.IntStream; @@ -267,19 +266,4 @@ private void initializeIndexIfNotExist(String indexName) throws IOException { assertEquals(3, getDocCount(TEST_MULTI_DOC_INDEX_NAME)); } } - - private List> getNestedHits(Map searchResponseAsMap) { - Map hitsMap = (Map) searchResponseAsMap.get("hits"); - return (List>) hitsMap.get("hits"); - } - - private Map getTotalHits(Map searchResponseAsMap) { - Map hitsMap = (Map) searchResponseAsMap.get("hits"); - return (Map) hitsMap.get("total"); - } - - private Optional getMaxScore(Map searchResponseAsMap) { - Map hitsMap = (Map) searchResponseAsMap.get("hits"); - return hitsMap.get("max_score") == null ? Optional.empty() : Optional.of(((Double) hitsMap.get("max_score")).floatValue()); - } } diff --git a/src/test/resources/log4j2-test.xml b/src/test/resources/log4j2-test.xml new file mode 100644 index 000000000..32c8f6bc7 --- /dev/null +++ b/src/test/resources/log4j2-test.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file