From 6474e9f574991c5281b537b7cc42b582f4128dcb Mon Sep 17 00:00:00 2001 From: Julio Perez Date: Mon, 27 Jan 2025 23:37:12 -0500 Subject: [PATCH 1/4] add reranker to search --- client/src/nv_ingest_client/util/milvus.py | 93 +++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/client/src/nv_ingest_client/util/milvus.py b/client/src/nv_ingest_client/util/milvus.py index 8a3fce06..a1fef538 100644 --- a/client/src/nv_ingest_client/util/milvus.py +++ b/client/src/nv_ingest_client/util/milvus.py @@ -19,6 +19,7 @@ import time from urllib.parse import urlparse from typing import Union, Dict +import requests def _dict_to_params(collections_dict: dict, write_params: dict): @@ -761,6 +762,13 @@ def nvingest_retrieval( model_name: str = "nvidia/nv-embedqa-e5-v5", output_fields: List[str] = ["text", "source", "content_metadata"], gpu_search: bool = True, + nv_ranker: bool = False, + nv_ranker_endpoint: str = "http://localhost:8015", + nv_ranker_model_name: str = "nvidia/llama-3.2-nv-rerankqa-1b-v2", + nv_ranker_nvidia_api_key: str = "", + nv_ranker_truncate: str = "END", + nv_ranker_top_k: int = 5, + nv_ranker_max_batch_size: int = 64, ): """ This function takes the input queries and conducts a hybrid/dense @@ -792,7 +800,20 @@ def nvingest_retrieval( The path where the sparse model has been loaded. model_name : str, optional The name of the dense embedding model available in the NIM embedding endpoint. - + nv_ranker : bool + Set to True to use the nvidia reranker. + nv_ranker_endpoint : str + The endpoint to the nvidia reranker + nv_ranker_model_name: str + The name of the model host in the nvidia reranker + nv_ranker_nvidia_api_key : str, + The nvidia reranker api key, necessary when using non-local asset + truncate : str [`END`, `NONE`] + Truncate the incoming texts if length is longer than the model allows. + nv_ranker_max_batch_size : int + Max size for the number of candidates to rerank. + nv_ranker_top_k : int, + The number of candidates to return after reranking. Returns ------- List @@ -819,6 +840,22 @@ def nvingest_retrieval( ) else: results = dense_retrieval(queries, collection_name, client, embed_model, top_k, output_fields=output_fields) + if nv_ranker: + rerank_results = [] + for query, candidates in zip(queries, results): + rerank_results.append( + nv_rerank( + query, + candidates, + reranker_endpoint=nv_ranker_endpoint, + model_name=nv_ranker_model_name, + nvidia_api_key=nv_ranker_nvidia_api_key, + truncate=nv_ranker_truncate, + topk=nv_ranker_top_k, + max_batch_size=nv_ranker_max_batch_size, + ) + ) + return results @@ -850,3 +887,57 @@ def remove_records(source_name: str, collection_name: str, milvus_uri: str = "ht filter=f'(source["source_name"] == "{source_name}")', ) return result_ids + + +def nv_rerank( + query, + candidates, + reranker_endpoint: str = "http://localhost:8015", + model_name: str = "nvidia/llama-3.2-nv-rerankqa-1b-v2", + nvidia_api_key: str = "", + truncate: str = "END", + max_batch_size: int = 64, + topk: int = 5, +): + """ + This function allows a user to rerank a set of candidates using the nvidia reranker nim. + + Parameters + ---------- + query : str + Query the candidates are supposed to answer. + candidates : list + List of the candidates to rerank. + reranker_endpoint : str + The endpoint to the nvidia reranker + model_name: str + The name of the model host in the nvidia reranker + nvidia_api_key : str, + The nvidia reranker api key, necessary when using non-local asset + truncate : str [`END`, `NONE`] + Truncate the incoming texts if length is longer than the model allows. + max_batch_size : int + Max size for the number of candidates to rerank. + topk : int, + The number of candidates to return after reranking. + + Returns + ------- + Dict + Dictionary with top_k reranked candidates. + """ + # reranker = NVIDIARerank(base_url=reranker_endpoint, nvidia_api_key=nvidia_api_key, top_n=top_k) + headers = {"accept": "application/json", "Content-Type": "application/json"} + texts = [] + map_candidates = {} + for idx, candidate in enumerate(candidates): + map_candidates[idx] = candidate + texts.append({"text": candidate["entity"]["text"]}) + payload = {"model": model_name, "query": {"text": query}, "passages": texts, "truncate": truncate} + response = requests.post(f"{reranker_endpoint}/v1/ranking", headers=headers, json=payload) + + rank_results = [] + for rank_vals in response.json()["rankings"]: + idx = rank_vals["index"] + rank_results.append(map_candidates[idx]) + return rank_results From 618480cae80d67b0a71cb1f9b39f208a559d4861 Mon Sep 17 00:00:00 2001 From: Julio Perez Date: Mon, 27 Jan 2025 23:40:31 -0500 Subject: [PATCH 2/4] add reranker docker container setup to compose --- docker-compose.yaml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docker-compose.yaml b/docker-compose.yaml index 8d9c307a..dcca20fc 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -125,6 +125,26 @@ services: capabilities: [gpu] runtime: nvidia + reranker: + # NIM ON + image: ${RERANKER_IMAGE:-nvcr.io/nim/nvidia/llama-3.2-nv-rerankqa-1b-v2}:${RERANKER_TAG:-1.3.0} + shm_size: 16gb + ports: + - "8015:8000" + environment: + - NIM_HTTP_API_PORT=8000 + - NIM_TRITON_LOG_VERBOSE=1 + - NGC_API_KEY=${NIM_NGC_API_KEY:-${NGC_API_KEY:-ngcapikey}} + - CUDA_VISIBLE_DEVICES=0 + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ["1"] + capabilities: [gpu] + runtime: nvidia + nv-ingest-ms-runtime: image: nvcr.io/ohlfw0olaadg/ea-participants/nv-ingest:24.10.1 build: From 315c6c3677ff654cf7cd4d1ff7164893bc6d3f99 Mon Sep 17 00:00:00 2001 From: Julio Perez Date: Tue, 28 Jan 2025 15:04:52 -0500 Subject: [PATCH 3/4] add reranker to retrieval profile --- docker-compose.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yaml b/docker-compose.yaml index dcca20fc..e82b2e74 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -144,6 +144,8 @@ services: device_ids: ["1"] capabilities: [gpu] runtime: nvidia + profiles: + - retrieval nv-ingest-ms-runtime: image: nvcr.io/ohlfw0olaadg/ea-participants/nv-ingest:24.10.1 From 6a63535d58cab0deb619308c8011caf31c32d838 Mon Sep 17 00:00:00 2001 From: Julio Perez Date: Tue, 28 Jan 2025 16:14:01 -0500 Subject: [PATCH 4/4] allow milvus reranking via api --- client/src/nv_ingest_client/util/milvus.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/client/src/nv_ingest_client/util/milvus.py b/client/src/nv_ingest_client/util/milvus.py index a1fef538..b5d12c8f 100644 --- a/client/src/nv_ingest_client/util/milvus.py +++ b/client/src/nv_ingest_client/util/milvus.py @@ -892,7 +892,7 @@ def remove_records(source_name: str, collection_name: str, milvus_uri: str = "ht def nv_rerank( query, candidates, - reranker_endpoint: str = "http://localhost:8015", + reranker_endpoint: str = "http://localhost:8015/v1/ranking", model_name: str = "nvidia/llama-3.2-nv-rerankqa-1b-v2", nvidia_api_key: str = "", truncate: str = "END", @@ -928,14 +928,15 @@ def nv_rerank( """ # reranker = NVIDIARerank(base_url=reranker_endpoint, nvidia_api_key=nvidia_api_key, top_n=top_k) headers = {"accept": "application/json", "Content-Type": "application/json"} + if nvidia_api_key: + headers["Authorization"] = f"Bearer {nvidia_api_key}" texts = [] map_candidates = {} for idx, candidate in enumerate(candidates): map_candidates[idx] = candidate texts.append({"text": candidate["entity"]["text"]}) payload = {"model": model_name, "query": {"text": query}, "passages": texts, "truncate": truncate} - response = requests.post(f"{reranker_endpoint}/v1/ranking", headers=headers, json=payload) - + response = requests.post(f"{reranker_endpoint}", headers=headers, json=payload) rank_results = [] for rank_vals in response.json()["rankings"]: idx = rank_vals["index"]