From 6474e9f574991c5281b537b7cc42b582f4128dcb Mon Sep 17 00:00:00 2001
From: Julio Perez <jperez@nvidia.com>
Date: Mon, 27 Jan 2025 23:37:12 -0500
Subject: [PATCH 1/4] add reranker to search

---
 client/src/nv_ingest_client/util/milvus.py | 93 +++++++++++++++++++++-
 1 file changed, 92 insertions(+), 1 deletion(-)

diff --git a/client/src/nv_ingest_client/util/milvus.py b/client/src/nv_ingest_client/util/milvus.py
index 8a3fce06..a1fef538 100644
--- a/client/src/nv_ingest_client/util/milvus.py
+++ b/client/src/nv_ingest_client/util/milvus.py
@@ -19,6 +19,7 @@
 import time
 from urllib.parse import urlparse
 from typing import Union, Dict
+import requests
 
 
 def _dict_to_params(collections_dict: dict, write_params: dict):
@@ -761,6 +762,13 @@ def nvingest_retrieval(
     model_name: str = "nvidia/nv-embedqa-e5-v5",
     output_fields: List[str] = ["text", "source", "content_metadata"],
     gpu_search: bool = True,
+    nv_ranker: bool = False,
+    nv_ranker_endpoint: str = "http://localhost:8015",
+    nv_ranker_model_name: str = "nvidia/llama-3.2-nv-rerankqa-1b-v2",
+    nv_ranker_nvidia_api_key: str = "",
+    nv_ranker_truncate: str = "END",
+    nv_ranker_top_k: int = 5,
+    nv_ranker_max_batch_size: int = 64,
 ):
     """
     This function takes the input queries and conducts a hybrid/dense
@@ -792,7 +800,20 @@ def nvingest_retrieval(
         The path where the sparse model has been loaded.
     model_name : str, optional
         The name of the dense embedding model available in the NIM embedding endpoint.
-
+    nv_ranker : bool
+        Set to True to use the nvidia reranker.
+    nv_ranker_endpoint : str
+        The endpoint to the nvidia reranker
+    nv_ranker_model_name: str
+        The name of the model host in the nvidia reranker
+    nv_ranker_nvidia_api_key : str,
+        The nvidia reranker api key, necessary when using non-local asset
+    truncate : str [`END`, `NONE`]
+        Truncate the incoming texts if length is longer than the model allows.
+    nv_ranker_max_batch_size : int
+        Max size for the number of candidates to rerank.
+    nv_ranker_top_k : int,
+        The number of candidates to return after reranking.
     Returns
     -------
     List
@@ -819,6 +840,22 @@ def nvingest_retrieval(
         )
     else:
         results = dense_retrieval(queries, collection_name, client, embed_model, top_k, output_fields=output_fields)
+    if nv_ranker:
+        rerank_results = []
+        for query, candidates in zip(queries, results):
+            rerank_results.append(
+                nv_rerank(
+                    query,
+                    candidates,
+                    reranker_endpoint=nv_ranker_endpoint,
+                    model_name=nv_ranker_model_name,
+                    nvidia_api_key=nv_ranker_nvidia_api_key,
+                    truncate=nv_ranker_truncate,
+                    topk=nv_ranker_top_k,
+                    max_batch_size=nv_ranker_max_batch_size,
+                )
+            )
+
     return results
 
 
@@ -850,3 +887,57 @@ def remove_records(source_name: str, collection_name: str, milvus_uri: str = "ht
         filter=f'(source["source_name"] == "{source_name}")',
     )
     return result_ids
+
+
+def nv_rerank(
+    query,
+    candidates,
+    reranker_endpoint: str = "http://localhost:8015",
+    model_name: str = "nvidia/llama-3.2-nv-rerankqa-1b-v2",
+    nvidia_api_key: str = "",
+    truncate: str = "END",
+    max_batch_size: int = 64,
+    topk: int = 5,
+):
+    """
+    This function allows a user to rerank a set of candidates using the nvidia reranker nim.
+
+    Parameters
+    ----------
+    query : str
+        Query the candidates are supposed to answer.
+    candidates : list
+        List of the candidates to rerank.
+    reranker_endpoint : str
+        The endpoint to the nvidia reranker
+    model_name: str
+        The name of the model host in the nvidia reranker
+    nvidia_api_key : str,
+        The nvidia reranker api key, necessary when using non-local asset
+    truncate : str [`END`, `NONE`]
+        Truncate the incoming texts if length is longer than the model allows.
+    max_batch_size : int
+        Max size for the number of candidates to rerank.
+    topk : int,
+        The number of candidates to return after reranking.
+
+    Returns
+    -------
+    Dict
+        Dictionary with top_k reranked candidates.
+    """
+    # reranker = NVIDIARerank(base_url=reranker_endpoint, nvidia_api_key=nvidia_api_key, top_n=top_k)
+    headers = {"accept": "application/json", "Content-Type": "application/json"}
+    texts = []
+    map_candidates = {}
+    for idx, candidate in enumerate(candidates):
+        map_candidates[idx] = candidate
+        texts.append({"text": candidate["entity"]["text"]})
+    payload = {"model": model_name, "query": {"text": query}, "passages": texts, "truncate": truncate}
+    response = requests.post(f"{reranker_endpoint}/v1/ranking", headers=headers, json=payload)
+
+    rank_results = []
+    for rank_vals in response.json()["rankings"]:
+        idx = rank_vals["index"]
+        rank_results.append(map_candidates[idx])
+    return rank_results

From 618480cae80d67b0a71cb1f9b39f208a559d4861 Mon Sep 17 00:00:00 2001
From: Julio Perez <jperez@nvidia.com>
Date: Mon, 27 Jan 2025 23:40:31 -0500
Subject: [PATCH 2/4] add reranker docker container setup to compose

---
 docker-compose.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/docker-compose.yaml b/docker-compose.yaml
index 8d9c307a..dcca20fc 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -125,6 +125,26 @@ services:
               capabilities: [gpu]
     runtime: nvidia
 
+  reranker:
+    # NIM ON
+    image: ${RERANKER_IMAGE:-nvcr.io/nim/nvidia/llama-3.2-nv-rerankqa-1b-v2}:${RERANKER_TAG:-1.3.0}
+    shm_size: 16gb
+    ports:
+      - "8015:8000"
+    environment:
+      - NIM_HTTP_API_PORT=8000
+      - NIM_TRITON_LOG_VERBOSE=1
+      - NGC_API_KEY=${NIM_NGC_API_KEY:-${NGC_API_KEY:-ngcapikey}}
+      - CUDA_VISIBLE_DEVICES=0
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["1"]
+              capabilities: [gpu]
+    runtime: nvidia
+
   nv-ingest-ms-runtime:
     image: nvcr.io/ohlfw0olaadg/ea-participants/nv-ingest:24.10.1
     build:

From 315c6c3677ff654cf7cd4d1ff7164893bc6d3f99 Mon Sep 17 00:00:00 2001
From: Julio Perez <jperez@nvidia.com>
Date: Tue, 28 Jan 2025 15:04:52 -0500
Subject: [PATCH 3/4] add reranker to retrieval profile

---
 docker-compose.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docker-compose.yaml b/docker-compose.yaml
index dcca20fc..e82b2e74 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -144,6 +144,8 @@ services:
               device_ids: ["1"]
               capabilities: [gpu]
     runtime: nvidia
+    profiles:
+      - retrieval
 
   nv-ingest-ms-runtime:
     image: nvcr.io/ohlfw0olaadg/ea-participants/nv-ingest:24.10.1

From 6a63535d58cab0deb619308c8011caf31c32d838 Mon Sep 17 00:00:00 2001
From: Julio Perez <jperez@nvidia.com>
Date: Tue, 28 Jan 2025 16:14:01 -0500
Subject: [PATCH 4/4] allow milvus reranking via api

---
 client/src/nv_ingest_client/util/milvus.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/client/src/nv_ingest_client/util/milvus.py b/client/src/nv_ingest_client/util/milvus.py
index a1fef538..b5d12c8f 100644
--- a/client/src/nv_ingest_client/util/milvus.py
+++ b/client/src/nv_ingest_client/util/milvus.py
@@ -892,7 +892,7 @@ def remove_records(source_name: str, collection_name: str, milvus_uri: str = "ht
 def nv_rerank(
     query,
     candidates,
-    reranker_endpoint: str = "http://localhost:8015",
+    reranker_endpoint: str = "http://localhost:8015/v1/ranking",
     model_name: str = "nvidia/llama-3.2-nv-rerankqa-1b-v2",
     nvidia_api_key: str = "",
     truncate: str = "END",
@@ -928,14 +928,15 @@ def nv_rerank(
     """
     # reranker = NVIDIARerank(base_url=reranker_endpoint, nvidia_api_key=nvidia_api_key, top_n=top_k)
     headers = {"accept": "application/json", "Content-Type": "application/json"}
+    if nvidia_api_key:
+        headers["Authorization"] = f"Bearer {nvidia_api_key}"
     texts = []
     map_candidates = {}
     for idx, candidate in enumerate(candidates):
         map_candidates[idx] = candidate
         texts.append({"text": candidate["entity"]["text"]})
     payload = {"model": model_name, "query": {"text": query}, "passages": texts, "truncate": truncate}
-    response = requests.post(f"{reranker_endpoint}/v1/ranking", headers=headers, json=payload)
-
+    response = requests.post(f"{reranker_endpoint}", headers=headers, json=payload)
     rank_results = []
     for rank_vals in response.json()["rankings"]:
         idx = rank_vals["index"]