From 5f9ad0d003038355a138370f02ff729d40c84b58 Mon Sep 17 00:00:00 2001
From: Alonso Guevara <alonsog@microsoft.com>
Date: Thu, 2 Jan 2025 11:44:21 -0600
Subject: [PATCH] Chore/gleanings any encoding (#1569)

* Make claims and entities independent of encoding

* Semver

* Change semver release type
---
 .semversioner/next-release/patch-20241230224307150194.json  | 4 ++++
 .../index/operations/extract_covariates/claim_extractor.py  | 6 +++---
 .../index/operations/extract_entities/graph_extractor.py    | 6 +++---
 graphrag/prompts/index/claim_extraction.py                  | 2 +-
 graphrag/prompts/index/entity_extraction.py                 | 2 +-
 5 files changed, 12 insertions(+), 8 deletions(-)
 create mode 100644 .semversioner/next-release/patch-20241230224307150194.json

diff --git a/.semversioner/next-release/patch-20241230224307150194.json b/.semversioner/next-release/patch-20241230224307150194.json
new file mode 100644
index 0000000000..f11788103c
--- /dev/null
+++ b/.semversioner/next-release/patch-20241230224307150194.json
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Make gleanings independent of encoding"
+}
diff --git a/graphrag/index/operations/extract_covariates/claim_extractor.py b/graphrag/index/operations/extract_covariates/claim_extractor.py
index e5fb6c3b40..a8758df0bf 100644
--- a/graphrag/index/operations/extract_covariates/claim_extractor.py
+++ b/graphrag/index/operations/extract_covariates/claim_extractor.py
@@ -88,8 +88,8 @@ def __init__(
 
         # Construct the looping arguments
         encoding = tiktoken.get_encoding(encoding_model or defs.ENCODING_MODEL)
-        yes = f"{encoding.encode('YES')[0]}"
-        no = f"{encoding.encode('NO')[0]}"
+        yes = f"{encoding.encode('Y')[0]}"
+        no = f"{encoding.encode('N')[0]}"
         self._loop_args = {"logit_bias": {yes: 100, no: 100}, "max_tokens": 1}
 
     async def __call__(
@@ -195,7 +195,7 @@ async def _process_document(
                 history=response.history,
                 model_parameters=self._loop_args,
             )
-            if response.output.content != "YES":
+            if response.output.content != "Y":
                 break
 
         return self._parse_claim_tuples(results, prompt_args)
diff --git a/graphrag/index/operations/extract_entities/graph_extractor.py b/graphrag/index/operations/extract_entities/graph_extractor.py
index 1a2ce19695..f10b2c83e5 100644
--- a/graphrag/index/operations/extract_entities/graph_extractor.py
+++ b/graphrag/index/operations/extract_entities/graph_extractor.py
@@ -92,8 +92,8 @@ def __init__(
 
         # Construct the looping arguments
         encoding = tiktoken.get_encoding(encoding_model or defs.ENCODING_MODEL)
-        yes = f"{encoding.encode('YES')[0]}"
-        no = f"{encoding.encode('NO')[0]}"
+        yes = f"{encoding.encode('Y')[0]}"
+        no = f"{encoding.encode('N')[0]}"
         self._loop_args = {"logit_bias": {yes: 100, no: 100}, "max_tokens": 1}
 
     async def __call__(
@@ -180,7 +180,7 @@ async def _process_document(
                 model_parameters=self._loop_args,
             )
 
-            if response.output.content != "YES":
+            if response.output.content != "Y":
                 break
 
         return results
diff --git a/graphrag/prompts/index/claim_extraction.py b/graphrag/prompts/index/claim_extraction.py
index 05b3153c20..6ce3f0c2cc 100644
--- a/graphrag/prompts/index/claim_extraction.py
+++ b/graphrag/prompts/index/claim_extraction.py
@@ -58,4 +58,4 @@
 
 
 CONTINUE_PROMPT = "MANY entities were missed in the last extraction.  Add them below using the same format:\n"
-LOOP_PROMPT = "It appears some entities may have still been missed.  Answer YES {tuple_delimiter} NO if there are still entities that need to be added.\n"
+LOOP_PROMPT = "It appears some entities may have still been missed.  Answer Y or N if there are still entities that need to be added.\n"
diff --git a/graphrag/prompts/index/entity_extraction.py b/graphrag/prompts/index/entity_extraction.py
index cb1bcc668a..b1aaea3d3f 100644
--- a/graphrag/prompts/index/entity_extraction.py
+++ b/graphrag/prompts/index/entity_extraction.py
@@ -126,4 +126,4 @@
 Output:"""
 
 CONTINUE_PROMPT = "MANY entities and relationships were missed in the last extraction. Remember to ONLY emit entities that match any of the previously extracted types. Add them below using the same format:\n"
-LOOP_PROMPT = "It appears some entities and relationships may have still been missed.  Answer YES | NO if there are still entities or relationships that need to be added.\n"
+LOOP_PROMPT = "It appears some entities and relationships may have still been missed.  Answer Y or N if there are still entities or relationships that need to be added.\n"