Chore/gleanings any encoding (#1569)

* Make claims and entities independent of encoding * Semver * Change semver release type
microsoft · Jan 2, 2025 · 5f9ad0d · 5f9ad0d
1 parent 2abd6c5
commit 5f9ad0d
Show file tree

Hide file tree

Showing 5 changed files with 12 additions and 8 deletions.
diff --git a/.semversioner/next-release/patch-20241230224307150194.json b/.semversioner/next-release/patch-20241230224307150194.json
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Make gleanings independent of encoding"
+}
diff --git a/graphrag/index/operations/extract_covariates/claim_extractor.py b/graphrag/index/operations/extract_covariates/claim_extractor.py
@@ -88,8 +88,8 @@ def __init__(
 
         # Construct the looping arguments
         encoding = tiktoken.get_encoding(encoding_model or defs.ENCODING_MODEL)
-        yes = f"{encoding.encode('YES')[0]}"
-        no = f"{encoding.encode('NO')[0]}"
+        yes = f"{encoding.encode('Y')[0]}"
+        no = f"{encoding.encode('N')[0]}"
         self._loop_args = {"logit_bias": {yes: 100, no: 100}, "max_tokens": 1}
 
     async def __call__(
@@ -195,7 +195,7 @@ async def _process_document(
                 history=response.history,
                 model_parameters=self._loop_args,
             )
-            if response.output.content != "YES":
+            if response.output.content != "Y":
                 break
 
         return self._parse_claim_tuples(results, prompt_args)

diff --git a/graphrag/index/operations/extract_entities/graph_extractor.py b/graphrag/index/operations/extract_entities/graph_extractor.py
@@ -92,8 +92,8 @@ def __init__(
 
         # Construct the looping arguments
         encoding = tiktoken.get_encoding(encoding_model or defs.ENCODING_MODEL)
-        yes = f"{encoding.encode('YES')[0]}"
-        no = f"{encoding.encode('NO')[0]}"
+        yes = f"{encoding.encode('Y')[0]}"
+        no = f"{encoding.encode('N')[0]}"
         self._loop_args = {"logit_bias": {yes: 100, no: 100}, "max_tokens": 1}
 
     async def __call__(
@@ -180,7 +180,7 @@ async def _process_document(
                 model_parameters=self._loop_args,
             )
 
-            if response.output.content != "YES":
+            if response.output.content != "Y":
                 break
 
         return results

diff --git a/graphrag/prompts/index/claim_extraction.py b/graphrag/prompts/index/claim_extraction.py
@@ -58,4 +58,4 @@
 
 
 CONTINUE_PROMPT = "MANY entities were missed in the last extraction.  Add them below using the same format:\n"
-LOOP_PROMPT = "It appears some entities may have still been missed.  Answer YES {tuple_delimiter} NO if there are still entities that need to be added.\n"
+LOOP_PROMPT = "It appears some entities may have still been missed.  Answer Y or N if there are still entities that need to be added.\n"
diff --git a/graphrag/prompts/index/entity_extraction.py b/graphrag/prompts/index/entity_extraction.py
@@ -126,4 +126,4 @@
 Output:"""
 
 CONTINUE_PROMPT = "MANY entities and relationships were missed in the last extraction. Remember to ONLY emit entities that match any of the previously extracted types. Add them below using the same format:\n"
-LOOP_PROMPT = "It appears some entities and relationships may have still been missed.  Answer YES | NO if there are still entities or relationships that need to be added.\n"
+LOOP_PROMPT = "It appears some entities and relationships may have still been missed.  Answer Y or N if there are still entities or relationships that need to be added.\n"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -58,4 +58,4 @@


		CONTINUE_PROMPT = "MANY entities were missed in the last extraction. Add them below using the same format:\n"
		LOOP_PROMPT = "It appears some entities may have still been missed. Answer YES {tuple_delimiter} NO if there are still entities that need to be added.\n"
		LOOP_PROMPT = "It appears some entities may have still been missed. Answer Y or N if there are still entities that need to be added.\n"