From 5f9ad0d003038355a138370f02ff729d40c84b58 Mon Sep 17 00:00:00 2001 From: Alonso Guevara Date: Thu, 2 Jan 2025 11:44:21 -0600 Subject: [PATCH] Chore/gleanings any encoding (#1569) * Make claims and entities independent of encoding * Semver * Change semver release type --- .semversioner/next-release/patch-20241230224307150194.json | 4 ++++ .../index/operations/extract_covariates/claim_extractor.py | 6 +++--- .../index/operations/extract_entities/graph_extractor.py | 6 +++--- graphrag/prompts/index/claim_extraction.py | 2 +- graphrag/prompts/index/entity_extraction.py | 2 +- 5 files changed, 12 insertions(+), 8 deletions(-) create mode 100644 .semversioner/next-release/patch-20241230224307150194.json diff --git a/.semversioner/next-release/patch-20241230224307150194.json b/.semversioner/next-release/patch-20241230224307150194.json new file mode 100644 index 0000000000..f11788103c --- /dev/null +++ b/.semversioner/next-release/patch-20241230224307150194.json @@ -0,0 +1,4 @@ +{ + "type": "minor", + "description": "Make gleanings independent of encoding" +} diff --git a/graphrag/index/operations/extract_covariates/claim_extractor.py b/graphrag/index/operations/extract_covariates/claim_extractor.py index e5fb6c3b40..a8758df0bf 100644 --- a/graphrag/index/operations/extract_covariates/claim_extractor.py +++ b/graphrag/index/operations/extract_covariates/claim_extractor.py @@ -88,8 +88,8 @@ def __init__( # Construct the looping arguments encoding = tiktoken.get_encoding(encoding_model or defs.ENCODING_MODEL) - yes = f"{encoding.encode('YES')[0]}" - no = f"{encoding.encode('NO')[0]}" + yes = f"{encoding.encode('Y')[0]}" + no = f"{encoding.encode('N')[0]}" self._loop_args = {"logit_bias": {yes: 100, no: 100}, "max_tokens": 1} async def __call__( @@ -195,7 +195,7 @@ async def _process_document( history=response.history, model_parameters=self._loop_args, ) - if response.output.content != "YES": + if response.output.content != "Y": break return self._parse_claim_tuples(results, prompt_args) diff --git a/graphrag/index/operations/extract_entities/graph_extractor.py b/graphrag/index/operations/extract_entities/graph_extractor.py index 1a2ce19695..f10b2c83e5 100644 --- a/graphrag/index/operations/extract_entities/graph_extractor.py +++ b/graphrag/index/operations/extract_entities/graph_extractor.py @@ -92,8 +92,8 @@ def __init__( # Construct the looping arguments encoding = tiktoken.get_encoding(encoding_model or defs.ENCODING_MODEL) - yes = f"{encoding.encode('YES')[0]}" - no = f"{encoding.encode('NO')[0]}" + yes = f"{encoding.encode('Y')[0]}" + no = f"{encoding.encode('N')[0]}" self._loop_args = {"logit_bias": {yes: 100, no: 100}, "max_tokens": 1} async def __call__( @@ -180,7 +180,7 @@ async def _process_document( model_parameters=self._loop_args, ) - if response.output.content != "YES": + if response.output.content != "Y": break return results diff --git a/graphrag/prompts/index/claim_extraction.py b/graphrag/prompts/index/claim_extraction.py index 05b3153c20..6ce3f0c2cc 100644 --- a/graphrag/prompts/index/claim_extraction.py +++ b/graphrag/prompts/index/claim_extraction.py @@ -58,4 +58,4 @@ CONTINUE_PROMPT = "MANY entities were missed in the last extraction. Add them below using the same format:\n" -LOOP_PROMPT = "It appears some entities may have still been missed. Answer YES {tuple_delimiter} NO if there are still entities that need to be added.\n" +LOOP_PROMPT = "It appears some entities may have still been missed. Answer Y or N if there are still entities that need to be added.\n" diff --git a/graphrag/prompts/index/entity_extraction.py b/graphrag/prompts/index/entity_extraction.py index cb1bcc668a..b1aaea3d3f 100644 --- a/graphrag/prompts/index/entity_extraction.py +++ b/graphrag/prompts/index/entity_extraction.py @@ -126,4 +126,4 @@ Output:""" CONTINUE_PROMPT = "MANY entities and relationships were missed in the last extraction. Remember to ONLY emit entities that match any of the previously extracted types. Add them below using the same format:\n" -LOOP_PROMPT = "It appears some entities and relationships may have still been missed. Answer YES | NO if there are still entities or relationships that need to be added.\n" +LOOP_PROMPT = "It appears some entities and relationships may have still been missed. Answer Y or N if there are still entities or relationships that need to be added.\n"