Skip to content

Commit

Permalink
fix: Clean input before passing it to the llm (#238)
Browse files Browse the repository at this point in the history
* fix: Clean input before passing it to the llm

* chore: Add license

* fix: typo

* chore: Bump graphiti version
  • Loading branch information
paul-paliychuk authored Dec 11, 2024
1 parent 6814cf7 commit a9091b0
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 1 deletion.
26 changes: 26 additions & 0 deletions graphiti_core/llm_client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,29 @@ def __init__(self, config: LLMConfig | None, cache: bool = False):
self.cache_enabled = cache
self.cache_dir = Cache(DEFAULT_CACHE_DIR) # Create a cache directory


def _clean_input(self, input: str) -> str:
"""Clean input string of invalid unicode and control characters.
Args:
input: Raw input string to be cleaned
Returns:
Cleaned string safe for LLM processing
"""
# Clean any invalid Unicode
cleaned = input.encode('utf-8', errors='ignore').decode('utf-8')

# Remove zero-width characters and other invisible unicode
zero_width = '\u200b\u200c\u200d\ufeff\u2060'
for char in zero_width:
cleaned = cleaned.replace(char, '')

# Remove control characters except newlines, returns, and tabs
cleaned = ''.join(char for char in cleaned if ord(char) >= 32 or char in '\n\r\t')

return cleaned

@retry(
stop=stop_after_attempt(4),
wait=wait_random_exponential(multiplier=10, min=5, max=120),
Expand Down Expand Up @@ -106,6 +129,9 @@ async def generate_response(
logger.debug(f'Cache hit for {cache_key}')
return cached_response

for message in messages:
message.content = self._clean_input(message.content)

response = await self._generate_response_with_retry(messages, response_model)

if self.cache_enabled:
Expand Down
1 change: 1 addition & 0 deletions graphiti_core/llm_client/openai_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ async def _generate_response(
) -> dict[str, typing.Any]:
openai_messages: list[ChatCompletionMessageParam] = []
for m in messages:
m.content = self._clean_input(m.content)
if m.role == 'user':
openai_messages.append({'role': 'user', 'content': m.content})
elif m.role == 'system':
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "graphiti-core"
version = "0.5.0pre4"
version = "0.5.0pre5"
description = "A temporal graph building library"
authors = [
"Paul Paliychuk <[email protected]>",
Expand Down
57 changes: 57 additions & 0 deletions tests/llm_client/test_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""
Copyright 2024, Zep Software, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

from graphiti_core.llm_client.client import LLMClient
from graphiti_core.llm_client.config import LLMConfig


class TestLLMClient(LLMClient):
"""Concrete implementation of LLMClient for testing"""

async def _generate_response(self, messages, response_model=None):
return {'content': 'test'}


def test_clean_input():
client = TestLLMClient(LLMConfig())

test_cases = [
# Basic text should remain unchanged
('Hello World', 'Hello World'),
# Control characters should be removed
('Hello\x00World', 'HelloWorld'),
# Newlines, tabs, returns should be preserved
('Hello\nWorld\tTest\r', 'Hello\nWorld\tTest\r'),
# Invalid Unicode should be removed
('Hello\udcdeWorld', 'HelloWorld'),
# Zero-width characters should be removed
('Hello\u200bWorld', 'HelloWorld'),
('Test\ufeffWord', 'TestWord'),
# Multiple issues combined
('Hello\x00\u200b\nWorld\udcde', 'Hello\nWorld'),
# Empty string should remain empty
('', ''),
# Form feed and other control characters from the error case
('{"edges":[{"relation_typ...\f\x04Hn\\?"}]}', '{"edges":[{"relation_typ...Hn\\?"}]}'),
# More specific control character tests
('Hello\x0cWorld', 'HelloWorld'), # form feed \f
('Hello\x04World', 'HelloWorld'), # end of transmission
# Combined JSON-like string with control characters
('{"test": "value\f\x00\x04"}', '{"test": "value"}'),
]

for input_str, expected in test_cases:
assert client._clean_input(input_str) == expected, f'Failed for input: {repr(input_str)}'

0 comments on commit a9091b0

Please sign in to comment.