From 1432af47f6b6d190ecabb8ac40a0c5577d303341 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Thu, 19 Dec 2024 19:53:40 -0300 Subject: [PATCH 01/10] add text_splitting unit test --- .../index/text_splitting/text_splitting.py | 97 +--------- tests/unit/indexing/test_text_splitting.py | 167 ++++++++++++++++++ 2 files changed, 169 insertions(+), 95 deletions(-) create mode 100644 tests/unit/indexing/test_text_splitting.py diff --git a/graphrag/index/text_splitting/text_splitting.py b/graphrag/index/text_splitting/text_splitting.py index 32b430bb69..b186908a28 100644 --- a/graphrag/index/text_splitting/text_splitting.py +++ b/graphrag/index/text_splitting/text_splitting.py @@ -3,19 +3,15 @@ """A module containing the 'Tokenizer', 'TextSplitter', 'NoopTextSplitter' and 'TokenTextSplitter' models.""" -import json import logging from abc import ABC, abstractmethod from collections.abc import Callable, Collection, Iterable from dataclasses import dataclass -from enum import Enum from typing import Any, Literal, cast import pandas as pd import tiktoken -from graphrag.index.utils.tokens import num_tokens_from_string - EncodedText = list[int] DecodeFn = Callable[[EncodedText], str] EncodeFn = Callable[[str], EncodedText] @@ -122,10 +118,10 @@ def num_tokens(self, text: str) -> int: def split_text(self, text: str | list[str]) -> list[str]: """Split text method.""" - if cast("bool", pd.isna(text)) or text == "": - return [] if isinstance(text, list): text = " ".join(text) + elif cast("bool", pd.isna(text)) or text == "": + return [] if not isinstance(text, str): msg = f"Attempting to split a non-string value, actual is {type(text)}" raise TypeError(msg) @@ -140,95 +136,6 @@ def split_text(self, text: str | list[str]) -> list[str]: return split_text_on_tokens(text=text, tokenizer=tokenizer) -class TextListSplitterType(str, Enum): - """Enum for the type of the TextListSplitter.""" - - DELIMITED_STRING = "delimited_string" - JSON = "json" - - -class TextListSplitter(TextSplitter): - """Text list splitter class definition.""" - - def __init__( - self, - chunk_size: int, - splitter_type: TextListSplitterType = TextListSplitterType.JSON, - input_delimiter: str | None = None, - output_delimiter: str | None = None, - model_name: str | None = None, - encoding_name: str | None = None, - ): - """Initialize the TextListSplitter with a chunk size.""" - # Set the chunk overlap to 0 as we use full strings - super().__init__(chunk_size, chunk_overlap=0) - self._type = splitter_type - self._input_delimiter = input_delimiter - self._output_delimiter = output_delimiter or "\n" - self._length_function = lambda x: num_tokens_from_string( - x, model=model_name, encoding_name=encoding_name - ) - - def split_text(self, text: str | list[str]) -> Iterable[str]: - """Split a string list into a list of strings for a given chunk size.""" - if not text: - return [] - - result: list[str] = [] - current_chunk: list[str] = [] - - # Add the brackets - current_length: int = self._length_function("[]") - - # Input should be a string list joined by a delimiter - string_list = self._load_text_list(text) - - if len(string_list) == 1: - return string_list - - for item in string_list: - # Count the length of the item and add comma - item_length = self._length_function(f"{item},") - - if current_length + item_length > self._chunk_size: - if current_chunk and len(current_chunk) > 0: - # Add the current chunk to the result - self._append_to_result(result, current_chunk) - - # Start a new chunk - current_chunk = [item] - # Add 2 for the brackets - current_length = item_length - else: - # Add the item to the current chunk - current_chunk.append(item) - # Add 1 for the comma - current_length += item_length - - # Add the last chunk to the result - self._append_to_result(result, current_chunk) - - return result - - def _load_text_list(self, text: str | list[str]): - """Load the text list based on the type.""" - if isinstance(text, list): - string_list = text - elif self._type == TextListSplitterType.JSON: - string_list = json.loads(text) - else: - string_list = text.split(self._input_delimiter) - return string_list - - def _append_to_result(self, chunk_list: list[str], new_chunk: list[str]): - """Append the current chunk to the result.""" - if new_chunk and len(new_chunk) > 0: - if self._type == TextListSplitterType.JSON: - chunk_list.append(json.dumps(new_chunk, ensure_ascii=False)) - else: - chunk_list.append(self._output_delimiter.join(new_chunk)) - - def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> list[str]: """Split incoming text and return chunks using tokenizer.""" splits: list[str] = [] diff --git a/tests/unit/indexing/test_text_splitting.py b/tests/unit/indexing/test_text_splitting.py new file mode 100644 index 0000000000..c29ecdc4a4 --- /dev/null +++ b/tests/unit/indexing/test_text_splitting.py @@ -0,0 +1,167 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +from unittest import mock +from unittest.mock import MagicMock, patch + +import pytest + +from graphrag.index.text_splitting.text_splitting import ( + NoopTextSplitter, + Tokenizer, + TokenTextSplitter, + split_text_on_tokens, +) + + +def test_noop_text_splitter() -> None: + splitter = NoopTextSplitter() + + assert list(splitter.split_text("some text")) == ["some text"] + assert list(splitter.split_text(["some", "text"])) == ["some", "text"] + + +class MockTokenizer: + def encode(self, text): + return [ord(char) for char in text] + + def decode(self, token_ids): + return "".join(chr(id) for id in token_ids) + + +def test_split_text_str_empty(): + splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=2) + result = splitter.split_text("") + + assert result == [] + + +def test_split_text_str_bool(): + splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=2) + result = splitter.split_text(None) # type: ignore + + assert result == [] + + +def test_split_text_str_int(): + splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=2) + with pytest.raises(TypeError): + splitter.split_text(123) # type: ignore + + +@mock.patch("graphrag.index.text_splitting.text_splitting.split_text_on_tokens") +def test_split_text_large_input(mock_split): + large_text = "a" * 10_000 + mock_split.return_value = ["chunk"] * 2_000 + splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=2) + + result = splitter.split_text(large_text) + + assert len(result) == 2_000, "Large input was not split correctly" + mock_split.assert_called_once() + + +@mock.patch("graphrag.index.text_splitting.text_splitting.split_text_on_tokens") +@mock.patch("graphrag.index.text_splitting.text_splitting.Tokenizer") +@mock.patch("tiktoken.get_encoding") +def test_token_text_splitter(mock_get_encoding, mock_tokenizer, mock_split_text): + text = "chunk1 chunk2 chunk3" + expected_chunks = ["chunk1", "chunk2", "chunk3"] + + mocked_tokenizer = MagicMock() + mock_tokenizer.return_value = mocked_tokenizer + mock_split_text.return_value = expected_chunks + + mock_get_encoding.return_value = MagicMock(tokens_per_chunk=5, chunk_overlap=2) + + splitter = TokenTextSplitter( + encoding_name="mock_encoding", + ) + + splitter.split_text(["chunk1", "chunk2", "chunk3"]) + + mock_split_text.assert_called_once_with(text=text, tokenizer=mocked_tokenizer) + + +@mock.patch("tiktoken.get_encoding") +def test_encode_basic(mock_get_encoding): + mock_encoder = MagicMock() + mock_encoder.encode.return_value = [1, 2, 3] + mock_get_encoding.return_value = mock_encoder + + splitter = TokenTextSplitter(encoding_name="mock_encoding") + result = splitter.encode("abc def") + + assert result == [1, 2, 3], "Encoding failed to return expected tokens" + mock_encoder.encode.assert_called_once_with( + "abc def", allowed_special=set(), disallowed_special="all" + ) + + +@mock.patch("tiktoken.get_encoding") +def test_num_tokens_empty_input(mock_get_encoding): + mock_encoder = MagicMock() + mock_encoder.encode.return_value = [] + mock_get_encoding.return_value = mock_encoder + + splitter = TokenTextSplitter(encoding_name="mock_encoding") + result = splitter.num_tokens("") + + assert result == 0, "Token count for empty input should be 0" + + +@mock.patch("tiktoken.encoding_for_model") +def test_model_name(mock_get_encoding): + mock_encoder = MagicMock() + mock_encoder.encode.return_value = [1, 2, 3] + mock_get_encoding.return_value = mock_encoder + + splitter = TokenTextSplitter(model_name="mock_model") + result = splitter.encode("abc def") + + assert result == [1, 2, 3], "Encoding failed to return expected tokens" + mock_encoder.encode.assert_called_once_with( + "abc def", allowed_special=set(), disallowed_special="all" + ) + + +@mock.patch("tiktoken.encoding_for_model", side_effect=KeyError) +@mock.patch("tiktoken.get_encoding") +def test_model_name_exception(mock_get_encoding, mock_encoding_for_model): + mock_get_encoding.return_value = mock.MagicMock() + + TokenTextSplitter(model_name="mock_model", encoding_name="mock_encoding") + + mock_get_encoding.assert_called_once_with("mock_encoding") + mock_encoding_for_model.assert_called_once_with("mock_model") + + +def test_split_text_on_tokens(): + text = "This is a test text, meaning to be taken seriously by this test only." + mocked_tokenizer = MockTokenizer() + tokenizer = Tokenizer( + chunk_overlap=5, + tokens_per_chunk=10, + decode=mocked_tokenizer.decode, + encode=lambda text: mocked_tokenizer.encode(text), + ) + + expected_splits = [ + "This is a ", + "is a test ", + "test text,", + "text, mean", + " meaning t", + "ing to be ", + "o be taken", + "taken seri", + " seriously", + "ously by t", + " by this t", + "his test o", + "est only.", + "nly.", + ] + + result = split_text_on_tokens(text=text, tokenizer=tokenizer) + assert result == expected_splits From 949ddfde58dce66dbc68cca5379fbbe751cbf750 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Fri, 20 Dec 2024 11:51:02 -0300 Subject: [PATCH 02/10] change folder test text splitting --- tests/unit/indexing/text_splitting/__init__.py | 2 ++ tests/unit/indexing/{ => text_splitting}/test_text_splitting.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 tests/unit/indexing/text_splitting/__init__.py rename tests/unit/indexing/{ => text_splitting}/test_text_splitting.py (99%) diff --git a/tests/unit/indexing/text_splitting/__init__.py b/tests/unit/indexing/text_splitting/__init__.py new file mode 100644 index 0000000000..0a3e38adfb --- /dev/null +++ b/tests/unit/indexing/text_splitting/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License diff --git a/tests/unit/indexing/test_text_splitting.py b/tests/unit/indexing/text_splitting/test_text_splitting.py similarity index 99% rename from tests/unit/indexing/test_text_splitting.py rename to tests/unit/indexing/text_splitting/test_text_splitting.py index c29ecdc4a4..fd8793b919 100644 --- a/tests/unit/indexing/test_text_splitting.py +++ b/tests/unit/indexing/text_splitting/test_text_splitting.py @@ -2,7 +2,7 @@ # Licensed under the MIT License from unittest import mock -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import pytest From ab7e0c2e0c09756d50bec26967fc7438ff652f8f Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Fri, 20 Dec 2024 14:31:56 -0300 Subject: [PATCH 03/10] fix chunk fn --- .../index/operations/chunk_text/strategies.py | 46 ++------------ .../index/text_splitting/text_splitting.py | 63 ++++++++++++++++--- .../text_splitting/test_text_splitting.py | 8 +-- 3 files changed, 65 insertions(+), 52 deletions(-) diff --git a/graphrag/index/operations/chunk_text/strategies.py b/graphrag/index/operations/chunk_text/strategies.py index 35c32585c0..d8edc2507c 100644 --- a/graphrag/index/operations/chunk_text/strategies.py +++ b/graphrag/index/operations/chunk_text/strategies.py @@ -12,7 +12,10 @@ import graphrag.config.defaults as defs from graphrag.index.operations.chunk_text.typing import TextChunk -from graphrag.index.text_splitting.text_splitting import Tokenizer +from graphrag.index.text_splitting.text_splitting import ( + Tokenizer, + split_multiple_texts_on_tokens, +) def run_tokens( @@ -32,7 +35,7 @@ def encode(text: str) -> list[int]: def decode(tokens: list[int]) -> str: return enc.decode(tokens) - return _split_text_on_tokens( + return split_multiple_texts_on_tokens( input, Tokenizer( chunk_overlap=chunk_overlap, @@ -43,45 +46,6 @@ def decode(tokens: list[int]) -> str: tick, ) - -# Adapted from - https://github.com/langchain-ai/langchain/blob/77b359edf5df0d37ef0d539f678cf64f5557cb54/libs/langchain/langchain/text_splitter.py#L471 -# So we could have better control over the chunking process -def _split_text_on_tokens( - texts: list[str], enc: Tokenizer, tick: ProgressTicker -) -> list[TextChunk]: - """Split incoming text and return chunks.""" - result = [] - mapped_ids = [] - - for source_doc_idx, text in enumerate(texts): - encoded = enc.encode(text) - tick(1) - mapped_ids.append((source_doc_idx, encoded)) - - input_ids: list[tuple[int, int]] = [ - (source_doc_idx, id) for source_doc_idx, ids in mapped_ids for id in ids - ] - - start_idx = 0 - cur_idx = min(start_idx + enc.tokens_per_chunk, len(input_ids)) - chunk_ids = input_ids[start_idx:cur_idx] - while start_idx < len(input_ids): - chunk_text = enc.decode([id for _, id in chunk_ids]) - doc_indices = list({doc_idx for doc_idx, _ in chunk_ids}) - result.append( - TextChunk( - text_chunk=chunk_text, - source_doc_indices=doc_indices, - n_tokens=len(chunk_ids), - ) - ) - start_idx += enc.tokens_per_chunk - enc.chunk_overlap - cur_idx = min(start_idx + enc.tokens_per_chunk, len(input_ids)) - chunk_ids = input_ids[start_idx:cur_idx] - - return result - - def run_sentences( input: list[str], _args: dict[str, Any], tick: ProgressTicker ) -> Iterable[TextChunk]: diff --git a/graphrag/index/text_splitting/text_splitting.py b/graphrag/index/text_splitting/text_splitting.py index b186908a28..1f67a1487f 100644 --- a/graphrag/index/text_splitting/text_splitting.py +++ b/graphrag/index/text_splitting/text_splitting.py @@ -7,11 +7,13 @@ from abc import ABC, abstractmethod from collections.abc import Callable, Collection, Iterable from dataclasses import dataclass -from typing import Any, Literal, cast +from typing import Any, Literal, Union, cast import pandas as pd import tiktoken +from graphrag.index.operations.chunk_text.typing import TextChunk + EncodedText = list[int] DecodeFn = Callable[[EncodedText], str] EncodeFn = Callable[[str], EncodedText] @@ -133,19 +135,66 @@ def split_text(self, text: str | list[str]) -> list[str]: encode=lambda text: self.encode(text), ) - return split_text_on_tokens(text=text, tokenizer=tokenizer) + return split_single_text_on_tokens(text=text, tokenizer=tokenizer) + +def split_text_on_tokens( + texts: str | list[str], tokenizer: Tokenizer, tick=None +) -> list[str] | list[TextChunk]: + """Handle both single text and list of texts.""" + if isinstance(texts, str): + return split_single_text_on_tokens(texts, tokenizer) + + return split_multiple_texts_on_tokens(texts, tokenizer, tick) -def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> list[str]: - """Split incoming text and return chunks using tokenizer.""" - splits: list[str] = [] +def split_single_text_on_tokens(text: str, tokenizer: Tokenizer) -> list[str]: + """Split a single text and return chunks using the tokenizer.""" + result = [] input_ids = tokenizer.encode(text) + + start_idx = 0 + cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) + chunk_ids = input_ids[start_idx:cur_idx] + + while start_idx < len(input_ids): + chunk_text = tokenizer.decode(list(chunk_ids)) + result.append(chunk_text) # Append chunked text as string + start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap + cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) + chunk_ids = input_ids[start_idx:cur_idx] + + return result + + +# Adapted from - https://github.com/langchain-ai/langchain/blob/77b359edf5df0d37ef0d539f678cf64f5557cb54/libs/langchain/langchain/text_splitter.py#L471 +# So we could have better control over the chunking process +def split_multiple_texts_on_tokens( + texts: list[str], tokenizer: Tokenizer, tick=None +) -> list[TextChunk]: + """Split multiple texts and return chunks with metadata using the tokenizer.""" + result = [] + mapped_ids = [] + + for source_doc_idx, text in enumerate(texts): + encoded = tokenizer.encode(text) + if tick: + tick(1) # Track progress if tick callback is provided + mapped_ids.append((source_doc_idx, encoded)) + + input_ids = [ + (source_doc_idx, id) for source_doc_idx, ids in mapped_ids for id in ids + ] + start_idx = 0 cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) chunk_ids = input_ids[start_idx:cur_idx] + while start_idx < len(input_ids): - splits.append(tokenizer.decode(chunk_ids)) + chunk_text = tokenizer.decode([id for _, id in chunk_ids]) + doc_indices = list({doc_idx for doc_idx, _ in chunk_ids}) + result.append(TextChunk(chunk_text, doc_indices, len(chunk_ids))) start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) chunk_ids = input_ids[start_idx:cur_idx] - return splits + + return result \ No newline at end of file diff --git a/tests/unit/indexing/text_splitting/test_text_splitting.py b/tests/unit/indexing/text_splitting/test_text_splitting.py index fd8793b919..a0d2e0425c 100644 --- a/tests/unit/indexing/text_splitting/test_text_splitting.py +++ b/tests/unit/indexing/text_splitting/test_text_splitting.py @@ -10,7 +10,7 @@ NoopTextSplitter, Tokenizer, TokenTextSplitter, - split_text_on_tokens, + split_single_text_on_tokens, ) @@ -49,7 +49,7 @@ def test_split_text_str_int(): splitter.split_text(123) # type: ignore -@mock.patch("graphrag.index.text_splitting.text_splitting.split_text_on_tokens") +@mock.patch("graphrag.index.text_splitting.text_splitting.split_single_text_on_tokens") def test_split_text_large_input(mock_split): large_text = "a" * 10_000 mock_split.return_value = ["chunk"] * 2_000 @@ -61,7 +61,7 @@ def test_split_text_large_input(mock_split): mock_split.assert_called_once() -@mock.patch("graphrag.index.text_splitting.text_splitting.split_text_on_tokens") +@mock.patch("graphrag.index.text_splitting.text_splitting.split_single_text_on_tokens") @mock.patch("graphrag.index.text_splitting.text_splitting.Tokenizer") @mock.patch("tiktoken.get_encoding") def test_token_text_splitter(mock_get_encoding, mock_tokenizer, mock_split_text): @@ -163,5 +163,5 @@ def test_split_text_on_tokens(): "nly.", ] - result = split_text_on_tokens(text=text, tokenizer=tokenizer) + result = split_single_text_on_tokens(text=text, tokenizer=tokenizer) assert result == expected_splits From a0b0774616b7fcdfc6c93773d9781509fe516cbe Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Fri, 20 Dec 2024 14:56:19 -0300 Subject: [PATCH 04/10] test new function --- .../index/text_splitting/text_splitting.py | 5 +- .../text_splitting/test_text_splitting.py | 108 +++++++++++++++++- 2 files changed, 110 insertions(+), 3 deletions(-) diff --git a/graphrag/index/text_splitting/text_splitting.py b/graphrag/index/text_splitting/text_splitting.py index 1f67a1487f..47ed47332f 100644 --- a/graphrag/index/text_splitting/text_splitting.py +++ b/graphrag/index/text_splitting/text_splitting.py @@ -7,10 +7,11 @@ from abc import ABC, abstractmethod from collections.abc import Callable, Collection, Iterable from dataclasses import dataclass -from typing import Any, Literal, Union, cast +from typing import Any, Literal, cast import pandas as pd import tiktoken +from datashaper import ProgressTicker from graphrag.index.operations.chunk_text.typing import TextChunk @@ -169,7 +170,7 @@ def split_single_text_on_tokens(text: str, tokenizer: Tokenizer) -> list[str]: # Adapted from - https://github.com/langchain-ai/langchain/blob/77b359edf5df0d37ef0d539f678cf64f5557cb54/libs/langchain/langchain/text_splitter.py#L471 # So we could have better control over the chunking process def split_multiple_texts_on_tokens( - texts: list[str], tokenizer: Tokenizer, tick=None + texts: list[str], tokenizer: Tokenizer, tick: ProgressTicker | None = None ) -> list[TextChunk]: """Split multiple texts and return chunks with metadata using the tokenizer.""" result = [] diff --git a/tests/unit/indexing/text_splitting/test_text_splitting.py b/tests/unit/indexing/text_splitting/test_text_splitting.py index a0d2e0425c..f71372f0e3 100644 --- a/tests/unit/indexing/text_splitting/test_text_splitting.py +++ b/tests/unit/indexing/text_splitting/test_text_splitting.py @@ -6,11 +6,14 @@ import pytest +from graphrag.index.operations.chunk_text.typing import TextChunk from graphrag.index.text_splitting.text_splitting import ( NoopTextSplitter, Tokenizer, TokenTextSplitter, + split_multiple_texts_on_tokens, split_single_text_on_tokens, + split_text_on_tokens, ) @@ -136,7 +139,43 @@ def test_model_name_exception(mock_get_encoding, mock_encoding_for_model): mock_encoding_for_model.assert_called_once_with("mock_model") -def test_split_text_on_tokens(): +@mock.patch( + "graphrag.index.text_splitting.text_splitting.split_multiple_texts_on_tokens" +) +def test_split_multiple_text_on_tokens_tick(mock_split): + text = ["This is a test text, meaning to be taken seriously by this test only."] + mock_split.return_value = ["chunk"] * 2 + tokenizer = MagicMock() + progress_ticket = MagicMock() + result = split_text_on_tokens(text, tokenizer, progress_ticket) + assert len(result) == 2, "Large input was not split correctly" + + mock_split.assert_called_once_with(text, tokenizer, progress_ticket) + + +@mock.patch( + "graphrag.index.text_splitting.text_splitting.split_multiple_texts_on_tokens" +) +def test_split_multiple_text_on_tokens_no_tick(mock_split): + text = ["This is a test text, meaning to be taken seriously by this test only."] + mock_split.return_value = ["chunk"] * 2 + tokenizer = MagicMock() + result = split_text_on_tokens(text, tokenizer) + assert len(result) == 2, "Large input was not split correctly" + mock_split.assert_called_once_with(text, tokenizer, None) + + +@mock.patch("graphrag.index.text_splitting.text_splitting.split_single_text_on_tokens") +def test_split_single_text_on_tokens_no_tick(mock_split): + text = "This is a test text, meaning to be taken seriously by this test only." + mock_split.return_value = ["chunk"] * 2 + tokenizer = MagicMock() + result = split_text_on_tokens(text, tokenizer) + assert len(result) == 2, "Large input was not split correctly" + mock_split.assert_called_once_with(text, tokenizer) + + +def test_split_single_text_on_tokens(): text = "This is a test text, meaning to be taken seriously by this test only." mocked_tokenizer = MockTokenizer() tokenizer = Tokenizer( @@ -165,3 +204,70 @@ def test_split_text_on_tokens(): result = split_single_text_on_tokens(text=text, tokenizer=tokenizer) assert result == expected_splits + +def test_split_multiple_texts_on_tokens_no_tick(): + texts = [ + "This is a test text, meaning to be taken seriously by this test only.", + "This is th second text, meaning to be taken seriously by this test only.", + ] + + mocked_tokenizer = MockTokenizer() + tokenizer = Tokenizer( + chunk_overlap=5, + tokens_per_chunk=10, + decode=mocked_tokenizer.decode, + encode=lambda text: mocked_tokenizer.encode(text), + ) + + result = split_multiple_texts_on_tokens(texts, tokenizer, tick=None) + assert result == [ + TextChunk(text_chunk="This is a ", source_doc_indices=[0], n_tokens=10), + TextChunk(text_chunk="is a test ", source_doc_indices=[0], n_tokens=10), + TextChunk(text_chunk="test text,", source_doc_indices=[0], n_tokens=10), + TextChunk(text_chunk="text, mean", source_doc_indices=[0], n_tokens=10), + TextChunk(text_chunk=" meaning t", source_doc_indices=[0], n_tokens=10), + TextChunk(text_chunk="ing to be ", source_doc_indices=[0], n_tokens=10), + TextChunk(text_chunk="o be taken", source_doc_indices=[0], n_tokens=10), + TextChunk(text_chunk="taken seri", source_doc_indices=[0], n_tokens=10), + TextChunk(text_chunk=" seriously", source_doc_indices=[0], n_tokens=10), + TextChunk(text_chunk="ously by t", source_doc_indices=[0], n_tokens=10), + TextChunk(text_chunk=" by this t", source_doc_indices=[0], n_tokens=10), + TextChunk(text_chunk="his test o", source_doc_indices=[0], n_tokens=10), + TextChunk(text_chunk="est only.T", source_doc_indices=[0, 1], n_tokens=10), + TextChunk(text_chunk="nly.This i", source_doc_indices=[0, 1], n_tokens=10), + TextChunk(text_chunk="his is th ", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk="s th secon", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk="second tex", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk="d text, me", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk="t, meaning", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk="aning to b", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk=" to be tak", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk="e taken se", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk="en serious", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk="riously by", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk="ly by this", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk=" this test", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk=" test only", source_doc_indices=[1], n_tokens=10), + TextChunk(text_chunk=" only.", source_doc_indices=[1], n_tokens=6), + TextChunk(text_chunk=".", source_doc_indices=[1], n_tokens=1), + ] + assert len(result) == 29, "Large input was not split correctly" + + +def test_split_multiple_texts_on_tokens_tick(): + texts = [ + "This is a test text, meaning to be taken seriously by this test only.", + "This is th second text, meaning to be taken seriously by this test only.", + ] + + mocked_tokenizer = MockTokenizer() + mock_tick = MagicMock() + tokenizer = Tokenizer( + chunk_overlap=5, + tokens_per_chunk=10, + decode=mocked_tokenizer.decode, + encode=lambda text: mocked_tokenizer.encode(text), + ) + + split_multiple_texts_on_tokens(texts, tokenizer, tick=mock_tick) + mock_tick.assert_called() From ee49d200134a5bd93123e02ba32f3acb08cd7924 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Fri, 20 Dec 2024 16:08:00 -0300 Subject: [PATCH 05/10] run formatter --- graphrag/index/operations/chunk_text/strategies.py | 1 + graphrag/index/text_splitting/text_splitting.py | 3 ++- tests/unit/indexing/text_splitting/test_text_splitting.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/graphrag/index/operations/chunk_text/strategies.py b/graphrag/index/operations/chunk_text/strategies.py index d8edc2507c..b7a242f742 100644 --- a/graphrag/index/operations/chunk_text/strategies.py +++ b/graphrag/index/operations/chunk_text/strategies.py @@ -46,6 +46,7 @@ def decode(tokens: list[int]) -> str: tick, ) + def run_sentences( input: list[str], _args: dict[str, Any], tick: ProgressTicker ) -> Iterable[TextChunk]: diff --git a/graphrag/index/text_splitting/text_splitting.py b/graphrag/index/text_splitting/text_splitting.py index 47ed47332f..f4b48a36ce 100644 --- a/graphrag/index/text_splitting/text_splitting.py +++ b/graphrag/index/text_splitting/text_splitting.py @@ -138,6 +138,7 @@ def split_text(self, text: str | list[str]) -> list[str]: return split_single_text_on_tokens(text=text, tokenizer=tokenizer) + def split_text_on_tokens( texts: str | list[str], tokenizer: Tokenizer, tick=None ) -> list[str] | list[TextChunk]: @@ -198,4 +199,4 @@ def split_multiple_texts_on_tokens( cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) chunk_ids = input_ids[start_idx:cur_idx] - return result \ No newline at end of file + return result diff --git a/tests/unit/indexing/text_splitting/test_text_splitting.py b/tests/unit/indexing/text_splitting/test_text_splitting.py index f71372f0e3..c193b7ad64 100644 --- a/tests/unit/indexing/text_splitting/test_text_splitting.py +++ b/tests/unit/indexing/text_splitting/test_text_splitting.py @@ -205,6 +205,7 @@ def test_split_single_text_on_tokens(): result = split_single_text_on_tokens(text=text, tokenizer=tokenizer) assert result == expected_splits + def test_split_multiple_texts_on_tokens_no_tick(): texts = [ "This is a test text, meaning to be taken seriously by this test only.", From 6aac9b22167f1c74162c994f9e34b3ac3fa353ec Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Fri, 20 Dec 2024 16:14:23 -0300 Subject: [PATCH 06/10] run spell check --- .../text_splitting/test_text_splitting.py | 53 +++++++++++++++---- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/tests/unit/indexing/text_splitting/test_text_splitting.py b/tests/unit/indexing/text_splitting/test_text_splitting.py index c193b7ad64..c2a9751e28 100644 --- a/tests/unit/indexing/text_splitting/test_text_splitting.py +++ b/tests/unit/indexing/text_splitting/test_text_splitting.py @@ -193,9 +193,9 @@ def test_split_single_text_on_tokens(): " meaning t", "ing to be ", "o be taken", - "taken seri", + "taken seri", # cspell:disable-line " seriously", - "ously by t", + "ously by t", # cspell:disable-line " by this t", "his test o", "est only.", @@ -229,23 +229,58 @@ def test_split_multiple_texts_on_tokens_no_tick(): TextChunk(text_chunk=" meaning t", source_doc_indices=[0], n_tokens=10), TextChunk(text_chunk="ing to be ", source_doc_indices=[0], n_tokens=10), TextChunk(text_chunk="o be taken", source_doc_indices=[0], n_tokens=10), - TextChunk(text_chunk="taken seri", source_doc_indices=[0], n_tokens=10), + TextChunk( + # cspell:disable-next-line # noqa: ERA001 + text_chunk="taken seri", + source_doc_indices=[0], + n_tokens=10, + ), TextChunk(text_chunk=" seriously", source_doc_indices=[0], n_tokens=10), - TextChunk(text_chunk="ously by t", source_doc_indices=[0], n_tokens=10), + TextChunk( + # cspell:disable-next-line # noqa: ERA001 + text_chunk="ously by t", + source_doc_indices=[0], + n_tokens=10, + ), TextChunk(text_chunk=" by this t", source_doc_indices=[0], n_tokens=10), - TextChunk(text_chunk="his test o", source_doc_indices=[0], n_tokens=10), + TextChunk( + # cspell:disable-next-line # noqa: ERA001 + text_chunk="his test o", + source_doc_indices=[0], + n_tokens=10, + ), TextChunk(text_chunk="est only.T", source_doc_indices=[0, 1], n_tokens=10), TextChunk(text_chunk="nly.This i", source_doc_indices=[0, 1], n_tokens=10), TextChunk(text_chunk="his is th ", source_doc_indices=[1], n_tokens=10), - TextChunk(text_chunk="s th secon", source_doc_indices=[1], n_tokens=10), + TextChunk( + # cspell:disable-next-line # noqa: ERA001 + text_chunk="s th secon", + source_doc_indices=[1], + n_tokens=10, + ), TextChunk(text_chunk="second tex", source_doc_indices=[1], n_tokens=10), TextChunk(text_chunk="d text, me", source_doc_indices=[1], n_tokens=10), TextChunk(text_chunk="t, meaning", source_doc_indices=[1], n_tokens=10), - TextChunk(text_chunk="aning to b", source_doc_indices=[1], n_tokens=10), + TextChunk( + # cspell:disable-next-line # noqa: ERA001 + text_chunk="aning to b", + source_doc_indices=[1], + n_tokens=10, + ), TextChunk(text_chunk=" to be tak", source_doc_indices=[1], n_tokens=10), TextChunk(text_chunk="e taken se", source_doc_indices=[1], n_tokens=10), - TextChunk(text_chunk="en serious", source_doc_indices=[1], n_tokens=10), - TextChunk(text_chunk="riously by", source_doc_indices=[1], n_tokens=10), + TextChunk( + # cspell:disable-next-line # noqa: ERA001 + text_chunk="en serious", + source_doc_indices=[1], + n_tokens=10, + ), + TextChunk( + # cspell:disable-next-line # noqa: ERA001 + text_chunk="riously by", + source_doc_indices=[1], + n_tokens=10, + ), TextChunk(text_chunk="ly by this", source_doc_indices=[1], n_tokens=10), TextChunk(text_chunk=" this test", source_doc_indices=[1], n_tokens=10), TextChunk(text_chunk=" test only", source_doc_indices=[1], n_tokens=10), From e7e2807166862f4fde9a268411636efe1f2424f5 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Fri, 20 Dec 2024 16:15:39 -0300 Subject: [PATCH 07/10] run semver --- .semversioner/next-release/patch-20241220191518597340.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .semversioner/next-release/patch-20241220191518597340.json diff --git a/.semversioner/next-release/patch-20241220191518597340.json b/.semversioner/next-release/patch-20241220191518597340.json new file mode 100644 index 0000000000..d410fe7ce7 --- /dev/null +++ b/.semversioner/next-release/patch-20241220191518597340.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "unit tests for text_splitting" +} From 33d05386346150177fe971272eca3ab3bffd558b Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Mon, 30 Dec 2024 19:27:12 -0500 Subject: [PATCH 08/10] remove tiktoken mocked from tests --- .../text_splitting/test_text_splitting.py | 46 ++++--------------- 1 file changed, 10 insertions(+), 36 deletions(-) diff --git a/tests/unit/indexing/text_splitting/test_text_splitting.py b/tests/unit/indexing/text_splitting/test_text_splitting.py index c2a9751e28..d2c1102d94 100644 --- a/tests/unit/indexing/text_splitting/test_text_splitting.py +++ b/tests/unit/indexing/text_splitting/test_text_splitting.py @@ -66,8 +66,7 @@ def test_split_text_large_input(mock_split): @mock.patch("graphrag.index.text_splitting.text_splitting.split_single_text_on_tokens") @mock.patch("graphrag.index.text_splitting.text_splitting.Tokenizer") -@mock.patch("tiktoken.get_encoding") -def test_token_text_splitter(mock_get_encoding, mock_tokenizer, mock_split_text): +def test_token_text_splitter(mock_tokenizer, mock_split_text): text = "chunk1 chunk2 chunk3" expected_chunks = ["chunk1", "chunk2", "chunk3"] @@ -75,57 +74,32 @@ def test_token_text_splitter(mock_get_encoding, mock_tokenizer, mock_split_text) mock_tokenizer.return_value = mocked_tokenizer mock_split_text.return_value = expected_chunks - mock_get_encoding.return_value = MagicMock(tokens_per_chunk=5, chunk_overlap=2) - - splitter = TokenTextSplitter( - encoding_name="mock_encoding", - ) + splitter = TokenTextSplitter() splitter.split_text(["chunk1", "chunk2", "chunk3"]) mock_split_text.assert_called_once_with(text=text, tokenizer=mocked_tokenizer) -@mock.patch("tiktoken.get_encoding") -def test_encode_basic(mock_get_encoding): - mock_encoder = MagicMock() - mock_encoder.encode.return_value = [1, 2, 3] - mock_get_encoding.return_value = mock_encoder - - splitter = TokenTextSplitter(encoding_name="mock_encoding") +def test_encode_basic(): + splitter = TokenTextSplitter() result = splitter.encode("abc def") - assert result == [1, 2, 3], "Encoding failed to return expected tokens" - mock_encoder.encode.assert_called_once_with( - "abc def", allowed_special=set(), disallowed_special="all" - ) - + assert result == [13997, 711], "Encoding failed to return expected tokens" -@mock.patch("tiktoken.get_encoding") -def test_num_tokens_empty_input(mock_get_encoding): - mock_encoder = MagicMock() - mock_encoder.encode.return_value = [] - mock_get_encoding.return_value = mock_encoder - splitter = TokenTextSplitter(encoding_name="mock_encoding") +def test_num_tokens_empty_input(): + splitter = TokenTextSplitter() result = splitter.num_tokens("") assert result == 0, "Token count for empty input should be 0" -@mock.patch("tiktoken.encoding_for_model") -def test_model_name(mock_get_encoding): - mock_encoder = MagicMock() - mock_encoder.encode.return_value = [1, 2, 3] - mock_get_encoding.return_value = mock_encoder - - splitter = TokenTextSplitter(model_name="mock_model") +def test_model_name(): + splitter = TokenTextSplitter(model_name="gpt-4o") result = splitter.encode("abc def") - assert result == [1, 2, 3], "Encoding failed to return expected tokens" - mock_encoder.encode.assert_called_once_with( - "abc def", allowed_special=set(), disallowed_special="all" - ) + assert result == [26682, 1056], "Encoding failed to return expected tokens" @mock.patch("tiktoken.encoding_for_model", side_effect=KeyError) From 51a188f2756f7e8aadacd3bdc6f545f4139c4b30 Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Mon, 13 Jan 2025 18:12:41 -0300 Subject: [PATCH 09/10] change progress ticker --- .../index/text_splitting/text_splitting.py | 14 +- .../text_splitting/test_text_splitting.py | 123 +----------------- 2 files changed, 3 insertions(+), 134 deletions(-) diff --git a/graphrag/index/text_splitting/text_splitting.py b/graphrag/index/text_splitting/text_splitting.py index effb17f053..1632904637 100644 --- a/graphrag/index/text_splitting/text_splitting.py +++ b/graphrag/index/text_splitting/text_splitting.py @@ -11,10 +11,10 @@ import pandas as pd import tiktoken -from datashaper import ProgressTicker import graphrag.config.defaults as defs from graphrag.index.operations.chunk_text.typing import TextChunk +from graphrag.logger.progress import ProgressTicker EncodedText = list[int] DecodeFn = Callable[[EncodedText], str] @@ -140,16 +140,6 @@ def split_text(self, text: str | list[str]) -> list[str]: return split_single_text_on_tokens(text=text, tokenizer=tokenizer) -def split_text_on_tokens( - texts: str | list[str], tokenizer: Tokenizer, tick=None -) -> list[str] | list[TextChunk]: - """Handle both single text and list of texts.""" - if isinstance(texts, str): - return split_single_text_on_tokens(texts, tokenizer) - - return split_multiple_texts_on_tokens(texts, tokenizer, tick) - - def split_single_text_on_tokens(text: str, tokenizer: Tokenizer) -> list[str]: """Split a single text and return chunks using the tokenizer.""" result = [] @@ -172,7 +162,7 @@ def split_single_text_on_tokens(text: str, tokenizer: Tokenizer) -> list[str]: # Adapted from - https://github.com/langchain-ai/langchain/blob/77b359edf5df0d37ef0d539f678cf64f5557cb54/libs/langchain/langchain/text_splitter.py#L471 # So we could have better control over the chunking process def split_multiple_texts_on_tokens( - texts: list[str], tokenizer: Tokenizer, tick: ProgressTicker | None = None + texts: list[str], tokenizer: Tokenizer, tick: ProgressTicker ) -> list[TextChunk]: """Split multiple texts and return chunks with metadata using the tokenizer.""" result = [] diff --git a/tests/unit/indexing/text_splitting/test_text_splitting.py b/tests/unit/indexing/text_splitting/test_text_splitting.py index d2c1102d94..15253ae4eb 100644 --- a/tests/unit/indexing/text_splitting/test_text_splitting.py +++ b/tests/unit/indexing/text_splitting/test_text_splitting.py @@ -13,7 +13,6 @@ TokenTextSplitter, split_multiple_texts_on_tokens, split_single_text_on_tokens, - split_text_on_tokens, ) @@ -113,42 +112,6 @@ def test_model_name_exception(mock_get_encoding, mock_encoding_for_model): mock_encoding_for_model.assert_called_once_with("mock_model") -@mock.patch( - "graphrag.index.text_splitting.text_splitting.split_multiple_texts_on_tokens" -) -def test_split_multiple_text_on_tokens_tick(mock_split): - text = ["This is a test text, meaning to be taken seriously by this test only."] - mock_split.return_value = ["chunk"] * 2 - tokenizer = MagicMock() - progress_ticket = MagicMock() - result = split_text_on_tokens(text, tokenizer, progress_ticket) - assert len(result) == 2, "Large input was not split correctly" - - mock_split.assert_called_once_with(text, tokenizer, progress_ticket) - - -@mock.patch( - "graphrag.index.text_splitting.text_splitting.split_multiple_texts_on_tokens" -) -def test_split_multiple_text_on_tokens_no_tick(mock_split): - text = ["This is a test text, meaning to be taken seriously by this test only."] - mock_split.return_value = ["chunk"] * 2 - tokenizer = MagicMock() - result = split_text_on_tokens(text, tokenizer) - assert len(result) == 2, "Large input was not split correctly" - mock_split.assert_called_once_with(text, tokenizer, None) - - -@mock.patch("graphrag.index.text_splitting.text_splitting.split_single_text_on_tokens") -def test_split_single_text_on_tokens_no_tick(mock_split): - text = "This is a test text, meaning to be taken seriously by this test only." - mock_split.return_value = ["chunk"] * 2 - tokenizer = MagicMock() - result = split_text_on_tokens(text, tokenizer) - assert len(result) == 2, "Large input was not split correctly" - mock_split.assert_called_once_with(text, tokenizer) - - def test_split_single_text_on_tokens(): text = "This is a test text, meaning to be taken seriously by this test only." mocked_tokenizer = MockTokenizer() @@ -180,91 +143,7 @@ def test_split_single_text_on_tokens(): assert result == expected_splits -def test_split_multiple_texts_on_tokens_no_tick(): - texts = [ - "This is a test text, meaning to be taken seriously by this test only.", - "This is th second text, meaning to be taken seriously by this test only.", - ] - - mocked_tokenizer = MockTokenizer() - tokenizer = Tokenizer( - chunk_overlap=5, - tokens_per_chunk=10, - decode=mocked_tokenizer.decode, - encode=lambda text: mocked_tokenizer.encode(text), - ) - - result = split_multiple_texts_on_tokens(texts, tokenizer, tick=None) - assert result == [ - TextChunk(text_chunk="This is a ", source_doc_indices=[0], n_tokens=10), - TextChunk(text_chunk="is a test ", source_doc_indices=[0], n_tokens=10), - TextChunk(text_chunk="test text,", source_doc_indices=[0], n_tokens=10), - TextChunk(text_chunk="text, mean", source_doc_indices=[0], n_tokens=10), - TextChunk(text_chunk=" meaning t", source_doc_indices=[0], n_tokens=10), - TextChunk(text_chunk="ing to be ", source_doc_indices=[0], n_tokens=10), - TextChunk(text_chunk="o be taken", source_doc_indices=[0], n_tokens=10), - TextChunk( - # cspell:disable-next-line # noqa: ERA001 - text_chunk="taken seri", - source_doc_indices=[0], - n_tokens=10, - ), - TextChunk(text_chunk=" seriously", source_doc_indices=[0], n_tokens=10), - TextChunk( - # cspell:disable-next-line # noqa: ERA001 - text_chunk="ously by t", - source_doc_indices=[0], - n_tokens=10, - ), - TextChunk(text_chunk=" by this t", source_doc_indices=[0], n_tokens=10), - TextChunk( - # cspell:disable-next-line # noqa: ERA001 - text_chunk="his test o", - source_doc_indices=[0], - n_tokens=10, - ), - TextChunk(text_chunk="est only.T", source_doc_indices=[0, 1], n_tokens=10), - TextChunk(text_chunk="nly.This i", source_doc_indices=[0, 1], n_tokens=10), - TextChunk(text_chunk="his is th ", source_doc_indices=[1], n_tokens=10), - TextChunk( - # cspell:disable-next-line # noqa: ERA001 - text_chunk="s th secon", - source_doc_indices=[1], - n_tokens=10, - ), - TextChunk(text_chunk="second tex", source_doc_indices=[1], n_tokens=10), - TextChunk(text_chunk="d text, me", source_doc_indices=[1], n_tokens=10), - TextChunk(text_chunk="t, meaning", source_doc_indices=[1], n_tokens=10), - TextChunk( - # cspell:disable-next-line # noqa: ERA001 - text_chunk="aning to b", - source_doc_indices=[1], - n_tokens=10, - ), - TextChunk(text_chunk=" to be tak", source_doc_indices=[1], n_tokens=10), - TextChunk(text_chunk="e taken se", source_doc_indices=[1], n_tokens=10), - TextChunk( - # cspell:disable-next-line # noqa: ERA001 - text_chunk="en serious", - source_doc_indices=[1], - n_tokens=10, - ), - TextChunk( - # cspell:disable-next-line # noqa: ERA001 - text_chunk="riously by", - source_doc_indices=[1], - n_tokens=10, - ), - TextChunk(text_chunk="ly by this", source_doc_indices=[1], n_tokens=10), - TextChunk(text_chunk=" this test", source_doc_indices=[1], n_tokens=10), - TextChunk(text_chunk=" test only", source_doc_indices=[1], n_tokens=10), - TextChunk(text_chunk=" only.", source_doc_indices=[1], n_tokens=6), - TextChunk(text_chunk=".", source_doc_indices=[1], n_tokens=1), - ] - assert len(result) == 29, "Large input was not split correctly" - - -def test_split_multiple_texts_on_tokens_tick(): +def test_split_multiple_texts_on_tokens(): texts = [ "This is a test text, meaning to be taken seriously by this test only.", "This is th second text, meaning to be taken seriously by this test only.", From 3eacadf9c8f6bd0b084c9314e46eff8cf3a8d45a Mon Sep 17 00:00:00 2001 From: Dayenne Souza Date: Mon, 13 Jan 2025 18:15:59 -0300 Subject: [PATCH 10/10] fix ruff check --- tests/unit/indexing/text_splitting/test_text_splitting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/indexing/text_splitting/test_text_splitting.py b/tests/unit/indexing/text_splitting/test_text_splitting.py index 15253ae4eb..833d49fbb1 100644 --- a/tests/unit/indexing/text_splitting/test_text_splitting.py +++ b/tests/unit/indexing/text_splitting/test_text_splitting.py @@ -6,7 +6,6 @@ import pytest -from graphrag.index.operations.chunk_text.typing import TextChunk from graphrag.index.text_splitting.text_splitting import ( NoopTextSplitter, Tokenizer,