Skip to content

Commit

Permalink
change progress ticker
Browse files Browse the repository at this point in the history
  • Loading branch information
dayesouza committed Jan 13, 2025
1 parent 5e72ec2 commit 51a188f
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 134 deletions.
14 changes: 2 additions & 12 deletions graphrag/index/text_splitting/text_splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@

import pandas as pd
import tiktoken
from datashaper import ProgressTicker

import graphrag.config.defaults as defs
from graphrag.index.operations.chunk_text.typing import TextChunk
from graphrag.logger.progress import ProgressTicker

EncodedText = list[int]
DecodeFn = Callable[[EncodedText], str]
Expand Down Expand Up @@ -140,16 +140,6 @@ def split_text(self, text: str | list[str]) -> list[str]:
return split_single_text_on_tokens(text=text, tokenizer=tokenizer)


def split_text_on_tokens(
texts: str | list[str], tokenizer: Tokenizer, tick=None
) -> list[str] | list[TextChunk]:
"""Handle both single text and list of texts."""
if isinstance(texts, str):
return split_single_text_on_tokens(texts, tokenizer)

return split_multiple_texts_on_tokens(texts, tokenizer, tick)


def split_single_text_on_tokens(text: str, tokenizer: Tokenizer) -> list[str]:
"""Split a single text and return chunks using the tokenizer."""
result = []
Expand All @@ -172,7 +162,7 @@ def split_single_text_on_tokens(text: str, tokenizer: Tokenizer) -> list[str]:
# Adapted from - https://github.com/langchain-ai/langchain/blob/77b359edf5df0d37ef0d539f678cf64f5557cb54/libs/langchain/langchain/text_splitter.py#L471
# So we could have better control over the chunking process
def split_multiple_texts_on_tokens(
texts: list[str], tokenizer: Tokenizer, tick: ProgressTicker | None = None
texts: list[str], tokenizer: Tokenizer, tick: ProgressTicker
) -> list[TextChunk]:
"""Split multiple texts and return chunks with metadata using the tokenizer."""
result = []
Expand Down
123 changes: 1 addition & 122 deletions tests/unit/indexing/text_splitting/test_text_splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
TokenTextSplitter,
split_multiple_texts_on_tokens,
split_single_text_on_tokens,
split_text_on_tokens,
)


Expand Down Expand Up @@ -113,42 +112,6 @@ def test_model_name_exception(mock_get_encoding, mock_encoding_for_model):
mock_encoding_for_model.assert_called_once_with("mock_model")


@mock.patch(
"graphrag.index.text_splitting.text_splitting.split_multiple_texts_on_tokens"
)
def test_split_multiple_text_on_tokens_tick(mock_split):
text = ["This is a test text, meaning to be taken seriously by this test only."]
mock_split.return_value = ["chunk"] * 2
tokenizer = MagicMock()
progress_ticket = MagicMock()
result = split_text_on_tokens(text, tokenizer, progress_ticket)
assert len(result) == 2, "Large input was not split correctly"

mock_split.assert_called_once_with(text, tokenizer, progress_ticket)


@mock.patch(
"graphrag.index.text_splitting.text_splitting.split_multiple_texts_on_tokens"
)
def test_split_multiple_text_on_tokens_no_tick(mock_split):
text = ["This is a test text, meaning to be taken seriously by this test only."]
mock_split.return_value = ["chunk"] * 2
tokenizer = MagicMock()
result = split_text_on_tokens(text, tokenizer)
assert len(result) == 2, "Large input was not split correctly"
mock_split.assert_called_once_with(text, tokenizer, None)


@mock.patch("graphrag.index.text_splitting.text_splitting.split_single_text_on_tokens")
def test_split_single_text_on_tokens_no_tick(mock_split):
text = "This is a test text, meaning to be taken seriously by this test only."
mock_split.return_value = ["chunk"] * 2
tokenizer = MagicMock()
result = split_text_on_tokens(text, tokenizer)
assert len(result) == 2, "Large input was not split correctly"
mock_split.assert_called_once_with(text, tokenizer)


def test_split_single_text_on_tokens():
text = "This is a test text, meaning to be taken seriously by this test only."
mocked_tokenizer = MockTokenizer()
Expand Down Expand Up @@ -180,91 +143,7 @@ def test_split_single_text_on_tokens():
assert result == expected_splits


def test_split_multiple_texts_on_tokens_no_tick():
texts = [
"This is a test text, meaning to be taken seriously by this test only.",
"This is th second text, meaning to be taken seriously by this test only.",
]

mocked_tokenizer = MockTokenizer()
tokenizer = Tokenizer(
chunk_overlap=5,
tokens_per_chunk=10,
decode=mocked_tokenizer.decode,
encode=lambda text: mocked_tokenizer.encode(text),
)

result = split_multiple_texts_on_tokens(texts, tokenizer, tick=None)
assert result == [
TextChunk(text_chunk="This is a ", source_doc_indices=[0], n_tokens=10),
TextChunk(text_chunk="is a test ", source_doc_indices=[0], n_tokens=10),
TextChunk(text_chunk="test text,", source_doc_indices=[0], n_tokens=10),
TextChunk(text_chunk="text, mean", source_doc_indices=[0], n_tokens=10),
TextChunk(text_chunk=" meaning t", source_doc_indices=[0], n_tokens=10),
TextChunk(text_chunk="ing to be ", source_doc_indices=[0], n_tokens=10),
TextChunk(text_chunk="o be taken", source_doc_indices=[0], n_tokens=10),
TextChunk(
# cspell:disable-next-line # noqa: ERA001
text_chunk="taken seri",
source_doc_indices=[0],
n_tokens=10,
),
TextChunk(text_chunk=" seriously", source_doc_indices=[0], n_tokens=10),
TextChunk(
# cspell:disable-next-line # noqa: ERA001
text_chunk="ously by t",
source_doc_indices=[0],
n_tokens=10,
),
TextChunk(text_chunk=" by this t", source_doc_indices=[0], n_tokens=10),
TextChunk(
# cspell:disable-next-line # noqa: ERA001
text_chunk="his test o",
source_doc_indices=[0],
n_tokens=10,
),
TextChunk(text_chunk="est only.T", source_doc_indices=[0, 1], n_tokens=10),
TextChunk(text_chunk="nly.This i", source_doc_indices=[0, 1], n_tokens=10),
TextChunk(text_chunk="his is th ", source_doc_indices=[1], n_tokens=10),
TextChunk(
# cspell:disable-next-line # noqa: ERA001
text_chunk="s th secon",
source_doc_indices=[1],
n_tokens=10,
),
TextChunk(text_chunk="second tex", source_doc_indices=[1], n_tokens=10),
TextChunk(text_chunk="d text, me", source_doc_indices=[1], n_tokens=10),
TextChunk(text_chunk="t, meaning", source_doc_indices=[1], n_tokens=10),
TextChunk(
# cspell:disable-next-line # noqa: ERA001
text_chunk="aning to b",
source_doc_indices=[1],
n_tokens=10,
),
TextChunk(text_chunk=" to be tak", source_doc_indices=[1], n_tokens=10),
TextChunk(text_chunk="e taken se", source_doc_indices=[1], n_tokens=10),
TextChunk(
# cspell:disable-next-line # noqa: ERA001
text_chunk="en serious",
source_doc_indices=[1],
n_tokens=10,
),
TextChunk(
# cspell:disable-next-line # noqa: ERA001
text_chunk="riously by",
source_doc_indices=[1],
n_tokens=10,
),
TextChunk(text_chunk="ly by this", source_doc_indices=[1], n_tokens=10),
TextChunk(text_chunk=" this test", source_doc_indices=[1], n_tokens=10),
TextChunk(text_chunk=" test only", source_doc_indices=[1], n_tokens=10),
TextChunk(text_chunk=" only.", source_doc_indices=[1], n_tokens=6),
TextChunk(text_chunk=".", source_doc_indices=[1], n_tokens=1),
]
assert len(result) == 29, "Large input was not split correctly"


def test_split_multiple_texts_on_tokens_tick():
def test_split_multiple_texts_on_tokens():
texts = [
"This is a test text, meaning to be taken seriously by this test only.",
"This is th second text, meaning to be taken seriously by this test only.",
Expand Down

0 comments on commit 51a188f

Please sign in to comment.