change progress ticker

microsoft · Jan 13, 2025 · 51a188f · 51a188f
1 parent 5e72ec2
commit 51a188f
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 134 deletions.
diff --git a/graphrag/index/text_splitting/text_splitting.py b/graphrag/index/text_splitting/text_splitting.py
@@ -11,10 +11,10 @@
 
 import pandas as pd
 import tiktoken
-from datashaper import ProgressTicker
 
 import graphrag.config.defaults as defs
 from graphrag.index.operations.chunk_text.typing import TextChunk
+from graphrag.logger.progress import ProgressTicker
 
 EncodedText = list[int]
 DecodeFn = Callable[[EncodedText], str]
@@ -140,16 +140,6 @@ def split_text(self, text: str | list[str]) -> list[str]:
         return split_single_text_on_tokens(text=text, tokenizer=tokenizer)
 
 
-def split_text_on_tokens(
-    texts: str | list[str], tokenizer: Tokenizer, tick=None
-) -> list[str] | list[TextChunk]:
-    """Handle both single text and list of texts."""
-    if isinstance(texts, str):
-        return split_single_text_on_tokens(texts, tokenizer)
-
-    return split_multiple_texts_on_tokens(texts, tokenizer, tick)
-
-
 def split_single_text_on_tokens(text: str, tokenizer: Tokenizer) -> list[str]:
     """Split a single text and return chunks using the tokenizer."""
     result = []
@@ -172,7 +162,7 @@ def split_single_text_on_tokens(text: str, tokenizer: Tokenizer) -> list[str]:
 # Adapted from - https://github.com/langchain-ai/langchain/blob/77b359edf5df0d37ef0d539f678cf64f5557cb54/libs/langchain/langchain/text_splitter.py#L471
 # So we could have better control over the chunking process
 def split_multiple_texts_on_tokens(
-    texts: list[str], tokenizer: Tokenizer, tick: ProgressTicker | None = None
+    texts: list[str], tokenizer: Tokenizer, tick: ProgressTicker
 ) -> list[TextChunk]:
     """Split multiple texts and return chunks with metadata using the tokenizer."""
     result = []

diff --git a/tests/unit/indexing/text_splitting/test_text_splitting.py b/tests/unit/indexing/text_splitting/test_text_splitting.py
@@ -13,7 +13,6 @@
     TokenTextSplitter,
     split_multiple_texts_on_tokens,
     split_single_text_on_tokens,
-    split_text_on_tokens,
 )
 
 
@@ -113,42 +112,6 @@ def test_model_name_exception(mock_get_encoding, mock_encoding_for_model):
     mock_encoding_for_model.assert_called_once_with("mock_model")
 
 
-@mock.patch(
-    "graphrag.index.text_splitting.text_splitting.split_multiple_texts_on_tokens"
-)
-def test_split_multiple_text_on_tokens_tick(mock_split):
-    text = ["This is a test text, meaning to be taken seriously by this test only."]
-    mock_split.return_value = ["chunk"] * 2
-    tokenizer = MagicMock()
-    progress_ticket = MagicMock()
-    result = split_text_on_tokens(text, tokenizer, progress_ticket)
-    assert len(result) == 2, "Large input was not split correctly"
-
-    mock_split.assert_called_once_with(text, tokenizer, progress_ticket)
-
-
-@mock.patch(
-    "graphrag.index.text_splitting.text_splitting.split_multiple_texts_on_tokens"
-)
-def test_split_multiple_text_on_tokens_no_tick(mock_split):
-    text = ["This is a test text, meaning to be taken seriously by this test only."]
-    mock_split.return_value = ["chunk"] * 2
-    tokenizer = MagicMock()
-    result = split_text_on_tokens(text, tokenizer)
-    assert len(result) == 2, "Large input was not split correctly"
-    mock_split.assert_called_once_with(text, tokenizer, None)
-
-
-@mock.patch("graphrag.index.text_splitting.text_splitting.split_single_text_on_tokens")
-def test_split_single_text_on_tokens_no_tick(mock_split):
-    text = "This is a test text, meaning to be taken seriously by this test only."
-    mock_split.return_value = ["chunk"] * 2
-    tokenizer = MagicMock()
-    result = split_text_on_tokens(text, tokenizer)
-    assert len(result) == 2, "Large input was not split correctly"
-    mock_split.assert_called_once_with(text, tokenizer)
-
-
 def test_split_single_text_on_tokens():
     text = "This is a test text, meaning to be taken seriously by this test only."
     mocked_tokenizer = MockTokenizer()
@@ -180,91 +143,7 @@ def test_split_single_text_on_tokens():
     assert result == expected_splits
 
 
-def test_split_multiple_texts_on_tokens_no_tick():
-    texts = [
-        "This is a test text, meaning to be taken seriously by this test only.",
-        "This is th second text, meaning to be taken seriously by this test only.",
-    ]
-
-    mocked_tokenizer = MockTokenizer()
-    tokenizer = Tokenizer(
-        chunk_overlap=5,
-        tokens_per_chunk=10,
-        decode=mocked_tokenizer.decode,
-        encode=lambda text: mocked_tokenizer.encode(text),
-    )
-
-    result = split_multiple_texts_on_tokens(texts, tokenizer, tick=None)
-    assert result == [
-        TextChunk(text_chunk="This is a ", source_doc_indices=[0], n_tokens=10),
-        TextChunk(text_chunk="is a test ", source_doc_indices=[0], n_tokens=10),
-        TextChunk(text_chunk="test text,", source_doc_indices=[0], n_tokens=10),
-        TextChunk(text_chunk="text, mean", source_doc_indices=[0], n_tokens=10),
-        TextChunk(text_chunk=" meaning t", source_doc_indices=[0], n_tokens=10),
-        TextChunk(text_chunk="ing to be ", source_doc_indices=[0], n_tokens=10),
-        TextChunk(text_chunk="o be taken", source_doc_indices=[0], n_tokens=10),
-        TextChunk(
-            # cspell:disable-next-line  # noqa: ERA001
-            text_chunk="taken seri",
-            source_doc_indices=[0],
-            n_tokens=10,
-        ),
-        TextChunk(text_chunk=" seriously", source_doc_indices=[0], n_tokens=10),
-        TextChunk(
-            # cspell:disable-next-line  # noqa: ERA001
-            text_chunk="ously by t",
-            source_doc_indices=[0],
-            n_tokens=10,
-        ),
-        TextChunk(text_chunk=" by this t", source_doc_indices=[0], n_tokens=10),
-        TextChunk(
-            # cspell:disable-next-line  # noqa: ERA001
-            text_chunk="his test o",
-            source_doc_indices=[0],
-            n_tokens=10,
-        ),
-        TextChunk(text_chunk="est only.T", source_doc_indices=[0, 1], n_tokens=10),
-        TextChunk(text_chunk="nly.This i", source_doc_indices=[0, 1], n_tokens=10),
-        TextChunk(text_chunk="his is th ", source_doc_indices=[1], n_tokens=10),
-        TextChunk(
-            # cspell:disable-next-line  # noqa: ERA001
-            text_chunk="s th secon",
-            source_doc_indices=[1],
-            n_tokens=10,
-        ),
-        TextChunk(text_chunk="second tex", source_doc_indices=[1], n_tokens=10),
-        TextChunk(text_chunk="d text, me", source_doc_indices=[1], n_tokens=10),
-        TextChunk(text_chunk="t, meaning", source_doc_indices=[1], n_tokens=10),
-        TextChunk(
-            # cspell:disable-next-line  # noqa: ERA001
-            text_chunk="aning to b",
-            source_doc_indices=[1],
-            n_tokens=10,
-        ),
-        TextChunk(text_chunk=" to be tak", source_doc_indices=[1], n_tokens=10),
-        TextChunk(text_chunk="e taken se", source_doc_indices=[1], n_tokens=10),
-        TextChunk(
-            # cspell:disable-next-line  # noqa: ERA001
-            text_chunk="en serious",
-            source_doc_indices=[1],
-            n_tokens=10,
-        ),
-        TextChunk(
-            # cspell:disable-next-line  # noqa: ERA001
-            text_chunk="riously by",
-            source_doc_indices=[1],
-            n_tokens=10,
-        ),
-        TextChunk(text_chunk="ly by this", source_doc_indices=[1], n_tokens=10),
-        TextChunk(text_chunk=" this test", source_doc_indices=[1], n_tokens=10),
-        TextChunk(text_chunk=" test only", source_doc_indices=[1], n_tokens=10),
-        TextChunk(text_chunk=" only.", source_doc_indices=[1], n_tokens=6),
-        TextChunk(text_chunk=".", source_doc_indices=[1], n_tokens=1),
-    ]
-    assert len(result) == 29, "Large input was not split correctly"
-
-
-def test_split_multiple_texts_on_tokens_tick():
+def test_split_multiple_texts_on_tokens():
     texts = [
         "This is a test text, meaning to be taken seriously by this test only.",
         "This is th second text, meaning to be taken seriously by this test only.",