Merge branch 'main' into benchmarking-update

chonkie-ai · Jan 29, 2025 · 8f5a9a5 · 8f5a9a5
2 parents 74e12b3 + 97e6365
commit 8f5a9a5
Show file tree

Hide file tree

Showing 18 changed files with 260 additions and 170 deletions.
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 [![Documentation](https://img.shields.io/badge/docs-chonkie.ai-blue.svg)](https://docs.chonkie.ai)
 ![Package size](https://img.shields.io/badge/size-11.2MB-blue)
 [![Downloads](https://static.pepy.tech/badge/chonkie)](https://pepy.tech/project/chonkie)
-[![Discord](https://dcbadge.limes.pink/api/server/https://discord.gg/nMYNVyuB5Y?style=flat)](https://discord.gg/rYYp6DC4cv)
+[![Discord](https://dcbadge.limes.pink/api/server/https://discord.gg/rYYp6DC4cv?style=flat)](https://discord.gg/rYYp6DC4cv)
 [![GitHub stars](https://img.shields.io/github/stars/bhavnicksm/chonkie.svg)](https://github.com/bhavnicksm/chonkie/stargazers)
 
 _The no-nonsense RAG chunking library that's lightweight, lightning-fast, and ready to CHONK your texts_

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "chonkie"
-version = "0.4.0"
+version = "0.4.1"
 description = "🦛 CHONK your texts with Chonkie ✨ - The no-nonsense RAG chunking library"
 readme = "README.md"
 requires-python = ">=3.9"

diff --git a/src/chonkie/__init__.py b/src/chonkie/__init__.py
@@ -34,9 +34,9 @@
     SentenceChunk,
 )
 
-__version__ = "0.4.0"
+__version__ = "0.4.1"
 __name__ = "chonkie"
-__author__ = "Bhavnick Minhas"
+__author__ = "Chonkie AI"
 
 # Add basic package metadata to __all__
 __all__ = [

diff --git a/src/chonkie/chunker/base.py b/src/chonkie/chunker/base.py
@@ -7,9 +7,11 @@
 from multiprocessing import Pool, cpu_count
 from typing import Any, Callable, List, Union
 
+from tqdm import tqdm
+
 from chonkie.types import Chunk
 
-from tqdm import tqdm
+
 class BaseChunker(ABC):
     """Abstract base class for all chunker implementations.
 
@@ -246,11 +248,11 @@ def _process_batch_sequential(self,
         return [
                 self.chunk(t) for t in tqdm(
                     texts,
-                    desc="🦛 CHONKING",
+                    desc="🦛",
                     disable=not show_progress_bar,
-                    unit="texts",
-                    bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱", 
-                    ascii=' >=')
+                        unit="doc",
+                    bar_format="{desc} ch{bar:20}nk {percentage:3.0f}% • {n_fmt}/{total_fmt} docs chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱", 
+                    ascii=' o')
         ]
 
     def _process_batch_multiprocessing(self,
@@ -264,12 +266,12 @@ def _process_batch_multiprocessing(self,
         with Pool(processes=num_workers) as pool:
             results = []
             with tqdm(total=total,
-                     desc="🦛 CHONKING",
+                     desc="🦛",
                      disable=not show_progress_bar,
-                     unit="texts",
-                     bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
-                     ascii=' >=') as pbar:
-                for result in pool.imap_unordered(self.chunk, texts, chunksize=chunksize):
+                     unit="doc",
+                     bar_format="{desc} ch{bar:20}nk {percentage:3.0f}% • {n_fmt}/{total_fmt} docs chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
+                     ascii=' o') as pbar:
+                for result in pool.imap(self.chunk, texts, chunksize=chunksize):
                     results.append(result)
                     pbar.update()
             return results

diff --git a/src/chonkie/chunker/recursive.py b/src/chonkie/chunker/recursive.py
@@ -2,7 +2,7 @@
 from bisect import bisect_left
 from functools import lru_cache
 from itertools import accumulate
-from typing import Any, List, Optional, Union
+from typing import Any, Callable, List, Optional, Union, Literal
 
 from chonkie.chunker.base import BaseChunker
 from chonkie.types import Chunk, RecursiveChunk, RecursiveLevel, RecursiveRules
@@ -18,21 +18,35 @@ class RecursiveChunker(BaseChunker):
     """
 
     def __init__(self,
-                 tokenizer: Union[str, Any] = "gpt2",
+                 tokenizer_or_token_counter: Union[str, Callable, Any] = "gpt2",
                  chunk_size: int = 512,
+                 min_characters_per_chunk: int = 12,
                  rules: RecursiveRules = RecursiveRules(), 
-                 min_characters_per_chunk: int = 12
+                 return_type: Literal["chunks", "texts"] = "chunks"
                  ) -> None:
         """Initialize the recursive chunker.
 
         Args:
-            tokenizer: The tokenizer to use for encoding/decoding.
+            tokenizer_or_token_counter: The tokenizer or token counter to use for encoding/decoding.
             chunk_size: The size of the chunks to return.
-            rules: The rules to use for chunking.
             min_characters_per_chunk: The minimum number of characters per chunk.
-            
+            rules: The rules to use for chunking.
+            return_type: Whether to return chunks or texts.
+        
+        Raises:
+            ValueError: If parameters are invalid.
+
         """
-        super().__init__(tokenizer)
+        super().__init__(tokenizer_or_token_counter=tokenizer_or_token_counter)
+
+        if chunk_size <= 0:
+            raise ValueError("chunk_size must be positive")
+        if min_characters_per_chunk < 1:
+            raise ValueError("min_characters_per_chunk must be at least 1")
+        if return_type not in ["chunks", "texts"]:
+            raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")
+
+        self.return_type = return_type
         self.rules = rules
         self.chunk_size = chunk_size
         self.min_characters_per_chunk = min_characters_per_chunk
@@ -194,7 +208,10 @@ def _recursive_chunk(self,
 
         # If level is out of bounds, return the text as a chunk
         if level >= len(self.rules):
-            return [self._create_chunk(text, self._get_token_count(text), level, full_text)]
+            if self.return_type == "chunks":
+                return [self._create_chunk(text, self._get_token_count(text), level, full_text)]
+            elif self.return_type == "texts":
+                return [text]
 
         # If full_text is not provided, use the text
         if full_text is None:
@@ -227,30 +244,32 @@ def _recursive_chunk(self,
             if token_count > self.chunk_size:
                 chunks.extend(self._recursive_chunk(split, level + 1, full_text))
             else:
-                if rule.delimiters is None and not rule.whitespace:
-                    # NOTE: This is a hack to get the decoded text, since merged = splits = token_splits
-                    # And we don't want to encode/decode the text again, that would be inefficient
-                    decoded_text = "".join(merged)
-                    chunks.append(self._create_chunk(split, token_count, level, decoded_text))
-                else:
-                    chunks.append(self._create_chunk(split, token_count, level, full_text))
-
+                if self.return_type == "chunks":
+                    if rule.delimiters is None and not rule.whitespace:
+                        # NOTE: This is a hack to get the decoded text, since merged = splits = token_splits
+                        # And we don't want to encode/decode the text again, that would be inefficient
+                        decoded_text = "".join(merged)
+                        chunks.append(self._create_chunk(split, token_count, level, decoded_text))
+                    else:
+                        chunks.append(self._create_chunk(split, token_count, level, full_text))
+                elif self.return_type == "texts":
+                    chunks.append(split)
         return chunks
 
-
     def chunk(self, text: str) -> List[Chunk]:
         """Chunk the text."""
         return self._recursive_chunk(text, level=0, full_text=text)
 
-
     def __repr__(self) -> str:
         """Get a string representation of the recursive chunker."""
         return (f"RecursiveChunker(rules={self.rules}, "
                 f"chunk_size={self.chunk_size}, "
-                f"min_characters_per_chunk={self.min_characters_per_chunk})")
+                f"min_characters_per_chunk={self.min_characters_per_chunk}, "
+                f"return_type={self.return_type})")
 
     def __str__(self) -> str:
         """Get a string representation of the recursive chunker."""
         return (f"RecursiveChunker(rules={self.rules}, "
                 f"chunk_size={self.chunk_size}, "
-                f"min_characters_per_chunk={self.min_characters_per_chunk})")
+                f"min_characters_per_chunk={self.min_characters_per_chunk}, "
+                f"return_type={self.return_type})")
diff --git a/src/chonkie/chunker/sdpm.py b/src/chonkie/chunker/sdpm.py
@@ -1,6 +1,6 @@
 """Semantic Double Pass Merge chunking using sentence embeddings."""
 
-from typing import Any, List, Union
+from typing import Any, List, Union, Literal
 
 from chonkie.types import SemanticChunk, Sentence
 
@@ -17,15 +17,17 @@ class SDPMChunker(SemanticChunker):
 
     Args:
         embedding_model: Sentence embedding model to use
-        similarity_threshold: Minimum similarity score to consider sentences similar
-        similarity_percentile: Minimum similarity percentile to consider sentences similar
+        mode: Mode for grouping sentences, either "cumulative" or "window"
+        threshold: Threshold for semantic similarity (0-1) or percentile (1-100), defaults to "auto"
         chunk_size: Maximum token count for a chunk
-        initial_sentences: Number of sentences to consider for initial grouping
-        skip_window: Number of chunks to skip when looking for similarities
+        similarity_window: Number of sentences to consider for similarity threshold calculation
+        min_sentences: Minimum number of sentences per chunk
         min_chunk_size: Minimum number of tokens per sentence
-
-    Methods:
-        chunk: Split text into chunks using the SDPM approach.
+        min_characters_per_sentence: Minimum number of characters per sentence
+        threshold_step: Step size for similarity threshold calculation
+        delim: Delimiters to split sentences on
+        skip_window: Number of chunks to skip when looking for similarities
+        return_type: Whether to return chunks or texts
 
     """
 
@@ -42,6 +44,7 @@ def __init__(
         threshold_step: float = 0.01,
         delim: Union[str, List[str]] = [".", "!", "?", "\n"],
         skip_window: int = 1,
+        return_type: Literal["chunks", "texts"] = "chunks",
         **kwargs
     ):
         """Initialize the SDPMChunker.
@@ -58,6 +61,7 @@ def __init__(
             threshold_step: Step size for similarity threshold calculation
             delim: Delimiters to split sentences on
             skip_window: Number of chunks to skip when looking for similarities
+            return_type: Whether to return chunks or texts
             **kwargs: Additional keyword arguments
 
         """
@@ -72,6 +76,7 @@ def __init__(
             min_characters_per_sentence=min_characters_per_sentence,
             threshold_step=threshold_step,
             delim=delim,
+            return_type=return_type,
             **kwargs
         )
         self.skip_window = skip_window

diff --git a/src/chonkie/chunker/semantic.py b/src/chonkie/chunker/semantic.py
@@ -1,7 +1,7 @@
 """Semantic chunking using sentence embeddings."""
 
 import warnings
-from typing import List, Union
+from typing import List, Union, Literal
 
 import numpy as np
 
@@ -24,7 +24,10 @@ class SemanticChunker(BaseChunker):
         min_chunk_size: Minimum number of tokens per sentence (defaults to 2)
         threshold_step: Step size for similarity threshold calculation
         delim: Delimiters to split sentences on
+        return_type: Whether to return chunks or texts
     
+    Raises:
+        ValueError: If parameters are invalid
     """
 
     def __init__(
@@ -39,6 +42,7 @@ def __init__(
         min_characters_per_sentence: int = 12,
         threshold_step: float = 0.01,
         delim: Union[str, List[str]] = [".", "!", "?", "\n"],
+        return_type: Literal["chunks", "texts"] = "chunks",
         **kwargs
     ):
         """Initialize the SemanticChunker.
@@ -56,6 +60,7 @@ def __init__(
             min_chunk_size: Minimum number of tokens per chunk (and sentence, defaults to 2)
             threshold_step: Step size for similarity threshold calculation
             delim: Delimiters to split sentences on
+            return_type: Whether to return chunks or texts
             **kwargs: Additional keyword arguments
 
         Raises:
@@ -85,6 +90,8 @@ def __init__(
             raise ValueError("threshold (float) must be between 0 and 1")
         elif type(threshold) == int and (threshold < 1 or threshold > 100):
             raise ValueError("threshold (int) must be between 1 and 100")
+        if return_type not in ["chunks", "texts"]:
+            raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")
 
         self.mode = mode
         self.chunk_size = chunk_size
@@ -96,6 +103,7 @@ def __init__(
         self.threshold_step = threshold_step
         self.delim = delim
         self.sep = "🦛"
+        self.return_type = return_type
 
         if isinstance(threshold, float):
             self.similarity_threshold = threshold
@@ -115,13 +123,13 @@ def __init__(
             self.embedding_model = AutoEmbeddings.get_embeddings(embedding_model, **kwargs)
         else:
             raise ValueError(
-                "embedding_model must be a string or BaseEmbeddings instance"
+                f"{embedding_model} is not a valid embedding model"
             )
 
         # Probably the dependency is not installed
         if self.embedding_model is None:
             raise ImportError(
-                "embedding_model is not a valid embedding model",
+                f"{embedding_model} is not a valid embedding model",
                 "Please install the `semantic` extra to use this feature",
             )
 
@@ -453,24 +461,27 @@ def _group_sentences(self, sentences: List[Sentence]) -> List[List[Sentence]]:
             return self._group_sentences_window(sentences)
 
     def _create_chunk(
-        self, sentences: List[Sentence], similarity_scores: List[float] = None
+        self, sentences: List[Sentence]
     ) -> SemanticChunk:
         """Create a chunk from a list of sentences."""
         if not sentences:
             raise ValueError("Cannot create chunk from empty sentence list")
-
-        # Compute chunk text and token count from sentences
-        text = "".join(sent.text for sent in sentences)
-        token_count = sum(sent.token_count for sent in sentences)
-
-        return SemanticChunk(
-            text=text,
-            start_index=sentences[0].start_index,
-            end_index=sentences[-1].end_index,
-            token_count=token_count,
-            sentences=sentences,
-        )
-
+        if self.return_type == "chunks":
+            # Compute chunk text and token count from sentences
+            text = "".join(sent.text for sent in sentences)
+            token_count = sum(sent.token_count for sent in sentences)
+            return SemanticChunk(
+                text=text,
+                start_index=sentences[0].start_index,
+                end_index=sentences[-1].end_index,
+                token_count=token_count,
+                sentences=sentences,
+            )
+        elif self.return_type == "texts":
+            return "".join(sent.text for sent in sentences)
+        else:
+            raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")
+
     def _split_chunks(
         self, sentence_groups: List[List[Sentence]]
     ) -> List[SemanticChunk]: