Skip to content

Commit

Permalink
Merge branch 'main' into benchmarking-update
Browse files Browse the repository at this point in the history
  • Loading branch information
bhavnicksm authored Jan 29, 2025
2 parents 74e12b3 + 97e6365 commit 8f5a9a5
Show file tree
Hide file tree
Showing 18 changed files with 260 additions and 170 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
[![Documentation](https://img.shields.io/badge/docs-chonkie.ai-blue.svg)](https://docs.chonkie.ai)
![Package size](https://img.shields.io/badge/size-11.2MB-blue)
[![Downloads](https://static.pepy.tech/badge/chonkie)](https://pepy.tech/project/chonkie)
[![Discord](https://dcbadge.limes.pink/api/server/https://discord.gg/nMYNVyuB5Y?style=flat)](https://discord.gg/rYYp6DC4cv)
[![Discord](https://dcbadge.limes.pink/api/server/https://discord.gg/rYYp6DC4cv?style=flat)](https://discord.gg/rYYp6DC4cv)
[![GitHub stars](https://img.shields.io/github/stars/bhavnicksm/chonkie.svg)](https://github.com/bhavnicksm/chonkie/stargazers)

_The no-nonsense RAG chunking library that's lightweight, lightning-fast, and ready to CHONK your texts_
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "chonkie"
version = "0.4.0"
version = "0.4.1"
description = "🦛 CHONK your texts with Chonkie ✨ - The no-nonsense RAG chunking library"
readme = "README.md"
requires-python = ">=3.9"
Expand Down
4 changes: 2 additions & 2 deletions src/chonkie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@
SentenceChunk,
)

__version__ = "0.4.0"
__version__ = "0.4.1"
__name__ = "chonkie"
__author__ = "Bhavnick Minhas"
__author__ = "Chonkie AI"

# Add basic package metadata to __all__
__all__ = [
Expand Down
22 changes: 12 additions & 10 deletions src/chonkie/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from multiprocessing import Pool, cpu_count
from typing import Any, Callable, List, Union

from tqdm import tqdm

from chonkie.types import Chunk

from tqdm import tqdm

class BaseChunker(ABC):
"""Abstract base class for all chunker implementations.
Expand Down Expand Up @@ -246,11 +248,11 @@ def _process_batch_sequential(self,
return [
self.chunk(t) for t in tqdm(
texts,
desc="🦛 CHONKING",
desc="🦛",
disable=not show_progress_bar,
unit="texts",
bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
ascii=' >=')
unit="doc",
bar_format="{desc} ch{bar:20}nk {percentage:3.0f}% • {n_fmt}/{total_fmt} docs chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
ascii=' o')
]

def _process_batch_multiprocessing(self,
Expand All @@ -264,12 +266,12 @@ def _process_batch_multiprocessing(self,
with Pool(processes=num_workers) as pool:
results = []
with tqdm(total=total,
desc="🦛 CHONKING",
desc="🦛",
disable=not show_progress_bar,
unit="texts",
bar_format="{desc}: [{bar:20}] {percentage:3.0f}% • {n_fmt}/{total_fmt} texts chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
ascii=' >=') as pbar:
for result in pool.imap_unordered(self.chunk, texts, chunksize=chunksize):
unit="doc",
bar_format="{desc} ch{bar:20}nk {percentage:3.0f}% • {n_fmt}/{total_fmt} docs chunked [{elapsed}<{remaining}, {rate_fmt}] 🌱",
ascii=' o') as pbar:
for result in pool.imap(self.chunk, texts, chunksize=chunksize):
results.append(result)
pbar.update()
return results
Expand Down
59 changes: 39 additions & 20 deletions src/chonkie/chunker/recursive.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from bisect import bisect_left
from functools import lru_cache
from itertools import accumulate
from typing import Any, List, Optional, Union
from typing import Any, Callable, List, Optional, Union, Literal

from chonkie.chunker.base import BaseChunker
from chonkie.types import Chunk, RecursiveChunk, RecursiveLevel, RecursiveRules
Expand All @@ -18,21 +18,35 @@ class RecursiveChunker(BaseChunker):
"""

def __init__(self,
tokenizer: Union[str, Any] = "gpt2",
tokenizer_or_token_counter: Union[str, Callable, Any] = "gpt2",
chunk_size: int = 512,
min_characters_per_chunk: int = 12,
rules: RecursiveRules = RecursiveRules(),
min_characters_per_chunk: int = 12
return_type: Literal["chunks", "texts"] = "chunks"
) -> None:
"""Initialize the recursive chunker.
Args:
tokenizer: The tokenizer to use for encoding/decoding.
tokenizer_or_token_counter: The tokenizer or token counter to use for encoding/decoding.
chunk_size: The size of the chunks to return.
rules: The rules to use for chunking.
min_characters_per_chunk: The minimum number of characters per chunk.
rules: The rules to use for chunking.
return_type: Whether to return chunks or texts.
Raises:
ValueError: If parameters are invalid.
"""
super().__init__(tokenizer)
super().__init__(tokenizer_or_token_counter=tokenizer_or_token_counter)

if chunk_size <= 0:
raise ValueError("chunk_size must be positive")
if min_characters_per_chunk < 1:
raise ValueError("min_characters_per_chunk must be at least 1")
if return_type not in ["chunks", "texts"]:
raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")

self.return_type = return_type
self.rules = rules
self.chunk_size = chunk_size
self.min_characters_per_chunk = min_characters_per_chunk
Expand Down Expand Up @@ -194,7 +208,10 @@ def _recursive_chunk(self,

# If level is out of bounds, return the text as a chunk
if level >= len(self.rules):
return [self._create_chunk(text, self._get_token_count(text), level, full_text)]
if self.return_type == "chunks":
return [self._create_chunk(text, self._get_token_count(text), level, full_text)]
elif self.return_type == "texts":
return [text]

# If full_text is not provided, use the text
if full_text is None:
Expand Down Expand Up @@ -227,30 +244,32 @@ def _recursive_chunk(self,
if token_count > self.chunk_size:
chunks.extend(self._recursive_chunk(split, level + 1, full_text))
else:
if rule.delimiters is None and not rule.whitespace:
# NOTE: This is a hack to get the decoded text, since merged = splits = token_splits
# And we don't want to encode/decode the text again, that would be inefficient
decoded_text = "".join(merged)
chunks.append(self._create_chunk(split, token_count, level, decoded_text))
else:
chunks.append(self._create_chunk(split, token_count, level, full_text))

if self.return_type == "chunks":
if rule.delimiters is None and not rule.whitespace:
# NOTE: This is a hack to get the decoded text, since merged = splits = token_splits
# And we don't want to encode/decode the text again, that would be inefficient
decoded_text = "".join(merged)
chunks.append(self._create_chunk(split, token_count, level, decoded_text))
else:
chunks.append(self._create_chunk(split, token_count, level, full_text))
elif self.return_type == "texts":
chunks.append(split)
return chunks


def chunk(self, text: str) -> List[Chunk]:
"""Chunk the text."""
return self._recursive_chunk(text, level=0, full_text=text)


def __repr__(self) -> str:
"""Get a string representation of the recursive chunker."""
return (f"RecursiveChunker(rules={self.rules}, "
f"chunk_size={self.chunk_size}, "
f"min_characters_per_chunk={self.min_characters_per_chunk})")
f"min_characters_per_chunk={self.min_characters_per_chunk}, "
f"return_type={self.return_type})")

def __str__(self) -> str:
"""Get a string representation of the recursive chunker."""
return (f"RecursiveChunker(rules={self.rules}, "
f"chunk_size={self.chunk_size}, "
f"min_characters_per_chunk={self.min_characters_per_chunk})")
f"min_characters_per_chunk={self.min_characters_per_chunk}, "
f"return_type={self.return_type})")
21 changes: 13 additions & 8 deletions src/chonkie/chunker/sdpm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Semantic Double Pass Merge chunking using sentence embeddings."""

from typing import Any, List, Union
from typing import Any, List, Union, Literal

from chonkie.types import SemanticChunk, Sentence

Expand All @@ -17,15 +17,17 @@ class SDPMChunker(SemanticChunker):
Args:
embedding_model: Sentence embedding model to use
similarity_threshold: Minimum similarity score to consider sentences similar
similarity_percentile: Minimum similarity percentile to consider sentences similar
mode: Mode for grouping sentences, either "cumulative" or "window"
threshold: Threshold for semantic similarity (0-1) or percentile (1-100), defaults to "auto"
chunk_size: Maximum token count for a chunk
initial_sentences: Number of sentences to consider for initial grouping
skip_window: Number of chunks to skip when looking for similarities
similarity_window: Number of sentences to consider for similarity threshold calculation
min_sentences: Minimum number of sentences per chunk
min_chunk_size: Minimum number of tokens per sentence
Methods:
chunk: Split text into chunks using the SDPM approach.
min_characters_per_sentence: Minimum number of characters per sentence
threshold_step: Step size for similarity threshold calculation
delim: Delimiters to split sentences on
skip_window: Number of chunks to skip when looking for similarities
return_type: Whether to return chunks or texts
"""

Expand All @@ -42,6 +44,7 @@ def __init__(
threshold_step: float = 0.01,
delim: Union[str, List[str]] = [".", "!", "?", "\n"],
skip_window: int = 1,
return_type: Literal["chunks", "texts"] = "chunks",
**kwargs
):
"""Initialize the SDPMChunker.
Expand All @@ -58,6 +61,7 @@ def __init__(
threshold_step: Step size for similarity threshold calculation
delim: Delimiters to split sentences on
skip_window: Number of chunks to skip when looking for similarities
return_type: Whether to return chunks or texts
**kwargs: Additional keyword arguments
"""
Expand All @@ -72,6 +76,7 @@ def __init__(
min_characters_per_sentence=min_characters_per_sentence,
threshold_step=threshold_step,
delim=delim,
return_type=return_type,
**kwargs
)
self.skip_window = skip_window
Expand Down
45 changes: 28 additions & 17 deletions src/chonkie/chunker/semantic.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Semantic chunking using sentence embeddings."""

import warnings
from typing import List, Union
from typing import List, Union, Literal

import numpy as np

Expand All @@ -24,7 +24,10 @@ class SemanticChunker(BaseChunker):
min_chunk_size: Minimum number of tokens per sentence (defaults to 2)
threshold_step: Step size for similarity threshold calculation
delim: Delimiters to split sentences on
return_type: Whether to return chunks or texts
Raises:
ValueError: If parameters are invalid
"""

def __init__(
Expand All @@ -39,6 +42,7 @@ def __init__(
min_characters_per_sentence: int = 12,
threshold_step: float = 0.01,
delim: Union[str, List[str]] = [".", "!", "?", "\n"],
return_type: Literal["chunks", "texts"] = "chunks",
**kwargs
):
"""Initialize the SemanticChunker.
Expand All @@ -56,6 +60,7 @@ def __init__(
min_chunk_size: Minimum number of tokens per chunk (and sentence, defaults to 2)
threshold_step: Step size for similarity threshold calculation
delim: Delimiters to split sentences on
return_type: Whether to return chunks or texts
**kwargs: Additional keyword arguments
Raises:
Expand Down Expand Up @@ -85,6 +90,8 @@ def __init__(
raise ValueError("threshold (float) must be between 0 and 1")
elif type(threshold) == int and (threshold < 1 or threshold > 100):
raise ValueError("threshold (int) must be between 1 and 100")
if return_type not in ["chunks", "texts"]:
raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")

self.mode = mode
self.chunk_size = chunk_size
Expand All @@ -96,6 +103,7 @@ def __init__(
self.threshold_step = threshold_step
self.delim = delim
self.sep = "🦛"
self.return_type = return_type

if isinstance(threshold, float):
self.similarity_threshold = threshold
Expand All @@ -115,13 +123,13 @@ def __init__(
self.embedding_model = AutoEmbeddings.get_embeddings(embedding_model, **kwargs)
else:
raise ValueError(
"embedding_model must be a string or BaseEmbeddings instance"
f"{embedding_model} is not a valid embedding model"
)

# Probably the dependency is not installed
if self.embedding_model is None:
raise ImportError(
"embedding_model is not a valid embedding model",
f"{embedding_model} is not a valid embedding model",
"Please install the `semantic` extra to use this feature",
)

Expand Down Expand Up @@ -453,24 +461,27 @@ def _group_sentences(self, sentences: List[Sentence]) -> List[List[Sentence]]:
return self._group_sentences_window(sentences)

def _create_chunk(
self, sentences: List[Sentence], similarity_scores: List[float] = None
self, sentences: List[Sentence]
) -> SemanticChunk:
"""Create a chunk from a list of sentences."""
if not sentences:
raise ValueError("Cannot create chunk from empty sentence list")

# Compute chunk text and token count from sentences
text = "".join(sent.text for sent in sentences)
token_count = sum(sent.token_count for sent in sentences)

return SemanticChunk(
text=text,
start_index=sentences[0].start_index,
end_index=sentences[-1].end_index,
token_count=token_count,
sentences=sentences,
)

if self.return_type == "chunks":
# Compute chunk text and token count from sentences
text = "".join(sent.text for sent in sentences)
token_count = sum(sent.token_count for sent in sentences)
return SemanticChunk(
text=text,
start_index=sentences[0].start_index,
end_index=sentences[-1].end_index,
token_count=token_count,
sentences=sentences,
)
elif self.return_type == "texts":
return "".join(sent.text for sent in sentences)
else:
raise ValueError("Invalid return_type. Must be either 'chunks' or 'texts'.")

def _split_chunks(
self, sentence_groups: List[List[Sentence]]
) -> List[SemanticChunk]:
Expand Down
Loading

0 comments on commit 8f5a9a5

Please sign in to comment.