Skip to content

Commit

Permalink
Merge pull request #79 from bhavnicksm/development
Browse files Browse the repository at this point in the history
[FEAT] Add "auto" threshold configuration via Statistical analysis in SemanticChunker + minor fixes
  • Loading branch information
bhavnicksm authored Dec 6, 2024
2 parents 27458b3 + 2bbb54d commit 1e784c2
Show file tree
Hide file tree
Showing 6 changed files with 297 additions and 123 deletions.
4 changes: 2 additions & 2 deletions src/chonkie/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def _get_tokenizer_counter(self) -> Callable[[str], int]:
else:
raise ValueError("Tokenizer backend not supported for token counting")

def _encode(self, text: str):
def _encode(self, text: str) -> List[int]:
"""Encode text using the backend tokenizer."""
if self._tokenizer_backend == "transformers":
return self.tokenizer.encode(text, add_special_tokens=False)
Expand All @@ -185,7 +185,7 @@ def _encode(self, text: str):
f"Tokenizer backend {self._tokenizer_backend} not supported."
)

def _encode_batch(self, texts: List[str]):
def _encode_batch(self, texts: List[str]) -> List[List[int]]:
"""Encode a batch of texts using the backend tokenizer."""
if self._tokenizer_backend == "transformers":
return self.tokenizer.batch_encode_plus(texts, add_special_tokens=False)[
Expand Down
62 changes: 40 additions & 22 deletions src/chonkie/chunker/sdpm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Semantic Double Pass Merge chunking using sentence embeddings."""
from typing import Any, List, Union

from .semantic import SemanticChunk, SemanticChunker, Sentence
Expand All @@ -20,42 +21,54 @@ class SDPMChunker(SemanticChunker):
skip_window: Number of chunks to skip when looking for similarities
min_chunk_size: Minimum number of tokens per sentence
Methods:
chunk: Split text into chunks using the SDPM approach.
"""

def __init__(
self,
embedding_model: Union[str, Any] = "minishlab/potion-base-8M",
similarity_threshold: float = None,
similarity_percentile: float = None,
mode: str = "window",
threshold: Union[str, float, int] = "auto",
chunk_size: int = 512,
initial_sentences: int = 1,
skip_window: int = 1, # How many chunks to skip when looking for similarities
min_chunk_size: int = 2, # Minimum number of tokens per sentence
similarity_window: int = 1,
min_sentences: int = 1,
min_chunk_size: int = 2,
min_characters_per_sentence: int = 12,
threshold_step: float = 0.01,
skip_window: int = 1,
):
"""Initialize the SDPMChunker.
Args:
embedding_model: Sentence embedding model to use
similarity_threshold: Minimum similarity score to consider sentences similar
similarity_percentile: Minimum similarity percentile to consider sentences similar
mode: Mode for grouping sentences, either "cumulative" or "window"
threshold: Threshold for semantic similarity (0-1) or percentile (1-100), defaults to "auto"
chunk_size: Maximum token count for a chunk
initial_sentences: Number of sentences to consider for initial grouping
skip_window: Number of chunks to skip when looking for similarities
similarity_window: Number of sentences to consider for similarity threshold calculation
min_sentences: Minimum number of sentences per chunk
min_chunk_size: Minimum number of tokens per sentence
min_characters_per_sentence: Minimum number of characters per sentence
threshold_step: Step size for similarity threshold calculation
skip_window: Number of chunks to skip when looking for similarities
"""
super().__init__(
embedding_model=embedding_model,
chunk_size=chunk_size,
similarity_threshold=similarity_threshold,
similarity_percentile=similarity_percentile,
initial_sentences=initial_sentences,
mode=mode,
threshold=threshold,
similarity_window=similarity_window,
min_sentences=min_sentences,
min_chunk_size=min_chunk_size,
min_characters_per_sentence=min_characters_per_sentence,
threshold_step=threshold_step,
)
self.skip_window = skip_window

def _merge_groups(self, groups: List[List[Sentence]]) -> List[Sentence]:
"""Merge the groups together"""
"""Merge the groups together."""
merged_group = []
for group in groups:
merged_group.extend(group)
Expand Down Expand Up @@ -118,8 +131,11 @@ def chunk(self, text: str) -> List[SemanticChunk]:

# Prepare sentences with precomputed information
sentences = self._prepare_sentences(text)
if len(sentences) < self.initial_sentences:
if len(sentences) <= self.min_sentences:
return [self._create_chunk(sentences)]

# Calculate similarity threshold
self.similarity_threshold = self._calculate_similarity_threshold(sentences)

# First pass: Group sentences by semantic similarity
initial_groups = self._group_sentences(sentences)
Expand All @@ -133,14 +149,16 @@ def chunk(self, text: str) -> List[SemanticChunk]:
return chunks

def __repr__(self) -> str:
threshold_info = (
f"similarity_threshold={self.similarity_threshold}"
if self.similarity_threshold is not None
else f"similarity_percentile={self.similarity_percentile}"
)
"""Return a string representation of the SDPMChunker."""
return (
f"SPDMChunker(chunk_size={self.chunk_size}, "
f"{threshold_info}, "
f"initial_sentences={self.initial_sentences}, "
f"SPDMChunker(embedding_model={self.embedding_model}, "
f"mode={self.mode}, "
f"threshold={self.threshold}, "
f"chunk_size={self.chunk_size}, "
f"similarity_window={self.similarity_window}, "
f"min_sentences={self.min_sentences}, "
f"min_chunk_size={self.min_chunk_size}, "
f"min_characters_per_sentence={self.min_characters_per_sentence}, "
f"threshold_step={self.threshold_step}, "
f"skip_window={self.skip_window})"
)
Loading

0 comments on commit 1e784c2

Please sign in to comment.