Skip to content

Commit

Permalink
Merge pull request #81 from bhavnicksm/development
Browse files Browse the repository at this point in the history
Expose the seperation delim for simple multilingual chunking
  • Loading branch information
bhavnicksm authored Dec 6, 2024
2 parents 98af028 + 9a7a458 commit 809d3a2
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 19 deletions.
3 changes: 3 additions & 0 deletions src/chonkie/chunker/sdpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def __init__(
min_chunk_size: int = 2,
min_characters_per_sentence: int = 12,
threshold_step: float = 0.01,
delim: Union[str, List[str]] = [".", "!", "?", "\n"],
skip_window: int = 1,
):
"""Initialize the SDPMChunker.
Expand All @@ -54,6 +55,7 @@ def __init__(
min_chunk_size: Minimum number of tokens per sentence
min_characters_per_sentence: Minimum number of characters per sentence
threshold_step: Step size for similarity threshold calculation
delim: Delimiters to split sentences on
skip_window: Number of chunks to skip when looking for similarities
"""
Expand All @@ -67,6 +69,7 @@ def __init__(
min_chunk_size=min_chunk_size,
min_characters_per_sentence=min_characters_per_sentence,
threshold_step=threshold_step,
delim=delim,
)
self.skip_window = skip_window

Expand Down
19 changes: 12 additions & 7 deletions src/chonkie/chunker/semantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ class SemanticChunker(BaseChunker):
min_characters_per_sentence: Minimum number of characters per sentence
min_chunk_size: Minimum number of tokens per sentence (defaults to 2)
threshold_step: Step size for similarity threshold calculation
delim: Delimiters to split sentences on
"""

def __init__(
Expand All @@ -37,6 +38,7 @@ def __init__(
min_chunk_size: int = 2,
min_characters_per_sentence: int = 12,
threshold_step: float = 0.01,
delim: Union[str, List[str]] = [".", "!", "?", "\n"]
):
"""Initialize the SemanticChunker.
Expand All @@ -52,6 +54,7 @@ def __init__(
min_characters_per_sentence: Minimum number of characters per sentence
min_chunk_size: Minimum number of tokens per chunk (and sentence, defaults to 2)
threshold_step: Step size for similarity threshold calculation
delim: Delimiters to split sentences on
Raises:
ValueError: If parameters are invalid
Expand All @@ -72,6 +75,8 @@ def __init__(
raise ValueError("mode must be 'cumulative' or 'window'")
if type(threshold) not in [str, float, int]:
raise ValueError("threshold must be a string, float, or int")
if type(delim) not in [str, list]:
raise ValueError("delim must be a string or list of strings")
elif type(threshold) == str and threshold not in ["auto"]:
raise ValueError("threshold must be 'auto', 'smart', or 'percentile'")
elif type(threshold) == float and (threshold < 0 or threshold > 1):
Expand All @@ -87,7 +92,9 @@ def __init__(
self.min_chunk_size = min_chunk_size
self.min_characters_per_sentence = min_characters_per_sentence
self.threshold_step = threshold_step

self.delim = delim
self.sep = "🦛"

if isinstance(threshold, float):
self.similarity_threshold = threshold
self.similarity_percentile = None
Expand Down Expand Up @@ -123,8 +130,6 @@ def __init__(
def _split_sentences(
self,
text: str,
delim: Union[str, List[str]] = [".", "!", "?", "\n"],
sep: str = "🦛",
) -> List[str]:
"""Fast sentence splitting while maintaining accuracy.
Expand All @@ -140,11 +145,11 @@ def _split_sentences(
"""
t = text
for c in delim:
t = t.replace(c, c + sep)
for c in self.delim:
t = t.replace(c, c + self.sep)

# Initial split
splits = [s for s in t.split(sep) if s != ""]
splits = [s for s in t.split(self.sep) if s != ""]
# print(splits)

# Combine short splits with previous sentence
Expand Down
20 changes: 8 additions & 12 deletions src/chonkie/chunker/sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(
min_sentences_per_chunk: int = 1,
min_chunk_size: int = 2,
use_approximate: bool = True,
delim: Union[str, List[str]] = [".", "!", "?", "\n"],
):
"""Initialize the SentenceChunker with configuration parameters.
Expand Down Expand Up @@ -64,6 +65,8 @@ def __init__(
self.min_sentences_per_chunk = min_sentences_per_chunk
self.min_chunk_size = min_chunk_size
self.use_approximate = use_approximate
self.delim = delim
self.sep = "🦛"

# TODO: This is a older method of sentence splitting that uses Regex
# but since Regex in python via re is super slooooow we use a different method
Expand Down Expand Up @@ -144,31 +147,24 @@ def __init__(

# return sentences

def _split_sentences(
self,
text: str,
delim: Union[str, List[str]] = [".", "!", "?", "\n"],
sep: str = "🦛",
) -> List[str]:
def _split_sentences(self, text: str) -> List[str]:
"""Fast sentence splitting while maintaining accuracy.
This method is faster than using regex for sentence splitting and is more accurate than using the spaCy sentence tokenizer.
Args:
text: Input text to be split into sentences
delim: Delimiters to split sentences on
sep: Separator to use when splitting sentences
Returns:
List of sentences
"""
t = text
for c in delim:
t = t.replace(c, c + sep)
for c in self.delim:
t = t.replace(c, c + self.sep)

# Initial split
splits = [s for s in t.split(sep) if s != ""]
splits = [s for s in t.split(self.sep) if s != ""]
# print(splits)

# Combine short splits with previous sentence
Expand Down

0 comments on commit 809d3a2

Please sign in to comment.