From 8da8a364ddd0f7d3dfd99106c3f352f9565173eb Mon Sep 17 00:00:00 2001 From: bhavnicksm Date: Sat, 7 Dec 2024 03:06:38 +0530 Subject: [PATCH] Expose the seperation delim for simple multilingual chunking --- src/chonkie/chunker/sdpm.py | 3 +++ src/chonkie/chunker/semantic.py | 19 ++++++++++++------- src/chonkie/chunker/sentence.py | 20 ++++++++------------ 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/chonkie/chunker/sdpm.py b/src/chonkie/chunker/sdpm.py index 4337dc3..7101c26 100644 --- a/src/chonkie/chunker/sdpm.py +++ b/src/chonkie/chunker/sdpm.py @@ -40,6 +40,7 @@ def __init__( min_chunk_size: int = 2, min_characters_per_sentence: int = 12, threshold_step: float = 0.01, + delim: Union[str, List[str]] = [".", "!", "?", "\n"], skip_window: int = 1, ): """Initialize the SDPMChunker. @@ -54,6 +55,7 @@ def __init__( min_chunk_size: Minimum number of tokens per sentence min_characters_per_sentence: Minimum number of characters per sentence threshold_step: Step size for similarity threshold calculation + delim: Delimiters to split sentences on skip_window: Number of chunks to skip when looking for similarities """ @@ -67,6 +69,7 @@ def __init__( min_chunk_size=min_chunk_size, min_characters_per_sentence=min_characters_per_sentence, threshold_step=threshold_step, + delim=delim, ) self.skip_window = skip_window diff --git a/src/chonkie/chunker/semantic.py b/src/chonkie/chunker/semantic.py index 7ab1d11..7e3ecee 100644 --- a/src/chonkie/chunker/semantic.py +++ b/src/chonkie/chunker/semantic.py @@ -23,7 +23,8 @@ class SemanticChunker(BaseChunker): min_characters_per_sentence: Minimum number of characters per sentence min_chunk_size: Minimum number of tokens per sentence (defaults to 2) threshold_step: Step size for similarity threshold calculation - + delim: Delimiters to split sentences on + """ def __init__( @@ -37,6 +38,7 @@ def __init__( min_chunk_size: int = 2, min_characters_per_sentence: int = 12, threshold_step: float = 0.01, + delim: Union[str, List[str]] = [".", "!", "?", "\n"] ): """Initialize the SemanticChunker. @@ -52,6 +54,7 @@ def __init__( min_characters_per_sentence: Minimum number of characters per sentence min_chunk_size: Minimum number of tokens per chunk (and sentence, defaults to 2) threshold_step: Step size for similarity threshold calculation + delim: Delimiters to split sentences on Raises: ValueError: If parameters are invalid @@ -72,6 +75,8 @@ def __init__( raise ValueError("mode must be 'cumulative' or 'window'") if type(threshold) not in [str, float, int]: raise ValueError("threshold must be a string, float, or int") + if type(delim) not in [str, list]: + raise ValueError("delim must be a string or list of strings") elif type(threshold) == str and threshold not in ["auto"]: raise ValueError("threshold must be 'auto', 'smart', or 'percentile'") elif type(threshold) == float and (threshold < 0 or threshold > 1): @@ -87,7 +92,9 @@ def __init__( self.min_chunk_size = min_chunk_size self.min_characters_per_sentence = min_characters_per_sentence self.threshold_step = threshold_step - + self.delim = delim + self.sep = "🦛" + if isinstance(threshold, float): self.similarity_threshold = threshold self.similarity_percentile = None @@ -123,8 +130,6 @@ def __init__( def _split_sentences( self, text: str, - delim: Union[str, List[str]] = [".", "!", "?", "\n"], - sep: str = "🦛", ) -> List[str]: """Fast sentence splitting while maintaining accuracy. @@ -140,11 +145,11 @@ def _split_sentences( """ t = text - for c in delim: - t = t.replace(c, c + sep) + for c in self.delim: + t = t.replace(c, c + self.sep) # Initial split - splits = [s for s in t.split(sep) if s != ""] + splits = [s for s in t.split(self.sep) if s != ""] # print(splits) # Combine short splits with previous sentence diff --git a/src/chonkie/chunker/sentence.py b/src/chonkie/chunker/sentence.py index ed72cff..7883ca3 100644 --- a/src/chonkie/chunker/sentence.py +++ b/src/chonkie/chunker/sentence.py @@ -31,6 +31,7 @@ def __init__( min_sentences_per_chunk: int = 1, min_chunk_size: int = 2, use_approximate: bool = True, + delim: Union[str, List[str]] = [".", "!", "?", "\n"], ): """Initialize the SentenceChunker with configuration parameters. @@ -64,6 +65,8 @@ def __init__( self.min_sentences_per_chunk = min_sentences_per_chunk self.min_chunk_size = min_chunk_size self.use_approximate = use_approximate + self.delim = delim + self.sep = "🦛" # TODO: This is a older method of sentence splitting that uses Regex # but since Regex in python via re is super slooooow we use a different method @@ -144,31 +147,24 @@ def __init__( # return sentences - def _split_sentences( - self, - text: str, - delim: Union[str, List[str]] = [".", "!", "?", "\n"], - sep: str = "🦛", - ) -> List[str]: + def _split_sentences(self, text: str) -> List[str]: """Fast sentence splitting while maintaining accuracy. This method is faster than using regex for sentence splitting and is more accurate than using the spaCy sentence tokenizer. Args: text: Input text to be split into sentences - delim: Delimiters to split sentences on - sep: Separator to use when splitting sentences - + Returns: List of sentences """ t = text - for c in delim: - t = t.replace(c, c + sep) + for c in self.delim: + t = t.replace(c, c + self.sep) # Initial split - splits = [s for s in t.split(sep) if s != ""] + splits = [s for s in t.split(self.sep) if s != ""] # print(splits) # Combine short splits with previous sentence