Merge pull request #81 from bhavnicksm/development

Expose the seperation delim for simple multilingual chunking
chonkie-ai · Dec 6, 2024 · 809d3a2 · 809d3a2
2 parents 98af028 + 9a7a458
commit 809d3a2
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 19 deletions.
diff --git a/src/chonkie/chunker/sdpm.py b/src/chonkie/chunker/sdpm.py
@@ -40,6 +40,7 @@ def __init__(
         min_chunk_size: int = 2,
         min_characters_per_sentence: int = 12,
         threshold_step: float = 0.01,
+        delim: Union[str, List[str]] = [".", "!", "?", "\n"],
         skip_window: int = 1,
     ):
         """Initialize the SDPMChunker.
@@ -54,6 +55,7 @@ def __init__(
             min_chunk_size: Minimum number of tokens per sentence
             min_characters_per_sentence: Minimum number of characters per sentence
             threshold_step: Step size for similarity threshold calculation
+            delim: Delimiters to split sentences on
             skip_window: Number of chunks to skip when looking for similarities
 
         """
@@ -67,6 +69,7 @@ def __init__(
             min_chunk_size=min_chunk_size,
             min_characters_per_sentence=min_characters_per_sentence,
             threshold_step=threshold_step,
+            delim=delim,
         )
         self.skip_window = skip_window
 

diff --git a/src/chonkie/chunker/semantic.py b/src/chonkie/chunker/semantic.py
@@ -23,7 +23,8 @@ class SemanticChunker(BaseChunker):
         min_characters_per_sentence: Minimum number of characters per sentence
         min_chunk_size: Minimum number of tokens per sentence (defaults to 2)
         threshold_step: Step size for similarity threshold calculation
-
+        delim: Delimiters to split sentences on
+    
     """
 
     def __init__(
@@ -37,6 +38,7 @@ def __init__(
         min_chunk_size: int = 2,
         min_characters_per_sentence: int = 12,
         threshold_step: float = 0.01,
+        delim: Union[str, List[str]] = [".", "!", "?", "\n"]
     ):
         """Initialize the SemanticChunker.
 
@@ -52,6 +54,7 @@ def __init__(
             min_characters_per_sentence: Minimum number of characters per sentence
             min_chunk_size: Minimum number of tokens per chunk (and sentence, defaults to 2)
             threshold_step: Step size for similarity threshold calculation
+            delim: Delimiters to split sentences on
 
         Raises:
             ValueError: If parameters are invalid
@@ -72,6 +75,8 @@ def __init__(
             raise ValueError("mode must be 'cumulative' or 'window'")
         if type(threshold) not in [str, float, int]:
             raise ValueError("threshold must be a string, float, or int")
+        if type(delim) not in [str, list]:
+            raise ValueError("delim must be a string or list of strings")
         elif type(threshold) == str and threshold not in ["auto"]:
             raise ValueError("threshold must be 'auto', 'smart', or 'percentile'")
         elif type(threshold) == float and (threshold < 0 or threshold > 1):
@@ -87,7 +92,9 @@ def __init__(
         self.min_chunk_size = min_chunk_size
         self.min_characters_per_sentence = min_characters_per_sentence
         self.threshold_step = threshold_step
-
+        self.delim = delim
+        self.sep = "🦛"
+
         if isinstance(threshold, float):
             self.similarity_threshold = threshold
             self.similarity_percentile = None
@@ -123,8 +130,6 @@ def __init__(
     def _split_sentences(
         self,
         text: str,
-        delim: Union[str, List[str]] = [".", "!", "?", "\n"],
-        sep: str = "🦛",
     ) -> List[str]:
         """Fast sentence splitting while maintaining accuracy.
 
@@ -140,11 +145,11 @@ def _split_sentences(
 
         """
         t = text
-        for c in delim:
-            t = t.replace(c, c + sep)
+        for c in self.delim:
+            t = t.replace(c, c + self.sep)
 
         # Initial split
-        splits = [s for s in t.split(sep) if s != ""]
+        splits = [s for s in t.split(self.sep) if s != ""]
         # print(splits)
 
         # Combine short splits with previous sentence

diff --git a/src/chonkie/chunker/sentence.py b/src/chonkie/chunker/sentence.py
@@ -31,6 +31,7 @@ def __init__(
         min_sentences_per_chunk: int = 1,
         min_chunk_size: int = 2,
         use_approximate: bool = True,
+        delim: Union[str, List[str]] = [".", "!", "?", "\n"],
     ):
         """Initialize the SentenceChunker with configuration parameters.
 
@@ -64,6 +65,8 @@ def __init__(
         self.min_sentences_per_chunk = min_sentences_per_chunk
         self.min_chunk_size = min_chunk_size
         self.use_approximate = use_approximate
+        self.delim = delim
+        self.sep = "🦛"
 
     # TODO: This is a older method of sentence splitting that uses Regex
     # but since Regex in python via re is super slooooow we use a different method
@@ -144,31 +147,24 @@ def __init__(
 
     #     return sentences
 
-    def _split_sentences(
-        self,
-        text: str,
-        delim: Union[str, List[str]] = [".", "!", "?", "\n"],
-        sep: str = "🦛",
-    ) -> List[str]:
+    def _split_sentences(self, text: str) -> List[str]:
         """Fast sentence splitting while maintaining accuracy.
 
         This method is faster than using regex for sentence splitting and is more accurate than using the spaCy sentence tokenizer.
 
         Args:
             text: Input text to be split into sentences
-            delim: Delimiters to split sentences on
-            sep: Separator to use when splitting sentences
-
+            
         Returns:
             List of sentences
 
         """
         t = text
-        for c in delim:
-            t = t.replace(c, c + sep)
+        for c in self.delim:
+            t = t.replace(c, c + self.sep)
 
         # Initial split
-        splits = [s for s in t.split(sep) if s != ""]
+        splits = [s for s in t.split(self.sep) if s != ""]
         # print(splits)
 
         # Combine short splits with previous sentence