Add patterns and unicode replacements

chonkie-ai · Jan 28, 2025 · ea8c82f · ea8c82f
1 parent 14d78c6
commit ea8c82f
Show file tree

Hide file tree

Showing 2 changed files with 191 additions and 32 deletions.
diff --git a/src/chonkie/chef/patterns.py b/src/chonkie/chef/patterns.py
@@ -0,0 +1,84 @@
+"""Common text patterns and abbreviations used across chefs."""
+
+from dataclasses import dataclass
+from typing import Set, Dict
+
+@dataclass
+class Abbreviations:
+    """Common abbreviations grouped by category."""
+
+    # Titles and honorifics
+    TITLES: Set[str] = {
+        'Mr.', 'Mrs.', 'Ms.', 'Dr.', 'Prof.',
+        'Sr.', 'Jr.', 'Rev.', 'Hon.',
+    }
+
+    # Academic and professional 
+    ACADEMIC: Set[str] = {
+        'Ph.D.', 'M.D.', 'B.A.', 'M.A.', 'B.Sc.',
+        'M.Sc.', 'D.Phil.', 'LL.B.', 'LL.M.',
+    }
+
+    # Latin abbreviations
+    LATIN: Set[str] = {
+        'etc.', 'e.g.', 'i.e.', 'viz.',
+        'vs.', 'al.', 'et al.', 'cf.',
+    }
+
+    # Military and government
+    MILITARY: Set[str] = {
+        'Gen.', 'Col.', 'Lt.', 'Sgt.', 'Capt.',
+        'Maj.', 'Adm.', 'Gov.', 'Sen.', 'Rep.',
+    }
+
+    # Common measurements
+    MEASUREMENTS: Set[str] = {
+        'cm.', 'mm.', 'km.', 'kg.', 'lb.',
+        'ft.', 'in.', 'hr.', 'min.', 'sec.',
+    }
+
+    # Business and organization
+    BUSINESS: Set[str] = {
+        'Inc.', 'Ltd.', 'Corp.', 'Co.', 'LLC.',
+        'dept.', 'div.', 'est.', 'avg.', 'approx.',
+    }
+
+    # Temporal abbreviations
+    TEMPORAL: Set[str] = {
+        'Jan.', 'Feb.', 'Mar.', 'Apr.', 'Jun.',
+        'Jul.', 'Aug.', 'Sep.', 'Sept.', 'Oct.',
+        'Nov.', 'Dec.', 'Mon.', 'Tue.', 'Wed.',
+        'Thu.', 'Fri.', 'Sat.', 'Sun.',
+    }
+
+    # Geographical abbreviations
+    GEOGRAPHICAL: Set[str] = {
+        'U.S.', 'U.S.A.', 'U.K.', 'E.U.',
+        'Ave.', 'Blvd.', 'Rd.', 'St.', 'Mt.',
+    }
+
+    @classmethod
+    def all(cls) -> Set[str]:
+        """Return all abbreviations as a single set."""
+        all_abbrevs = set()
+        for field_name in cls.__annotations__:
+            if isinstance(field_name, str) and field_name.isupper():
+                all_abbrevs.update(getattr(cls, field_name))
+        return all_abbrevs
+
+    @classmethod
+    def by_category(cls) -> Dict[str, Set[str]]:
+        """Return abbreviations grouped by category."""
+        categories = {}
+        for field_name in cls.__annotations__:
+            if isinstance(field_name, str) and field_name.isupper():
+                categories[field_name.lower()] = getattr(cls, field_name)
+        return categories
+
+
+@dataclass(frozen=True, slots=True)
+class UnicodeReplacements:
+    """Common Unicode characters used for replacements."""
+
+    DOT_LEADER = '․'  # U+2024 ONE DOT LEADER
+    ELLIPSIS = '…'    # U+2026 HORIZONTAL ELLIPSIS
diff --git a/src/chonkie/chef/text.py b/src/chonkie/chef/text.py
@@ -1,46 +1,114 @@
 from typing import Optional, Union, List
+import re
 
 from chonkie.chef.base import BaseChef
-
+from chonkie.chef.patterns import Abbreviations, UnicodeReplacements
 class TextChef(BaseChef):
     """A chef that handles basic text processing.
     
     This chef handles basic text files and performs common text cleaning operations
     like removing extra whitespace, normalizing line endings, etc.
     """
 
-    def __init__(self) -> None:
+    def __init__(self, 
+                 whitespace: bool = True,
+                 newlines: bool = True,
+                 abbreviations: bool = True,
+                 ellipsis: bool = True, 
+                 sentence_endings: str = '.!?;:') -> None:
         """Initialize the TextChef with common text file extensions."""
-        super().__init__(extensions=['.txt', '.md', '.rst', '.text'])
+        extensions = ['.txt', '.md', '.rst', '.text']
+        super().__init__(extensions=extensions)
 
-    def _normalize_spaces(self, text: str) -> str:
-        """Normalize spaces in the text."""
-        return ' '.join(text.split())
-
-    def _remove_empty_lines(self, text: str) -> str:
-        """Remove empty lines from the text and preserve paragraph breaks."""
-        # Split the text into lines
-        lines = text.split('\n')
+        # Initialize the flags
+        self._enable_whitespace = whitespace
+        self._enable_newlines = newlines
+        self._enable_abbreviations = abbreviations
+        self._enable_ellipsis = ellipsis
+
+        # Initialize the sentence endings
+        self._sentence_endings = sentence_endings
+
+        # Initialize the patterns
+        self._abbreviations = Abbreviations.all() if abbreviations else set()
+        self._unicode_replacements = UnicodeReplacements()
+
+        # Compiling the regex patterns
+        self._ellipsis_pattern = re.compile(r'\.{3,}')
+        self._newline_pattern = re.compile(r'\n+')
+
+    def _handle_abbreviations(self, text: str) -> str:
+        """Replace the fullstop in abbreviations with a dot leader."""
+        for abbreviation in self._abbreviations:
+            new_abbreviation = abbreviation.replace('.', self._unicode_replacements.DOT_LEADER)
+            text = re.sub(abbreviation, new_abbreviation, text)
+        return text
+
+    def _replace_ellipsis(self, text: str) -> str:
+        """Replace ellipsis with Unicode ellipsis character.
+        
+        Args:
+            text: Input text
+            
+        Returns:
+            Text with ellipsis replaced
+
+        """
+        # Replace any sequence of 3 or more dots with ellipsis character
+        return self._ellipsis_pattern.sub(self._unicode_replacements.ELLIPSIS, text)
+
+    def _normalize_whitespace(self, text: str) -> str:
+        """Normalize whitespace in text.
+        
+        Args:
+            text: Input text
+            
+        Returns:
+            Text with normalized whitespace
 
-        # Remove empty lines
-        lines = [line for line in lines if line.strip()]
+        """
+        # Replace multiple spaces with single space
+        text = ' '.join(text.split())
+        return text
+
+    def _normalize_newlines(self, text: str) -> str:
+        """Normalize newlines in text.
+        
+        Args:
+            text: Input text
+            
+        Returns:
+            Text with normalized newlines
 
-        # Join the lines back together
-        return '\n'.join(lines)
+        """
+        # Normalize newlines
+        text = text.replace('\r\n', '\n').replace('\r', '\n')
+
+        # Normalize more than one newline as double newlines
+        text = self._newline_pattern.sub('\n\n', text)
 
-    def _normalize_line_endings(self, text: str) -> str:
-        r"""Normalize line endings to \n."""
-        return text.replace('\r\n', '\n').replace('\r', '\n')
+        # Remove empty lines while preserving paragraph structure
+        lines = [line.strip() for line in text.split('\n')]
+        result = []
+
+        for i, line in enumerate(lines):
+            if not line:  # Skip empty lines
+                continue
+
+            # If this isn't the first line and previous line doesn't end with 
+            # sentence ending punctuation, join with a space instead of newline
+            if result and not result[-1][-1] in self._sentence_endings:
+                result[-1] = result[-1] + ' ' + line
+            else:
+                result.append(line)
+
+        return '\n'.join(result)
 
     def clean(self, text: str) -> str:
         r"""Clean the text by performing basic text processing operations.
         
-        Operations performed:
-        - Normalize line endings to \n
-        - Remove redundant empty lines
-        - Strip whitespace from start/end
-        - Replace multiple spaces with single space
-        
+        A common function where one can enable/disable the operations supported by the chef.
+
         Args:
             text: The text to clean.
         
@@ -50,14 +118,21 @@ def clean(self, text: str) -> str:
         """
         if not text:
             return text
-
-        # Normalize line endings
-        text = self._normalize_line_endings(text)
 
-        # Remove redundant empty lines
-        text = self._remove_empty_lines(text)
+        # Normalize whitespace
+        if self._enable_whitespace:
+            text = self._normalize_whitespace(text)
+
+        # Normalize newlines
+        if self._enable_newlines:
+            text = self._normalize_newlines(text)
+
+        # Replace ellipsis
+        if self._enable_ellipsis:
+            text = self._replace_ellipsis(text)
+
+        # Replace abbreviations
+        if self._enable_abbreviations:
+            text = self._handle_abbreviations(text)
 
-        # Replace multiple spaces with single space
-        text = self._normalize_spaces(text)
-
         return text