Skip to content

Commit

Permalink
Add patterns and unicode replacements
Browse files Browse the repository at this point in the history
  • Loading branch information
bhavnicksm committed Jan 28, 2025
1 parent 14d78c6 commit ea8c82f
Show file tree
Hide file tree
Showing 2 changed files with 191 additions and 32 deletions.
84 changes: 84 additions & 0 deletions src/chonkie/chef/patterns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Common text patterns and abbreviations used across chefs."""

from dataclasses import dataclass
from typing import Set, Dict

@dataclass
class Abbreviations:
"""Common abbreviations grouped by category."""

# Titles and honorifics
TITLES: Set[str] = {
'Mr.', 'Mrs.', 'Ms.', 'Dr.', 'Prof.',
'Sr.', 'Jr.', 'Rev.', 'Hon.',
}

# Academic and professional
ACADEMIC: Set[str] = {
'Ph.D.', 'M.D.', 'B.A.', 'M.A.', 'B.Sc.',
'M.Sc.', 'D.Phil.', 'LL.B.', 'LL.M.',
}

# Latin abbreviations
LATIN: Set[str] = {
'etc.', 'e.g.', 'i.e.', 'viz.',
'vs.', 'al.', 'et al.', 'cf.',
}

# Military and government
MILITARY: Set[str] = {
'Gen.', 'Col.', 'Lt.', 'Sgt.', 'Capt.',
'Maj.', 'Adm.', 'Gov.', 'Sen.', 'Rep.',
}

# Common measurements
MEASUREMENTS: Set[str] = {
'cm.', 'mm.', 'km.', 'kg.', 'lb.',
'ft.', 'in.', 'hr.', 'min.', 'sec.',
}

# Business and organization
BUSINESS: Set[str] = {
'Inc.', 'Ltd.', 'Corp.', 'Co.', 'LLC.',
'dept.', 'div.', 'est.', 'avg.', 'approx.',
}

# Temporal abbreviations
TEMPORAL: Set[str] = {
'Jan.', 'Feb.', 'Mar.', 'Apr.', 'Jun.',
'Jul.', 'Aug.', 'Sep.', 'Sept.', 'Oct.',
'Nov.', 'Dec.', 'Mon.', 'Tue.', 'Wed.',
'Thu.', 'Fri.', 'Sat.', 'Sun.',
}

# Geographical abbreviations
GEOGRAPHICAL: Set[str] = {
'U.S.', 'U.S.A.', 'U.K.', 'E.U.',
'Ave.', 'Blvd.', 'Rd.', 'St.', 'Mt.',
}

@classmethod
def all(cls) -> Set[str]:
"""Return all abbreviations as a single set."""
all_abbrevs = set()
for field_name in cls.__annotations__:
if isinstance(field_name, str) and field_name.isupper():
all_abbrevs.update(getattr(cls, field_name))
return all_abbrevs

@classmethod
def by_category(cls) -> Dict[str, Set[str]]:
"""Return abbreviations grouped by category."""
categories = {}
for field_name in cls.__annotations__:
if isinstance(field_name, str) and field_name.isupper():
categories[field_name.lower()] = getattr(cls, field_name)
return categories


@dataclass(frozen=True, slots=True)
class UnicodeReplacements:
"""Common Unicode characters used for replacements."""

DOT_LEADER = '․' # U+2024 ONE DOT LEADER
ELLIPSIS = '…' # U+2026 HORIZONTAL ELLIPSIS
139 changes: 107 additions & 32 deletions src/chonkie/chef/text.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,114 @@
from typing import Optional, Union, List
import re

from chonkie.chef.base import BaseChef

from chonkie.chef.patterns import Abbreviations, UnicodeReplacements
class TextChef(BaseChef):
"""A chef that handles basic text processing.
This chef handles basic text files and performs common text cleaning operations
like removing extra whitespace, normalizing line endings, etc.
"""

def __init__(self) -> None:
def __init__(self,
whitespace: bool = True,
newlines: bool = True,
abbreviations: bool = True,
ellipsis: bool = True,
sentence_endings: str = '.!?;:') -> None:
"""Initialize the TextChef with common text file extensions."""
super().__init__(extensions=['.txt', '.md', '.rst', '.text'])
extensions = ['.txt', '.md', '.rst', '.text']
super().__init__(extensions=extensions)

def _normalize_spaces(self, text: str) -> str:
"""Normalize spaces in the text."""
return ' '.join(text.split())

def _remove_empty_lines(self, text: str) -> str:
"""Remove empty lines from the text and preserve paragraph breaks."""
# Split the text into lines
lines = text.split('\n')
# Initialize the flags
self._enable_whitespace = whitespace
self._enable_newlines = newlines
self._enable_abbreviations = abbreviations
self._enable_ellipsis = ellipsis

# Initialize the sentence endings
self._sentence_endings = sentence_endings

# Initialize the patterns
self._abbreviations = Abbreviations.all() if abbreviations else set()
self._unicode_replacements = UnicodeReplacements()

# Compiling the regex patterns
self._ellipsis_pattern = re.compile(r'\.{3,}')
self._newline_pattern = re.compile(r'\n+')

def _handle_abbreviations(self, text: str) -> str:
"""Replace the fullstop in abbreviations with a dot leader."""
for abbreviation in self._abbreviations:
new_abbreviation = abbreviation.replace('.', self._unicode_replacements.DOT_LEADER)
text = re.sub(abbreviation, new_abbreviation, text)
return text

def _replace_ellipsis(self, text: str) -> str:
"""Replace ellipsis with Unicode ellipsis character.
Args:
text: Input text
Returns:
Text with ellipsis replaced
"""
# Replace any sequence of 3 or more dots with ellipsis character
return self._ellipsis_pattern.sub(self._unicode_replacements.ELLIPSIS, text)

def _normalize_whitespace(self, text: str) -> str:
"""Normalize whitespace in text.
Args:
text: Input text
Returns:
Text with normalized whitespace
# Remove empty lines
lines = [line for line in lines if line.strip()]
"""
# Replace multiple spaces with single space
text = ' '.join(text.split())
return text

def _normalize_newlines(self, text: str) -> str:
"""Normalize newlines in text.
Args:
text: Input text
Returns:
Text with normalized newlines
# Join the lines back together
return '\n'.join(lines)
"""
# Normalize newlines
text = text.replace('\r\n', '\n').replace('\r', '\n')

# Normalize more than one newline as double newlines
text = self._newline_pattern.sub('\n\n', text)

def _normalize_line_endings(self, text: str) -> str:
r"""Normalize line endings to \n."""
return text.replace('\r\n', '\n').replace('\r', '\n')
# Remove empty lines while preserving paragraph structure
lines = [line.strip() for line in text.split('\n')]
result = []

for i, line in enumerate(lines):
if not line: # Skip empty lines
continue

# If this isn't the first line and previous line doesn't end with
# sentence ending punctuation, join with a space instead of newline
if result and not result[-1][-1] in self._sentence_endings:
result[-1] = result[-1] + ' ' + line
else:
result.append(line)

return '\n'.join(result)

def clean(self, text: str) -> str:
r"""Clean the text by performing basic text processing operations.
Operations performed:
- Normalize line endings to \n
- Remove redundant empty lines
- Strip whitespace from start/end
- Replace multiple spaces with single space
A common function where one can enable/disable the operations supported by the chef.
Args:
text: The text to clean.
Expand All @@ -50,14 +118,21 @@ def clean(self, text: str) -> str:
"""
if not text:
return text

# Normalize line endings
text = self._normalize_line_endings(text)

# Remove redundant empty lines
text = self._remove_empty_lines(text)
# Normalize whitespace
if self._enable_whitespace:
text = self._normalize_whitespace(text)

# Normalize newlines
if self._enable_newlines:
text = self._normalize_newlines(text)

# Replace ellipsis
if self._enable_ellipsis:
text = self._replace_ellipsis(text)

# Replace abbreviations
if self._enable_abbreviations:
text = self._handle_abbreviations(text)

# Replace multiple spaces with single space
text = self._normalize_spaces(text)

return text

0 comments on commit ea8c82f

Please sign in to comment.