Skip to content

Commit

Permalink
Update clean_text parameters to be more general (punc -> char)
Browse files Browse the repository at this point in the history
  • Loading branch information
harrykeightley committed Oct 16, 2023
1 parent e05981c commit a097692
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 12 deletions.
2 changes: 1 addition & 1 deletion elpis/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from elpis.datasets.dataset import CleaningOptions, Dataset, ProcessingBatch
from elpis.datasets.preprocessing import process_batch
from elpis.datasets.processing import create_dataset, prepare_dataset
from elpis.datasets.processing import prepare_dataset, create_dataset

__all__ = [
"CleaningOptions",
Expand Down
24 changes: 15 additions & 9 deletions elpis/datasets/clean_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,37 @@
def clean_text(
text: str,
words_to_remove: Optional[List[str]] = None,
punctuation_to_explode: str = "",
punctuation_to_remove: str = "",
characters_to_explode: str = "",
characters_to_remove: str = "",
to_lower=True,
) -> str:
"""Cleans the text based on the supplied options.
Parameters:
text: The text to clean.
options: The cleaning options.
words_to_remove: Words to remove from the text.
characters_to_remove: A string of chars to remove from the text.
characters_to_explode: A string of chars to replace with spaces in the text.
to_lower: True iff the resulting text should be converted to lower case.
Converts to uppercase if False.
Returns:
The cleaned text
"""
words = text.upper().split()
words = text.split()

if words_to_remove is not None:
words = filter(lambda word: word not in words_to_remove, words)

if punctuation_to_explode != "":
words = map(lambda word: explode(word, punctuation_to_explode), words)
if characters_to_explode != "":
words = map(lambda word: explode(word, characters_to_explode), words)

if punctuation_to_remove != "":
words = map(lambda word: collapse(word, punctuation_to_remove), words)
if characters_to_remove != "":
words = map(lambda word: collapse(word, characters_to_remove), words)

result = " ".join(words).strip()
return remove_consecutive_spaces(result)
result = remove_consecutive_spaces(result)
return result.lower() if to_lower else result.upper()


def explode(text: str, pattern: str) -> str:
Expand Down
4 changes: 2 additions & 2 deletions elpis/datasets/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def clean_annotation(
transcript = clean_text(
text=annotation.transcript,
words_to_remove=cleaning_options.words_to_remove,
punctuation_to_explode=cleaning_options.punctuation_to_explode,
punctuation_to_remove=cleaning_options.punctuation_to_remove,
characters_to_explode=cleaning_options.punctuation_to_explode,
characters_to_remove=cleaning_options.punctuation_to_remove,
)
result = copy(annotation)
result.transcript = transcript
Expand Down

0 comments on commit a097692

Please sign in to comment.