Update clean_text parameters to be more general (punc -> char)

CoEDL · Oct 16, 2023 · a097692 · a097692
1 parent e05981c
commit a097692
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 12 deletions.
diff --git a/elpis/datasets/__init__.py b/elpis/datasets/__init__.py
@@ -1,6 +1,6 @@
 from elpis.datasets.dataset import CleaningOptions, Dataset, ProcessingBatch
 from elpis.datasets.preprocessing import process_batch
-from elpis.datasets.processing import create_dataset, prepare_dataset
+from elpis.datasets.processing import prepare_dataset, create_dataset
 
 __all__ = [
     "CleaningOptions",

diff --git a/elpis/datasets/clean_text.py b/elpis/datasets/clean_text.py
@@ -5,31 +5,37 @@
 def clean_text(
     text: str,
     words_to_remove: Optional[List[str]] = None,
-    punctuation_to_explode: str = "",
-    punctuation_to_remove: str = "",
+    characters_to_explode: str = "",
+    characters_to_remove: str = "",
+    to_lower=True,
 ) -> str:
     """Cleans the text based on the supplied options.
 
     Parameters:
         text: The text to clean.
-        options: The cleaning options.
+        words_to_remove: Words to remove from the text.
+        characters_to_remove: A string of chars to remove from the text.
+        characters_to_explode: A string of chars to replace with spaces in the text.
+        to_lower: True iff the resulting text should be converted to lower case.
+            Converts to uppercase if False.
 
     Returns:
         The cleaned text
     """
-    words = text.upper().split()
+    words = text.split()
 
     if words_to_remove is not None:
         words = filter(lambda word: word not in words_to_remove, words)
 
-    if punctuation_to_explode != "":
-        words = map(lambda word: explode(word, punctuation_to_explode), words)
+    if characters_to_explode != "":
+        words = map(lambda word: explode(word, characters_to_explode), words)
 
-    if punctuation_to_remove != "":
-        words = map(lambda word: collapse(word, punctuation_to_remove), words)
+    if characters_to_remove != "":
+        words = map(lambda word: collapse(word, characters_to_remove), words)
 
     result = " ".join(words).strip()
-    return remove_consecutive_spaces(result)
+    result = remove_consecutive_spaces(result)
+    return result.lower() if to_lower else result.upper()
 
 
 def explode(text: str, pattern: str) -> str:

diff --git a/elpis/datasets/preprocessing.py b/elpis/datasets/preprocessing.py
@@ -63,8 +63,8 @@ def clean_annotation(
     transcript = clean_text(
         text=annotation.transcript,
         words_to_remove=cleaning_options.words_to_remove,
-        punctuation_to_explode=cleaning_options.punctuation_to_explode,
-        punctuation_to_remove=cleaning_options.punctuation_to_remove,
+        characters_to_explode=cleaning_options.punctuation_to_explode,
+        characters_to_remove=cleaning_options.punctuation_to_remove,
     )
     result = copy(annotation)
     result.transcript = transcript