From 9db9114a405e94c56012318eb8c1b031edeb0f41 Mon Sep 17 00:00:00 2001 From: Kristian-Knudsen Date: Wed, 6 Dec 2023 21:35:35 +0100 Subject: [PATCH] added support for paranthesis --- src/text_extraction/postprocessing.py | 4 ++-- src/text_extraction/test/test_postprocessing.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/text_extraction/postprocessing.py b/src/text_extraction/postprocessing.py index 451a5f8..0f8b752 100644 --- a/src/text_extraction/postprocessing.py +++ b/src/text_extraction/postprocessing.py @@ -17,7 +17,7 @@ def clean_sentence(sentence: str) -> str: def clean_word(word: str) -> str: """ Cleans a word making it as clean as possible for positive detection in the spell_checking module """ - invalid_characters = ['.', '-', ';', ','] + invalid_characters = ['.', '-', ';', ',', '(', ')'] invalid_numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] # Pre cleaning for common start point @@ -36,6 +36,6 @@ def clean_word(word: str) -> str: # Check if any char of the word is in the invalid_characters if any(char in word for char in invalid_characters): - word = re.sub(r"[\.\,\-\;]*", "", word) + word = re.sub(r"[\.\,\-\;\(\)]*", "", word) return word diff --git a/src/text_extraction/test/test_postprocessing.py b/src/text_extraction/test/test_postprocessing.py index 3f6d641..5e5b662 100644 --- a/src/text_extraction/test/test_postprocessing.py +++ b/src/text_extraction/test/test_postprocessing.py @@ -28,7 +28,7 @@ def test_spaces_removed(self): def test_invalid_characters_removed(self): """ Check if invalid characters are correctly removed """ # Arrange - to_test = "te.-,;st;" + to_test = "te.-,;s())t;" expected = "test" # Act actual = clean_word(to_test) @@ -38,7 +38,7 @@ def test_invalid_characters_removed(self): def test_clean_sentence(self): """ Test to check if a sentence is properly cleaned """ # Arrange - to_test = "this is a ; long se123ntence \n\n " + to_test = "this is a ; long se123ntence () \n\n " expected = "this is a long sentence" # Act