Skip to content

Commit

Permalink
added support for paranthesis
Browse files Browse the repository at this point in the history
  • Loading branch information
Kristian-Knudsen committed Dec 6, 2023
1 parent 054fe0b commit 9db9114
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 4 deletions.
4 changes: 2 additions & 2 deletions src/text_extraction/postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def clean_sentence(sentence: str) -> str:
def clean_word(word: str) -> str:
""" Cleans a word making it as clean as possible for
positive detection in the spell_checking module """
invalid_characters = ['.', '-', ';', ',']
invalid_characters = ['.', '-', ';', ',', '(', ')']
invalid_numbers = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

# Pre cleaning for common start point
Expand All @@ -36,6 +36,6 @@ def clean_word(word: str) -> str:

# Check if any char of the word is in the invalid_characters
if any(char in word for char in invalid_characters):
word = re.sub(r"[\.\,\-\;]*", "", word)
word = re.sub(r"[\.\,\-\;\(\)]*", "", word)

return word
4 changes: 2 additions & 2 deletions src/text_extraction/test/test_postprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_spaces_removed(self):
def test_invalid_characters_removed(self):
""" Check if invalid characters are correctly removed """
# Arrange
to_test = "te.-,;st;"
to_test = "te.-,;s())t;"
expected = "test"
# Act
actual = clean_word(to_test)
Expand All @@ -38,7 +38,7 @@ def test_invalid_characters_removed(self):
def test_clean_sentence(self):
""" Test to check if a sentence is properly cleaned """
# Arrange
to_test = "this is a ; long se123ntence \n\n "
to_test = "this is a ; long se123ntence () \n\n "
expected = "this is a long sentence"

# Act
Expand Down

0 comments on commit 9db9114

Please sign in to comment.