Allow words with dashes/apostrophe returned from tokenizer

miso-belica · Mar 7, 2020 · 612e071 · 612e071
1 parent 2a1d63b
commit 612e071
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 2 deletions.
diff --git a/sumy/nlp/tokenizers.py b/sumy/nlp/tokenizers.py
@@ -59,7 +59,7 @@ def tokenize(self, text):
 class Tokenizer(object):
     """Language dependent tokenizer of text document."""
 
-    _WORD_PATTERN = re.compile(r"^[^\W\d_]+$", re.UNICODE)
+    _WORD_PATTERN = re.compile(r"^[^\W\d_](?:[^\W\d_]|['-])*$", re.UNICODE)
     # feel free to contribute if you have better tokenizer for any of these languages :)
     LANGUAGE_ALIASES = {
         "slovak": "czech",
@@ -127,4 +127,4 @@ def to_words(self, sentence):
 
     @staticmethod
     def _is_word(word):
-        return bool(Tokenizer._WORD_PATTERN.search(word))
+        return bool(Tokenizer._WORD_PATTERN.match(word))
diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py
@@ -40,6 +40,11 @@ def test_language_getter():
         "I am a very nice sentence with comma, but..",
         ("I", "am", "a", "very", "nice", "sentence", "with", "comma"),
     ),
+    (
+        "english",
+        "I am doing sugar-free data-mining for Peter's study - vega punk.",
+        ("I", "am", "doing", "sugar-free", "data-mining", "for", "Peter", "study", "vega", "punk"),
+    ),
     (
         "japanese",
         "この文章を、正しくトークン化したい。",