Skip to content

Commit

Permalink
Allow words with dashes/apostrophe returned from tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
miso-belica committed Mar 7, 2020
1 parent 2a1d63b commit 612e071
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
4 changes: 2 additions & 2 deletions sumy/nlp/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def tokenize(self, text):
class Tokenizer(object):
"""Language dependent tokenizer of text document."""

_WORD_PATTERN = re.compile(r"^[^\W\d_]+$", re.UNICODE)
_WORD_PATTERN = re.compile(r"^[^\W\d_](?:[^\W\d_]|['-])*$", re.UNICODE)
# feel free to contribute if you have better tokenizer for any of these languages :)
LANGUAGE_ALIASES = {
"slovak": "czech",
Expand Down Expand Up @@ -127,4 +127,4 @@ def to_words(self, sentence):

@staticmethod
def _is_word(word):
return bool(Tokenizer._WORD_PATTERN.search(word))
return bool(Tokenizer._WORD_PATTERN.match(word))
5 changes: 5 additions & 0 deletions tests/test_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ def test_language_getter():
"I am a very nice sentence with comma, but..",
("I", "am", "a", "very", "nice", "sentence", "with", "comma"),
),
(
"english",
"I am doing sugar-free data-mining for Peter's study - vega punk.",
("I", "am", "doing", "sugar-free", "data-mining", "for", "Peter", "study", "vega", "punk"),
),
(
"japanese",
"この文章を、正しくトークン化したい。",
Expand Down

0 comments on commit 612e071

Please sign in to comment.