Skip to content

Commit

Permalink
modified: EduNLP/SIF/tokenization/text/tokenization.py
Browse files Browse the repository at this point in the history
	modified:   tests/test_tokenizer/test_tokenizer.py
  • Loading branch information
KINGNEWBLUSH committed Mar 12, 2024
1 parent 1476f8a commit f02ccce
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 10 deletions.
2 changes: 1 addition & 1 deletion EduNLP/SIF/tokenization/text/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def tokenize(text,
elif (tokenizer == 'bpe'):
try:
tokenizer = HGTokenizer.from_file('bpeTokenizer.json')
except OSError:
except :
tokenizer = huggingface_tokenizer.Tokenizer(
huggingface_tokenizer.models.BPE())
if (bpe_trainfile is None):
Expand Down
32 changes: 23 additions & 9 deletions tests/test_tokenizer/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,20 +51,34 @@ def test_CharTokenizer():
assert ret == ans


def test_Tokenizer():
items = ["""The stationery store has 600 exercise books, and after selling
some, there are still 4 packs left, 25 each, how many are sold?"""]
def test_TokenizerNLTK():
items = ["The stationery store has 600 exercise books, and after selling\
some, there are still 4 packs left, 25 each, how many are sold?"]
ans = [
'The', 'stationery', 'store', 'has', '600', 'exercise',
'books', 'and', 'after', 'selling', 'some', 'there', 'are', 'still',
'4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold'
]
for tok in ['nltk', 'spacy']:
tokenizer = get_tokenizer("pure_text",
text_params={"tokenizer": tok, "stopwords": set(",?")})
tokens = tokenizer(items)
ret = next(tokens)
assert ret == ans
tokenizer = get_tokenizer("pure_text",
text_params={"tokenizer": 'nltk', "stopwords": set(",?")})
tokens = tokenizer(items)
ret = next(tokens)
assert ret == ans


def test_TokenizerSpacy():
items = ["The stationery store has 600 exercise books, and after selling\
some, there are still 4 packs left, 25 each, how many are sold?"]
ans = [
'The', 'stationery', 'store', 'has', '600', 'exercise',
'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still',
'4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold'
]
tokenizer = get_tokenizer("pure_text",
text_params={"tokenizer": 'spacy', "stopwords": set(",?")})
tokens = tokenizer(items)
ret = next(tokens)
assert ret == ans


def test_TokenizerBPE():
Expand Down

0 comments on commit f02ccce

Please sign in to comment.