From f02ccce028a005a12ebeb26f26940cd3615f289e Mon Sep 17 00:00:00 2001 From: KINGNEWBLUSH <102594899+KINGNEWBLUSH@users.noreply.github.com> Date: Tue, 12 Mar 2024 03:12:56 +0000 Subject: [PATCH] modified: EduNLP/SIF/tokenization/text/tokenization.py modified: tests/test_tokenizer/test_tokenizer.py --- EduNLP/SIF/tokenization/text/tokenization.py | 2 +- tests/test_tokenizer/test_tokenizer.py | 32 ++++++++++++++------ 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/EduNLP/SIF/tokenization/text/tokenization.py b/EduNLP/SIF/tokenization/text/tokenization.py index 34dfcf24..8563fe49 100644 --- a/EduNLP/SIF/tokenization/text/tokenization.py +++ b/EduNLP/SIF/tokenization/text/tokenization.py @@ -101,7 +101,7 @@ def tokenize(text, elif (tokenizer == 'bpe'): try: tokenizer = HGTokenizer.from_file('bpeTokenizer.json') - except OSError: + except : tokenizer = huggingface_tokenizer.Tokenizer( huggingface_tokenizer.models.BPE()) if (bpe_trainfile is None): diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py index f3e2dca3..5202fa96 100644 --- a/tests/test_tokenizer/test_tokenizer.py +++ b/tests/test_tokenizer/test_tokenizer.py @@ -51,20 +51,34 @@ def test_CharTokenizer(): assert ret == ans -def test_Tokenizer(): - items = ["""The stationery store has 600 exercise books, and after selling - some, there are still 4 packs left, 25 each, how many are sold?"""] +def test_TokenizerNLTK(): + items = ["The stationery store has 600 exercise books, and after selling\ + some, there are still 4 packs left, 25 each, how many are sold?"] ans = [ 'The', 'stationery', 'store', 'has', '600', 'exercise', 'books', 'and', 'after', 'selling', 'some', 'there', 'are', 'still', '4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold' ] - for tok in ['nltk', 'spacy']: - tokenizer = get_tokenizer("pure_text", - text_params={"tokenizer": tok, "stopwords": set(",?")}) - tokens = tokenizer(items) - ret = next(tokens) - assert ret == ans + tokenizer = get_tokenizer("pure_text", + text_params={"tokenizer": 'nltk', "stopwords": set(",?")}) + tokens = tokenizer(items) + ret = next(tokens) + assert ret == ans + + +def test_TokenizerSpacy(): + items = ["The stationery store has 600 exercise books, and after selling\ + some, there are still 4 packs left, 25 each, how many are sold?"] + ans = [ + 'The', 'stationery', 'store', 'has', '600', 'exercise', + 'books', 'and', 'after', 'selling', ' ', 'some', 'there', 'are', 'still', + '4', 'packs', 'left', '25', 'each', 'how', 'many', 'are', 'sold' + ] + tokenizer = get_tokenizer("pure_text", + text_params={"tokenizer": 'spacy', "stopwords": set(",?")}) + tokens = tokenizer(items) + ret = next(tokens) + assert ret == ans def test_TokenizerBPE():