From 7560fff935824c58128327573819bf8f67c8692c Mon Sep 17 00:00:00 2001 From: juhoinkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:39:36 +0200 Subject: [PATCH] Catch OperationFailed by _normalize_word() and log text being tokenized --- annif/analyzer/analyzer.py | 17 ++++++++++++----- tests/test_analyzer.py | 21 +++++++++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index 5ba876f9d..652b3bdb9 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -5,6 +5,8 @@ import functools import unicodedata +from annif.exception import OperationFailedException + _KEY_TOKEN_MIN_LENGTH = "token_min_length" @@ -44,11 +46,16 @@ def tokenize_words(self, text: str, filter: bool = True) -> list[str]: import nltk.tokenize - return [ - self._normalize_word(word) - for word in nltk.tokenize.word_tokenize(text) - if (not filter or self.is_valid_token(word)) - ] + try: + return [ + self._normalize_word(word) + for word in nltk.tokenize.word_tokenize(text) + if (not filter or self.is_valid_token(word)) + ] + except OperationFailedException as err: + raise OperationFailedException( + f"Error in tokenization of text '{text}'" + ) from err def _normalize_word(self, word): """Normalize (stem or lemmatize) a word form into a normal form.""" diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index ecddfbb37..f6376bb55 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -1,8 +1,11 @@ """Unit tests for analyzers in Annif""" +import importlib.util +from unittest import mock import pytest import annif.analyzer +from annif.exception import OperationFailedException def test_get_analyzer_nonexistent(): @@ -60,6 +63,24 @@ def test_english_tokenize_words_no_filter(): assert len(words) == 23 +@pytest.mark.skipif( + importlib.util.find_spec("voikko") is None, + reason="test requires that Voikko is installed", +) +def test_tokenize_words_operationfailed(): + analyzer = annif.analyzer.get_analyzer("voikko(fi)") + text = "An error producing sentence." + with mock.patch( + "voikko.libvoikko.Voikko.analyze", + side_effect=ValueError, + ): + with pytest.raises( + OperationFailedException, + match="Error in tokenization of text 'An error producing sentence.'", + ): + analyzer.tokenize_words(text) + + def test_english_filter_words_min_token(): analyzer = annif.analyzer.get_analyzer("snowball(english,token_min_length=2)") text = """Since 2000, a 3D printer can be used to print