From 5ca814482a8a2af1f4fa499144cd17d5a170f523 Mon Sep 17 00:00:00 2001 From: juhoinkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 2 Nov 2023 08:58:09 +0200 Subject: [PATCH 1/2] Catch ValueError by Voikko.analyze --- annif/analyzer/voikko.py | 10 +++++++++- tests/test_analyzer_voikko.py | 15 +++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/annif/analyzer/voikko.py b/annif/analyzer/voikko.py index e6e693d65..82e2fdcbf 100644 --- a/annif/analyzer/voikko.py +++ b/annif/analyzer/voikko.py @@ -5,6 +5,8 @@ import voikko.libvoikko +from annif.exception import OperationFailedException + from . import analyzer @@ -27,7 +29,13 @@ def __getstate__(self) -> dict[str, str | None]: def _normalize_word(self, word: str) -> str: if self.voikko is None: self.voikko = voikko.libvoikko.Voikko(self.param) - result = self.voikko.analyze(word) + try: + result = self.voikko.analyze(word) + except ValueError as err: + raise OperationFailedException( + f"Voikko error in analysis of word '{word}'" + ) from err + if len(result) > 0 and "BASEFORM" in result[0]: return result[0]["BASEFORM"] return word diff --git a/tests/test_analyzer_voikko.py b/tests/test_analyzer_voikko.py index 6cba156f7..74bf0a971 100644 --- a/tests/test_analyzer_voikko.py +++ b/tests/test_analyzer_voikko.py @@ -1,8 +1,11 @@ """Unit tests for voikko analyzer in Annif""" +from unittest import mock + import pytest import annif.analyzer +from annif.exception import OperationFailedException voikko = pytest.importorskip("annif.analyzer.voikko") @@ -18,3 +21,15 @@ def test_voikko_finnish_analyzer_normalize_word(): assert analyzer._normalize_word("xyzzy") == "xyzzy" assert analyzer._normalize_word("vanhat") == "vanha" assert analyzer._normalize_word("koirien") == "koira" + + +def test_voikko_analyze_valueerror(): + analyzer = annif.analyzer.get_analyzer("voikko(fi)") + with mock.patch( + "voikko.libvoikko.Voikko.analyze", + side_effect=ValueError, + ): + with pytest.raises( + OperationFailedException, match="Voikko error in analysis of word 'kissa'" + ): + assert analyzer._normalize_word("kissa") From 7560fff935824c58128327573819bf8f67c8692c Mon Sep 17 00:00:00 2001 From: juhoinkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:39:36 +0200 Subject: [PATCH 2/2] Catch OperationFailed by _normalize_word() and log text being tokenized --- annif/analyzer/analyzer.py | 17 ++++++++++++----- tests/test_analyzer.py | 21 +++++++++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index 5ba876f9d..652b3bdb9 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -5,6 +5,8 @@ import functools import unicodedata +from annif.exception import OperationFailedException + _KEY_TOKEN_MIN_LENGTH = "token_min_length" @@ -44,11 +46,16 @@ def tokenize_words(self, text: str, filter: bool = True) -> list[str]: import nltk.tokenize - return [ - self._normalize_word(word) - for word in nltk.tokenize.word_tokenize(text) - if (not filter or self.is_valid_token(word)) - ] + try: + return [ + self._normalize_word(word) + for word in nltk.tokenize.word_tokenize(text) + if (not filter or self.is_valid_token(word)) + ] + except OperationFailedException as err: + raise OperationFailedException( + f"Error in tokenization of text '{text}'" + ) from err def _normalize_word(self, word): """Normalize (stem or lemmatize) a word form into a normal form.""" diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index ecddfbb37..f6376bb55 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -1,8 +1,11 @@ """Unit tests for analyzers in Annif""" +import importlib.util +from unittest import mock import pytest import annif.analyzer +from annif.exception import OperationFailedException def test_get_analyzer_nonexistent(): @@ -60,6 +63,24 @@ def test_english_tokenize_words_no_filter(): assert len(words) == 23 +@pytest.mark.skipif( + importlib.util.find_spec("voikko") is None, + reason="test requires that Voikko is installed", +) +def test_tokenize_words_operationfailed(): + analyzer = annif.analyzer.get_analyzer("voikko(fi)") + text = "An error producing sentence." + with mock.patch( + "voikko.libvoikko.Voikko.analyze", + side_effect=ValueError, + ): + with pytest.raises( + OperationFailedException, + match="Error in tokenization of text 'An error producing sentence.'", + ): + analyzer.tokenize_words(text) + + def test_english_filter_words_min_token(): analyzer = annif.analyzer.get_analyzer("snowball(english,token_min_length=2)") text = """Since 2000, a 3D printer can be used to print