From 5ca814482a8a2af1f4fa499144cd17d5a170f523 Mon Sep 17 00:00:00 2001
From: juhoinkinen <34240031+juhoinkinen@users.noreply.github.com>
Date: Thu, 2 Nov 2023 08:58:09 +0200
Subject: [PATCH 1/2] Catch ValueError by Voikko.analyze

---
 annif/analyzer/voikko.py      | 10 +++++++++-
 tests/test_analyzer_voikko.py | 15 +++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/annif/analyzer/voikko.py b/annif/analyzer/voikko.py
index e6e693d65..82e2fdcbf 100644
--- a/annif/analyzer/voikko.py
+++ b/annif/analyzer/voikko.py
@@ -5,6 +5,8 @@
 
 import voikko.libvoikko
 
+from annif.exception import OperationFailedException
+
 from . import analyzer
 
 
@@ -27,7 +29,13 @@ def __getstate__(self) -> dict[str, str | None]:
     def _normalize_word(self, word: str) -> str:
         if self.voikko is None:
             self.voikko = voikko.libvoikko.Voikko(self.param)
-        result = self.voikko.analyze(word)
+        try:
+            result = self.voikko.analyze(word)
+        except ValueError as err:
+            raise OperationFailedException(
+                f"Voikko error in analysis of word '{word}'"
+            ) from err
+
         if len(result) > 0 and "BASEFORM" in result[0]:
             return result[0]["BASEFORM"]
         return word
diff --git a/tests/test_analyzer_voikko.py b/tests/test_analyzer_voikko.py
index 6cba156f7..74bf0a971 100644
--- a/tests/test_analyzer_voikko.py
+++ b/tests/test_analyzer_voikko.py
@@ -1,8 +1,11 @@
 """Unit tests for voikko analyzer in Annif"""
 
+from unittest import mock
+
 import pytest
 
 import annif.analyzer
+from annif.exception import OperationFailedException
 
 voikko = pytest.importorskip("annif.analyzer.voikko")
 
@@ -18,3 +21,15 @@ def test_voikko_finnish_analyzer_normalize_word():
     assert analyzer._normalize_word("xyzzy") == "xyzzy"
     assert analyzer._normalize_word("vanhat") == "vanha"
     assert analyzer._normalize_word("koirien") == "koira"
+
+
+def test_voikko_analyze_valueerror():
+    analyzer = annif.analyzer.get_analyzer("voikko(fi)")
+    with mock.patch(
+        "voikko.libvoikko.Voikko.analyze",
+        side_effect=ValueError,
+    ):
+        with pytest.raises(
+            OperationFailedException, match="Voikko error in analysis of word 'kissa'"
+        ):
+            assert analyzer._normalize_word("kissa")

From 7560fff935824c58128327573819bf8f67c8692c Mon Sep 17 00:00:00 2001
From: juhoinkinen <34240031+juhoinkinen@users.noreply.github.com>
Date: Thu, 2 Nov 2023 11:39:36 +0200
Subject: [PATCH 2/2] Catch OperationFailed by _normalize_word() and log text
 being tokenized

---
 annif/analyzer/analyzer.py | 17 ++++++++++++-----
 tests/test_analyzer.py     | 21 +++++++++++++++++++++
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py
index 5ba876f9d..652b3bdb9 100644
--- a/annif/analyzer/analyzer.py
+++ b/annif/analyzer/analyzer.py
@@ -5,6 +5,8 @@
 import functools
 import unicodedata
 
+from annif.exception import OperationFailedException
+
 _KEY_TOKEN_MIN_LENGTH = "token_min_length"
 
 
@@ -44,11 +46,16 @@ def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
 
         import nltk.tokenize
 
-        return [
-            self._normalize_word(word)
-            for word in nltk.tokenize.word_tokenize(text)
-            if (not filter or self.is_valid_token(word))
-        ]
+        try:
+            return [
+                self._normalize_word(word)
+                for word in nltk.tokenize.word_tokenize(text)
+                if (not filter or self.is_valid_token(word))
+            ]
+        except OperationFailedException as err:
+            raise OperationFailedException(
+                f"Error in tokenization of text '{text}'"
+            ) from err
 
     def _normalize_word(self, word):
         """Normalize (stem or lemmatize) a word form into a normal form."""
diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py
index ecddfbb37..f6376bb55 100644
--- a/tests/test_analyzer.py
+++ b/tests/test_analyzer.py
@@ -1,8 +1,11 @@
 """Unit tests for analyzers in Annif"""
+import importlib.util
+from unittest import mock
 
 import pytest
 
 import annif.analyzer
+from annif.exception import OperationFailedException
 
 
 def test_get_analyzer_nonexistent():
@@ -60,6 +63,24 @@ def test_english_tokenize_words_no_filter():
     assert len(words) == 23
 
 
+@pytest.mark.skipif(
+    importlib.util.find_spec("voikko") is None,
+    reason="test requires that Voikko is installed",
+)
+def test_tokenize_words_operationfailed():
+    analyzer = annif.analyzer.get_analyzer("voikko(fi)")
+    text = "An error producing sentence."
+    with mock.patch(
+        "voikko.libvoikko.Voikko.analyze",
+        side_effect=ValueError,
+    ):
+        with pytest.raises(
+            OperationFailedException,
+            match="Error in tokenization of text 'An error producing sentence.'",
+        ):
+            analyzer.tokenize_words(text)
+
+
 def test_english_filter_words_min_token():
     analyzer = annif.analyzer.get_analyzer("snowball(english,token_min_length=2)")
     text = """Since 2000, a 3D printer can be used to print