Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Debug ValueError by Voikko analyzer #738

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions annif/analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import functools
import unicodedata

from annif.exception import OperationFailedException

_KEY_TOKEN_MIN_LENGTH = "token_min_length"


Expand Down Expand Up @@ -44,11 +46,16 @@ def tokenize_words(self, text: str, filter: bool = True) -> list[str]:

import nltk.tokenize

return [
self._normalize_word(word)
for word in nltk.tokenize.word_tokenize(text)
if (not filter or self.is_valid_token(word))
]
try:
return [
self._normalize_word(word)
for word in nltk.tokenize.word_tokenize(text)
if (not filter or self.is_valid_token(word))
]
except OperationFailedException as err:
raise OperationFailedException(
f"Error in tokenization of text '{text}'"
) from err

def _normalize_word(self, word):
"""Normalize (stem or lemmatize) a word form into a normal form."""
Expand Down
10 changes: 9 additions & 1 deletion annif/analyzer/voikko.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

import voikko.libvoikko

from annif.exception import OperationFailedException

from . import analyzer


Expand All @@ -27,7 +29,13 @@ def __getstate__(self) -> dict[str, str | None]:
def _normalize_word(self, word: str) -> str:
if self.voikko is None:
self.voikko = voikko.libvoikko.Voikko(self.param)
result = self.voikko.analyze(word)
try:
result = self.voikko.analyze(word)
except ValueError as err:
raise OperationFailedException(
f"Voikko error in analysis of word '{word}'"
) from err

if len(result) > 0 and "BASEFORM" in result[0]:
return result[0]["BASEFORM"]
return word
21 changes: 21 additions & 0 deletions tests/test_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""Unit tests for analyzers in Annif"""
import importlib.util
from unittest import mock

import pytest

import annif.analyzer
from annif.exception import OperationFailedException


def test_get_analyzer_nonexistent():
Expand Down Expand Up @@ -60,6 +63,24 @@ def test_english_tokenize_words_no_filter():
assert len(words) == 23


@pytest.mark.skipif(
importlib.util.find_spec("voikko") is None,
reason="test requires that Voikko is installed",
)
def test_tokenize_words_operationfailed():
analyzer = annif.analyzer.get_analyzer("voikko(fi)")
text = "An error producing sentence."
with mock.patch(
"voikko.libvoikko.Voikko.analyze",
side_effect=ValueError,
):
with pytest.raises(
OperationFailedException,
match="Error in tokenization of text 'An error producing sentence.'",
):
analyzer.tokenize_words(text)


def test_english_filter_words_min_token():
analyzer = annif.analyzer.get_analyzer("snowball(english,token_min_length=2)")
text = """Since 2000, a 3D printer can be used to print
Expand Down
15 changes: 15 additions & 0 deletions tests/test_analyzer_voikko.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""Unit tests for voikko analyzer in Annif"""

from unittest import mock

import pytest

import annif.analyzer
from annif.exception import OperationFailedException

voikko = pytest.importorskip("annif.analyzer.voikko")

Expand All @@ -18,3 +21,15 @@ def test_voikko_finnish_analyzer_normalize_word():
assert analyzer._normalize_word("xyzzy") == "xyzzy"
assert analyzer._normalize_word("vanhat") == "vanha"
assert analyzer._normalize_word("koirien") == "koira"


def test_voikko_analyze_valueerror():
analyzer = annif.analyzer.get_analyzer("voikko(fi)")
with mock.patch(
"voikko.libvoikko.Voikko.analyze",
side_effect=ValueError,
):
with pytest.raises(
OperationFailedException, match="Voikko error in analysis of word 'kissa'"
):
assert analyzer._normalize_word("kissa")