From ce73322a39773488e57f6b151f4d22f0fcc83bd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Mon, 11 Mar 2024 09:23:25 -0700 Subject: [PATCH 01/12] Add support for Japanese and basic support for languages without spaces --- pyproject.toml | 8 ++++- src/alt_eval/tokenizer.py | 68 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 73 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 71c3341..7325753 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "jiwer>=3.0.3", "python-iso639>=2023.6.15", "regex>=2023.8.8", - "sacremoses>=0.0.53", + "sacremoses==0.0.53", ] classifiers = [ "License :: OSI Approved :: MIT License", @@ -26,6 +26,12 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", ] +[project.optional-dependencies] +ja = [ + "fugashi==1.3.0", + "unidic-lite==1.0.8", +] + [build-system] requires = ["setuptools"] diff --git a/src/alt_eval/tokenizer.py b/src/alt_eval/tokenizer.py index 67c93a2..7600dd9 100644 --- a/src/alt_eval/tokenizer.py +++ b/src/alt_eval/tokenizer.py @@ -1,5 +1,6 @@ import copy from dataclasses import dataclass, field +import functools import unicodedata import regex as re @@ -59,6 +60,30 @@ def tokens_as_words(tokens: list[Token]) -> list[Token]: return result +# fmt: off +UNICODE_SCRIPTS = [ + "Adlm", "Aghb", "Ahom", "Arab", "Armi", "Armn", "Avst", "Bali", "Bamu", "Bass", "Batk", "Beng", + "Bhks", "Bopo", "Brah", "Brai", "Bugi", "Buhd", "Cakm", "Cans", "Cari", "Cham", "Cher", "Chrs", + "Copt", "Cpmn", "Cprt", "Cyrl", "Deva", "Diak", "Dogr", "Dsrt", "Dupl", "Egyp", "Elba", "Elym", + "Ethi", "Geor", "Glag", "Gong", "Gonm", "Goth", "Gran", "Grek", "Gujr", "Guru", "Hang", "Hani", + "Hano", "Hatr", "Hebr", "Hira", "Hluw", "Hmng", "Hmnp", "Hung", "Ital", "Java", "Kali", "Kana", + "Kawi", "Khar", "Khmr", "Khoj", "Kits", "Knda", "Kthi", "Lana", "Laoo", "Latn", "Lepc", "Limb", + "Lina", "Linb", "Lisu", "Lyci", "Lydi", "Mahj", "Maka", "Mand", "Mani", "Marc", "Medf", "Mend", + "Merc", "Mero", "Mlym", "Modi", "Mong", "Mroo", "Mtei", "Mult", "Mymr", "Nagm", "Nand", "Narb", + "Nbat", "Newa", "Nkoo", "Nshu", "Ogam", "Olck", "Orkh", "Orya", "Osge", "Osma", "Ougr", "Palm", + "Pauc", "Perm", "Phag", "Phli", "Phlp", "Phnx", "Plrd", "Prti", "Rjng", "Rohg", "Runr", "Samr", + "Sarb", "Saur", "Sgnw", "Shaw", "Shrd", "Sidd", "Sind", "Sinh", "Sogd", "Sogo", "Sora", "Soyo", + "Sund", "Sylo", "Syrc", "Tagb", "Takr", "Tale", "Talu", "Taml", "Tang", "Tavt", "Telu", "Tfng", + "Tglg", "Thaa", "Thai", "Tibt", "Tirh", "Tnsa", "Toto", "Ugar", "Vaii", "Vith", "Wara", "Wcho", + "Xpeo", "Xsux", "Yezi", "Yiii", "Zanb", +] +UNICODE_SCRIPTS_NO_SPACES = [ + "Egyp", "Hani", "Hira", "Hluw", "Lina", "Linb", "Xsux", "Kana", "Khmr", "Laoo", "Mymr", "Phag", + "Lana", "Thai", "Tibt", +] +# fmt: on + + class LyricsTokenizer: """A Moses-based tokenizer for lyrics. @@ -80,14 +105,42 @@ def __init__(self) -> None: r"(?P)(?P's)\b|\b(?Pwie|für)(?P'n)\b", flags=re.IGNORECASE ) + # A regex to match the boundary between two letters from two different scripts, or between a + # number and a letter from a script that does not use spaces between words. + self._different_scripts_re = re.compile( + r"|".join( + [rf"(?<=[\p{{L}}&&\p{{{s}}}])(?=[\p{{L}}--\p{{{s}}}])" for s in UNICODE_SCRIPTS] + + [rf"(?<=[\p{{L}}&&\p{{{s}}}])(?=[0-9])" for s in UNICODE_SCRIPTS_NO_SPACES] + + [rf"(?<=[0-9])(?=[\p{{L}}&&\p{{{s}}}])" for s in UNICODE_SCRIPTS_NO_SPACES] + ), + flags=re.VERSION1, + ) + + # A regex to match a character in a script that does not use spaces between words. + self._no_spaces_re = re.compile( + r"(" + r"|".join([rf"\p{{{s}}}" for s in UNICODE_SCRIPTS_NO_SPACES]) + r")", + flags=re.VERSION1, + ) + + @functools.cached_property + def _fugashi_tagger(self): + try: + import fugashi + + return fugashi.Tagger() + except (ImportError, RuntimeError) as e: + raise RuntimeError( + "Failed to initialize the tagger for Japanese. Please make sure to install the " + "required dependencies via `pip install 'jam-alt[ja]'." + ) from e + def __call__(self, text: str, language: str = "en") -> list[Token]: """ Tokenize the given text. Args: text: A string to tokenize. - language: A language code supported by `sacremoses`: either an ISO 639-1 language code, - or "cjk" for Chinese, Japanese and Korean. + language: An ISO 639-1 language code. Returns: A list of `Token` objects. @@ -147,6 +200,17 @@ def __call__(self, text: str, language: str = "en") -> list[Token]: # Split contractions line = self._contraction_de_re.sub(r"\g \g", line) + if language == "ja": + # Tokenize Japanese + line = " ".join(w.surface.strip() for w in self._fugashi_tagger(line)) + else: + # In other languages that do not use spaces to separate words, treat each + # character as a separate word + text = self._no_spaces_re.sub(r" \1 ", text) + + # Insert spaces between characters from different scripts + text = self._different_scripts_re.sub(" ", text) + result.extend(line.strip().split()) return to_rich_tokens(result) From d1926f3c26fd6ff7e211d2d398331f9ce875a6bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Mon, 11 Mar 2024 09:23:38 -0700 Subject: [PATCH 02/12] Return MER and WIL --- src/alt_eval/metrics.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/alt_eval/metrics.py b/src/alt_eval/metrics.py index 68a3eb4..fb90f09 100644 --- a/src/alt_eval/metrics.py +++ b/src/alt_eval/metrics.py @@ -123,6 +123,8 @@ def compute_word_metrics( results = { "WER": wo.wer, + "MER": wo.mer, + "WIL": wo.wil, "ER_case": error_counts["case"] / total_len, } if visualize_errors: From 8a2e9e39ee8bbc1dda14c79f878201d59fc06020 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Tue, 12 Mar 2024 02:57:47 -0700 Subject: [PATCH 03/12] Fix tokenizer bug --- src/alt_eval/tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/alt_eval/tokenizer.py b/src/alt_eval/tokenizer.py index 7600dd9..52b56b5 100644 --- a/src/alt_eval/tokenizer.py +++ b/src/alt_eval/tokenizer.py @@ -206,10 +206,10 @@ def __call__(self, text: str, language: str = "en") -> list[Token]: else: # In other languages that do not use spaces to separate words, treat each # character as a separate word - text = self._no_spaces_re.sub(r" \1 ", text) + line = self._no_spaces_re.sub(r" \1 ", line) - # Insert spaces between characters from different scripts - text = self._different_scripts_re.sub(" ", text) + # Insert spaces between characters from different scripts + line = self._different_scripts_re.sub(" ", line) result.extend(line.strip().split()) From dcff187675215a0381e4db549210920b8c8f5c86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Tue, 12 Mar 2024 02:59:38 -0700 Subject: [PATCH 04/12] Add tokenizer test --- tests/__init__.py | 0 tests/test_tokenizer.py | 51 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/test_tokenizer.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py new file mode 100644 index 0000000..32d29d8 --- /dev/null +++ b/tests/test_tokenizer.py @@ -0,0 +1,51 @@ +import pytest + +from alt_eval.tokenizer import LyricsTokenizer + + +# fmt: off +@pytest.mark.parametrize( + "language, text, expected_tokens", + [ + ( + "en", + "I ain't got nothin' but the blues", + ["I", "ain", "'t", "got", "nothin'", "but", "the", "blues"], + ), + ( + "en", + "It 'll be fun (ha!)", + ["It", "'ll", "be", "fun", "(", "ha", "!", ")"] + ), + ( + "de", + "Sei's Melancholie", + ["Sei", "'s", "Melancholie"] + ), + ( + "de", + "Könnt' ich dir Schmerz erspar'n", + ["Könnt'", "ich", "dir", "Schmerz", "erspar'n"], + ), + ( + "fr", + "T'avais fait l'amour deux fois sans penser qu'avec cette fille-là", + ["T'", "avais", "fait", "l'", "amour", "deux", "fois", "sans", "penser", "qu'", "avec", "cette", "fille", "-", "là"], + ), + ( + "ja", + "私は日本語を話せません(ラララ)", + ["私", "は", "日本", "語", "を", "話せ", "ませ", "ん", "(", "ラララ", ")"], + ), + ( + "zh", + "我不会说中文。(哈哈)", + ["我", "不", "会", "说", "中", "文", "。", "(", "哈", "哈", ")"], + ) + ], +) +# fmt: on +def test_lyrics_tokenizer(language, text, expected_tokens): + tokenizer = LyricsTokenizer() + tokens = [t.text for t in tokenizer(text, language=language)] + assert tokens == expected_tokens From 34add1f0ecfd11fddf0a972a61ba63ef896b3409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Tue, 12 Mar 2024 03:00:50 -0700 Subject: [PATCH 05/12] Update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7a3f85f..e52658d 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ The package implements metrics designed to work well with lyrics formatted accor - Section breaks (i.e. double line breaks) ## Usage -Install the package with `pip install alt-eval`. +Install the package with `pip install alt-eval`. (Use the `ja` extra if you need to evaluate Japanese: `pip install alt-eval[ja]`.) To compute the metrics: ```python From ec23d24590de202a6483e7050702272d3b209347 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Tue, 12 Mar 2024 03:07:44 -0700 Subject: [PATCH 06/12] Run tests on gha --- .github/workflows/test.yml | 27 +++++++++++++++++++++++++++ pyproject.toml | 3 +++ 2 files changed, 30 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..87506f6 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,27 @@ +on: + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + push: + branches: [main] + + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout github repo + uses: actions/checkout@v4 + with: + lfs: true + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.9 + + - name: Install package + run: pip install '.[test,ja]' + + - name: Run tests + run: pytest tests diff --git a/pyproject.toml b/pyproject.toml index 7325753..ea0a43c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,9 @@ ja = [ "fugashi==1.3.0", "unidic-lite==1.0.8", ] +test = [ + "pytest>=7.3.1", +] [build-system] From 42ecf4c90ee4878e0de8f7c7554b966ded3a0dbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Tue, 12 Mar 2024 03:20:35 -0700 Subject: [PATCH 07/12] Remove 'cjk' tag support from compute_metrics --- src/alt_eval/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alt_eval/metrics.py b/src/alt_eval/metrics.py index fb90f09..1edf6de 100644 --- a/src/alt_eval/metrics.py +++ b/src/alt_eval/metrics.py @@ -196,7 +196,7 @@ def compute_metrics( """ if isinstance(languages, str): languages = [languages] * len(references) - languages = [lg if lg == "cjk" else iso639.Language.match(lg).part1 for lg in languages] + languages = [iso639.Language.match(lg).part1 for lg in languages] tokenizer = LyricsTokenizer() tokens_ref, tokens_hyp = [], [] From 9f43f1dbd3fd6cce07b612544cfdc87e6de3442d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Tue, 12 Mar 2024 03:35:08 -0700 Subject: [PATCH 08/12] Add test case --- tests/test_tokenizer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 32d29d8..c132d8a 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -17,6 +17,11 @@ "It 'll be fun (ha!)", ["It", "'ll", "be", "fun", "(", "ha", "!", ")"] ), + ( + "en", + "Just like 2Pac", + ["Just", "like", "2Pac"], + ), ( "de", "Sei's Melancholie", From a43be0a15893fb4474389e2af5ca7a00fdce0e54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Tue, 12 Mar 2024 05:27:21 -0700 Subject: [PATCH 09/12] Better separation between languages --- src/alt_eval/tokenizer.py | 124 ++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 57 deletions(-) diff --git a/src/alt_eval/tokenizer.py b/src/alt_eval/tokenizer.py index 52b56b5..a46ae2f 100644 --- a/src/alt_eval/tokenizer.py +++ b/src/alt_eval/tokenizer.py @@ -122,18 +122,6 @@ def __init__(self) -> None: flags=re.VERSION1, ) - @functools.cached_property - def _fugashi_tagger(self): - try: - import fugashi - - return fugashi.Tagger() - except (ImportError, RuntimeError) as e: - raise RuntimeError( - "Failed to initialize the tagger for Japanese. Please make sure to install the " - "required dependencies via `pip install 'jam-alt[ja]'." - ) from e - def __call__(self, text: str, language: str = "en") -> list[Token]: """ Tokenize the given text. @@ -145,12 +133,6 @@ def __call__(self, text: str, language: str = "en") -> list[Token]: Returns: A list of `Token` objects. """ - if language not in self._tokenizers: - self._tokenizers[language] = MosesTokenizer(lang=language) - self._punct_normalizers[language] = MosesPunctNormalizer(lang=language) - tokenizer = self._tokenizers[language] - punct_normalizer = self._punct_normalizers[language] - text = self._non_text_re.sub(" ", text) text = unicodedata.normalize("NFC", text) text = text.rstrip("\n") @@ -164,47 +146,13 @@ def __call__(self, text: str, language: str = "en") -> list[Token]: if line.count("\n") >= 2: result.append("\n\n") elif line.strip(): - # Ensure the line ends with punctuation to make the tokenizer treat it as - # a sentence - remove_last = False - if not self._end_punctuation_re.search(line): - remove_last = True - line += " ." - - line = punct_normalizer.normalize(line) - - if language in ["en", "fr", "it"]: - # Protect apostrophes at word boundaries to prevent the tokenizer from - # interpreting them as quotes - line = self._word_boundary_apos_re.sub("@@apos@@", line) - else: - # For languages where the tokenizer doesn't handle apostrophes within words, - # protect all apostrophes - line = line.replace("'", "@@apos@@") - - line = tokenizer.tokenize( - line.strip(), - return_str=True, - escape=False, - aggressive_dash_splits=True, - protected_patterns=[r"\*+", r"@@apos@@"], - ) - - if remove_last: - assert line.endswith(" ."), line - line = line[:-2] - - # Post-process apostrophes - line = line.replace("@@apos@@", "'") - if language == "de": - # Split contractions - line = self._contraction_de_re.sub(r"\g \g", line) - + # Tokenize depending on the language if language == "ja": - # Tokenize Japanese - line = " ".join(w.surface.strip() for w in self._fugashi_tagger(line)) + line = self._tokenize_japanese(line) else: - # In other languages that do not use spaces to separate words, treat each + line = self._tokenize_moses(line, language) + + # In languages that do not use spaces to separate words, treat each # character as a separate word line = self._no_spaces_re.sub(r" \1 ", line) @@ -214,3 +162,65 @@ def __call__(self, text: str, language: str = "en") -> list[Token]: result.extend(line.strip().split()) return to_rich_tokens(result) + + @functools.lru_cache(maxsize=200) + def _get_moses_tokenizer(self, language: str) -> MosesTokenizer: + return MosesTokenizer(lang=language) + + @functools.lru_cache(maxsize=200) + def _get_moses_punct_normalizer(self, language: str) -> MosesPunctNormalizer: + return MosesPunctNormalizer(lang=language) + + @functools.cached_property + def _fugashi_tagger(self): + try: + import fugashi + + return fugashi.Tagger() + except (ImportError, RuntimeError) as e: + raise RuntimeError( + "Failed to initialize the tagger for Japanese. Please make sure to install the " + "required dependencies via `pip install 'jam-alt[ja]'." + ) from e + + def _tokenize_moses(self, line: str, language: str) -> str: + # Ensure the line ends with punctuation to make the tokenizer treat it as + # a sentence + remove_last = False + if not self._end_punctuation_re.search(line): + remove_last = True + line += " ." + + line = self._get_moses_punct_normalizer(language).normalize(line) + + if language in ["en", "fr", "it"]: + # Protect apostrophes at word boundaries to prevent the tokenizer from + # interpreting them as quotes + line = self._word_boundary_apos_re.sub("@@apos@@", line) + else: + # For languages where the tokenizer doesn't handle apostrophes within words, + # protect all apostrophes + line = line.replace("'", "@@apos@@") + + line = self._get_moses_tokenizer(language).tokenize( + line.strip(), + return_str=True, + escape=False, + aggressive_dash_splits=True, + protected_patterns=[r"\*+", r"@@apos@@"], + ) + + if remove_last: + assert line.endswith(" ."), line + line = line[:-2] + + # Post-process apostrophes + line = line.replace("@@apos@@", "'") + if language == "de": + # Split contractions + line = self._contraction_de_re.sub(r"\g \g", line) + + return line + + def _tokenize_japanese(self, line: str) -> str: + return " ".join(w.surface.strip() for w in self._fugashi_tagger(line)) From 7c79c52b03569031311b305d1ea5384f9e974074 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Tue, 12 Mar 2024 05:56:10 -0700 Subject: [PATCH 10/12] Fix test --- tests/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index c132d8a..a00a2c6 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -14,7 +14,7 @@ ), ( "en", - "It 'll be fun (ha!)", + "It'll be fun (ha!)", ["It", "'ll", "be", "fun", "(", "ha", "!", ")"] ), ( From f46f0a8cc7213f2cc8fbe25c8c29d489e53a0493 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Tue, 12 Mar 2024 10:27:24 -0700 Subject: [PATCH 11/12] Revert back from fugashi, update readme --- README.md | 7 ++++++- pyproject.toml | 4 ---- src/alt_eval/tokenizer.py | 32 +++++++------------------------- tests/test_tokenizer.py | 2 +- 4 files changed, 14 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index e52658d..3bf3e28 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,13 @@ The package implements metrics designed to work well with lyrics formatted accor - Line breaks - Section breaks (i.e. double line breaks) +Under the hood, the text is pre-processed using the [`sacremoses`](https://github.com/hplt-project/sacremoses) tokenizer and punctuation normalizer. +Note that apostrophes and single quotes are never treated as quotation marks, but as part of a word, marking an elision or a contraction. +For writing systems that do not use spaces to separate words (Chinese, Japanese, Thai, Lao, Burmese, …), each character is considered as a separate word, as per [Radford et al. (2022)](https://arxiv.org/abs/2212.04356). +See the [test cases](./tests/test_tokenizer.py) for examples of how different languages are tokenized. + ## Usage -Install the package with `pip install alt-eval`. (Use the `ja` extra if you need to evaluate Japanese: `pip install alt-eval[ja]`.) +Install the package with `pip install alt-eval`. To compute the metrics: ```python diff --git a/pyproject.toml b/pyproject.toml index ea0a43c..e225e86 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,10 +27,6 @@ classifiers = [ ] [project.optional-dependencies] -ja = [ - "fugashi==1.3.0", - "unidic-lite==1.0.8", -] test = [ "pytest>=7.3.1", ] diff --git a/src/alt_eval/tokenizer.py b/src/alt_eval/tokenizer.py index a46ae2f..4c82364 100644 --- a/src/alt_eval/tokenizer.py +++ b/src/alt_eval/tokenizer.py @@ -146,18 +146,15 @@ def __call__(self, text: str, language: str = "en") -> list[Token]: if line.count("\n") >= 2: result.append("\n\n") elif line.strip(): - # Tokenize depending on the language - if language == "ja": - line = self._tokenize_japanese(line) - else: - line = self._tokenize_moses(line, language) + # Tokenize using sacremoses + line = self._tokenize_moses(line, language) - # In languages that do not use spaces to separate words, treat each - # character as a separate word - line = self._no_spaces_re.sub(r" \1 ", line) + # In languages that do not use spaces to separate words, treat each + # character as a separate word + line = self._no_spaces_re.sub(r" \1 ", line) - # Insert spaces between characters from different scripts - line = self._different_scripts_re.sub(" ", line) + # Insert spaces between characters from different scripts + line = self._different_scripts_re.sub(" ", line) result.extend(line.strip().split()) @@ -171,18 +168,6 @@ def _get_moses_tokenizer(self, language: str) -> MosesTokenizer: def _get_moses_punct_normalizer(self, language: str) -> MosesPunctNormalizer: return MosesPunctNormalizer(lang=language) - @functools.cached_property - def _fugashi_tagger(self): - try: - import fugashi - - return fugashi.Tagger() - except (ImportError, RuntimeError) as e: - raise RuntimeError( - "Failed to initialize the tagger for Japanese. Please make sure to install the " - "required dependencies via `pip install 'jam-alt[ja]'." - ) from e - def _tokenize_moses(self, line: str, language: str) -> str: # Ensure the line ends with punctuation to make the tokenizer treat it as # a sentence @@ -221,6 +206,3 @@ def _tokenize_moses(self, line: str, language: str) -> str: line = self._contraction_de_re.sub(r"\g \g", line) return line - - def _tokenize_japanese(self, line: str) -> str: - return " ".join(w.surface.strip() for w in self._fugashi_tagger(line)) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index a00a2c6..c39eb61 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -40,7 +40,7 @@ ( "ja", "私は日本語を話せません(ラララ)", - ["私", "は", "日本", "語", "を", "話せ", "ませ", "ん", "(", "ラララ", ")"], + ["私", "は", "日", "本", "語", "を", "話", "せ", "ま", "せ", "ん", "(", "ラ", "ラ", "ラ", ")"], ), ( "zh", From afe0d98e554ebb5950b0b2c68e484dbd6f6da51a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Wed, 13 Mar 2024 09:27:02 +0100 Subject: [PATCH 12/12] Update test.yml --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 87506f6..7c94708 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -21,7 +21,7 @@ jobs: python-version: 3.9 - name: Install package - run: pip install '.[test,ja]' + run: pip install '.[test]' - name: Run tests run: pytest tests