From ce73322a39773488e57f6b151f4d22f0fcc83bd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= <cifkao@users.noreply.github.com>
Date: Mon, 11 Mar 2024 09:23:25 -0700
Subject: [PATCH 01/12] Add support for Japanese and basic support for
 languages without spaces

---
 pyproject.toml            |  8 ++++-
 src/alt_eval/tokenizer.py | 68 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 73 insertions(+), 3 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 71c3341..7325753 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
     "jiwer>=3.0.3",
     "python-iso639>=2023.6.15",
     "regex>=2023.8.8",
-    "sacremoses>=0.0.53",
+    "sacremoses==0.0.53",
 ]
 classifiers = [
     "License :: OSI Approved :: MIT License",
@@ -26,6 +26,12 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
 
+[project.optional-dependencies]
+ja = [
+    "fugashi==1.3.0",
+    "unidic-lite==1.0.8",
+]
+
 
 [build-system]
 requires = ["setuptools"]
diff --git a/src/alt_eval/tokenizer.py b/src/alt_eval/tokenizer.py
index 67c93a2..7600dd9 100644
--- a/src/alt_eval/tokenizer.py
+++ b/src/alt_eval/tokenizer.py
@@ -1,5 +1,6 @@
 import copy
 from dataclasses import dataclass, field
+import functools
 import unicodedata
 
 import regex as re
@@ -59,6 +60,30 @@ def tokens_as_words(tokens: list[Token]) -> list[Token]:
     return result
 
 
+# fmt: off
+UNICODE_SCRIPTS = [
+    "Adlm", "Aghb", "Ahom", "Arab", "Armi", "Armn", "Avst", "Bali", "Bamu", "Bass", "Batk", "Beng",
+    "Bhks", "Bopo", "Brah", "Brai", "Bugi", "Buhd", "Cakm", "Cans", "Cari", "Cham", "Cher", "Chrs",
+    "Copt", "Cpmn", "Cprt", "Cyrl", "Deva", "Diak", "Dogr", "Dsrt", "Dupl", "Egyp", "Elba", "Elym",
+    "Ethi", "Geor", "Glag", "Gong", "Gonm", "Goth", "Gran", "Grek", "Gujr", "Guru", "Hang", "Hani",
+    "Hano", "Hatr", "Hebr", "Hira", "Hluw", "Hmng", "Hmnp", "Hung", "Ital", "Java", "Kali", "Kana",
+    "Kawi", "Khar", "Khmr", "Khoj", "Kits", "Knda", "Kthi", "Lana", "Laoo", "Latn", "Lepc", "Limb",
+    "Lina", "Linb", "Lisu", "Lyci", "Lydi", "Mahj", "Maka", "Mand", "Mani", "Marc", "Medf", "Mend",
+    "Merc", "Mero", "Mlym", "Modi", "Mong", "Mroo", "Mtei", "Mult", "Mymr", "Nagm", "Nand", "Narb",
+    "Nbat", "Newa", "Nkoo", "Nshu", "Ogam", "Olck", "Orkh", "Orya", "Osge", "Osma", "Ougr", "Palm",
+    "Pauc", "Perm", "Phag", "Phli", "Phlp", "Phnx", "Plrd", "Prti", "Rjng", "Rohg", "Runr", "Samr",
+    "Sarb", "Saur", "Sgnw", "Shaw", "Shrd", "Sidd", "Sind", "Sinh", "Sogd", "Sogo", "Sora", "Soyo",
+    "Sund", "Sylo", "Syrc", "Tagb", "Takr", "Tale", "Talu", "Taml", "Tang", "Tavt", "Telu", "Tfng",
+    "Tglg", "Thaa", "Thai", "Tibt", "Tirh", "Tnsa", "Toto", "Ugar", "Vaii", "Vith", "Wara", "Wcho",
+    "Xpeo", "Xsux", "Yezi", "Yiii", "Zanb",
+]
+UNICODE_SCRIPTS_NO_SPACES = [
+    "Egyp", "Hani", "Hira", "Hluw", "Lina", "Linb", "Xsux", "Kana", "Khmr", "Laoo", "Mymr", "Phag",
+    "Lana", "Thai", "Tibt",
+]
+# fmt: on
+
+
 class LyricsTokenizer:
     """A Moses-based tokenizer for lyrics.
 
@@ -80,14 +105,42 @@ def __init__(self) -> None:
             r"(?P<a>)(?P<b>'s)\b|\b(?P<a>wie|für)(?P<b>'n)\b", flags=re.IGNORECASE
         )
 
+        # A regex to match the boundary between two letters from two different scripts, or between a
+        # number and a letter from a script that does not use spaces between words.
+        self._different_scripts_re = re.compile(
+            r"|".join(
+                [rf"(?<=[\p{{L}}&&\p{{{s}}}])(?=[\p{{L}}--\p{{{s}}}])" for s in UNICODE_SCRIPTS]
+                + [rf"(?<=[\p{{L}}&&\p{{{s}}}])(?=[0-9])" for s in UNICODE_SCRIPTS_NO_SPACES]
+                + [rf"(?<=[0-9])(?=[\p{{L}}&&\p{{{s}}}])" for s in UNICODE_SCRIPTS_NO_SPACES]
+            ),
+            flags=re.VERSION1,
+        )
+
+        # A regex to match a character in a script that does not use spaces between words.
+        self._no_spaces_re = re.compile(
+            r"(" + r"|".join([rf"\p{{{s}}}" for s in UNICODE_SCRIPTS_NO_SPACES]) + r")",
+            flags=re.VERSION1,
+        )
+
+    @functools.cached_property
+    def _fugashi_tagger(self):
+        try:
+            import fugashi
+
+            return fugashi.Tagger()
+        except (ImportError, RuntimeError) as e:
+            raise RuntimeError(
+                "Failed to initialize the tagger for Japanese. Please make sure to install the "
+                "required dependencies via `pip install 'jam-alt[ja]'."
+            ) from e
+
     def __call__(self, text: str, language: str = "en") -> list[Token]:
         """
         Tokenize the given text.
 
         Args:
             text: A string to tokenize.
-            language: A language code supported by `sacremoses`: either an ISO 639-1 language code,
-                or "cjk" for Chinese, Japanese and Korean.
+            language: An ISO 639-1 language code.
 
         Returns:
             A list of `Token` objects.
@@ -147,6 +200,17 @@ def __call__(self, text: str, language: str = "en") -> list[Token]:
                     # Split contractions
                     line = self._contraction_de_re.sub(r"\g<a> \g<b>", line)
 
+                if language == "ja":
+                    # Tokenize Japanese
+                    line = " ".join(w.surface.strip() for w in self._fugashi_tagger(line))
+                else:
+                    # In other languages that do not use spaces to separate words, treat each
+                    # character as a separate word
+                    text = self._no_spaces_re.sub(r" \1 ", text)
+
+                # Insert spaces between characters from different scripts
+                text = self._different_scripts_re.sub(" ", text)
+
                 result.extend(line.strip().split())
 
         return to_rich_tokens(result)

From d1926f3c26fd6ff7e211d2d398331f9ce875a6bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= <cifkao@users.noreply.github.com>
Date: Mon, 11 Mar 2024 09:23:38 -0700
Subject: [PATCH 02/12] Return MER and WIL

---
 src/alt_eval/metrics.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/alt_eval/metrics.py b/src/alt_eval/metrics.py
index 68a3eb4..fb90f09 100644
--- a/src/alt_eval/metrics.py
+++ b/src/alt_eval/metrics.py
@@ -123,6 +123,8 @@ def compute_word_metrics(
 
     results = {
         "WER": wo.wer,
+        "MER": wo.mer,
+        "WIL": wo.wil,
         "ER_case": error_counts["case"] / total_len,
     }
     if visualize_errors:

From 8a2e9e39ee8bbc1dda14c79f878201d59fc06020 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= <cifkao@users.noreply.github.com>
Date: Tue, 12 Mar 2024 02:57:47 -0700
Subject: [PATCH 03/12] Fix tokenizer bug

---
 src/alt_eval/tokenizer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/alt_eval/tokenizer.py b/src/alt_eval/tokenizer.py
index 7600dd9..52b56b5 100644
--- a/src/alt_eval/tokenizer.py
+++ b/src/alt_eval/tokenizer.py
@@ -206,10 +206,10 @@ def __call__(self, text: str, language: str = "en") -> list[Token]:
                 else:
                     # In other languages that do not use spaces to separate words, treat each
                     # character as a separate word
-                    text = self._no_spaces_re.sub(r" \1 ", text)
+                    line = self._no_spaces_re.sub(r" \1 ", line)
 
-                # Insert spaces between characters from different scripts
-                text = self._different_scripts_re.sub(" ", text)
+                    # Insert spaces between characters from different scripts
+                    line = self._different_scripts_re.sub(" ", line)
 
                 result.extend(line.strip().split())
 

From dcff187675215a0381e4db549210920b8c8f5c86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= <cifkao@users.noreply.github.com>
Date: Tue, 12 Mar 2024 02:59:38 -0700
Subject: [PATCH 04/12] Add tokenizer test

---
 tests/__init__.py       |  0
 tests/test_tokenizer.py | 51 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_tokenizer.py

diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
new file mode 100644
index 0000000..32d29d8
--- /dev/null
+++ b/tests/test_tokenizer.py
@@ -0,0 +1,51 @@
+import pytest
+
+from alt_eval.tokenizer import LyricsTokenizer
+
+
+# fmt: off
+@pytest.mark.parametrize(
+    "language, text, expected_tokens",
+    [
+        (
+            "en",
+            "I ain't got nothin' but the blues",
+            ["I", "ain", "'t", "got", "nothin'", "but", "the", "blues"],
+        ),
+        (
+            "en",
+            "It 'll be fun (ha!)",
+            ["It", "'ll", "be", "fun", "(", "ha", "!", ")"]
+        ),
+        (
+            "de",
+            "Sei's Melancholie",
+            ["Sei", "'s", "Melancholie"]
+        ),
+        (
+            "de",
+            "Könnt' ich dir Schmerz erspar'n",
+            ["Könnt'", "ich", "dir", "Schmerz", "erspar'n"],
+        ),
+        (
+            "fr",
+            "T'avais fait l'amour deux fois sans penser qu'avec cette fille-là",
+            ["T'", "avais", "fait", "l'", "amour", "deux", "fois", "sans", "penser", "qu'", "avec", "cette", "fille", "-", "là"],
+        ),
+        (
+            "ja",
+            "私は日本語を話せません(ラララ)",
+            ["私", "は", "日本", "語", "を", "話せ", "ませ", "ん", "(", "ラララ", ")"],
+        ),
+        (
+            "zh",
+            "我不会说中文。(哈哈)",
+            ["我", "不", "会", "说", "中", "文", "。", "(", "哈", "哈", ")"],
+        )
+    ],
+)
+# fmt: on
+def test_lyrics_tokenizer(language, text, expected_tokens):
+    tokenizer = LyricsTokenizer()
+    tokens = [t.text for t in tokenizer(text, language=language)]
+    assert tokens == expected_tokens

From 34add1f0ecfd11fddf0a972a61ba63ef896b3409 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= <cifkao@users.noreply.github.com>
Date: Tue, 12 Mar 2024 03:00:50 -0700
Subject: [PATCH 05/12] Update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7a3f85f..e52658d 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ The package implements metrics designed to work well with lyrics formatted accor
   - Section breaks (i.e. double line breaks)
 
 ## Usage
-Install the package with `pip install alt-eval`.
+Install the package with `pip install alt-eval`. (Use the `ja` extra if you need to evaluate Japanese: `pip install alt-eval[ja]`.)
 
 To compute the metrics:
 ```python

From ec23d24590de202a6483e7050702272d3b209347 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= <cifkao@users.noreply.github.com>
Date: Tue, 12 Mar 2024 03:07:44 -0700
Subject: [PATCH 06/12] Run tests on gha

---
 .github/workflows/test.yml | 27 +++++++++++++++++++++++++++
 pyproject.toml             |  3 +++
 2 files changed, 30 insertions(+)
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..87506f6
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,27 @@
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, ready_for_review]
+  push:
+    branches: [main]
+
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout github repo
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
+      
+      - name: Install package
+        run: pip install '.[test,ja]'
+      
+      - name: Run tests
+        run: pytest tests
diff --git a/pyproject.toml b/pyproject.toml
index 7325753..ea0a43c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,9 @@ ja = [
     "fugashi==1.3.0",
     "unidic-lite==1.0.8",
 ]
+test = [
+    "pytest>=7.3.1",
+]
 
 
 [build-system]

From 42ecf4c90ee4878e0de8f7c7554b966ded3a0dbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= <cifkao@users.noreply.github.com>
Date: Tue, 12 Mar 2024 03:20:35 -0700
Subject: [PATCH 07/12] Remove 'cjk' tag support from compute_metrics

---
 src/alt_eval/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/alt_eval/metrics.py b/src/alt_eval/metrics.py
index fb90f09..1edf6de 100644
--- a/src/alt_eval/metrics.py
+++ b/src/alt_eval/metrics.py
@@ -196,7 +196,7 @@ def compute_metrics(
     """
     if isinstance(languages, str):
         languages = [languages] * len(references)
-    languages = [lg if lg == "cjk" else iso639.Language.match(lg).part1 for lg in languages]
+    languages = [iso639.Language.match(lg).part1 for lg in languages]
 
     tokenizer = LyricsTokenizer()
     tokens_ref, tokens_hyp = [], []

From 9f43f1dbd3fd6cce07b612544cfdc87e6de3442d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= <cifkao@users.noreply.github.com>
Date: Tue, 12 Mar 2024 03:35:08 -0700
Subject: [PATCH 08/12] Add test case

---
 tests/test_tokenizer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 32d29d8..c132d8a 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -17,6 +17,11 @@
             "It 'll be fun (ha!)",
             ["It", "'ll", "be", "fun", "(", "ha", "!", ")"]
         ),
+        (
+            "en",
+            "Just like 2Pac",
+            ["Just", "like", "2Pac"],
+        ),
         (
             "de",
             "Sei's Melancholie",

From a43be0a15893fb4474389e2af5ca7a00fdce0e54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= <cifkao@users.noreply.github.com>
Date: Tue, 12 Mar 2024 05:27:21 -0700
Subject: [PATCH 09/12] Better separation between languages

---
 src/alt_eval/tokenizer.py | 124 ++++++++++++++++++++------------------
 1 file changed, 67 insertions(+), 57 deletions(-)

diff --git a/src/alt_eval/tokenizer.py b/src/alt_eval/tokenizer.py
index 52b56b5..a46ae2f 100644
--- a/src/alt_eval/tokenizer.py
+++ b/src/alt_eval/tokenizer.py
@@ -122,18 +122,6 @@ def __init__(self) -> None:
             flags=re.VERSION1,
         )
 
-    @functools.cached_property
-    def _fugashi_tagger(self):
-        try:
-            import fugashi
-
-            return fugashi.Tagger()
-        except (ImportError, RuntimeError) as e:
-            raise RuntimeError(
-                "Failed to initialize the tagger for Japanese. Please make sure to install the "
-                "required dependencies via `pip install 'jam-alt[ja]'."
-            ) from e
-
     def __call__(self, text: str, language: str = "en") -> list[Token]:
         """
         Tokenize the given text.
@@ -145,12 +133,6 @@ def __call__(self, text: str, language: str = "en") -> list[Token]:
         Returns:
             A list of `Token` objects.
         """
-        if language not in self._tokenizers:
-            self._tokenizers[language] = MosesTokenizer(lang=language)
-            self._punct_normalizers[language] = MosesPunctNormalizer(lang=language)
-        tokenizer = self._tokenizers[language]
-        punct_normalizer = self._punct_normalizers[language]
-
         text = self._non_text_re.sub(" ", text)
         text = unicodedata.normalize("NFC", text)
         text = text.rstrip("\n")
@@ -164,47 +146,13 @@ def __call__(self, text: str, language: str = "en") -> list[Token]:
                 if line.count("\n") >= 2:
                     result.append("\n\n")
             elif line.strip():
-                # Ensure the line ends with punctuation to make the tokenizer treat it as
-                # a sentence
-                remove_last = False
-                if not self._end_punctuation_re.search(line):
-                    remove_last = True
-                    line += " ."
-
-                line = punct_normalizer.normalize(line)
-
-                if language in ["en", "fr", "it"]:
-                    # Protect apostrophes at word boundaries to prevent the tokenizer from
-                    # interpreting them as quotes
-                    line = self._word_boundary_apos_re.sub("@@apos@@", line)
-                else:
-                    # For languages where the tokenizer doesn't handle apostrophes within words,
-                    # protect all apostrophes
-                    line = line.replace("'", "@@apos@@")
-
-                line = tokenizer.tokenize(
-                    line.strip(),
-                    return_str=True,
-                    escape=False,
-                    aggressive_dash_splits=True,
-                    protected_patterns=[r"\*+", r"@@apos@@"],
-                )
-
-                if remove_last:
-                    assert line.endswith(" ."), line
-                    line = line[:-2]
-
-                # Post-process apostrophes
-                line = line.replace("@@apos@@", "'")
-                if language == "de":
-                    # Split contractions
-                    line = self._contraction_de_re.sub(r"\g<a> \g<b>", line)
-
+                # Tokenize depending on the language
                 if language == "ja":
-                    # Tokenize Japanese
-                    line = " ".join(w.surface.strip() for w in self._fugashi_tagger(line))
+                    line = self._tokenize_japanese(line)
                 else:
-                    # In other languages that do not use spaces to separate words, treat each
+                    line = self._tokenize_moses(line, language)
+
+                    # In languages that do not use spaces to separate words, treat each
                     # character as a separate word
                     line = self._no_spaces_re.sub(r" \1 ", line)
 
@@ -214,3 +162,65 @@ def __call__(self, text: str, language: str = "en") -> list[Token]:
                 result.extend(line.strip().split())
 
         return to_rich_tokens(result)
+
+    @functools.lru_cache(maxsize=200)
+    def _get_moses_tokenizer(self, language: str) -> MosesTokenizer:
+        return MosesTokenizer(lang=language)
+
+    @functools.lru_cache(maxsize=200)
+    def _get_moses_punct_normalizer(self, language: str) -> MosesPunctNormalizer:
+        return MosesPunctNormalizer(lang=language)
+
+    @functools.cached_property
+    def _fugashi_tagger(self):
+        try:
+            import fugashi
+
+            return fugashi.Tagger()
+        except (ImportError, RuntimeError) as e:
+            raise RuntimeError(
+                "Failed to initialize the tagger for Japanese. Please make sure to install the "
+                "required dependencies via `pip install 'jam-alt[ja]'."
+            ) from e
+
+    def _tokenize_moses(self, line: str, language: str) -> str:
+        # Ensure the line ends with punctuation to make the tokenizer treat it as
+        # a sentence
+        remove_last = False
+        if not self._end_punctuation_re.search(line):
+            remove_last = True
+            line += " ."
+
+        line = self._get_moses_punct_normalizer(language).normalize(line)
+
+        if language in ["en", "fr", "it"]:
+            # Protect apostrophes at word boundaries to prevent the tokenizer from
+            # interpreting them as quotes
+            line = self._word_boundary_apos_re.sub("@@apos@@", line)
+        else:
+            # For languages where the tokenizer doesn't handle apostrophes within words,
+            # protect all apostrophes
+            line = line.replace("'", "@@apos@@")
+
+        line = self._get_moses_tokenizer(language).tokenize(
+            line.strip(),
+            return_str=True,
+            escape=False,
+            aggressive_dash_splits=True,
+            protected_patterns=[r"\*+", r"@@apos@@"],
+        )
+
+        if remove_last:
+            assert line.endswith(" ."), line
+            line = line[:-2]
+
+        # Post-process apostrophes
+        line = line.replace("@@apos@@", "'")
+        if language == "de":
+            # Split contractions
+            line = self._contraction_de_re.sub(r"\g<a> \g<b>", line)
+
+        return line
+
+    def _tokenize_japanese(self, line: str) -> str:
+        return " ".join(w.surface.strip() for w in self._fugashi_tagger(line))

From 7c79c52b03569031311b305d1ea5384f9e974074 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= <cifkao@users.noreply.github.com>
Date: Tue, 12 Mar 2024 05:56:10 -0700
Subject: [PATCH 10/12] Fix test

---
 tests/test_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index c132d8a..a00a2c6 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -14,7 +14,7 @@
         ),
         (
             "en",
-            "It 'll be fun (ha!)",
+            "It'll be fun (ha!)",
             ["It", "'ll", "be", "fun", "(", "ha", "!", ")"]
         ),
         (

From f46f0a8cc7213f2cc8fbe25c8c29d489e53a0493 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= <cifkao@users.noreply.github.com>
Date: Tue, 12 Mar 2024 10:27:24 -0700
Subject: [PATCH 11/12] Revert back from fugashi, update readme

---
 README.md                 |  7 ++++++-
 pyproject.toml            |  4 ----
 src/alt_eval/tokenizer.py | 32 +++++++-------------------------
 tests/test_tokenizer.py   |  2 +-
 4 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index e52658d..3bf3e28 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,13 @@ The package implements metrics designed to work well with lyrics formatted accor
   - Line breaks
   - Section breaks (i.e. double line breaks)
 
+Under the hood, the text is pre-processed using the [`sacremoses`](https://github.com/hplt-project/sacremoses) tokenizer and punctuation normalizer.
+Note that apostrophes and single quotes are never treated as quotation marks, but as part of a word, marking an elision or a contraction.
+For writing systems that do not use spaces to separate words (Chinese, Japanese, Thai, Lao, Burmese, …), each character is considered as a separate word, as per [Radford et al. (2022)](https://arxiv.org/abs/2212.04356).
+See the [test cases](./tests/test_tokenizer.py) for examples of how different languages are tokenized.
+
 ## Usage
-Install the package with `pip install alt-eval`. (Use the `ja` extra if you need to evaluate Japanese: `pip install alt-eval[ja]`.)
+Install the package with `pip install alt-eval`.
 
 To compute the metrics:
 ```python
diff --git a/pyproject.toml b/pyproject.toml
index ea0a43c..e225e86 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,10 +27,6 @@ classifiers = [
 ]
 
 [project.optional-dependencies]
-ja = [
-    "fugashi==1.3.0",
-    "unidic-lite==1.0.8",
-]
 test = [
     "pytest>=7.3.1",
 ]
diff --git a/src/alt_eval/tokenizer.py b/src/alt_eval/tokenizer.py
index a46ae2f..4c82364 100644
--- a/src/alt_eval/tokenizer.py
+++ b/src/alt_eval/tokenizer.py
@@ -146,18 +146,15 @@ def __call__(self, text: str, language: str = "en") -> list[Token]:
                 if line.count("\n") >= 2:
                     result.append("\n\n")
             elif line.strip():
-                # Tokenize depending on the language
-                if language == "ja":
-                    line = self._tokenize_japanese(line)
-                else:
-                    line = self._tokenize_moses(line, language)
+                # Tokenize using sacremoses
+                line = self._tokenize_moses(line, language)
 
-                    # In languages that do not use spaces to separate words, treat each
-                    # character as a separate word
-                    line = self._no_spaces_re.sub(r" \1 ", line)
+                # In languages that do not use spaces to separate words, treat each
+                # character as a separate word
+                line = self._no_spaces_re.sub(r" \1 ", line)
 
-                    # Insert spaces between characters from different scripts
-                    line = self._different_scripts_re.sub(" ", line)
+                # Insert spaces between characters from different scripts
+                line = self._different_scripts_re.sub(" ", line)
 
                 result.extend(line.strip().split())
 
@@ -171,18 +168,6 @@ def _get_moses_tokenizer(self, language: str) -> MosesTokenizer:
     def _get_moses_punct_normalizer(self, language: str) -> MosesPunctNormalizer:
         return MosesPunctNormalizer(lang=language)
 
-    @functools.cached_property
-    def _fugashi_tagger(self):
-        try:
-            import fugashi
-
-            return fugashi.Tagger()
-        except (ImportError, RuntimeError) as e:
-            raise RuntimeError(
-                "Failed to initialize the tagger for Japanese. Please make sure to install the "
-                "required dependencies via `pip install 'jam-alt[ja]'."
-            ) from e
-
     def _tokenize_moses(self, line: str, language: str) -> str:
         # Ensure the line ends with punctuation to make the tokenizer treat it as
         # a sentence
@@ -221,6 +206,3 @@ def _tokenize_moses(self, line: str, language: str) -> str:
             line = self._contraction_de_re.sub(r"\g<a> \g<b>", line)
 
         return line
-
-    def _tokenize_japanese(self, line: str) -> str:
-        return " ".join(w.surface.strip() for w in self._fugashi_tagger(line))
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index a00a2c6..c39eb61 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -40,7 +40,7 @@
         (
             "ja",
             "私は日本語を話せません(ラララ)",
-            ["私", "は", "日本", "語", "を", "話せ", "ませ", "ん", "(", "ラララ", ")"],
+            ["私", "は", "日", "本", "語", "を", "話", "せ", "ま", "せ", "ん", "(", "ラ", "ラ", "ラ", ")"],
         ),
         (
             "zh",

From afe0d98e554ebb5950b0b2c68e484dbd6f6da51a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= <cifkao@users.noreply.github.com>
Date: Wed, 13 Mar 2024 09:27:02 +0100
Subject: [PATCH 12/12] Update test.yml

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 87506f6..7c94708 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -21,7 +21,7 @@ jobs:
           python-version: 3.9
       
       - name: Install package
-        run: pip install '.[test,ja]'
+        run: pip install '.[test]'
       
       - name: Run tests
         run: pytest tests