From 051041fecf7ed3a09b21711d7377d561b75e3861 Mon Sep 17 00:00:00 2001 From: "Logi E." Date: Thu, 1 Feb 2024 13:35:52 +0100 Subject: [PATCH 1/2] Adds exceptions for abbrev. expansion. --- src/icespeak/transcribe/__init__.py | 4 ++++ tests/test_transcribe.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/src/icespeak/transcribe/__init__.py b/src/icespeak/transcribe/__init__.py index 743353c..362e476 100644 --- a/src/icespeak/transcribe/__init__.py +++ b/src/icespeak/transcribe/__init__.py @@ -532,6 +532,9 @@ def _split_substring_types(t: str) -> Iterable[str]: _IGNORED_TOKENS = frozenset( (TOK.WORD, TOK.PERSON, TOK.ENTITY, TOK.TIMESTAMP, TOK.UNKNOWN) ) +# These should not be interpreted as abbreviations +# if they aren't followed by a period +_IGNORED_ABBREVS = frozenset(("mið", "fim", "bandar", "mao", "próf", "tom", "mar")) _HYPHEN_SYMBOLS = frozenset(HYPHENS) _StrBool = Union[str, bool] @@ -1379,6 +1382,7 @@ def token_transcribe( token.kind == TOK.WORD and (meanings := Abbreviations.get_meaning(token.txt)) and meanings[0].fl != "erl" + and token.txt not in _IGNORED_ABBREVS ): # Expand abbreviation token.txt = meanings[0].stofn diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index dda4461..51c2d10 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -464,6 +464,15 @@ def test_dt_token_transcribe_basic() -> None: t = _fix_ws("Hvað er 0,61cm í tommum?") n = DT.token_transcribe(t) assert "núll komma sextíu og einn sentimetri í tommum" in n + t = "En ef við tökum mið af því hve fim hún er í fimleikum?" + n = DT.token_transcribe(t) + assert n == t + t = "Hann bandar frá sér höndum þegar minnst er á mao zedong." + n = DT.token_transcribe(t) + assert n == t + t = "maðurinn tom fékk mar eftir strembið próf í síðustu viku" + n = DT.token_transcribe(t) + assert n == t def test_dt_token_transcribe_experimental(): @@ -543,7 +552,13 @@ def test_dt_token_transcribe_experimental(): t = "Í 1., 2., 3. og 4. lagi. Í 31. lagi" n = DT.token_transcribe(t, options=t_opts) assert "Í fyrsta" in n + # TODO: Figure out a way to quickly put expanded word in correct case # assert "öðru" in n assert "þriðja" in n assert "fjórða" in n assert "þrítugasta og fyrsta" in n + t = "Á mið. eða fim. verður fundur hjá okkur." + n = DT.token_transcribe(t) + # TODO: ditto the point above + # assert "miðvikudag " in n + # assert "fimmtudag " in n From 6bcf195ea7c8065f554cea9a39ff35820704815e Mon Sep 17 00:00:00 2001 From: "Logi E." Date: Thu, 1 Feb 2024 13:46:01 +0100 Subject: [PATCH 2/2] =?UTF-8?q?Adds=20tests=20for=20'pr=C3=B3f.'=20and=20'?= =?UTF-8?q?bandar.'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/icespeak/transcribe/__init__.py | 2 +- tests/test_transcribe.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/icespeak/transcribe/__init__.py b/src/icespeak/transcribe/__init__.py index 362e476..ed846e7 100644 --- a/src/icespeak/transcribe/__init__.py +++ b/src/icespeak/transcribe/__init__.py @@ -533,7 +533,7 @@ def _split_substring_types(t: str) -> Iterable[str]: (TOK.WORD, TOK.PERSON, TOK.ENTITY, TOK.TIMESTAMP, TOK.UNKNOWN) ) # These should not be interpreted as abbreviations -# if they aren't followed by a period +# unless they include a period _IGNORED_ABBREVS = frozenset(("mið", "fim", "bandar", "mao", "próf", "tom", "mar")) _HYPHEN_SYMBOLS = frozenset(HYPHENS) diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index 51c2d10..21ad778 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -473,6 +473,12 @@ def test_dt_token_transcribe_basic() -> None: t = "maðurinn tom fékk mar eftir strembið próf í síðustu viku" n = DT.token_transcribe(t) assert n == t + t = "Undirritað, próf. Jónína" + n = DT.token_transcribe(t) + assert "prófessor" in n + t = "Hann er bandar. ríkisborgari" + n = DT.token_transcribe(t) + assert "bandarískur" in n def test_dt_token_transcribe_experimental():