diff --git a/src/icespeak/transcribe/__init__.py b/src/icespeak/transcribe/__init__.py index 743353c..ed846e7 100644 --- a/src/icespeak/transcribe/__init__.py +++ b/src/icespeak/transcribe/__init__.py @@ -532,6 +532,9 @@ def _split_substring_types(t: str) -> Iterable[str]: _IGNORED_TOKENS = frozenset( (TOK.WORD, TOK.PERSON, TOK.ENTITY, TOK.TIMESTAMP, TOK.UNKNOWN) ) +# These should not be interpreted as abbreviations +# unless they include a period +_IGNORED_ABBREVS = frozenset(("mið", "fim", "bandar", "mao", "próf", "tom", "mar")) _HYPHEN_SYMBOLS = frozenset(HYPHENS) _StrBool = Union[str, bool] @@ -1379,6 +1382,7 @@ def token_transcribe( token.kind == TOK.WORD and (meanings := Abbreviations.get_meaning(token.txt)) and meanings[0].fl != "erl" + and token.txt not in _IGNORED_ABBREVS ): # Expand abbreviation token.txt = meanings[0].stofn diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index dda4461..21ad778 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -464,6 +464,21 @@ def test_dt_token_transcribe_basic() -> None: t = _fix_ws("Hvað er 0,61cm í tommum?") n = DT.token_transcribe(t) assert "núll komma sextíu og einn sentimetri í tommum" in n + t = "En ef við tökum mið af því hve fim hún er í fimleikum?" + n = DT.token_transcribe(t) + assert n == t + t = "Hann bandar frá sér höndum þegar minnst er á mao zedong." + n = DT.token_transcribe(t) + assert n == t + t = "maðurinn tom fékk mar eftir strembið próf í síðustu viku" + n = DT.token_transcribe(t) + assert n == t + t = "Undirritað, próf. Jónína" + n = DT.token_transcribe(t) + assert "prófessor" in n + t = "Hann er bandar. ríkisborgari" + n = DT.token_transcribe(t) + assert "bandarískur" in n def test_dt_token_transcribe_experimental(): @@ -543,7 +558,13 @@ def test_dt_token_transcribe_experimental(): t = "Í 1., 2., 3. og 4. lagi. Í 31. lagi" n = DT.token_transcribe(t, options=t_opts) assert "Í fyrsta" in n + # TODO: Figure out a way to quickly put expanded word in correct case # assert "öðru" in n assert "þriðja" in n assert "fjórða" in n assert "þrítugasta og fyrsta" in n + t = "Á mið. eða fim. verður fundur hjá okkur." + n = DT.token_transcribe(t) + # TODO: ditto the point above + # assert "miðvikudag " in n + # assert "fimmtudag " in n