Skip to content

Commit

Permalink
Merge pull request #6 from mideind/abbrev-fix
Browse files Browse the repository at this point in the history
Abbrev fix
  • Loading branch information
sveinbjornt authored Feb 6, 2024
2 parents dd420f4 + 6bcf195 commit f8e0e01
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/icespeak/transcribe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,9 @@ def _split_substring_types(t: str) -> Iterable[str]:
_IGNORED_TOKENS = frozenset(
(TOK.WORD, TOK.PERSON, TOK.ENTITY, TOK.TIMESTAMP, TOK.UNKNOWN)
)
# These should not be interpreted as abbreviations
# unless they include a period
_IGNORED_ABBREVS = frozenset(("mið", "fim", "bandar", "mao", "próf", "tom", "mar"))
_HYPHEN_SYMBOLS = frozenset(HYPHENS)

_StrBool = Union[str, bool]
Expand Down Expand Up @@ -1379,6 +1382,7 @@ def token_transcribe(
token.kind == TOK.WORD
and (meanings := Abbreviations.get_meaning(token.txt))
and meanings[0].fl != "erl"
and token.txt not in _IGNORED_ABBREVS
):
# Expand abbreviation
token.txt = meanings[0].stofn
Expand Down
21 changes: 21 additions & 0 deletions tests/test_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,21 @@ def test_dt_token_transcribe_basic() -> None:
t = _fix_ws("Hvað er 0,61cm í tommum?")
n = DT.token_transcribe(t)
assert "núll komma sextíu og einn sentimetri í tommum" in n
t = "En ef við tökum mið af því hve fim hún er í fimleikum?"
n = DT.token_transcribe(t)
assert n == t
t = "Hann bandar frá sér höndum þegar minnst er á mao zedong."
n = DT.token_transcribe(t)
assert n == t
t = "maðurinn tom fékk mar eftir strembið próf í síðustu viku"
n = DT.token_transcribe(t)
assert n == t
t = "Undirritað, próf. Jónína"
n = DT.token_transcribe(t)
assert "prófessor" in n
t = "Hann er bandar. ríkisborgari"
n = DT.token_transcribe(t)
assert "bandarískur" in n


def test_dt_token_transcribe_experimental():
Expand Down Expand Up @@ -543,7 +558,13 @@ def test_dt_token_transcribe_experimental():
t = "Í 1., 2., 3. og 4. lagi. Í 31. lagi"
n = DT.token_transcribe(t, options=t_opts)
assert "Í fyrsta" in n
# TODO: Figure out a way to quickly put expanded word in correct case
# assert "öðru" in n
assert "þriðja" in n
assert "fjórða" in n
assert "þrítugasta og fyrsta" in n
t = "Á mið. eða fim. verður fundur hjá okkur."
n = DT.token_transcribe(t)
# TODO: ditto the point above
# assert "miðvikudag " in n
# assert "fimmtudag " in n

0 comments on commit f8e0e01

Please sign in to comment.