From 051041fecf7ed3a09b21711d7377d561b75e3861 Mon Sep 17 00:00:00 2001
From: "Logi E." <logi.eyjolfsson@gmail.com>
Date: Thu, 1 Feb 2024 13:35:52 +0100
Subject: [PATCH 1/2] Adds exceptions for abbrev. expansion.

---
 src/icespeak/transcribe/__init__.py |  4 ++++
 tests/test_transcribe.py            | 15 +++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/src/icespeak/transcribe/__init__.py b/src/icespeak/transcribe/__init__.py
index 743353c..362e476 100644
--- a/src/icespeak/transcribe/__init__.py
+++ b/src/icespeak/transcribe/__init__.py
@@ -532,6 +532,9 @@ def _split_substring_types(t: str) -> Iterable[str]:
 _IGNORED_TOKENS = frozenset(
     (TOK.WORD, TOK.PERSON, TOK.ENTITY, TOK.TIMESTAMP, TOK.UNKNOWN)
 )
+# These should not be interpreted as abbreviations
+# if they aren't followed by a period
+_IGNORED_ABBREVS = frozenset(("mið", "fim", "bandar", "mao", "próf", "tom", "mar"))
 _HYPHEN_SYMBOLS = frozenset(HYPHENS)
 
 _StrBool = Union[str, bool]
@@ -1379,6 +1382,7 @@ def token_transcribe(
                 token.kind == TOK.WORD
                 and (meanings := Abbreviations.get_meaning(token.txt))
                 and meanings[0].fl != "erl"
+                and token.txt not in _IGNORED_ABBREVS
             ):
                 # Expand abbreviation
                 token.txt = meanings[0].stofn
diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py
index dda4461..51c2d10 100644
--- a/tests/test_transcribe.py
+++ b/tests/test_transcribe.py
@@ -464,6 +464,15 @@ def test_dt_token_transcribe_basic() -> None:
     t = _fix_ws("Hvað er 0,61cm í tommum?")
     n = DT.token_transcribe(t)
     assert "núll komma sextíu og einn sentimetri í tommum" in n
+    t = "En ef við tökum mið af því hve fim hún er í fimleikum?"
+    n = DT.token_transcribe(t)
+    assert n == t
+    t = "Hann bandar frá sér höndum þegar minnst er á mao zedong."
+    n = DT.token_transcribe(t)
+    assert n == t
+    t = "maðurinn tom fékk mar eftir strembið próf í síðustu viku"
+    n = DT.token_transcribe(t)
+    assert n == t
 
 
 def test_dt_token_transcribe_experimental():
@@ -543,7 +552,13 @@ def test_dt_token_transcribe_experimental():
     t = "Í 1., 2., 3. og 4. lagi. Í 31. lagi"
     n = DT.token_transcribe(t, options=t_opts)
     assert "Í fyrsta" in n
+    # TODO: Figure out a way to quickly put expanded word in correct case
     # assert "öðru" in n
     assert "þriðja" in n
     assert "fjórða" in n
     assert "þrítugasta og fyrsta" in n
+    t = "Á mið. eða fim. verður fundur hjá okkur."
+    n = DT.token_transcribe(t)
+    # TODO: ditto the point above
+    # assert "miðvikudag " in n
+    # assert "fimmtudag " in n

From 6bcf195ea7c8065f554cea9a39ff35820704815e Mon Sep 17 00:00:00 2001
From: "Logi E." <logi.eyjolfsson@gmail.com>
Date: Thu, 1 Feb 2024 13:46:01 +0100
Subject: [PATCH 2/2] =?UTF-8?q?Adds=20tests=20for=20'pr=C3=B3f.'=20and=20'?=
 =?UTF-8?q?bandar.'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/icespeak/transcribe/__init__.py | 2 +-
 tests/test_transcribe.py            | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/icespeak/transcribe/__init__.py b/src/icespeak/transcribe/__init__.py
index 362e476..ed846e7 100644
--- a/src/icespeak/transcribe/__init__.py
+++ b/src/icespeak/transcribe/__init__.py
@@ -533,7 +533,7 @@ def _split_substring_types(t: str) -> Iterable[str]:
     (TOK.WORD, TOK.PERSON, TOK.ENTITY, TOK.TIMESTAMP, TOK.UNKNOWN)
 )
 # These should not be interpreted as abbreviations
-# if they aren't followed by a period
+# unless they include a period
 _IGNORED_ABBREVS = frozenset(("mið", "fim", "bandar", "mao", "próf", "tom", "mar"))
 _HYPHEN_SYMBOLS = frozenset(HYPHENS)
 
diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py
index 51c2d10..21ad778 100644
--- a/tests/test_transcribe.py
+++ b/tests/test_transcribe.py
@@ -473,6 +473,12 @@ def test_dt_token_transcribe_basic() -> None:
     t = "maðurinn tom fékk mar eftir strembið próf í síðustu viku"
     n = DT.token_transcribe(t)
     assert n == t
+    t = "Undirritað, próf. Jónína"
+    n = DT.token_transcribe(t)
+    assert "prófessor" in n
+    t = "Hann er bandar. ríkisborgari"
+    n = DT.token_transcribe(t)
+    assert "bandarískur" in n
 
 
 def test_dt_token_transcribe_experimental():