From e751708bf35ade1bd4eb1467273e6104e0550e73 Mon Sep 17 00:00:00 2001
From: Michael McAuliffe <michael.e.mcauliffe@gmail.com>
Date: Mon, 2 Dec 2024 11:44:55 -0700
Subject: [PATCH] Set default to composed unicode (#854)

---
 docs/source/changelog/changelog_3.2.rst              | 5 +++++
 montreal_forced_aligner/dictionary/multispeaker.py   | 2 +-
 montreal_forced_aligner/g2p/phonetisaurus_trainer.py | 4 ++++
 montreal_forced_aligner/g2p/trainer.py               | 4 +++-
 4 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/docs/source/changelog/changelog_3.2.rst b/docs/source/changelog/changelog_3.2.rst
index 4e416ac2..4949536c 100644
--- a/docs/source/changelog/changelog_3.2.rst
+++ b/docs/source/changelog/changelog_3.2.rst
@@ -5,6 +5,11 @@
 3.2 Changelog
 *************
 
+3.2.1
+-----
+
+- Changed unicode normalization to default to composed forms unless overridden by :code:`--unicode_decomposition true`
+
 3.2.0
 -----
 
diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py
index 0eac9db8..755ffe20 100644
--- a/montreal_forced_aligner/dictionary/multispeaker.py
+++ b/montreal_forced_aligner/dictionary/multispeaker.py
@@ -581,7 +581,7 @@ def dictionary_setup(self) -> Tuple[typing.Set[str], collections.Counter]:
                             if getattr(self, "unicode_decomposition", False):
                                 characters = unicodedata.normalize("NFKD", word)
                             else:
-                                characters = word
+                                characters = unicodedata.normalize("NFKC", word)
                             graphemes.update(characters)
                         if pretrained:
                             difference = set(pron) - self.non_silence_phones - self.silence_phones
diff --git a/montreal_forced_aligner/g2p/phonetisaurus_trainer.py b/montreal_forced_aligner/g2p/phonetisaurus_trainer.py
index 08ea6639..45ab2371 100644
--- a/montreal_forced_aligner/g2p/phonetisaurus_trainer.py
+++ b/montreal_forced_aligner/g2p/phonetisaurus_trainer.py
@@ -1744,6 +1744,8 @@ def initialize_training(self) -> None:
                 for pronunciation, word in query:
                     if self.unicode_decomposition:
                         word = unicodedata.normalize("NFKD", word)
+                    else:
+                        word = unicodedata.normalize("NFKC", word)
                     self.g2p_training_graphemes.update(word)
                     self.g2p_training_phones.update(pronunciation.split())
 
@@ -1814,6 +1816,8 @@ def initialize_training(self) -> None:
                 for pronunciation, word in query:
                     if self.unicode_decomposition:
                         word = unicodedata.normalize("NFKD", word)
+                    else:
+                        word = unicodedata.normalize("NFKC", word)
                     grapheme_count += len(word)
                     self.g2p_training_graphemes.update(word)
                     self.g2p_num_training_pronunciations += 1
diff --git a/montreal_forced_aligner/g2p/trainer.py b/montreal_forced_aligner/g2p/trainer.py
index 413db3f0..52cc09a8 100644
--- a/montreal_forced_aligner/g2p/trainer.py
+++ b/montreal_forced_aligner/g2p/trainer.py
@@ -205,7 +205,7 @@ def __init__(
         validation_proportion: float = 0.1,
         num_pronunciations: int = 0,
         evaluation_mode: bool = False,
-        unicode_decomposition: bool = True,
+        unicode_decomposition: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -782,6 +782,8 @@ def initialize_training(self) -> None:
                         continue
                     if self.unicode_decomposition:
                         word = unicodedata.normalize("NFKD", word)
+                    else:
+                        word = unicodedata.normalize("NFKC", word)
                     self.g2p_training_graphemes.update(word)
                     for p in pronunciations:
                         self.g2p_training_phones.update(p.split())