From e751708bf35ade1bd4eb1467273e6104e0550e73 Mon Sep 17 00:00:00 2001 From: Michael McAuliffe Date: Mon, 2 Dec 2024 11:44:55 -0700 Subject: [PATCH] Set default to composed unicode (#854) --- docs/source/changelog/changelog_3.2.rst | 5 +++++ montreal_forced_aligner/dictionary/multispeaker.py | 2 +- montreal_forced_aligner/g2p/phonetisaurus_trainer.py | 4 ++++ montreal_forced_aligner/g2p/trainer.py | 4 +++- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/docs/source/changelog/changelog_3.2.rst b/docs/source/changelog/changelog_3.2.rst index 4e416ac2..4949536c 100644 --- a/docs/source/changelog/changelog_3.2.rst +++ b/docs/source/changelog/changelog_3.2.rst @@ -5,6 +5,11 @@ 3.2 Changelog ************* +3.2.1 +----- + +- Changed unicode normalization to default to composed forms unless overridden by :code:`--unicode_decomposition true` + 3.2.0 ----- diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py index 0eac9db8..755ffe20 100644 --- a/montreal_forced_aligner/dictionary/multispeaker.py +++ b/montreal_forced_aligner/dictionary/multispeaker.py @@ -581,7 +581,7 @@ def dictionary_setup(self) -> Tuple[typing.Set[str], collections.Counter]: if getattr(self, "unicode_decomposition", False): characters = unicodedata.normalize("NFKD", word) else: - characters = word + characters = unicodedata.normalize("NFKC", word) graphemes.update(characters) if pretrained: difference = set(pron) - self.non_silence_phones - self.silence_phones diff --git a/montreal_forced_aligner/g2p/phonetisaurus_trainer.py b/montreal_forced_aligner/g2p/phonetisaurus_trainer.py index 08ea6639..45ab2371 100644 --- a/montreal_forced_aligner/g2p/phonetisaurus_trainer.py +++ b/montreal_forced_aligner/g2p/phonetisaurus_trainer.py @@ -1744,6 +1744,8 @@ def initialize_training(self) -> None: for pronunciation, word in query: if self.unicode_decomposition: word = unicodedata.normalize("NFKD", word) + else: + word = unicodedata.normalize("NFKC", word) self.g2p_training_graphemes.update(word) self.g2p_training_phones.update(pronunciation.split()) @@ -1814,6 +1816,8 @@ def initialize_training(self) -> None: for pronunciation, word in query: if self.unicode_decomposition: word = unicodedata.normalize("NFKD", word) + else: + word = unicodedata.normalize("NFKC", word) grapheme_count += len(word) self.g2p_training_graphemes.update(word) self.g2p_num_training_pronunciations += 1 diff --git a/montreal_forced_aligner/g2p/trainer.py b/montreal_forced_aligner/g2p/trainer.py index 413db3f0..52cc09a8 100644 --- a/montreal_forced_aligner/g2p/trainer.py +++ b/montreal_forced_aligner/g2p/trainer.py @@ -205,7 +205,7 @@ def __init__( validation_proportion: float = 0.1, num_pronunciations: int = 0, evaluation_mode: bool = False, - unicode_decomposition: bool = True, + unicode_decomposition: bool = False, **kwargs, ): super().__init__(**kwargs) @@ -782,6 +782,8 @@ def initialize_training(self) -> None: continue if self.unicode_decomposition: word = unicodedata.normalize("NFKD", word) + else: + word = unicodedata.normalize("NFKC", word) self.g2p_training_graphemes.update(word) for p in pronunciations: self.g2p_training_phones.update(p.split())