Bug fixes (#712)

MontrealCorpusTools · Nov 2, 2023 · 8ef1aba · 8ef1aba
1 parent f64192d
commit 8ef1aba
Show file tree

Hide file tree

Showing 7 changed files with 29 additions and 65 deletions.
diff --git a/docs/source/changelog/changelog_3.0.rst b/docs/source/changelog/changelog_3.0.rst
@@ -5,8 +5,15 @@
 3.0 Changelog
 *************
 
+3.0.0a8
+=======
+
+- Fixed an issue in not normalizing utterance and speaker xvectors from speechbrain
+- Bug fixes for integration with Anchor
+
 3.0.0a7
 =======
+
 - Fixed an issue where using relative paths could delete the all MFA temporary files with :code:`--clean`
 - Fixed an issue where "<eps>" in transcript to force silence was inserting phones for OOVs rather than silence
 

diff --git a/docs/source/first_steps/tutorials.rst b/docs/source/first_steps/tutorials.rst
@@ -1,7 +1,7 @@
 
 .. _`filing an issue`: https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/issues
 
-.. _`Montreal Forced Aligner v2 Corpus Phonetics Tutorial`: https://www.eleanorchodroff.com/tutorial/montreal-forced-aligner-v2.html
+.. _`Montreal Forced Aligner v2 Corpus Phonetics Tutorial`: https://eleanorchodroff.com/tutorial/montreal-forced-aligner.html
 
 .. _`Phonetic forced alignment with the Montreal Forced Aligner`: https://www.youtube.com/watch?v=Zhj-ccMDj_w
 

diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py
@@ -540,7 +540,6 @@ def initialize_jobs(self) -> None:
                 bulk_update(session, Utterance, update_mappings, id_field="speaker_id")
             session.commit()
             if session.query(Dictionary2Job).count() == 0:
-
                 dict_job_mappings = []
                 for job_id, dict_id in (
                     session.query(Utterance.job_id, Dictionary.id)
@@ -675,11 +674,11 @@ def normalize_text(self) -> None:
                 has_words = (
                     session.query(Dictionary).filter(Dictionary.name == "unknown").first() is None
                 )
+                existing_oovs = set()
                 words = session.query(
-                    Word.id, Word.mapping_id, Word.dictionary_id, Word.word
+                    Word.id, Word.mapping_id, Word.dictionary_id, Word.word, Word.word_type
                 ).order_by(Word.mapping_id)
                 if not has_words or getattr(self, "use_g2p", False):
-
                     word_insert_mappings["<eps>"] = {
                         "id": word_key,
                         "word": "<eps>",
@@ -690,7 +689,10 @@ def normalize_text(self) -> None:
                     }
                     word_key += 1
                     max_mapping_ids[1] = word_key - 1
-                for w_id, m_id, d_id, w in words:
+                for w_id, m_id, d_id, w, wt in words:
+                    if wt is WordType.oov:
+                        existing_oovs.add(w)
+                        continue
                     word_indexes[(d_id, w)] = w_id
                     word_mapping_ids[(d_id, w)] = m_id
                     max_mapping_ids[d_id] = m_id
@@ -818,6 +820,8 @@ def normalize_text(self) -> None:
                                 word_key += 1
                     else:
                         for word in to_g2p:
+                            if word in existing_oovs:
+                                continue
                             if word not in word_insert_mappings:
                                 word_insert_mappings[word] = {
                                     "id": word_key,
@@ -1151,7 +1155,6 @@ def create_subset(self, subset: int) -> None:
                     logger.debug(f"For {dict_id}, total number of utterances is {num_utts}")
                     larger_subset_num = int(subset_per_dictionary * 10)
                     if num_utts > larger_subset_num:
-
                         larger_subset_query = (
                             session.query(Utterance.id)
                             .join(Utterance.speaker)
@@ -1182,7 +1185,6 @@ def create_subset(self, subset: int) -> None:
                         session.execute(query)
                         logger.debug(f"For {dict_id}, subset is {subset_per_dictionary}")
                     elif num_utts > subset_per_dictionary:
-
                         larger_subset_query = (
                             session.query(Utterance.id)
                             .join(Utterance.speaker)

diff --git a/montreal_forced_aligner/corpus/ivector_corpus.py b/montreal_forced_aligner/corpus/ivector_corpus.py
@@ -248,6 +248,7 @@ def compute_plda(self, minimum_utterance_count: int = 1) -> None:
                     pbar.update(1)
                     vector = DoubleVector()
                     vector.from_numpy(ivector)
+                    ivector_normalize_length(vector)
                     ivector_mat.Row(i).CopyFromVec(vector)
                     num_utt_done += 1
                 update_mapping.append({"id": speaker_id, "num_utterances": num_utterances})

diff --git a/montreal_forced_aligner/db.py b/montreal_forced_aligner/db.py
@@ -1276,7 +1276,11 @@ def aligned_phone_intervals(self) -> typing.List[CtmInterval]:
         """
         Phone intervals from :attr:`montreal_forced_aligner.data.WorkflowType.alignment`
         """
-        return [x.as_ctm() for x in self.phone_intervals]
+        return [
+            x.as_ctm()
+            for x in self.phone_intervals
+            if x.workflow.workflow_type in [WorkflowType.alignment, WorkflowType.online_alignment]
+        ]
 
     @property
     def aligned_word_intervals(self) -> typing.List[CtmInterval]:
@@ -1760,7 +1764,6 @@ def dictionary_ids(self) -> typing.List[int]:
     def construct_feature_archive(
         self, working_directory: Path, dictionary_id: typing.Optional[int] = None, **kwargs
     ):
-
         fmllr_path = self.construct_path(
             self.corpus.current_subset_directory, "trans", "scp", dictionary_id
         )

diff --git a/montreal_forced_aligner/diarization/speaker_diarizer.py b/montreal_forced_aligner/diarization/speaker_diarizer.py
@@ -22,7 +22,7 @@
 import sqlalchemy
 import yaml
 from _kalpy.ivector import Plda, ivector_normalize_length
-from _kalpy.matrix import FloatVector
+from _kalpy.matrix import DoubleVector, FloatVector
 from kalpy.utils import read_kaldi_object
 from sklearn import metrics
 from sqlalchemy.orm import joinedload, selectinload
@@ -411,7 +411,6 @@ def classify_speakers(self):
 
     def map_speakers_to_ground_truth(self):
         with self.session() as session:
-
             utterances = session.query(Utterance.id, Utterance.speaker_id)
             labels = []
             utterance_ids = []
@@ -441,7 +440,6 @@ def evaluate_clustering(self) -> None:
         with self.session() as session, mfa_open(
             self.working_directory.joinpath("diarization_evaluation_results.csv"), "w"
         ) as f:
-
             writer = csv.DictWriter(
                 f,
                 fieldnames=[
@@ -805,7 +803,6 @@ def classify_iteration(self, iteration=None) -> None:
         with self.session() as session, tqdm(
             total=self.num_utterances, disable=config.QUIET
         ) as pbar:
-
             unknown_speaker_id = (
                 session.query(Speaker.id).filter(Speaker.name == "MFA_UNKNOWN").first()[0]
             )
@@ -1317,7 +1314,6 @@ def load_embeddings(self) -> None:
             begin = time.time()
             update_mapping = {}
             arguments = [SpeechbrainArguments(1, self.session, None, self.cuda, self.cluster)]
-            embeddings = []
             utterance_ids = []
             original_use_mp = config.USE_MP
             if self.cuda:
@@ -1327,8 +1323,11 @@ def load_embeddings(self) -> None:
                 SpeechbrainEmbeddingFunction, arguments, pbar.update
             ):
                 utterance_ids.append(u_id)
-                embeddings.append(emb)
-                update_mapping[u_id] = {"id": u_id, "xvector": emb}
+
+                kaldi_ivector = DoubleVector()
+                kaldi_ivector.from_numpy(emb)
+                ivector_normalize_length(kaldi_ivector)
+                update_mapping[u_id] = {"id": u_id, "xvector": kaldi_ivector.numpy()}
                 if len(update_mapping) >= batch_size:
                     bulk_update(session, Utterance, list(update_mapping.values()))
                     session.commit()
@@ -1404,53 +1403,6 @@ def refresh_plda_vectors(self):
             session.execute(sqlalchemy.text("VACUUM ANALYZE Utterance"))
         logger.debug(f"Refreshing utterance PLDA vectors took {time.time() - begin:.3f} seconds.")
 
-    def refresh_speaker_plda_vectors(self):
-
-        logger.info("Refreshing speaker PLDA vectors...")
-        begin = time.time()
-        self.create_new_current_workflow(WorkflowType.speaker_diarization)
-        self.plda = read_kaldi_object(Plda, self.plda_path)
-        with self.session() as session:
-            session.execute(sqlalchemy.text("DROP INDEX IF EXISTS speaker_plda_vector_index;"))
-            session.commit()
-            c = session.query(Corpus).first()
-            c.plda_calculated = False
-            update_mapping = []
-            speakers = (
-                session.query(Speaker.id, c.speaker_ivector_column, sqlalchemy.func.count())
-                .join(Utterance.speaker)
-                .filter(c.speaker_ivector_column != None)  # noqa
-                .group_by(Speaker.id)
-            )
-            with tqdm(total=self.num_speakers, disable=config.QUIET) as pbar:
-                for s_id, ivector, utt_count in speakers:
-                    kaldi_ivector = FloatVector()
-                    kaldi_ivector.from_numpy(ivector)
-                    pbar.update(1)
-                    update_mapping.append(
-                        {
-                            "id": s_id,
-                            "plda_vector": self.plda.transform_ivector(
-                                kaldi_ivector, utt_count
-                            ).numpy(),
-                        }
-                    )
-
-            bulk_update(session, Speaker, update_mapping)
-            c.plda_calculated = True
-            session.flush()
-            n_lists = int(math.sqrt(self.num_speakers))
-            session.execute(
-                sqlalchemy.text(
-                    f"CREATE INDEX IF NOT EXISTS speaker_plda_vector_index ON speaker USING ivfflat (plda_vector vector_cosine_ops) WITH (lists = {n_lists});"
-                )
-            )
-            session.commit()
-        autocommit_engine = self.db_engine.execution_options(isolation_level="AUTOCOMMIT")
-        with sqlalchemy.orm.Session(autocommit_engine) as session:
-            session.execute(sqlalchemy.text("VACUUM ANALYZE Speaker"))
-        logger.debug(f"Refreshing speaker PLDA vectors took {time.time() - begin:.3f} seconds.")
-
     def refresh_speaker_vectors(self) -> None:
         """Refresh speaker vectors following clustering or classification"""
         begin = time.time()
@@ -1585,7 +1537,6 @@ def export_files(self, output_directory: str) -> None:
                 Dumper=yaml.Dumper,
             )
         with self.session() as session:
-
             logger.info("Writing output files...")
             files = session.query(File).options(
                 selectinload(File.utterances),

diff --git a/montreal_forced_aligner/utils.py b/montreal_forced_aligner/utils.py
@@ -259,7 +259,7 @@ def get_mfa_version() -> str:
     try:
         from ._version import version as __version__  # noqa
     except ImportError:
-        __version__ = "2.0.0"
+        __version__ = "3.0.0"
     return __version__