From ce0f5c1c80fe271e75de0a720bfec09ca27c19bc Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 20 Apr 2023 11:26:40 -0400
Subject: [PATCH 01/16] add transform attribute for MixedCut

---
 lhotse/cut/mixed.py               | 120 ++++++++++++++++++++++++++----
 test/cut/test_cut_augmentation.py |  41 +++++++++-
 2 files changed, 145 insertions(+), 16 deletions(-)

diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py
index b0ec1026d..005a953d6 100644
--- a/lhotse/cut/mixed.py
+++ b/lhotse/cut/mixed.py
@@ -10,7 +10,12 @@
 from intervaltree import IntervalTree
 
 from lhotse.audio import AudioMixer, Recording, audio_energy, torchaudio_save_flac_safe
-from lhotse.augmentation import AugmentFn
+from lhotse.augmentation import (
+    AudioTransform,
+    AugmentFn,
+    LoudnessNormalization,
+    ReverbWithImpulseResponse,
+)
 from lhotse.cut.base import Cut
 from lhotse.cut.data import DataCut
 from lhotse.cut.padding import PaddingCut
@@ -93,6 +98,9 @@ class MixedCut(Cut):
 
     .. note:: Each track in a MixedCut can be either a MonoCut, MultiCut, or PaddingCut.
 
+    .. note:: The ``transforms`` field is a list of dictionaries that describe the transformations
+        that should be applied to the track after mixing.
+
     See also:
 
         - :class:`lhotse.cut.Cut`
@@ -103,6 +111,7 @@ class MixedCut(Cut):
 
     id: str
     tracks: List[MixTrack]
+    transforms: Optional[List[Dict]] = None
 
     @property
     def supervisions(self) -> List[SupervisionSegment]:
@@ -713,6 +722,35 @@ def perturb_volume(self, factor: float, affix_id: bool = True) -> "MixedCut":
             ],
         )
 
+    def normalize_loudness(self, target: float, affix_id: bool = False) -> "DataCut":
+        """
+        Return a new ``MixedCut`` that will lazily apply loudness normalization.
+
+        :param target: The target loudness in dBFS.
+        :param affix_id: When true, we will modify the ``DataCut.id`` field
+            by affixing it with "_ln{target}".
+        :return: a modified copy of the current ``DataCut``.
+        """
+        # Pre-conditions
+        assert (
+            self.has_recording
+        ), "Cannot apply loudness normalization on a MixedCut without Recording."
+        if self.has_features:
+            logging.warning(
+                "Attempting to normalize loudness on a MixedCut that references pre-computed features. "
+                "The feature manifest will be detached, as we do not support feature-domain "
+                "loudness normalization."
+            )
+            self.features = None
+
+        transforms = self.transforms.copy() if self.transforms is not None else []
+        transforms.append(LoudnessNormalization(target=target).to_dict())
+        return fastcopy(
+            self,
+            id=f"{self.id}_ln{target}" if affix_id else self.id,
+            transforms=transforms,
+        )
+
     def reverb_rir(
         self,
         rir_recording: Optional["Recording"] = None,
@@ -722,6 +760,7 @@ def reverb_rir(
         rir_channels: List[int] = [0],
         room_rng_seed: Optional[int] = None,
         source_rng_seed: Optional[int] = None,
+        mix_first: bool = True,
     ) -> "MixedCut":
         """
         Return a new ``MixedCut`` that will convolve the audio with the provided impulse response.
@@ -739,6 +778,9 @@ def reverb_rir(
             be convolved with one of the specified channels.
         :param room_rng_seed: Seed for the room configuration.
         :param source_rng_seed: Seed for the source position.
+        :param mix_first: When true, the mixing will be done first before convolving with the RIR.
+            This effectively means that all tracks will be convolved with the same RIR. If you
+            are simulating multi-speaker mixtures, you should set this to False.
         :return: a modified copy of the current ``MixedCut``.
         """
         # Pre-conditions
@@ -760,23 +802,64 @@ def reverb_rir(
             self.tracks
         ), "Invalid number of channels in `rir_channels`, must be either 1 or equal to the number of tracks."
 
+        # There are 2 ways to apply RIRs:
+        # 1. Mix the tracks first, then apply RIRs. This is same as applying the same RIR
+        #    to all tracks. It does not make sense if all tracks belong to different speakers,
+        #    but it is useful for cases when we have a mixture of MonoCut and PaddingCut,
+        #    and we want to apply the same RIR to all of them.
+        # 2. Apply RIRs to each track separately. This is useful when we want to simulate
+        #    different speakers in the same room.
+
+        # First simulate the room config (will only be used if RIR is not provided)
+        uuid4_str = str(uuid4())
+        # The room RNG seed is based on the cut ID. This ensures that all tracks in the
+        # mixed cut will have the same room configuration.
+        if room_rng_seed is None:
+            room_rng_seed = hash_str_to_int(uuid4_str + self.id)
+        # The source RNG seed is based on the track ID. This ensures that each track
+        # will have a different source position.
+        source_rng_seeds = [source_rng_seed] * len(self.tracks)
+        if source_rng_seed is None:
+            source_rng_seeds = [
+                hash_str_to_int(uuid4_str + track.cut.id) for track in self.tracks
+            ]
+            source_rng_seed = source_rng_seeds[0]
+
+        # Apply same RIR to all tracks after mixing (default)
+        if mix_first:
+            if rir_recording is None:
+                from lhotse.augmentation.utils import FastRandomRIRGenerator
+
+                rir_generator = FastRandomRIRGenerator(
+                    sr=self.sampling_rate,
+                    room_seed=room_rng_seed,
+                    source_seed=source_rng_seed,
+                )
+            else:
+                rir_generator = None
+
+            transforms = self.transforms.copy() if self.transforms is not None else []
+            transforms.append(
+                ReverbWithImpulseResponse(
+                    rir=rir_recording,
+                    normalize_output=normalize_output,
+                    early_only=early_only,
+                    rir_channels=rir_channels if rir_channels is not None else [0],
+                    rir_generator=rir_generator,
+                ).to_dict()
+            )
+            return fastcopy(
+                self,
+                id=f"{self.id}_rvb" if affix_id else self.id,
+                transforms=transforms,
+            )
+
+        # Apply RIRs to each track separately. Note that we do not pass a `mix_first`
+        # argument below since it is True by default.
+
         if len(rir_channels) == 1:
             rir_channels = rir_channels * len(self.tracks)
 
-        source_rng_seeds = [source_rng_seed] * len(self.tracks)
-        if rir_recording is None:
-            uuid4_str = str(uuid4())
-            # The room RNG seed is based on the cut ID. This ensures that all tracks in the
-            # mixed cut will have the same room configuration.
-            if room_rng_seed is None:
-                room_rng_seed = hash_str_to_int(uuid4_str + self.id)
-            # The source RNG seed is based on the track ID. This ensures that each track
-            # will have a different source position.
-            if source_rng_seed is None:
-                source_rng_seeds = [
-                    hash_str_to_int(uuid4_str + track.cut.id) for track in self.tracks
-                ]
-
         return MixedCut(
             id=f"{self.id}_rvb" if affix_id else self.id,
             tracks=[
@@ -977,6 +1060,13 @@ def load_audio(
                 f"this issue at https://github.com/lhotse-speech/lhotse/issues "
                 f"showing the cut below. MixedCut:\n{self}"
             )
+
+            # We'll apply the transforms now (if any).
+            transforms = [
+                AudioTransform.from_dict(params) for params in self.transforms or []
+            ]
+            for tfn in transforms:
+                audio = tfn(audio, self.sampling_rate)
         else:
             audio = mixer.unmixed_audio
 
diff --git a/test/cut/test_cut_augmentation.py b/test/cut/test_cut_augmentation.py
index 7525ed33b..f98f21083 100644
--- a/test/cut/test_cut_augmentation.py
+++ b/test/cut/test_cut_augmentation.py
@@ -349,7 +349,7 @@ def test_mixed_cut_start01_perturb_volume(cut_with_supervision_start01):
 def test_mixed_cut_start01_reverb_rir(cut_with_supervision_start01, rir):
     mixed_rvb = cut_with_supervision_start01.append(
         cut_with_supervision_start01
-    ).reverb_rir(rir_recording=rir)
+    ).reverb_rir(rir_recording=rir, mix_first=False)
     assert mixed_rvb.start == 0  # MixedCut always starts at 0
     assert mixed_rvb.duration == cut_with_supervision_start01.duration * 2
     assert mixed_rvb.end == cut_with_supervision_start01.duration * 2
@@ -396,6 +396,24 @@ def test_mixed_cut_start01_reverb_rir(cut_with_supervision_start01, rir):
     )
 
 
+def test_mixed_cut_start01_reverb_rir_mix_first(cut_with_supervision_start01, rir):
+    mixed_rvb = cut_with_supervision_start01.pad(duration=0.5).reverb_rir(
+        rir_recording=rir, mix_first=True
+    )
+    assert mixed_rvb.start == 0  # MixedCut always starts at 0
+    assert mixed_rvb.duration == 0.5
+    assert mixed_rvb.end == 0.5
+    assert mixed_rvb.num_samples == 4000
+
+    # Check that the padding part should not be all zeros afte
+    np.testing.assert_raises(
+        AssertionError,
+        np.testing.assert_array_almost_equal,
+        mixed_rvb.load_audio()[:, 3200:],
+        np.zeros((1, 800)),
+    )
+
+
 def test_mixed_cut_start01_reverb_rir_with_fast_random(
     cut_with_supervision_start01, rir
 ):
@@ -449,6 +467,23 @@ def test_mixed_cut_start01_reverb_rir_multi_channel(
             mixed_cut.reverb_rir(multi_channel_rir, rir_channels=rir_channels)
 
 
+@pytest.mark.skipif(
+    not is_module_available("pyloudnorm"),
+    reason="This test requires pyloudnorm to be installed.",
+)
+@pytest.mark.parametrize("target", [-15.0, -20.0, -25.0])
+def test_mixed_cut_normalize_loudness(cut_with_supervision_start01, target):
+    mixed_cut = cut_with_supervision_start01.append(cut_with_supervision_start01)
+    mixed_cut_ln = mixed_cut.normalize_loudness(target)
+
+    import pyloudnorm as pyln
+
+    # check if loudness is correct
+    meter = pyln.Meter(mixed_cut_ln.sampling_rate)  # create BS.1770 meter
+    loudness = meter.integrated_loudness(mixed_cut_ln.load_audio().T)
+    assert loudness == pytest.approx(target, abs=0.5)
+
+
 @pytest.mark.skipif(
     not is_module_available("nara_wpe"),
     reason="This test requires nara_wpe to be installed.",
@@ -587,6 +622,10 @@ def test_cut_perturb_volume(cut_set, cut_id, scale):
     )
 
 
+@pytest.mark.skipif(
+    not is_module_available("pyloudnorm"),
+    reason="This test requires pyloudnorm to be installed.",
+)
 @pytest.mark.parametrize("target", [-15.0, -20.0, -25.0])
 def test_cut_normalize_loudness(libri_cut_set, target):
     cut_set_ln = libri_cut_set.normalize_loudness(target)

From ab18682257962740ee0825ba170689c9cc877e28 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 20 Apr 2023 11:36:54 -0400
Subject: [PATCH 02/16] add mix_first option in normalize_loudness

---
 lhotse/cut/mixed.py               | 35 ++++++++++++++++++++++++-------
 test/cut/test_cut_augmentation.py | 11 ++++++----
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py
index 005a953d6..5cfd4a3d2 100644
--- a/lhotse/cut/mixed.py
+++ b/lhotse/cut/mixed.py
@@ -722,11 +722,16 @@ def perturb_volume(self, factor: float, affix_id: bool = True) -> "MixedCut":
             ],
         )
 
-    def normalize_loudness(self, target: float, affix_id: bool = False) -> "DataCut":
+    def normalize_loudness(
+        self, target: float, mix_first: bool = True, affix_id: bool = False
+    ) -> "DataCut":
         """
         Return a new ``MixedCut`` that will lazily apply loudness normalization.
 
         :param target: The target loudness in dBFS.
+        :param mix_first: If true, we will mix the underlying cuts before applying
+            loudness normalization. If false, we cannot guarantee that the resulting
+            cut will have the target loudness.
         :param affix_id: When true, we will modify the ``DataCut.id`` field
             by affixing it with "_ln{target}".
         :return: a modified copy of the current ``DataCut``.
@@ -743,13 +748,27 @@ def normalize_loudness(self, target: float, affix_id: bool = False) -> "DataCut"
             )
             self.features = None
 
-        transforms = self.transforms.copy() if self.transforms is not None else []
-        transforms.append(LoudnessNormalization(target=target).to_dict())
-        return fastcopy(
-            self,
-            id=f"{self.id}_ln{target}" if affix_id else self.id,
-            transforms=transforms,
-        )
+        if mix_first:
+            transforms = self.transforms.copy() if self.transforms is not None else []
+            transforms.append(LoudnessNormalization(target=target).to_dict())
+            return fastcopy(
+                self,
+                id=f"{self.id}_ln{target}" if affix_id else self.id,
+                transforms=transforms,
+            )
+        else:
+            return MixedCut(
+                id=f"{self.id}_ln{target}" if affix_id else self.id,
+                tracks=[
+                    fastcopy(
+                        track,
+                        cut=track.cut.normalize_loudness(
+                            target=target, affix_id=affix_id
+                        ),
+                    )
+                    for track in self.tracks
+                ],
+            )
 
     def reverb_rir(
         self,
diff --git a/test/cut/test_cut_augmentation.py b/test/cut/test_cut_augmentation.py
index f98f21083..bc60a075a 100644
--- a/test/cut/test_cut_augmentation.py
+++ b/test/cut/test_cut_augmentation.py
@@ -471,17 +471,20 @@ def test_mixed_cut_start01_reverb_rir_multi_channel(
     not is_module_available("pyloudnorm"),
     reason="This test requires pyloudnorm to be installed.",
 )
-@pytest.mark.parametrize("target", [-15.0, -20.0, -25.0])
-def test_mixed_cut_normalize_loudness(cut_with_supervision_start01, target):
+@pytest.mark.parametrize(
+    "target, mix_first", [(-15.0, True), (-20.0, True), (-25.0, False)]
+)
+def test_mixed_cut_normalize_loudness(cut_with_supervision_start01, target, mix_first):
     mixed_cut = cut_with_supervision_start01.append(cut_with_supervision_start01)
-    mixed_cut_ln = mixed_cut.normalize_loudness(target)
+    mixed_cut_ln = mixed_cut.normalize_loudness(target, mix_first=mix_first)
 
     import pyloudnorm as pyln
 
     # check if loudness is correct
     meter = pyln.Meter(mixed_cut_ln.sampling_rate)  # create BS.1770 meter
     loudness = meter.integrated_loudness(mixed_cut_ln.load_audio().T)
-    assert loudness == pytest.approx(target, abs=0.5)
+    if mix_first:
+        assert loudness == pytest.approx(target, abs=0.5)
 
 
 @pytest.mark.skipif(

From e4bca7421a7a4506fc3da82d639935233689b36a Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 20 Apr 2023 15:40:46 -0400
Subject: [PATCH 03/16] handle the case when mix is called on MixedCut with
 existing transforms

---
 lhotse/cut/set.py | 57 +++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
index 1f5ad02b3..88532a1ee 100644
--- a/lhotse/cut/set.py
+++ b/lhotse/cut/set.py
@@ -2614,10 +2614,14 @@ def mix(
     if offset > reference_cut.duration:
         reference_cut = reference_cut.pad(duration=offset)
 
-    # When the left_cut is a MixedCut, take its existing tracks, otherwise create a new track.
-    if isinstance(reference_cut, MixedCut):
+    # When the left_cut is a MixedCut and it does not have existing transforms,
+    # take its existing tracks, otherwise create a new track.
+    if (
+        isinstance(reference_cut, MixedCut)
+        and len(ifnone(reference_cut.transforms, [])) == 0
+    ):
         old_tracks = reference_cut.tracks
-    elif isinstance(reference_cut, (DataCut, PaddingCut)):
+    elif isinstance(reference_cut, (DataCut, PaddingCut, MixedCut)):
         old_tracks = [MixTrack(cut=reference_cut)]
     else:
         raise ValueError(f"Unsupported type of cut in mix(): {type(reference_cut)}")
@@ -2625,27 +2629,32 @@ def mix(
     # When the right_cut is a MixedCut, adapt its existing tracks with the new offset and snr,
     # otherwise create a new track.
     if isinstance(mixed_in_cut, MixedCut):
-        new_tracks = [
-            MixTrack(
-                cut=track.cut,
-                offset=round(track.offset + offset, ndigits=8),
-                snr=(
-                    # When no new SNR is specified, retain whatever was there in the first place.
-                    track.snr
-                    if snr is None
-                    # When new SNR is specified but none was specified before, assign the new SNR value.
-                    else snr
-                    if track.snr is None
-                    # When both new and previous SNR were specified, assign their sum,
-                    # as the SNR for each track is defined with regard to the first track energy.
-                    else track.snr + snr
-                    if snr is not None and track is not None
-                    # When no SNR was specified whatsoever, use none.
-                    else None
-                ),
-            )
-            for track in mixed_in_cut.tracks
-        ]
+        # Similarly for mixed_in_cut, if it is a MixedCut and it does not have existing transforms,
+        # take its existing tracks, otherwise create a new track.
+        if len(ifnone(mixed_in_cut.transforms, [])) > 0:
+            new_tracks = [MixTrack(cut=mixed_in_cut, offset=offset, snr=snr)]
+        else:
+            new_tracks = [
+                MixTrack(
+                    cut=track.cut,
+                    offset=round(track.offset + offset, ndigits=8),
+                    snr=(
+                        # When no new SNR is specified, retain whatever was there in the first place.
+                        track.snr
+                        if snr is None
+                        # When new SNR is specified but none was specified before, assign the new SNR value.
+                        else snr
+                        if track.snr is None
+                        # When both new and previous SNR were specified, assign their sum,
+                        # as the SNR for each track is defined with regard to the first track energy.
+                        else track.snr + snr
+                        if snr is not None and track is not None
+                        # When no SNR was specified whatsoever, use none.
+                        else None
+                    ),
+                )
+                for track in mixed_in_cut.tracks
+            ]
     elif isinstance(mixed_in_cut, (DataCut, PaddingCut)):
         new_tracks = [MixTrack(cut=mixed_in_cut, offset=offset, snr=snr)]
     else:

From 71a92367f02da57655e7b3f633f1e7990b60fa6e Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 20 Apr 2023 16:08:03 -0400
Subject: [PATCH 04/16] add test for mixing with transformed MixedCut

---
 test/cut/test_cut_mixing.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/test/cut/test_cut_mixing.py b/test/cut/test_cut_mixing.py
index c38753ecb..3404d4edf 100644
--- a/test/cut/test_cut_mixing.py
+++ b/test/cut/test_cut_mixing.py
@@ -345,6 +345,28 @@ def test_mix_cut_snr_pad_both(libri_cut):
     assert E(feats_nosnr) > E(feats_snr)
 
 
+@pytest.mark.parametrize("mix_first", [True, False])
+def test_mix_cut_with_transform(libri_cut, mix_first):
+    # Create original mixed cut
+    padded = libri_cut.pad(duration=20, direction="right")
+    # Create transformed mixed cut
+    padded = padded.reverb_rir(mix_first=mix_first)
+    # Mix another cut
+    mixed1 = padded.mix(libri_cut)
+    mixed2 = libri_cut.mix(padded)
+
+    assert isinstance(padded, MixedCut)
+    assert len(padded.tracks) == 2
+    assert isinstance(mixed1, MixedCut)
+    assert isinstance(mixed2, MixedCut)
+    if mix_first:
+        assert len(mixed1.tracks) == 2
+        assert len(mixed2.tracks) == 2
+    else:
+        assert len(mixed1.tracks) == 3
+        assert len(mixed2.tracks) == 3
+
+
 def test_cut_set_mix_snr_is_deterministic():
     cuts = DummyManifest(CutSet, begin_id=0, end_id=2)
 

From 2e54646b5e1219a5ef001814f4305540cd0ff414 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 15 May 2023 20:25:43 -0400
Subject: [PATCH 05/16] enhancements and bug fixes

---
 lhotse/bin/modes/recipes/ami.py  |  9 ++++++
 lhotse/bin/modes/recipes/icsi.py |  9 +++---
 lhotse/recipes/ami.py            | 49 ++++++++++++++++++++++++++------
 lhotse/recipes/icsi.py           | 16 ++++++-----
 lhotse/recipes/utils.py          |  1 +
 5 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/lhotse/bin/modes/recipes/ami.py b/lhotse/bin/modes/recipes/ami.py
index 992e20474..b129de572 100644
--- a/lhotse/bin/modes/recipes/ami.py
+++ b/lhotse/bin/modes/recipes/ami.py
@@ -53,6 +53,13 @@
         " segmentation). If None, no segmentation is performed."
     ),
 )
+@click.option(
+    "--merge-consecutive",
+    type=bool,
+    is_flag=True,
+    default=False,
+    help="Merge consecutive segments from the same speaker.",
+)
 def ami(
     corpus_dir: Pathlike,
     output_dir: Pathlike,
@@ -61,6 +68,7 @@ def ami(
     partition: str,
     normalize_text: bool,
     max_words_per_segment: int,
+    merge_consecutive: bool,
 ):
     """AMI data preparation."""
     prepare_ami(
@@ -71,6 +79,7 @@ def ami(
         partition=partition,
         normalize_text=normalize_text,
         max_words_per_segment=max_words_per_segment,
+        merge_consecutive=merge_consecutive,
     )
 
 
diff --git a/lhotse/bin/modes/recipes/icsi.py b/lhotse/bin/modes/recipes/icsi.py
index 7d5a033bc..29c7f4edf 100644
--- a/lhotse/bin/modes/recipes/icsi.py
+++ b/lhotse/bin/modes/recipes/icsi.py
@@ -64,12 +64,13 @@ def icsi(
 )
 @click.option(
     "--normalize-text",
-    is_flag=True,
-    help="If set, convert all text annotations to upper case (similar to Kaldi)",
+    type=click.Choice(["none", "upper", "kaldi"], case_sensitive=False),
+    default="kaldi",
+    help="Type of text normalization to apply (kaldi style, by default)",
 )
 def icsi(
     audio_dir: Pathlike,
-    transcript_dir: Pathlike,
+    transcripts_dir: Pathlike,
     output_dir: Pathlike,
     mic: str,
     normalize_text: bool,
@@ -77,7 +78,7 @@ def icsi(
     """AMI data preparation."""
     prepare_icsi(
         audio_dir,
-        transcript_dir,
+        transcripts_dir,
         output_dir=output_dir,
         mic=mic,
         normalize_text=normalize_text,
diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py
index df6105dee..d5ff85361 100644
--- a/lhotse/recipes/ami.py
+++ b/lhotse/recipes/ami.py
@@ -276,7 +276,8 @@ class AmiSegmentAnnotation(NamedTuple):
 def parse_ami_annotations(
     annotations_dir: Pathlike,
     normalize: str = "upper",
-    max_words_per_segment: int = None,
+    max_words_per_segment: Optional[int] = None,
+    merge_consecutive: bool = False,
 ) -> Dict[str, List[SupervisionSegment]]:
 
     # Extract if zipped file
@@ -355,7 +356,9 @@ def parse_ami_annotations(
             seg_words = list(
                 filter(lambda w: w[0] >= seg_start and w[1] <= seg_end, spk_words)
             )
-            subsegments = split_segment(seg_words, max_words_per_segment)
+            subsegments = split_segment(
+                seg_words, max_words_per_segment, merge_consecutive
+            )
             for subseg in subsegments:
                 start, end, text = subseg
                 annotations[key].append(
@@ -372,26 +375,49 @@ def parse_ami_annotations(
 
 
 def split_segment(
-    words: List[Tuple[float, float, str]], max_words_per_segment: Optional[int]
+    words: List[Tuple[float, float, str]],
+    max_words_per_segment: Optional[int] = None,
+    merge_consecutive: bool = False,
 ):
     def split_(sequence, sep):
         chunk = []
         for val in sequence:
             if val[-1] == sep:
-                yield chunk
+                if len(chunk) > 0:
+                    yield chunk
                 chunk = []
             else:
                 chunk.append(val)
-        yield chunk
+        if len(chunk) > 0:
+            yield chunk
 
     def split_on_fullstop_(sequence):
-        return split_(sequence, ".")
-
-    def split_on_comma_(segment, max_words_per_segment):
+        subsegs = list(split_(sequence, "."))
+        if len(subsegs) < 2:
+            return subsegs
+        # Set a large default value for max_words_per_segment if not provided
+        max_segment_length = max_words_per_segment if max_words_per_segment else 100000
+        if merge_consecutive:
+            # Merge consecutive subsegments if their length is less than max_words_per_segment
+            merged_subsegs = [subsegs[0]]
+            for subseg in subsegs[1:]:
+                if (
+                    merged_subsegs[-1][-1][1] == subseg[0][0]
+                    and len(merged_subsegs[-1]) + len(subseg) <= max_segment_length
+                ):
+                    merged_subsegs[-1].extend(subseg)
+                else:
+                    merged_subsegs.append(subseg)
+            subsegs = merged_subsegs
+        return subsegs
+
+    def split_on_comma_(segment):
         # This function smartly splits a segment on commas such that the number of words
         # in each subsegment is as close to max_words_per_segment as possible.
         # First we create subsegments by splitting on commas
         subsegs = list(split_(segment, ","))
+        if len(subsegs) < 2:
+            return subsegs
         # Now we merge subsegments while ensuring that the number of words in each
         # subsegment is less than max_words_per_segment
         merged_subsegs = [subsegs[0]]
@@ -409,7 +435,7 @@ def split_on_comma_(segment, max_words_per_segment):
         # Now we split each subsegment based on commas to get at most max_words_per_segment
         # words per subsegment.
         subsegments = [
-            list(split_on_comma_(subseg, max_words_per_segment))
+            list(split_on_comma_(subseg))
             if len(subseg) > max_words_per_segment
             else [subseg]
             for subseg in subsegments
@@ -614,6 +640,7 @@ def prepare_ami(
     partition: Optional[str] = "full-corpus",
     normalize_text: str = "kaldi",
     max_words_per_segment: Optional[int] = None,
+    merge_consecutive: bool = False,
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
     """
     Returns the manifests which consist of the Recordings and Supervisions
@@ -625,6 +652,9 @@ def prepare_ami(
     :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text
     :param max_words_per_segment: int, maximum number of words per segment. If not None, we will split
         longer segments similar to Kaldi's data prep scripts, i.e., split on full-stop and comma.
+    :param merge_consecutive: bool, if True, merge consecutive segments split on full-stop.
+        We will only merge segments if the number of words in the merged segment is less than
+        max_words_per_segment.
     :return: a Dict whose key is ('train', 'dev', 'eval'), and the values are dicts of manifests under keys
         'recordings' and 'supervisions'.
 
@@ -662,6 +692,7 @@ def prepare_ami(
         annotations_dir,
         normalize=normalize_text,
         max_words_per_segment=max_words_per_segment,
+        merge_consecutive=merge_consecutive,
     )
 
     # Audio
diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
index 29734e84b..604d7852e 100644
--- a/lhotse/recipes/icsi.py
+++ b/lhotse/recipes/icsi.py
@@ -234,7 +234,6 @@ def download_icsi(
 def parse_icsi_annotations(
     transcripts_dir: Pathlike, normalize: str = "upper"
 ) -> Tuple[Dict[str, List[SupervisionSegment]], Dict[str, Dict[str, int]]]:
-
     annotations = defaultdict(list)
     # In Lhotse, channels are integers, so we map channel ids to integers for each session
     channel_to_idx_map = defaultdict(dict)
@@ -299,7 +298,6 @@ def prepare_audio_grouped(
     audio_paths: List[Pathlike],
     channel_to_idx_map: Dict[str, Dict[str, int]] = None,
 ) -> RecordingSet:
-
     # Group together multiple channels from the same session.
     # We will use that to create a Recording with multiple sources (channels).
     from cytoolz import groupby
@@ -474,7 +472,7 @@ def prepare_supervision_other(
 
 def prepare_icsi(
     audio_dir: Pathlike,
-    transcripts_dir: Pathlike,
+    transcripts_dir: Optional[Pathlike] = None,
     output_dir: Optional[Pathlike] = None,
     mic: Optional[str] = "ihm",
     normalize_text: str = "kaldi",
@@ -490,7 +488,11 @@ def prepare_icsi(
         'recordings' and 'supervisions'.
     """
     audio_dir = Path(audio_dir)
-    transcripts_dir = Path(transcripts_dir)
+    transcripts_dir = (
+        Path(transcripts_dir)
+        if transcripts_dir is not None
+        else audio_dir / "transcripts"
+    )
 
     assert audio_dir.is_dir(), f"No such directory: {audio_dir}"
     assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}"
@@ -539,6 +541,9 @@ def prepare_icsi(
             lambda x: x.recording_id in PARTITIONS[part]
         )
 
+        audio_part, supervision_part = fix_manifests(audio_part, supervision_part)
+        validate_recordings_and_supervisions(audio_part, supervision_part)
+
         # Write to output directory if a path is provided
         if output_dir is not None:
             audio_part.to_file(output_dir / f"icsi-{mic}_recordings_{part}.jsonl.gz")
@@ -546,9 +551,6 @@ def prepare_icsi(
                 output_dir / f"icsi-{mic}_supervisions_{part}.jsonl.gz"
             )
 
-        audio_part, supervision_part = fix_manifests(audio_part, supervision_part)
-        validate_recordings_and_supervisions(audio_part, supervision_part)
-
         # Combine all manifests into one dictionary
         manifests[part] = {"recordings": audio_part, "supervisions": supervision_part}
 
diff --git a/lhotse/recipes/utils.py b/lhotse/recipes/utils.py
index ab3d7b2d5..37fc00a99 100644
--- a/lhotse/recipes/utils.py
+++ b/lhotse/recipes/utils.py
@@ -146,6 +146,7 @@ def normalize_text_ami(text: str, normalize: str = "upper") -> str:
         text = re.sub(r"MM HMM", "MM-HMM", text)
         text = re.sub(r"UH HUH", "UH-HUH", text)
         text = re.sub(r"(\b)O K(\b)", r"\g<1>OK\g<2>", text)
+        text = re.sub(r"(\b)O_K(\b)", r"\g<1>OK\g<2>", text)
         return text
 
 

From db37a752466c722997cfade9d5a5192f289ab7b0 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 15 May 2023 20:38:08 -0400
Subject: [PATCH 06/16] small changes in some cutset methods

---
 lhotse/cut/base.py | 8 +++++---
 lhotse/cut/set.py  | 9 ++++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py
index 4df14a017..c9a989cf1 100644
--- a/lhotse/cut/base.py
+++ b/lhotse/cut/base.py
@@ -652,6 +652,7 @@ def trim_to_supervision_groups(
         supervision_group = [supervisions[0]]
         cur_end = supervisions[0].end
         new_cuts = []
+        group_idx = 0
         for sup in supervisions[1:]:
             if sup.start - cur_end <= max_pause:
                 supervision_group.append(sup)
@@ -666,8 +667,9 @@ def trim_to_supervision_groups(
                         offset=offset,
                         duration=duration,
                         keep_excessive_supervisions=False,
-                    )
+                    ).with_id(f"{self.id}-{max_pause}-{group_idx}")
                 )
+                group_idx += 1
                 supervision_group = [sup]
                 cur_end = sup.end
 
@@ -680,7 +682,7 @@ def trim_to_supervision_groups(
                     offset=offset,
                     duration=duration,
                     keep_excessive_supervisions=False,
-                )
+                ).with_id(f"{self.id}-{max_pause}-{group_idx}")
             )
         # The total number of supervisions should be the same.
         assert sum(len(c.supervisions) for c in new_cuts) == len(self.supervisions), (
@@ -724,7 +726,7 @@ def cut_into_windows(
                     offset=hop * i,
                     duration=duration,
                     keep_excessive_supervisions=keep_excessive_supervisions,
-                )
+                ).with_id(f"{self.id}-{i}")
             )
         return CutSet.from_cuts(new_cuts)
 
diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
index 8b474e0fa..81851794c 100644
--- a/lhotse/cut/set.py
+++ b/lhotse/cut/set.py
@@ -590,6 +590,7 @@ def to_shar(
                 warn_unused_fields=warn_unused_fields,
                 include_cuts=include_cuts,
                 shard_suffix=None,
+                verbose=verbose,
             )
 
         progbar = partial(tqdm, desc="Shard progress") if verbose else lambda x: x
@@ -610,6 +611,7 @@ def to_shar(
                         warn_unused_fields=warn_unused_fields,
                         include_cuts=True,
                         shard_suffix=f".{idx:06d}",
+                        verbose=False,
                     )
                 )
             for f in progbar(as_completed(futures)):
@@ -2706,7 +2708,7 @@ def mix(
     elif isinstance(mixed_in_cut, (DataCut, PaddingCut)):
         new_tracks = [MixTrack(cut=mixed_in_cut, offset=offset, snr=snr)]
     else:
-        raise ValueError(f"Unsupported type of cut in mix(): {type(reference_cut)}")
+        raise ValueError(f"Unsupported type of cut in mix(): {type(mixed_in_cut)}")
 
     return MixedCut(id=mixed_cut_id, tracks=old_tracks + new_tracks)
 
@@ -3386,9 +3388,12 @@ def _export_to_shar_single(
     warn_unused_fields: bool,
     include_cuts: bool,
     shard_suffix: Optional[str],
+    verbose: bool,
 ) -> Dict[str, List[str]]:
     from lhotse.shar import SharWriter
 
+    pbar = tqdm(desc="Exporting to SHAR", disable=not verbose)
+
     with SharWriter(
         output_dir=output_dir,
         fields=fields,
@@ -3399,5 +3404,7 @@ def _export_to_shar_single(
     ) as writer:
         for cut in cuts:
             writer.write(cut)
+            pbar.update()
 
+    # Finally, return the list of output files.
     return writer.output_paths

From 7b59ecdfb25118dbc9e5841bf414d8ab631d27d0 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 15 May 2023 20:42:17 -0400
Subject: [PATCH 07/16] small fix in error message

---
 lhotse/cut/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py
index c9a989cf1..8c2519687 100644
--- a/lhotse/cut/base.py
+++ b/lhotse/cut/base.py
@@ -490,7 +490,7 @@ def trim_to_supervisions(
                     len(set(to_hashable(s.channel) for s in trimmed.supervisions)) == 1
                 ), (
                     "Trimmed cut has supervisions with different channels. Either set "
-                    "`ignore_channel=True` to keep original channels or `keep_overlapping=False` "
+                    "`keep_all_channels=True` to keep original channels or `keep_overlapping=False` "
                     "to retain only 1 supervision per trimmed cut."
                 )
                 trimmed.channel = trimmed.supervisions[0].channel

From a64727a2b5aad58e908cb5028813a09afe8e79a4 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Tue, 16 May 2023 20:26:32 -0400
Subject: [PATCH 08/16] return word alignments from ami recipe

---
 lhotse/recipes/ami.py | 47 +++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py
index d5ff85361..73b47cc54 100644
--- a/lhotse/recipes/ami.py
+++ b/lhotse/recipes/ami.py
@@ -38,8 +38,8 @@
 from lhotse.audio import AudioSource, Recording, RecordingSet
 from lhotse.qa import fix_manifests
 from lhotse.recipes.utils import normalize_text_ami
-from lhotse.supervision import SupervisionSegment, SupervisionSet
-from lhotse.utils import Pathlike, Seconds, resumable_download
+from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike, Seconds, add_durations, resumable_download
 
 # fmt: off
 MEETINGS = {
@@ -271,6 +271,7 @@ class AmiSegmentAnnotation(NamedTuple):
     gender: str
     start_time: Seconds
     end_time: Seconds
+    words: List[AlignmentItem]
 
 
 def parse_ami_annotations(
@@ -279,7 +280,6 @@ def parse_ami_annotations(
     max_words_per_segment: Optional[int] = None,
     merge_consecutive: bool = False,
 ) -> Dict[str, List[SupervisionSegment]]:
-
     # Extract if zipped file
     if str(annotations_dir).endswith(".zip"):
         import zipfile
@@ -360,14 +360,27 @@ def parse_ami_annotations(
                 seg_words, max_words_per_segment, merge_consecutive
             )
             for subseg in subsegments:
-                start, end, text = subseg
+                start = subseg[0][0]
+                end = subseg[-1][1]
+                word_alignments = [
+                    AlignmentItem(
+                        start=round(w[0], ndigits=4),
+                        duration=add_durations(w[1], -w[0], sampling_rate=16000),
+                        symbol=normalize_text_ami(w[2], normalize=normalize),
+                    )
+                    for w in subseg
+                ]
+                # Filter out empty words
+                word_alignments = [w for w in word_alignments if w.symbol]
+                text = " ".join(w.symbol for w in word_alignments)
                 annotations[key].append(
                     AmiSegmentAnnotation(
-                        text=normalize_text_ami(text, normalize=normalize),
+                        text=text,
                         speaker=key[1],
                         gender=key[1][0],
                         start_time=start,
                         end_time=end,
+                        words=word_alignments,
                     )
                 )
 
@@ -378,7 +391,14 @@ def split_segment(
     words: List[Tuple[float, float, str]],
     max_words_per_segment: Optional[int] = None,
     merge_consecutive: bool = False,
-):
+) -> List[List[Tuple[float, float, str]]]:
+    """
+    Given a list of words, return a list of segments (each segment is a list of words)
+    where each segment has at most max_words_per_segment words. If merge_consecutive
+    is True, then consecutive segments with less than max_words_per_segment words
+    will be merged together.
+    """
+
     def split_(sequence, sep):
         chunk = []
         for val in sequence:
@@ -443,11 +463,8 @@ def split_on_comma_(segment):
         # flatten the list of lists
         subsegments = [item for sublist in subsegments for item in sublist]
 
-    # For each subsegment, we create a tuple of (start_time, end_time, text)
-    subsegments = [
-        (subseg[0][0], subseg[-1][1], " ".join([w[2] for w in subseg]))
-        for subseg in filter(lambda s: len(s) > 0, subsegments)
-    ]
+    # Filter out empty subsegments
+    subsegments = list(filter(lambda s: len(s) > 0, subsegments))
     return subsegments
 
 
@@ -563,7 +580,9 @@ def prepare_supervision_ihm(
                 continue
 
             for seg_idx, seg_info in enumerate(annotation):
-                duration = seg_info.end_time - seg_info.start_time
+                duration = add_durations(
+                    seg_info.end_time, -seg_info.start_time, sampling_rate=16000
+                )
                 # Some annotations in IHM setting exceed audio duration, so we
                 # ignore such segments
                 if seg_info.end_time > recording.duration:
@@ -577,13 +596,14 @@ def prepare_supervision_ihm(
                         SupervisionSegment(
                             id=f"{recording.id}-{channel}-{seg_idx}",
                             recording_id=recording.id,
-                            start=seg_info.start_time,
+                            start=round(seg_info.start_time, ndigits=4),
                             duration=duration,
                             channel=channel,
                             language="English",
                             speaker=seg_info.speaker,
                             gender=seg_info.gender,
                             text=seg_info.text,
+                            alignment={"word": seg_info.words},
                         )
                     )
 
@@ -627,6 +647,7 @@ def prepare_supervision_other(
                         speaker=seg_info.speaker,
                         gender=seg_info.gender,
                         text=seg_info.text,
+                        alignment={"word": seg_info.words},
                     )
                 )
     return SupervisionSet.from_segments(segments)

From 850ce2c5fd9b511b95137dc7f1f59a4577828abb Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 18 May 2023 09:23:46 -0400
Subject: [PATCH 09/16] add word alignments for ICSI

---
 lhotse/recipes/icsi.py | 204 ++++++++++++++++++++++++++++-------------
 1 file changed, 142 insertions(+), 62 deletions(-)

diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
index 604d7852e..757461ac5 100644
--- a/lhotse/recipes/icsi.py
+++ b/lhotse/recipes/icsi.py
@@ -106,8 +106,8 @@
 from lhotse.audio import AudioSource, Recording, RecordingSet, read_sph
 from lhotse.qa import fix_manifests
 from lhotse.recipes.utils import normalize_text_ami
-from lhotse.supervision import SupervisionSegment, SupervisionSet
-from lhotse.utils import Pathlike, Seconds, resumable_download
+from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike, Seconds, add_durations, resumable_download
 
 # fmt:off
 PARTITIONS = {
@@ -135,15 +135,6 @@
 # fmt:on
 
 
-class IcsiSegmentAnnotation(NamedTuple):
-    text: str
-    speaker: str
-    channel: str
-    gender: str
-    start_time: Seconds
-    end_time: Seconds
-
-
 def download_audio(
     target_dir: Path,
     force_download: Optional[bool] = False,
@@ -205,22 +196,30 @@ def download_icsi(
     download_audio(audio_dir, force_download, url, mic)
 
     # Annotations
-    logging.info("Downloading AMI annotations")
+    logging.info("Downloading ICSI annotations")
 
     if transcripts_dir.exists() and not force_download:
         logging.info(
             f"Skip downloading transcripts as they exist in: {transcripts_dir}"
         )
         return target_dir
-    annotations_url = f"{url}/ICSICorpusAnnotations/ICSI_original_transcripts.zip"
+
+    # We need the MRT transcripts for the speaker-to-channel mapping. The NXT transcripts
+    # are used for the actual annotations (since they contain word alignments)
+    annotations_url_mrt = f"{url}/ICSICorpusAnnotations/ICSI_original_transcripts.zip"
+    annotations_url_nxt = f"{url}/ICSICorpusAnnotations/ICSI_core_NXT.zip"
     resumable_download(
-        annotations_url,
+        annotations_url_mrt,
         filename=target_dir / "ICSI_original_transcripts.zip",
         force_download=force_download,
     )
+    resumable_download(
+        annotations_url_nxt,
+        filename=target_dir / "ICSI_core_NXT.zip",
+        force_download=force_download,
+    )
 
-    # Unzip annotations zip file
-    with zipfile.ZipFile(target_dir / "ICSI_original_transcripts.zip") as z:
+    with zipfile.ZipFile(target_dir / "ICSI_core_NXT.zip") as z:
         # Unzips transcripts to <target_dir>/'transcripts'
         # zip file also contains some documentation which will be unzipped to <target_dir>
         z.extractall(target_dir)
@@ -228,9 +227,22 @@ def download_icsi(
         if transcripts_dir:
             Path(target_dir / "transcripts").rename(transcripts_dir)
 
+    # From the MRT transcripts, we only need the transcripts/preambles.mrt file
+    with zipfile.ZipFile(target_dir / "ICSI_original_transcripts.zip") as z:
+        z.extract("transcripts/preambles.mrt", transcripts_dir)
+
     return target_dir
 
 
+class IcsiSegmentAnnotation(NamedTuple):
+    text: str
+    speaker: str
+    gender: str
+    start_time: Seconds
+    end_time: Seconds
+    words: List[AlignmentItem]
+
+
 def parse_icsi_annotations(
     transcripts_dir: Pathlike, normalize: str = "upper"
 ) -> Tuple[Dict[str, List[SupervisionSegment]], Dict[str, Dict[str, int]]]:
@@ -240,53 +252,119 @@ def parse_icsi_annotations(
     spk_to_channel_map = defaultdict(dict)
 
     # First we get global speaker ids and channels
-    for meeting_file in tqdm(
-        transcripts_dir.rglob("./*.mrt"), desc="Parsing ICSI mrt files"
-    ):
-        if meeting_file.stem == "preambles":
+    with open(transcripts_dir / "preambles.mrt") as f:
+        root = ET.parse(f).getroot()  # <Meetings>
+        for child in root:
+            if child.tag == "Meeting":
+                meeting_id = child.attrib["Session"]
+                for grandchild in child:
+                    if grandchild.tag == "Preamble":
+                        for greatgrandchild in grandchild:
+                            if greatgrandchild.tag == "Channels":
+                                channel_to_idx_map[meeting_id] = {
+                                    channel.attrib["Name"]: idx
+                                    for idx, channel in enumerate(greatgrandchild)
+                                }
+                            elif greatgrandchild.tag == "Participants":
+                                for speaker in greatgrandchild:
+                                    # some speakers may not have an associated channel in some meetings, so we
+                                    # assign them the SDM channel
+                                    spk_to_channel_map[meeting_id][
+                                        speaker.attrib["Name"]
+                                    ] = (
+                                        speaker.attrib["Channel"]
+                                        if "Channel" in speaker.attrib
+                                        else "chan6"
+                                    )
+
+    # Get the speaker segment times from the segments file
+    segments = {}
+    for file in (transcripts_dir / "Segments").glob("*.xml"):
+        meet_id, local_id, _ = file.stem.split(".")
+        spk_segments = []
+        spk_id = None
+        with open(file) as f:
+            tree = ET.parse(f)
+            for seg in tree.getroot():
+                if seg.tag != "segment":
+                    continue
+                if spk_id is None and "participant" in seg.attrib:
+                    spk_id = seg.attrib["participant"]
+                start_time = float(seg.attrib["starttime"])
+                end_time = float(seg.attrib["endtime"])
+                spk_segments.append((start_time, end_time))
+        if spk_id is None or len(spk_segments) == 0:
             continue
-        with open(meeting_file) as f:
-            meeting_id = meeting_file.stem
-            root = ET.parse(f).getroot()  # <Meeting>
-            for child in root:
-                if child.tag == "Preamble":
-                    for grandchild in child:
-                        if grandchild.tag == "Channels":
-                            channel_to_idx_map[meeting_id] = {
-                                channel.attrib["Name"]: idx
-                                for idx, channel in enumerate(grandchild)
-                            }
-                        elif grandchild.tag == "Participants":
-                            for speaker in grandchild:
-                                # some speakers may not have an associated channel in some meetings, so we
-                                # assign them the SDM channel
-                                spk_to_channel_map[meeting_id][
-                                    speaker.attrib["Name"]
-                                ] = (
-                                    speaker.attrib["Channel"]
-                                    if "Channel" in speaker.attrib
-                                    else "chan6"
-                                )
-                elif child.tag == "Transcript":
-                    for segment in child:
-                        if len(list(segment)) == 0 and "Participant" in segment.attrib:
-                            start_time = float(segment.attrib["StartTime"])
-                            end_time = float(segment.attrib["EndTime"])
-                            speaker = segment.attrib["Participant"]
-                            channel = spk_to_channel_map[meeting_id][speaker]
-                            text = normalize_text_ami(
-                                segment.text.strip(), normalize=normalize
-                            )
-                            annotations[(meeting_id, speaker, channel)].append(
-                                IcsiSegmentAnnotation(
-                                    text,
-                                    speaker,
-                                    channel,
-                                    speaker[0],
-                                    start_time,
-                                    end_time,
-                                )
-                            )
+        key = (meet_id, local_id)
+        channel = spk_to_channel_map[meet_id][spk_id]
+        segments[key] = (spk_id, channel, spk_segments)
+
+    # Now we go through each speaker's word-level annotations and store them
+    words = {}
+    for file in (transcripts_dir / "Words").glob("*.xml"):
+        meet_id, local_id, _ = file.stem.split(".")
+        key = (meet_id, local_id)
+        if key not in segments:
+            continue
+        else:
+            spk_id, channel, spk_segments = segments[key]
+
+        seg_words = []
+        combine_with_next = False
+        with open(file) as f:
+            tree = ET.parse(f)
+            for i, word in enumerate(tree.getroot()):
+                if (
+                    word.tag != "w"
+                    or "starttime" not in word.attrib
+                    or word.attrib["starttime"] == ""
+                    or "endtime" not in word.attrib
+                    or word.attrib["endtime"] == ""
+                ):
+                    continue
+                start_time = float(word.attrib["starttime"])
+                end_time = float(word.attrib["endtime"])
+                seg_words.append((start_time, end_time, word.text))
+        words[key] = (spk_id, channel, seg_words)
+
+    # Now we create segment-level annotations by combining the word-level
+    # annotations with the speaker segment times. We also normalize the text
+    # (if requested).
+    annotations = defaultdict(list)
+
+    for key, (spk_id, channel, spk_segments) in segments.items():
+        # Get the words for this speaker
+        _, _, spk_words = words[key]
+        # Now iterate over the speaker segments and create segment annotations
+        for seg_start, seg_end in spk_segments:
+            seg_words = list(
+                filter(lambda w: w[0] >= seg_start and w[1] <= seg_end, spk_words)
+            )
+            if len(seg_words) == 0:
+                continue
+            start = seg_words[0][0]
+            end = seg_words[-1][1]
+            word_alignments = [
+                AlignmentItem(
+                    start=round(w[0], ndigits=4),
+                    duration=add_durations(w[1], -w[0], sampling_rate=16000),
+                    symbol=normalize_text_ami(w[2], normalize=normalize),
+                )
+                for w in seg_words
+            ]
+            # Filter out empty words
+            word_alignments = [w for w in word_alignments if len(w.symbol) > 0]
+            text = " ".join(w.symbol for w in word_alignments)
+            annotations[key].append(
+                IcsiSegmentAnnotation(
+                    text=text,
+                    speaker=spk_id,
+                    gender=spk_id[0],
+                    start_time=start,
+                    end_time=end,
+                    words=word_alignments,
+                )
+            )
     return annotations, channel_to_idx_map
 
 
@@ -422,6 +500,7 @@ def prepare_supervision_ihm(
                             speaker=seg_info.speaker,
                             gender=seg_info.gender,
                             text=seg_info.text,
+                            alignment={"word": seg_info.words},
                         )
                     )
 
@@ -465,6 +544,7 @@ def prepare_supervision_other(
                         speaker=seg_info.speaker,
                         gender=seg_info.gender,
                         text=seg_info.text,
+                        alignment={"word": seg_info.words},
                     )
                 )
     return SupervisionSet.from_segments(segments)

From 4b39c6fe9b27188b0f97c980eef7b6e3d0dc97b0 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 18 May 2023 09:26:05 -0400
Subject: [PATCH 10/16] remove unwanted whitespace

---
 lhotse/recipes/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lhotse/recipes/utils.py b/lhotse/recipes/utils.py
index 37fc00a99..b63ad0c66 100644
--- a/lhotse/recipes/utils.py
+++ b/lhotse/recipes/utils.py
@@ -147,7 +147,7 @@ def normalize_text_ami(text: str, normalize: str = "upper") -> str:
         text = re.sub(r"UH HUH", "UH-HUH", text)
         text = re.sub(r"(\b)O K(\b)", r"\g<1>OK\g<2>", text)
         text = re.sub(r"(\b)O_K(\b)", r"\g<1>OK\g<2>", text)
-        return text
+        return text.strip()
 
 
 def normalize_text_chime6(text: str, normalize: str = "upper") -> str:

From 3c16b906d7d7ff7a22541e04f1f118509005d544 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 18 May 2023 10:22:09 -0400
Subject: [PATCH 11/16] fix IHM preparation

---
 lhotse/recipes/icsi.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
index 757461ac5..6f42896e6 100644
--- a/lhotse/recipes/icsi.py
+++ b/lhotse/recipes/icsi.py
@@ -245,7 +245,9 @@ class IcsiSegmentAnnotation(NamedTuple):
 
 def parse_icsi_annotations(
     transcripts_dir: Pathlike, normalize: str = "upper"
-) -> Tuple[Dict[str, List[SupervisionSegment]], Dict[str, Dict[str, int]]]:
+) -> Tuple[
+    Dict[Tuple[str, str, str], List[SupervisionSegment]], Dict[str, Dict[str, int]]
+]:
     annotations = defaultdict(list)
     # In Lhotse, channels are integers, so we map channel ids to integers for each session
     channel_to_idx_map = defaultdict(dict)
@@ -329,12 +331,13 @@ def parse_icsi_annotations(
 
     # Now we create segment-level annotations by combining the word-level
     # annotations with the speaker segment times. We also normalize the text
-    # (if requested).
+    # (if requested). The annotations is a dict indexed by (meeting_id, spk_id, channel).
     annotations = defaultdict(list)
 
     for key, (spk_id, channel, spk_segments) in segments.items():
         # Get the words for this speaker
         _, _, spk_words = words[key]
+        new_key = (key[0], spk_id, channel)
         # Now iterate over the speaker segments and create segment annotations
         for seg_start, seg_end in spk_segments:
             seg_words = list(
@@ -355,7 +358,7 @@ def parse_icsi_annotations(
             # Filter out empty words
             word_alignments = [w for w in word_alignments if len(w.symbol) > 0]
             text = " ".join(w.symbol for w in word_alignments)
-            annotations[key].append(
+            annotations[new_key].append(
                 IcsiSegmentAnnotation(
                     text=text,
                     speaker=spk_id,

From 9921575d85b6eaa5f58a2b98c69fd4bc81e7afef Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 18 May 2023 11:36:32 -0400
Subject: [PATCH 12/16] remove words with zero or negative duration

---
 lhotse/recipes/ami.py  | 11 +++++++++--
 lhotse/recipes/icsi.py |  8 ++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py
index 73b47cc54..f668cb0b5 100644
--- a/lhotse/recipes/ami.py
+++ b/lhotse/recipes/ami.py
@@ -370,8 +370,15 @@ def parse_ami_annotations(
                     )
                     for w in subseg
                 ]
-                # Filter out empty words
-                word_alignments = [w for w in word_alignments if w.symbol]
+                word_alignments = [w for w in word_alignments if len(w.symbol) > 0]
+                if any(w.duration <= 0 for w in word_alignments):
+                    logging.warning(
+                        f"Segment {key[0]}.{key[1]}.{key[2]} at time {start}-{end} "
+                        f"has a word with zero or negative duration."
+                    )
+                    word_alignments = [
+                        w for w in word_alignments if w.duration > 0
+                    ]  # type: ignore
                 text = " ".join(w.symbol for w in word_alignments)
                 annotations[key].append(
                     AmiSegmentAnnotation(
diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
index 6f42896e6..18699b0dc 100644
--- a/lhotse/recipes/icsi.py
+++ b/lhotse/recipes/icsi.py
@@ -357,6 +357,14 @@ def parse_icsi_annotations(
             ]
             # Filter out empty words
             word_alignments = [w for w in word_alignments if len(w.symbol) > 0]
+            if any(w.duration <= 0 for w in word_alignments):
+                logging.warning(
+                    f"Segment {key[0]}.{spk_id}.{channel} at time {start}-{end} "
+                    f"has a word with zero or negative duration."
+                )
+                word_alignments = [
+                    w for w in word_alignments if w.duration > 0
+                ]  # type: ignore
             text = " ".join(w.symbol for w in word_alignments)
             annotations[new_key].append(
                 IcsiSegmentAnnotation(

From dba413f3bf844acb58f5bdab349334094d32de5b Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Thu, 18 May 2023 13:14:14 -0400
Subject: [PATCH 13/16] ensure word alignments respect segment boundary

---
 lhotse/recipes/ami.py  | 32 ++++++++++++++++----------------
 lhotse/recipes/icsi.py | 33 ++++++++++++++++-----------------
 2 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py
index f668cb0b5..c71db9e4a 100644
--- a/lhotse/recipes/ami.py
+++ b/lhotse/recipes/ami.py
@@ -362,23 +362,23 @@ def parse_ami_annotations(
             for subseg in subsegments:
                 start = subseg[0][0]
                 end = subseg[-1][1]
-                word_alignments = [
-                    AlignmentItem(
-                        start=round(w[0], ndigits=4),
-                        duration=add_durations(w[1], -w[0], sampling_rate=16000),
-                        symbol=normalize_text_ami(w[2], normalize=normalize),
-                    )
-                    for w in subseg
-                ]
-                word_alignments = [w for w in word_alignments if len(w.symbol) > 0]
-                if any(w.duration <= 0 for w in word_alignments):
-                    logging.warning(
-                        f"Segment {key[0]}.{key[1]}.{key[2]} at time {start}-{end} "
-                        f"has a word with zero or negative duration."
+                word_alignments = []
+                for w in subseg:
+                    w_start = max(start, round(w[0], ndigits=4))
+                    w_end = min(end, round(w[1], ndigits=4))
+                    w_dur = add_durations(w_end, -w_start, sampling_rate=16000)
+                    w_symbol = normalize_text_ami(w[2], normalize=normalize)
+                    if len(w_symbol) == 0:
+                        continue
+                    if w_dur <= 0:
+                        logging.warning(
+                            f"Segment {key[0]}.{key[1]}.{key[2]} at time {start}-{end} "
+                            f"has a word with zero or negative duration. Skipping."
+                        )
+                        continue
+                    word_alignments.append(
+                        AlignmentItem(start=w_start, duration=w_dur, symbol=w_symbol)
                     )
-                    word_alignments = [
-                        w for w in word_alignments if w.duration > 0
-                    ]  # type: ignore
                 text = " ".join(w.symbol for w in word_alignments)
                 annotations[key].append(
                     AmiSegmentAnnotation(
diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
index 18699b0dc..72b92a407 100644
--- a/lhotse/recipes/icsi.py
+++ b/lhotse/recipes/icsi.py
@@ -347,24 +347,23 @@ def parse_icsi_annotations(
                 continue
             start = seg_words[0][0]
             end = seg_words[-1][1]
-            word_alignments = [
-                AlignmentItem(
-                    start=round(w[0], ndigits=4),
-                    duration=add_durations(w[1], -w[0], sampling_rate=16000),
-                    symbol=normalize_text_ami(w[2], normalize=normalize),
-                )
-                for w in seg_words
-            ]
-            # Filter out empty words
-            word_alignments = [w for w in word_alignments if len(w.symbol) > 0]
-            if any(w.duration <= 0 for w in word_alignments):
-                logging.warning(
-                    f"Segment {key[0]}.{spk_id}.{channel} at time {start}-{end} "
-                    f"has a word with zero or negative duration."
+            word_alignments = []
+            for w in seg_words:
+                w_start = max(start, round(w[0], ndigits=4))
+                w_end = min(end, round(w[1], ndigits=4))
+                w_dur = add_durations(w_end, -w_start, sampling_rate=16000)
+                w_symbol = normalize_text_ami(w[2], normalize=normalize)
+                if len(w_symbol) == 0:
+                    continue
+                if w_dur <= 0:
+                    logging.warning(
+                        f"Segment {key[0]}.{spk_id}.{channel} at time {start}-{end} "
+                        f"has a word with zero or negative duration. Skipping."
+                    )
+                    continue
+                word_alignments.append(
+                    AlignmentItem(start=w_start, duration=w_dur, symbol=w_symbol)
                 )
-                word_alignments = [
-                    w for w in word_alignments if w.duration > 0
-                ]  # type: ignore
             text = " ".join(w.symbol for w in word_alignments)
             annotations[new_key].append(
                 IcsiSegmentAnnotation(

From 12be4242c7b28233dfcd9a569b52c3c2104015ab Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 22 May 2023 10:16:42 -0400
Subject: [PATCH 14/16] add save-to-wav option for icsi

---
 lhotse/bin/modes/recipes/icsi.py | 10 +++++++++-
 lhotse/recipes/icsi.py           | 27 ++++++++++++++++++++++++---
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/lhotse/bin/modes/recipes/icsi.py b/lhotse/bin/modes/recipes/icsi.py
index 29c7f4edf..78a18473c 100644
--- a/lhotse/bin/modes/recipes/icsi.py
+++ b/lhotse/bin/modes/recipes/icsi.py
@@ -68,18 +68,26 @@ def icsi(
     default="kaldi",
     help="Type of text normalization to apply (kaldi style, by default)",
 )
+@click.option(
+    "--save-to-wav",
+    is_flag=True,
+    default=False,
+    help="If True and `mic` is sdm/ihm/mdm, save the recordings as WAV for faster processing.",
+)
 def icsi(
     audio_dir: Pathlike,
     transcripts_dir: Pathlike,
     output_dir: Pathlike,
     mic: str,
     normalize_text: bool,
+    save_to_wav: bool,
 ):
-    """AMI data preparation."""
+    """ICSI data preparation."""
     prepare_icsi(
         audio_dir,
         transcripts_dir,
         output_dir=output_dir,
         mic=mic,
         normalize_text=normalize_text,
+        save_to_wav=save_to_wav,
     )
diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
index 72b92a407..124064009 100644
--- a/lhotse/recipes/icsi.py
+++ b/lhotse/recipes/icsi.py
@@ -100,6 +100,7 @@
 from pathlib import Path
 from typing import Dict, List, NamedTuple, Optional, Tuple, Union
 
+import soundfile as sf
 from tqdm.auto import tqdm
 
 from lhotse import validate_recordings_and_supervisions
@@ -385,6 +386,8 @@ def parse_icsi_annotations(
 def prepare_audio_grouped(
     audio_paths: List[Pathlike],
     channel_to_idx_map: Dict[str, Dict[str, int]] = None,
+    save_to_wav: bool = False,
+    output_dir: Pathlike = None,
 ) -> RecordingSet:
     # Group together multiple channels from the same session.
     # We will use that to create a Recording with multiple sources (channels).
@@ -404,6 +407,16 @@ def prepare_audio_grouped(
             }
         audio_sf, samplerate = read_sph(channel_paths[0])
 
+        if save_to_wav:
+            session_dir = Path(output_dir) / "wavs" / session_name
+            session_dir.mkdir(parents=True, exist_ok=True)
+            for i, audio_path in enumerate(channel_paths):
+                audio, _ = read_sph(audio_path)
+                wav_path = session_dir / f"{audio_path.stem}.wav"
+                sf.write(wav_path, audio.T, samplerate)
+                # Replace the sph path with the wav path
+                channel_paths[i] = wav_path
+
         recordings.append(
             Recording(
                 id=session_name,
@@ -436,7 +449,7 @@ def prepare_audio_single(
     for audio_path in tqdm(audio_paths, desc="Preparing audio"):
         session_name = audio_path.parts[-2]
         if audio_path.suffix == ".wav":
-            audio_sf = sf.SoundFile(str(audio_path))
+            audio_sf = sf.SoundFile(audio_path)
             num_frames = audio_sf.frames
             num_channels = audio_sf.channels
             samplerate = audio_sf.samplerate
@@ -566,6 +579,7 @@ def prepare_icsi(
     output_dir: Optional[Pathlike] = None,
     mic: Optional[str] = "ihm",
     normalize_text: str = "kaldi",
+    save_to_wav: bool = False,
 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
     """
     Returns the manifests which consist of the Recordings and Supervisions
@@ -574,6 +588,7 @@ def prepare_icsi(
     :param output_dir: Pathlike, the path where to write the manifests - `None` means manifests aren't stored on disk.
     :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use.
     :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text
+    :param save_to_wav: bool, whether to save the sph audio to wav format
     :return: a Dict whose key is ('train', 'dev', 'test'), and the values are dicts of manifests under keys
         'recordings' and 'supervisions'.
     """
@@ -588,6 +603,9 @@ def prepare_icsi(
     assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}"
     assert mic in MIC_TO_CHANNELS.keys(), f"Mic {mic} not supported"
 
+    if save_to_wav:
+        assert output_dir is not None, "output_dir must be specified when saving to wav"
+
     if output_dir is not None:
         output_dir = Path(output_dir)
         output_dir.mkdir(parents=True, exist_ok=True)
@@ -604,7 +622,10 @@ def prepare_icsi(
     if mic == "ihm" or mic == "mdm":
         audio_paths = audio_dir.rglob(f"chan[{channels}].sph")
         audio = prepare_audio_grouped(
-            list(audio_paths), channel_to_idx_map if mic == "ihm" else None
+            list(audio_paths),
+            channel_to_idx_map if mic == "ihm" else None,
+            save_to_wav,
+            output_dir,
         )
     elif mic == "sdm" or mic == "ihm-mix":
         audio_paths = (
@@ -612,7 +633,7 @@ def prepare_icsi(
             if len(channels)
             else audio_dir.rglob("*.wav")
         )
-        audio = prepare_audio_single(list(audio_paths))
+        audio = prepare_audio_single(list(audio_paths), save_to_wav, output_dir)
 
     # Supervisions
     logging.info("Preparing supervision manifests")

From c4b957df6387d7c13ba75a73998b0d54709f49a9 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 22 May 2023 10:37:07 -0400
Subject: [PATCH 15/16] add test for mixing cut with recording

---
 lhotse/cut/set.py           | 2 +-
 test/cut/test_cut_mixing.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py
index 81851794c..ad5b3e1a2 100644
--- a/lhotse/cut/set.py
+++ b/lhotse/cut/set.py
@@ -2610,7 +2610,7 @@ def mix(
         )
         snr = None
 
-    if reference_cut.num_features is not None:
+    if reference_cut.num_features is not None and mixed_in_cut.num_features is not None:
         assert (
             reference_cut.num_features == mixed_in_cut.num_features
         ), "Cannot mix cuts with different feature dimensions."
diff --git a/test/cut/test_cut_mixing.py b/test/cut/test_cut_mixing.py
index 8cac69d48..2f0b06ab1 100644
--- a/test/cut/test_cut_mixing.py
+++ b/test/cut/test_cut_mixing.py
@@ -324,6 +324,12 @@ def test_mix_cut_snr(libri_cut):
     assert E(feats) > E(feats_snr)
 
 
+def test_mix_cut_with_other_raises_error(libri_cut):
+    libri_cut = libri_cut.drop_features()
+    with pytest.raises(ValueError):
+        _ = libri_cut.mix(libri_cut.recording)
+
+
 def test_mix_cut_snr_truncate_snr_reference(libri_cut):
     mixed = libri_cut.pad(duration=20).mix(libri_cut, offset_other_by=10)
     mixed_snr = libri_cut.pad(duration=20).mix(libri_cut, offset_other_by=10, snr=10)

From a253cf419a538939658a1747c612902eb054ad27 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 22 May 2023 14:15:41 -0400
Subject: [PATCH 16/16] add read_sph_torchaudio

---
 lhotse/audio.py                   | 39 ++++++++++++++++++++++++++++---
 test/cut/test_cut_augmentation.py |  2 +-
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/lhotse/audio.py b/lhotse/audio.py
index f6267850c..46045ca6a 100644
--- a/lhotse/audio.py
+++ b/lhotse/audio.py
@@ -1767,7 +1767,6 @@ def info(
     force_opus_sampling_rate: Optional[int] = None,
     force_read_audio: bool = False,
 ) -> LibsndfileCompatibleAudioInfo:
-
     if force_read_audio:
         # This is a reliable fallback for situations when the user knows that audio files do not
         # have duration metadata in their headers.
@@ -1824,7 +1823,8 @@ def torchaudio_2_0_ffmpeg_enabled() -> bool:
     import torchaudio
     from packaging import version
 
-    ver = version.parse(torchaudio.__version__)
+    # Handle cases like '2.0.0+cu117'
+    ver = version.parse(version.parse(torchaudio.__version__).base_version)
     if ver == version.parse("2.0.0"):
         return os.environ.get("TORCHAUDIO_USE_BACKEND_DISPATCHER", "0") == "1"
     if ver >= version.parse("2.1.0"):
@@ -2374,6 +2374,40 @@ def sph_info(path: Pathlike) -> LibsndfileCompatibleAudioInfo:
 
 def read_sph(
     sph_path: Pathlike, offset: Seconds = 0.0, duration: Optional[Seconds] = None
+) -> Tuple[np.ndarray, int]:
+    """
+    Reads SPH files either using torchaudio or using sph2pipe in a shell subprocess.
+
+    :return: a tuple of audio samples and the sampling rate.
+    """
+    try:
+        return read_sph_torchaudio(sph_path=sph_path, offset=offset, duration=duration)
+    except:
+        return read_sph_sph2pipe(sph_path=sph_path, offset=offset, duration=duration)
+
+
+def read_sph_torchaudio(
+    sph_path: Pathlike, offset: Seconds = 0.0, duration: Optional[Seconds] = None
+) -> Tuple[np.ndarray, int]:
+    """
+    Reads SPH files using torchaudio.
+
+    :return: a tuple of audio samples and the sampling rate.
+    """
+    # Actual audio reading.
+    sph_path = str(sph_path)
+    try:
+        samples, sampling_rate = torchaudio_2_ffmpeg_load(sph_path, offset, duration)
+    except RuntimeError as e:
+        raise AudioLoadingError(
+            f"{e}\nThe torchaudio command for which the program failed is: "
+            f"torchaudio.load({sph_path}, frame_offset={int(offset * 100)}, num_frames={int(duration * 100)})"
+        )
+    return samples, sampling_rate
+
+
+def read_sph_sph2pipe(
+    sph_path: Pathlike, offset: Seconds = 0.0, duration: Optional[Seconds] = None
 ) -> Tuple[np.ndarray, int]:
     """
     Reads SPH files using sph2pipe in a shell subprocess.
@@ -2381,7 +2415,6 @@ def read_sph(
 
     :return: a tuple of audio samples and the sampling rate.
     """
-
     sph_path = Path(sph_path)
 
     # Construct the sph2pipe command depending on the arguments passed.
diff --git a/test/cut/test_cut_augmentation.py b/test/cut/test_cut_augmentation.py
index 7e9d04a7b..968c6e12d 100644
--- a/test/cut/test_cut_augmentation.py
+++ b/test/cut/test_cut_augmentation.py
@@ -420,7 +420,7 @@ def test_mixed_cut_start01_reverb_rir_with_fast_random(
 ):
     mixed_rvb = cut_with_supervision_start01.append(
         cut_with_supervision_start01
-    ).reverb_rir()
+    ).reverb_rir(mix_first=False)
     assert mixed_rvb.start == 0  # MixedCut always starts at 0
     assert mixed_rvb.duration == cut_with_supervision_start01.duration * 2
     assert mixed_rvb.end == cut_with_supervision_start01.duration * 2