From ce0f5c1c80fe271e75de0a720bfec09ca27c19bc Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Thu, 20 Apr 2023 11:26:40 -0400 Subject: [PATCH 01/16] add transform attribute for MixedCut --- lhotse/cut/mixed.py | 120 ++++++++++++++++++++++++++---- test/cut/test_cut_augmentation.py | 41 +++++++++- 2 files changed, 145 insertions(+), 16 deletions(-) diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py index b0ec1026d..005a953d6 100644 --- a/lhotse/cut/mixed.py +++ b/lhotse/cut/mixed.py @@ -10,7 +10,12 @@ from intervaltree import IntervalTree from lhotse.audio import AudioMixer, Recording, audio_energy, torchaudio_save_flac_safe -from lhotse.augmentation import AugmentFn +from lhotse.augmentation import ( + AudioTransform, + AugmentFn, + LoudnessNormalization, + ReverbWithImpulseResponse, +) from lhotse.cut.base import Cut from lhotse.cut.data import DataCut from lhotse.cut.padding import PaddingCut @@ -93,6 +98,9 @@ class MixedCut(Cut): .. note:: Each track in a MixedCut can be either a MonoCut, MultiCut, or PaddingCut. + .. note:: The ``transforms`` field is a list of dictionaries that describe the transformations + that should be applied to the track after mixing. + See also: - :class:`lhotse.cut.Cut` @@ -103,6 +111,7 @@ class MixedCut(Cut): id: str tracks: List[MixTrack] + transforms: Optional[List[Dict]] = None @property def supervisions(self) -> List[SupervisionSegment]: @@ -713,6 +722,35 @@ def perturb_volume(self, factor: float, affix_id: bool = True) -> "MixedCut": ], ) + def normalize_loudness(self, target: float, affix_id: bool = False) -> "DataCut": + """ + Return a new ``MixedCut`` that will lazily apply loudness normalization. + + :param target: The target loudness in dBFS. + :param affix_id: When true, we will modify the ``DataCut.id`` field + by affixing it with "_ln{target}". + :return: a modified copy of the current ``DataCut``. + """ + # Pre-conditions + assert ( + self.has_recording + ), "Cannot apply loudness normalization on a MixedCut without Recording." + if self.has_features: + logging.warning( + "Attempting to normalize loudness on a MixedCut that references pre-computed features. " + "The feature manifest will be detached, as we do not support feature-domain " + "loudness normalization." + ) + self.features = None + + transforms = self.transforms.copy() if self.transforms is not None else [] + transforms.append(LoudnessNormalization(target=target).to_dict()) + return fastcopy( + self, + id=f"{self.id}_ln{target}" if affix_id else self.id, + transforms=transforms, + ) + def reverb_rir( self, rir_recording: Optional["Recording"] = None, @@ -722,6 +760,7 @@ def reverb_rir( rir_channels: List[int] = [0], room_rng_seed: Optional[int] = None, source_rng_seed: Optional[int] = None, + mix_first: bool = True, ) -> "MixedCut": """ Return a new ``MixedCut`` that will convolve the audio with the provided impulse response. @@ -739,6 +778,9 @@ def reverb_rir( be convolved with one of the specified channels. :param room_rng_seed: Seed for the room configuration. :param source_rng_seed: Seed for the source position. + :param mix_first: When true, the mixing will be done first before convolving with the RIR. + This effectively means that all tracks will be convolved with the same RIR. If you + are simulating multi-speaker mixtures, you should set this to False. :return: a modified copy of the current ``MixedCut``. """ # Pre-conditions @@ -760,23 +802,64 @@ def reverb_rir( self.tracks ), "Invalid number of channels in `rir_channels`, must be either 1 or equal to the number of tracks." + # There are 2 ways to apply RIRs: + # 1. Mix the tracks first, then apply RIRs. This is same as applying the same RIR + # to all tracks. It does not make sense if all tracks belong to different speakers, + # but it is useful for cases when we have a mixture of MonoCut and PaddingCut, + # and we want to apply the same RIR to all of them. + # 2. Apply RIRs to each track separately. This is useful when we want to simulate + # different speakers in the same room. + + # First simulate the room config (will only be used if RIR is not provided) + uuid4_str = str(uuid4()) + # The room RNG seed is based on the cut ID. This ensures that all tracks in the + # mixed cut will have the same room configuration. + if room_rng_seed is None: + room_rng_seed = hash_str_to_int(uuid4_str + self.id) + # The source RNG seed is based on the track ID. This ensures that each track + # will have a different source position. + source_rng_seeds = [source_rng_seed] * len(self.tracks) + if source_rng_seed is None: + source_rng_seeds = [ + hash_str_to_int(uuid4_str + track.cut.id) for track in self.tracks + ] + source_rng_seed = source_rng_seeds[0] + + # Apply same RIR to all tracks after mixing (default) + if mix_first: + if rir_recording is None: + from lhotse.augmentation.utils import FastRandomRIRGenerator + + rir_generator = FastRandomRIRGenerator( + sr=self.sampling_rate, + room_seed=room_rng_seed, + source_seed=source_rng_seed, + ) + else: + rir_generator = None + + transforms = self.transforms.copy() if self.transforms is not None else [] + transforms.append( + ReverbWithImpulseResponse( + rir=rir_recording, + normalize_output=normalize_output, + early_only=early_only, + rir_channels=rir_channels if rir_channels is not None else [0], + rir_generator=rir_generator, + ).to_dict() + ) + return fastcopy( + self, + id=f"{self.id}_rvb" if affix_id else self.id, + transforms=transforms, + ) + + # Apply RIRs to each track separately. Note that we do not pass a `mix_first` + # argument below since it is True by default. + if len(rir_channels) == 1: rir_channels = rir_channels * len(self.tracks) - source_rng_seeds = [source_rng_seed] * len(self.tracks) - if rir_recording is None: - uuid4_str = str(uuid4()) - # The room RNG seed is based on the cut ID. This ensures that all tracks in the - # mixed cut will have the same room configuration. - if room_rng_seed is None: - room_rng_seed = hash_str_to_int(uuid4_str + self.id) - # The source RNG seed is based on the track ID. This ensures that each track - # will have a different source position. - if source_rng_seed is None: - source_rng_seeds = [ - hash_str_to_int(uuid4_str + track.cut.id) for track in self.tracks - ] - return MixedCut( id=f"{self.id}_rvb" if affix_id else self.id, tracks=[ @@ -977,6 +1060,13 @@ def load_audio( f"this issue at https://github.com/lhotse-speech/lhotse/issues " f"showing the cut below. MixedCut:\n{self}" ) + + # We'll apply the transforms now (if any). + transforms = [ + AudioTransform.from_dict(params) for params in self.transforms or [] + ] + for tfn in transforms: + audio = tfn(audio, self.sampling_rate) else: audio = mixer.unmixed_audio diff --git a/test/cut/test_cut_augmentation.py b/test/cut/test_cut_augmentation.py index 7525ed33b..f98f21083 100644 --- a/test/cut/test_cut_augmentation.py +++ b/test/cut/test_cut_augmentation.py @@ -349,7 +349,7 @@ def test_mixed_cut_start01_perturb_volume(cut_with_supervision_start01): def test_mixed_cut_start01_reverb_rir(cut_with_supervision_start01, rir): mixed_rvb = cut_with_supervision_start01.append( cut_with_supervision_start01 - ).reverb_rir(rir_recording=rir) + ).reverb_rir(rir_recording=rir, mix_first=False) assert mixed_rvb.start == 0 # MixedCut always starts at 0 assert mixed_rvb.duration == cut_with_supervision_start01.duration * 2 assert mixed_rvb.end == cut_with_supervision_start01.duration * 2 @@ -396,6 +396,24 @@ def test_mixed_cut_start01_reverb_rir(cut_with_supervision_start01, rir): ) +def test_mixed_cut_start01_reverb_rir_mix_first(cut_with_supervision_start01, rir): + mixed_rvb = cut_with_supervision_start01.pad(duration=0.5).reverb_rir( + rir_recording=rir, mix_first=True + ) + assert mixed_rvb.start == 0 # MixedCut always starts at 0 + assert mixed_rvb.duration == 0.5 + assert mixed_rvb.end == 0.5 + assert mixed_rvb.num_samples == 4000 + + # Check that the padding part should not be all zeros afte + np.testing.assert_raises( + AssertionError, + np.testing.assert_array_almost_equal, + mixed_rvb.load_audio()[:, 3200:], + np.zeros((1, 800)), + ) + + def test_mixed_cut_start01_reverb_rir_with_fast_random( cut_with_supervision_start01, rir ): @@ -449,6 +467,23 @@ def test_mixed_cut_start01_reverb_rir_multi_channel( mixed_cut.reverb_rir(multi_channel_rir, rir_channels=rir_channels) +@pytest.mark.skipif( + not is_module_available("pyloudnorm"), + reason="This test requires pyloudnorm to be installed.", +) +@pytest.mark.parametrize("target", [-15.0, -20.0, -25.0]) +def test_mixed_cut_normalize_loudness(cut_with_supervision_start01, target): + mixed_cut = cut_with_supervision_start01.append(cut_with_supervision_start01) + mixed_cut_ln = mixed_cut.normalize_loudness(target) + + import pyloudnorm as pyln + + # check if loudness is correct + meter = pyln.Meter(mixed_cut_ln.sampling_rate) # create BS.1770 meter + loudness = meter.integrated_loudness(mixed_cut_ln.load_audio().T) + assert loudness == pytest.approx(target, abs=0.5) + + @pytest.mark.skipif( not is_module_available("nara_wpe"), reason="This test requires nara_wpe to be installed.", @@ -587,6 +622,10 @@ def test_cut_perturb_volume(cut_set, cut_id, scale): ) +@pytest.mark.skipif( + not is_module_available("pyloudnorm"), + reason="This test requires pyloudnorm to be installed.", +) @pytest.mark.parametrize("target", [-15.0, -20.0, -25.0]) def test_cut_normalize_loudness(libri_cut_set, target): cut_set_ln = libri_cut_set.normalize_loudness(target) From ab18682257962740ee0825ba170689c9cc877e28 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Thu, 20 Apr 2023 11:36:54 -0400 Subject: [PATCH 02/16] add mix_first option in normalize_loudness --- lhotse/cut/mixed.py | 35 ++++++++++++++++++++++++------- test/cut/test_cut_augmentation.py | 11 ++++++---- 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py index 005a953d6..5cfd4a3d2 100644 --- a/lhotse/cut/mixed.py +++ b/lhotse/cut/mixed.py @@ -722,11 +722,16 @@ def perturb_volume(self, factor: float, affix_id: bool = True) -> "MixedCut": ], ) - def normalize_loudness(self, target: float, affix_id: bool = False) -> "DataCut": + def normalize_loudness( + self, target: float, mix_first: bool = True, affix_id: bool = False + ) -> "DataCut": """ Return a new ``MixedCut`` that will lazily apply loudness normalization. :param target: The target loudness in dBFS. + :param mix_first: If true, we will mix the underlying cuts before applying + loudness normalization. If false, we cannot guarantee that the resulting + cut will have the target loudness. :param affix_id: When true, we will modify the ``DataCut.id`` field by affixing it with "_ln{target}". :return: a modified copy of the current ``DataCut``. @@ -743,13 +748,27 @@ def normalize_loudness(self, target: float, affix_id: bool = False) -> "DataCut" ) self.features = None - transforms = self.transforms.copy() if self.transforms is not None else [] - transforms.append(LoudnessNormalization(target=target).to_dict()) - return fastcopy( - self, - id=f"{self.id}_ln{target}" if affix_id else self.id, - transforms=transforms, - ) + if mix_first: + transforms = self.transforms.copy() if self.transforms is not None else [] + transforms.append(LoudnessNormalization(target=target).to_dict()) + return fastcopy( + self, + id=f"{self.id}_ln{target}" if affix_id else self.id, + transforms=transforms, + ) + else: + return MixedCut( + id=f"{self.id}_ln{target}" if affix_id else self.id, + tracks=[ + fastcopy( + track, + cut=track.cut.normalize_loudness( + target=target, affix_id=affix_id + ), + ) + for track in self.tracks + ], + ) def reverb_rir( self, diff --git a/test/cut/test_cut_augmentation.py b/test/cut/test_cut_augmentation.py index f98f21083..bc60a075a 100644 --- a/test/cut/test_cut_augmentation.py +++ b/test/cut/test_cut_augmentation.py @@ -471,17 +471,20 @@ def test_mixed_cut_start01_reverb_rir_multi_channel( not is_module_available("pyloudnorm"), reason="This test requires pyloudnorm to be installed.", ) -@pytest.mark.parametrize("target", [-15.0, -20.0, -25.0]) -def test_mixed_cut_normalize_loudness(cut_with_supervision_start01, target): +@pytest.mark.parametrize( + "target, mix_first", [(-15.0, True), (-20.0, True), (-25.0, False)] +) +def test_mixed_cut_normalize_loudness(cut_with_supervision_start01, target, mix_first): mixed_cut = cut_with_supervision_start01.append(cut_with_supervision_start01) - mixed_cut_ln = mixed_cut.normalize_loudness(target) + mixed_cut_ln = mixed_cut.normalize_loudness(target, mix_first=mix_first) import pyloudnorm as pyln # check if loudness is correct meter = pyln.Meter(mixed_cut_ln.sampling_rate) # create BS.1770 meter loudness = meter.integrated_loudness(mixed_cut_ln.load_audio().T) - assert loudness == pytest.approx(target, abs=0.5) + if mix_first: + assert loudness == pytest.approx(target, abs=0.5) @pytest.mark.skipif( From e4bca7421a7a4506fc3da82d639935233689b36a Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Thu, 20 Apr 2023 15:40:46 -0400 Subject: [PATCH 03/16] handle the case when mix is called on MixedCut with existing transforms --- lhotse/cut/set.py | 57 +++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py index 1f5ad02b3..88532a1ee 100644 --- a/lhotse/cut/set.py +++ b/lhotse/cut/set.py @@ -2614,10 +2614,14 @@ def mix( if offset > reference_cut.duration: reference_cut = reference_cut.pad(duration=offset) - # When the left_cut is a MixedCut, take its existing tracks, otherwise create a new track. - if isinstance(reference_cut, MixedCut): + # When the left_cut is a MixedCut and it does not have existing transforms, + # take its existing tracks, otherwise create a new track. + if ( + isinstance(reference_cut, MixedCut) + and len(ifnone(reference_cut.transforms, [])) == 0 + ): old_tracks = reference_cut.tracks - elif isinstance(reference_cut, (DataCut, PaddingCut)): + elif isinstance(reference_cut, (DataCut, PaddingCut, MixedCut)): old_tracks = [MixTrack(cut=reference_cut)] else: raise ValueError(f"Unsupported type of cut in mix(): {type(reference_cut)}") @@ -2625,27 +2629,32 @@ def mix( # When the right_cut is a MixedCut, adapt its existing tracks with the new offset and snr, # otherwise create a new track. if isinstance(mixed_in_cut, MixedCut): - new_tracks = [ - MixTrack( - cut=track.cut, - offset=round(track.offset + offset, ndigits=8), - snr=( - # When no new SNR is specified, retain whatever was there in the first place. - track.snr - if snr is None - # When new SNR is specified but none was specified before, assign the new SNR value. - else snr - if track.snr is None - # When both new and previous SNR were specified, assign their sum, - # as the SNR for each track is defined with regard to the first track energy. - else track.snr + snr - if snr is not None and track is not None - # When no SNR was specified whatsoever, use none. - else None - ), - ) - for track in mixed_in_cut.tracks - ] + # Similarly for mixed_in_cut, if it is a MixedCut and it does not have existing transforms, + # take its existing tracks, otherwise create a new track. + if len(ifnone(mixed_in_cut.transforms, [])) > 0: + new_tracks = [MixTrack(cut=mixed_in_cut, offset=offset, snr=snr)] + else: + new_tracks = [ + MixTrack( + cut=track.cut, + offset=round(track.offset + offset, ndigits=8), + snr=( + # When no new SNR is specified, retain whatever was there in the first place. + track.snr + if snr is None + # When new SNR is specified but none was specified before, assign the new SNR value. + else snr + if track.snr is None + # When both new and previous SNR were specified, assign their sum, + # as the SNR for each track is defined with regard to the first track energy. + else track.snr + snr + if snr is not None and track is not None + # When no SNR was specified whatsoever, use none. + else None + ), + ) + for track in mixed_in_cut.tracks + ] elif isinstance(mixed_in_cut, (DataCut, PaddingCut)): new_tracks = [MixTrack(cut=mixed_in_cut, offset=offset, snr=snr)] else: From 71a92367f02da57655e7b3f633f1e7990b60fa6e Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Thu, 20 Apr 2023 16:08:03 -0400 Subject: [PATCH 04/16] add test for mixing with transformed MixedCut --- test/cut/test_cut_mixing.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test/cut/test_cut_mixing.py b/test/cut/test_cut_mixing.py index c38753ecb..3404d4edf 100644 --- a/test/cut/test_cut_mixing.py +++ b/test/cut/test_cut_mixing.py @@ -345,6 +345,28 @@ def test_mix_cut_snr_pad_both(libri_cut): assert E(feats_nosnr) > E(feats_snr) +@pytest.mark.parametrize("mix_first", [True, False]) +def test_mix_cut_with_transform(libri_cut, mix_first): + # Create original mixed cut + padded = libri_cut.pad(duration=20, direction="right") + # Create transformed mixed cut + padded = padded.reverb_rir(mix_first=mix_first) + # Mix another cut + mixed1 = padded.mix(libri_cut) + mixed2 = libri_cut.mix(padded) + + assert isinstance(padded, MixedCut) + assert len(padded.tracks) == 2 + assert isinstance(mixed1, MixedCut) + assert isinstance(mixed2, MixedCut) + if mix_first: + assert len(mixed1.tracks) == 2 + assert len(mixed2.tracks) == 2 + else: + assert len(mixed1.tracks) == 3 + assert len(mixed2.tracks) == 3 + + def test_cut_set_mix_snr_is_deterministic(): cuts = DummyManifest(CutSet, begin_id=0, end_id=2) From 2e54646b5e1219a5ef001814f4305540cd0ff414 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Mon, 15 May 2023 20:25:43 -0400 Subject: [PATCH 05/16] enhancements and bug fixes --- lhotse/bin/modes/recipes/ami.py | 9 ++++++ lhotse/bin/modes/recipes/icsi.py | 9 +++--- lhotse/recipes/ami.py | 49 ++++++++++++++++++++++++++------ lhotse/recipes/icsi.py | 16 ++++++----- lhotse/recipes/utils.py | 1 + 5 files changed, 64 insertions(+), 20 deletions(-) diff --git a/lhotse/bin/modes/recipes/ami.py b/lhotse/bin/modes/recipes/ami.py index 992e20474..b129de572 100644 --- a/lhotse/bin/modes/recipes/ami.py +++ b/lhotse/bin/modes/recipes/ami.py @@ -53,6 +53,13 @@ " segmentation). If None, no segmentation is performed." ), ) +@click.option( + "--merge-consecutive", + type=bool, + is_flag=True, + default=False, + help="Merge consecutive segments from the same speaker.", +) def ami( corpus_dir: Pathlike, output_dir: Pathlike, @@ -61,6 +68,7 @@ def ami( partition: str, normalize_text: bool, max_words_per_segment: int, + merge_consecutive: bool, ): """AMI data preparation.""" prepare_ami( @@ -71,6 +79,7 @@ def ami( partition=partition, normalize_text=normalize_text, max_words_per_segment=max_words_per_segment, + merge_consecutive=merge_consecutive, ) diff --git a/lhotse/bin/modes/recipes/icsi.py b/lhotse/bin/modes/recipes/icsi.py index 7d5a033bc..29c7f4edf 100644 --- a/lhotse/bin/modes/recipes/icsi.py +++ b/lhotse/bin/modes/recipes/icsi.py @@ -64,12 +64,13 @@ def icsi( ) @click.option( "--normalize-text", - is_flag=True, - help="If set, convert all text annotations to upper case (similar to Kaldi)", + type=click.Choice(["none", "upper", "kaldi"], case_sensitive=False), + default="kaldi", + help="Type of text normalization to apply (kaldi style, by default)", ) def icsi( audio_dir: Pathlike, - transcript_dir: Pathlike, + transcripts_dir: Pathlike, output_dir: Pathlike, mic: str, normalize_text: bool, @@ -77,7 +78,7 @@ def icsi( """AMI data preparation.""" prepare_icsi( audio_dir, - transcript_dir, + transcripts_dir, output_dir=output_dir, mic=mic, normalize_text=normalize_text, diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py index df6105dee..d5ff85361 100644 --- a/lhotse/recipes/ami.py +++ b/lhotse/recipes/ami.py @@ -276,7 +276,8 @@ class AmiSegmentAnnotation(NamedTuple): def parse_ami_annotations( annotations_dir: Pathlike, normalize: str = "upper", - max_words_per_segment: int = None, + max_words_per_segment: Optional[int] = None, + merge_consecutive: bool = False, ) -> Dict[str, List[SupervisionSegment]]: # Extract if zipped file @@ -355,7 +356,9 @@ def parse_ami_annotations( seg_words = list( filter(lambda w: w[0] >= seg_start and w[1] <= seg_end, spk_words) ) - subsegments = split_segment(seg_words, max_words_per_segment) + subsegments = split_segment( + seg_words, max_words_per_segment, merge_consecutive + ) for subseg in subsegments: start, end, text = subseg annotations[key].append( @@ -372,26 +375,49 @@ def parse_ami_annotations( def split_segment( - words: List[Tuple[float, float, str]], max_words_per_segment: Optional[int] + words: List[Tuple[float, float, str]], + max_words_per_segment: Optional[int] = None, + merge_consecutive: bool = False, ): def split_(sequence, sep): chunk = [] for val in sequence: if val[-1] == sep: - yield chunk + if len(chunk) > 0: + yield chunk chunk = [] else: chunk.append(val) - yield chunk + if len(chunk) > 0: + yield chunk def split_on_fullstop_(sequence): - return split_(sequence, ".") - - def split_on_comma_(segment, max_words_per_segment): + subsegs = list(split_(sequence, ".")) + if len(subsegs) < 2: + return subsegs + # Set a large default value for max_words_per_segment if not provided + max_segment_length = max_words_per_segment if max_words_per_segment else 100000 + if merge_consecutive: + # Merge consecutive subsegments if their length is less than max_words_per_segment + merged_subsegs = [subsegs[0]] + for subseg in subsegs[1:]: + if ( + merged_subsegs[-1][-1][1] == subseg[0][0] + and len(merged_subsegs[-1]) + len(subseg) <= max_segment_length + ): + merged_subsegs[-1].extend(subseg) + else: + merged_subsegs.append(subseg) + subsegs = merged_subsegs + return subsegs + + def split_on_comma_(segment): # This function smartly splits a segment on commas such that the number of words # in each subsegment is as close to max_words_per_segment as possible. # First we create subsegments by splitting on commas subsegs = list(split_(segment, ",")) + if len(subsegs) < 2: + return subsegs # Now we merge subsegments while ensuring that the number of words in each # subsegment is less than max_words_per_segment merged_subsegs = [subsegs[0]] @@ -409,7 +435,7 @@ def split_on_comma_(segment, max_words_per_segment): # Now we split each subsegment based on commas to get at most max_words_per_segment # words per subsegment. subsegments = [ - list(split_on_comma_(subseg, max_words_per_segment)) + list(split_on_comma_(subseg)) if len(subseg) > max_words_per_segment else [subseg] for subseg in subsegments @@ -614,6 +640,7 @@ def prepare_ami( partition: Optional[str] = "full-corpus", normalize_text: str = "kaldi", max_words_per_segment: Optional[int] = None, + merge_consecutive: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions @@ -625,6 +652,9 @@ def prepare_ami( :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text :param max_words_per_segment: int, maximum number of words per segment. If not None, we will split longer segments similar to Kaldi's data prep scripts, i.e., split on full-stop and comma. + :param merge_consecutive: bool, if True, merge consecutive segments split on full-stop. + We will only merge segments if the number of words in the merged segment is less than + max_words_per_segment. :return: a Dict whose key is ('train', 'dev', 'eval'), and the values are dicts of manifests under keys 'recordings' and 'supervisions'. @@ -662,6 +692,7 @@ def prepare_ami( annotations_dir, normalize=normalize_text, max_words_per_segment=max_words_per_segment, + merge_consecutive=merge_consecutive, ) # Audio diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py index 29734e84b..604d7852e 100644 --- a/lhotse/recipes/icsi.py +++ b/lhotse/recipes/icsi.py @@ -234,7 +234,6 @@ def download_icsi( def parse_icsi_annotations( transcripts_dir: Pathlike, normalize: str = "upper" ) -> Tuple[Dict[str, List[SupervisionSegment]], Dict[str, Dict[str, int]]]: - annotations = defaultdict(list) # In Lhotse, channels are integers, so we map channel ids to integers for each session channel_to_idx_map = defaultdict(dict) @@ -299,7 +298,6 @@ def prepare_audio_grouped( audio_paths: List[Pathlike], channel_to_idx_map: Dict[str, Dict[str, int]] = None, ) -> RecordingSet: - # Group together multiple channels from the same session. # We will use that to create a Recording with multiple sources (channels). from cytoolz import groupby @@ -474,7 +472,7 @@ def prepare_supervision_other( def prepare_icsi( audio_dir: Pathlike, - transcripts_dir: Pathlike, + transcripts_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, mic: Optional[str] = "ihm", normalize_text: str = "kaldi", @@ -490,7 +488,11 @@ def prepare_icsi( 'recordings' and 'supervisions'. """ audio_dir = Path(audio_dir) - transcripts_dir = Path(transcripts_dir) + transcripts_dir = ( + Path(transcripts_dir) + if transcripts_dir is not None + else audio_dir / "transcripts" + ) assert audio_dir.is_dir(), f"No such directory: {audio_dir}" assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}" @@ -539,6 +541,9 @@ def prepare_icsi( lambda x: x.recording_id in PARTITIONS[part] ) + audio_part, supervision_part = fix_manifests(audio_part, supervision_part) + validate_recordings_and_supervisions(audio_part, supervision_part) + # Write to output directory if a path is provided if output_dir is not None: audio_part.to_file(output_dir / f"icsi-{mic}_recordings_{part}.jsonl.gz") @@ -546,9 +551,6 @@ def prepare_icsi( output_dir / f"icsi-{mic}_supervisions_{part}.jsonl.gz" ) - audio_part, supervision_part = fix_manifests(audio_part, supervision_part) - validate_recordings_and_supervisions(audio_part, supervision_part) - # Combine all manifests into one dictionary manifests[part] = {"recordings": audio_part, "supervisions": supervision_part} diff --git a/lhotse/recipes/utils.py b/lhotse/recipes/utils.py index ab3d7b2d5..37fc00a99 100644 --- a/lhotse/recipes/utils.py +++ b/lhotse/recipes/utils.py @@ -146,6 +146,7 @@ def normalize_text_ami(text: str, normalize: str = "upper") -> str: text = re.sub(r"MM HMM", "MM-HMM", text) text = re.sub(r"UH HUH", "UH-HUH", text) text = re.sub(r"(\b)O K(\b)", r"\g<1>OK\g<2>", text) + text = re.sub(r"(\b)O_K(\b)", r"\g<1>OK\g<2>", text) return text From db37a752466c722997cfade9d5a5192f289ab7b0 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Mon, 15 May 2023 20:38:08 -0400 Subject: [PATCH 06/16] small changes in some cutset methods --- lhotse/cut/base.py | 8 +++++--- lhotse/cut/set.py | 9 ++++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py index 4df14a017..c9a989cf1 100644 --- a/lhotse/cut/base.py +++ b/lhotse/cut/base.py @@ -652,6 +652,7 @@ def trim_to_supervision_groups( supervision_group = [supervisions[0]] cur_end = supervisions[0].end new_cuts = [] + group_idx = 0 for sup in supervisions[1:]: if sup.start - cur_end <= max_pause: supervision_group.append(sup) @@ -666,8 +667,9 @@ def trim_to_supervision_groups( offset=offset, duration=duration, keep_excessive_supervisions=False, - ) + ).with_id(f"{self.id}-{max_pause}-{group_idx}") ) + group_idx += 1 supervision_group = [sup] cur_end = sup.end @@ -680,7 +682,7 @@ def trim_to_supervision_groups( offset=offset, duration=duration, keep_excessive_supervisions=False, - ) + ).with_id(f"{self.id}-{max_pause}-{group_idx}") ) # The total number of supervisions should be the same. assert sum(len(c.supervisions) for c in new_cuts) == len(self.supervisions), ( @@ -724,7 +726,7 @@ def cut_into_windows( offset=hop * i, duration=duration, keep_excessive_supervisions=keep_excessive_supervisions, - ) + ).with_id(f"{self.id}-{i}") ) return CutSet.from_cuts(new_cuts) diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py index 8b474e0fa..81851794c 100644 --- a/lhotse/cut/set.py +++ b/lhotse/cut/set.py @@ -590,6 +590,7 @@ def to_shar( warn_unused_fields=warn_unused_fields, include_cuts=include_cuts, shard_suffix=None, + verbose=verbose, ) progbar = partial(tqdm, desc="Shard progress") if verbose else lambda x: x @@ -610,6 +611,7 @@ def to_shar( warn_unused_fields=warn_unused_fields, include_cuts=True, shard_suffix=f".{idx:06d}", + verbose=False, ) ) for f in progbar(as_completed(futures)): @@ -2706,7 +2708,7 @@ def mix( elif isinstance(mixed_in_cut, (DataCut, PaddingCut)): new_tracks = [MixTrack(cut=mixed_in_cut, offset=offset, snr=snr)] else: - raise ValueError(f"Unsupported type of cut in mix(): {type(reference_cut)}") + raise ValueError(f"Unsupported type of cut in mix(): {type(mixed_in_cut)}") return MixedCut(id=mixed_cut_id, tracks=old_tracks + new_tracks) @@ -3386,9 +3388,12 @@ def _export_to_shar_single( warn_unused_fields: bool, include_cuts: bool, shard_suffix: Optional[str], + verbose: bool, ) -> Dict[str, List[str]]: from lhotse.shar import SharWriter + pbar = tqdm(desc="Exporting to SHAR", disable=not verbose) + with SharWriter( output_dir=output_dir, fields=fields, @@ -3399,5 +3404,7 @@ def _export_to_shar_single( ) as writer: for cut in cuts: writer.write(cut) + pbar.update() + # Finally, return the list of output files. return writer.output_paths From 7b59ecdfb25118dbc9e5841bf414d8ab631d27d0 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Mon, 15 May 2023 20:42:17 -0400 Subject: [PATCH 07/16] small fix in error message --- lhotse/cut/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py index c9a989cf1..8c2519687 100644 --- a/lhotse/cut/base.py +++ b/lhotse/cut/base.py @@ -490,7 +490,7 @@ def trim_to_supervisions( len(set(to_hashable(s.channel) for s in trimmed.supervisions)) == 1 ), ( "Trimmed cut has supervisions with different channels. Either set " - "`ignore_channel=True` to keep original channels or `keep_overlapping=False` " + "`keep_all_channels=True` to keep original channels or `keep_overlapping=False` " "to retain only 1 supervision per trimmed cut." ) trimmed.channel = trimmed.supervisions[0].channel From a64727a2b5aad58e908cb5028813a09afe8e79a4 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Tue, 16 May 2023 20:26:32 -0400 Subject: [PATCH 08/16] return word alignments from ami recipe --- lhotse/recipes/ami.py | 47 +++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py index d5ff85361..73b47cc54 100644 --- a/lhotse/recipes/ami.py +++ b/lhotse/recipes/ami.py @@ -38,8 +38,8 @@ from lhotse.audio import AudioSource, Recording, RecordingSet from lhotse.qa import fix_manifests from lhotse.recipes.utils import normalize_text_ami -from lhotse.supervision import SupervisionSegment, SupervisionSet -from lhotse.utils import Pathlike, Seconds, resumable_download +from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet +from lhotse.utils import Pathlike, Seconds, add_durations, resumable_download # fmt: off MEETINGS = { @@ -271,6 +271,7 @@ class AmiSegmentAnnotation(NamedTuple): gender: str start_time: Seconds end_time: Seconds + words: List[AlignmentItem] def parse_ami_annotations( @@ -279,7 +280,6 @@ def parse_ami_annotations( max_words_per_segment: Optional[int] = None, merge_consecutive: bool = False, ) -> Dict[str, List[SupervisionSegment]]: - # Extract if zipped file if str(annotations_dir).endswith(".zip"): import zipfile @@ -360,14 +360,27 @@ def parse_ami_annotations( seg_words, max_words_per_segment, merge_consecutive ) for subseg in subsegments: - start, end, text = subseg + start = subseg[0][0] + end = subseg[-1][1] + word_alignments = [ + AlignmentItem( + start=round(w[0], ndigits=4), + duration=add_durations(w[1], -w[0], sampling_rate=16000), + symbol=normalize_text_ami(w[2], normalize=normalize), + ) + for w in subseg + ] + # Filter out empty words + word_alignments = [w for w in word_alignments if w.symbol] + text = " ".join(w.symbol for w in word_alignments) annotations[key].append( AmiSegmentAnnotation( - text=normalize_text_ami(text, normalize=normalize), + text=text, speaker=key[1], gender=key[1][0], start_time=start, end_time=end, + words=word_alignments, ) ) @@ -378,7 +391,14 @@ def split_segment( words: List[Tuple[float, float, str]], max_words_per_segment: Optional[int] = None, merge_consecutive: bool = False, -): +) -> List[List[Tuple[float, float, str]]]: + """ + Given a list of words, return a list of segments (each segment is a list of words) + where each segment has at most max_words_per_segment words. If merge_consecutive + is True, then consecutive segments with less than max_words_per_segment words + will be merged together. + """ + def split_(sequence, sep): chunk = [] for val in sequence: @@ -443,11 +463,8 @@ def split_on_comma_(segment): # flatten the list of lists subsegments = [item for sublist in subsegments for item in sublist] - # For each subsegment, we create a tuple of (start_time, end_time, text) - subsegments = [ - (subseg[0][0], subseg[-1][1], " ".join([w[2] for w in subseg])) - for subseg in filter(lambda s: len(s) > 0, subsegments) - ] + # Filter out empty subsegments + subsegments = list(filter(lambda s: len(s) > 0, subsegments)) return subsegments @@ -563,7 +580,9 @@ def prepare_supervision_ihm( continue for seg_idx, seg_info in enumerate(annotation): - duration = seg_info.end_time - seg_info.start_time + duration = add_durations( + seg_info.end_time, -seg_info.start_time, sampling_rate=16000 + ) # Some annotations in IHM setting exceed audio duration, so we # ignore such segments if seg_info.end_time > recording.duration: @@ -577,13 +596,14 @@ def prepare_supervision_ihm( SupervisionSegment( id=f"{recording.id}-{channel}-{seg_idx}", recording_id=recording.id, - start=seg_info.start_time, + start=round(seg_info.start_time, ndigits=4), duration=duration, channel=channel, language="English", speaker=seg_info.speaker, gender=seg_info.gender, text=seg_info.text, + alignment={"word": seg_info.words}, ) ) @@ -627,6 +647,7 @@ def prepare_supervision_other( speaker=seg_info.speaker, gender=seg_info.gender, text=seg_info.text, + alignment={"word": seg_info.words}, ) ) return SupervisionSet.from_segments(segments) From 850ce2c5fd9b511b95137dc7f1f59a4577828abb Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Thu, 18 May 2023 09:23:46 -0400 Subject: [PATCH 09/16] add word alignments for ICSI --- lhotse/recipes/icsi.py | 204 ++++++++++++++++++++++++++++------------- 1 file changed, 142 insertions(+), 62 deletions(-) diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py index 604d7852e..757461ac5 100644 --- a/lhotse/recipes/icsi.py +++ b/lhotse/recipes/icsi.py @@ -106,8 +106,8 @@ from lhotse.audio import AudioSource, Recording, RecordingSet, read_sph from lhotse.qa import fix_manifests from lhotse.recipes.utils import normalize_text_ami -from lhotse.supervision import SupervisionSegment, SupervisionSet -from lhotse.utils import Pathlike, Seconds, resumable_download +from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet +from lhotse.utils import Pathlike, Seconds, add_durations, resumable_download # fmt:off PARTITIONS = { @@ -135,15 +135,6 @@ # fmt:on -class IcsiSegmentAnnotation(NamedTuple): - text: str - speaker: str - channel: str - gender: str - start_time: Seconds - end_time: Seconds - - def download_audio( target_dir: Path, force_download: Optional[bool] = False, @@ -205,22 +196,30 @@ def download_icsi( download_audio(audio_dir, force_download, url, mic) # Annotations - logging.info("Downloading AMI annotations") + logging.info("Downloading ICSI annotations") if transcripts_dir.exists() and not force_download: logging.info( f"Skip downloading transcripts as they exist in: {transcripts_dir}" ) return target_dir - annotations_url = f"{url}/ICSICorpusAnnotations/ICSI_original_transcripts.zip" + + # We need the MRT transcripts for the speaker-to-channel mapping. The NXT transcripts + # are used for the actual annotations (since they contain word alignments) + annotations_url_mrt = f"{url}/ICSICorpusAnnotations/ICSI_original_transcripts.zip" + annotations_url_nxt = f"{url}/ICSICorpusAnnotations/ICSI_core_NXT.zip" resumable_download( - annotations_url, + annotations_url_mrt, filename=target_dir / "ICSI_original_transcripts.zip", force_download=force_download, ) + resumable_download( + annotations_url_nxt, + filename=target_dir / "ICSI_core_NXT.zip", + force_download=force_download, + ) - # Unzip annotations zip file - with zipfile.ZipFile(target_dir / "ICSI_original_transcripts.zip") as z: + with zipfile.ZipFile(target_dir / "ICSI_core_NXT.zip") as z: # Unzips transcripts to /'transcripts' # zip file also contains some documentation which will be unzipped to z.extractall(target_dir) @@ -228,9 +227,22 @@ def download_icsi( if transcripts_dir: Path(target_dir / "transcripts").rename(transcripts_dir) + # From the MRT transcripts, we only need the transcripts/preambles.mrt file + with zipfile.ZipFile(target_dir / "ICSI_original_transcripts.zip") as z: + z.extract("transcripts/preambles.mrt", transcripts_dir) + return target_dir +class IcsiSegmentAnnotation(NamedTuple): + text: str + speaker: str + gender: str + start_time: Seconds + end_time: Seconds + words: List[AlignmentItem] + + def parse_icsi_annotations( transcripts_dir: Pathlike, normalize: str = "upper" ) -> Tuple[Dict[str, List[SupervisionSegment]], Dict[str, Dict[str, int]]]: @@ -240,53 +252,119 @@ def parse_icsi_annotations( spk_to_channel_map = defaultdict(dict) # First we get global speaker ids and channels - for meeting_file in tqdm( - transcripts_dir.rglob("./*.mrt"), desc="Parsing ICSI mrt files" - ): - if meeting_file.stem == "preambles": + with open(transcripts_dir / "preambles.mrt") as f: + root = ET.parse(f).getroot() # + for child in root: + if child.tag == "Meeting": + meeting_id = child.attrib["Session"] + for grandchild in child: + if grandchild.tag == "Preamble": + for greatgrandchild in grandchild: + if greatgrandchild.tag == "Channels": + channel_to_idx_map[meeting_id] = { + channel.attrib["Name"]: idx + for idx, channel in enumerate(greatgrandchild) + } + elif greatgrandchild.tag == "Participants": + for speaker in greatgrandchild: + # some speakers may not have an associated channel in some meetings, so we + # assign them the SDM channel + spk_to_channel_map[meeting_id][ + speaker.attrib["Name"] + ] = ( + speaker.attrib["Channel"] + if "Channel" in speaker.attrib + else "chan6" + ) + + # Get the speaker segment times from the segments file + segments = {} + for file in (transcripts_dir / "Segments").glob("*.xml"): + meet_id, local_id, _ = file.stem.split(".") + spk_segments = [] + spk_id = None + with open(file) as f: + tree = ET.parse(f) + for seg in tree.getroot(): + if seg.tag != "segment": + continue + if spk_id is None and "participant" in seg.attrib: + spk_id = seg.attrib["participant"] + start_time = float(seg.attrib["starttime"]) + end_time = float(seg.attrib["endtime"]) + spk_segments.append((start_time, end_time)) + if spk_id is None or len(spk_segments) == 0: continue - with open(meeting_file) as f: - meeting_id = meeting_file.stem - root = ET.parse(f).getroot() # - for child in root: - if child.tag == "Preamble": - for grandchild in child: - if grandchild.tag == "Channels": - channel_to_idx_map[meeting_id] = { - channel.attrib["Name"]: idx - for idx, channel in enumerate(grandchild) - } - elif grandchild.tag == "Participants": - for speaker in grandchild: - # some speakers may not have an associated channel in some meetings, so we - # assign them the SDM channel - spk_to_channel_map[meeting_id][ - speaker.attrib["Name"] - ] = ( - speaker.attrib["Channel"] - if "Channel" in speaker.attrib - else "chan6" - ) - elif child.tag == "Transcript": - for segment in child: - if len(list(segment)) == 0 and "Participant" in segment.attrib: - start_time = float(segment.attrib["StartTime"]) - end_time = float(segment.attrib["EndTime"]) - speaker = segment.attrib["Participant"] - channel = spk_to_channel_map[meeting_id][speaker] - text = normalize_text_ami( - segment.text.strip(), normalize=normalize - ) - annotations[(meeting_id, speaker, channel)].append( - IcsiSegmentAnnotation( - text, - speaker, - channel, - speaker[0], - start_time, - end_time, - ) - ) + key = (meet_id, local_id) + channel = spk_to_channel_map[meet_id][spk_id] + segments[key] = (spk_id, channel, spk_segments) + + # Now we go through each speaker's word-level annotations and store them + words = {} + for file in (transcripts_dir / "Words").glob("*.xml"): + meet_id, local_id, _ = file.stem.split(".") + key = (meet_id, local_id) + if key not in segments: + continue + else: + spk_id, channel, spk_segments = segments[key] + + seg_words = [] + combine_with_next = False + with open(file) as f: + tree = ET.parse(f) + for i, word in enumerate(tree.getroot()): + if ( + word.tag != "w" + or "starttime" not in word.attrib + or word.attrib["starttime"] == "" + or "endtime" not in word.attrib + or word.attrib["endtime"] == "" + ): + continue + start_time = float(word.attrib["starttime"]) + end_time = float(word.attrib["endtime"]) + seg_words.append((start_time, end_time, word.text)) + words[key] = (spk_id, channel, seg_words) + + # Now we create segment-level annotations by combining the word-level + # annotations with the speaker segment times. We also normalize the text + # (if requested). + annotations = defaultdict(list) + + for key, (spk_id, channel, spk_segments) in segments.items(): + # Get the words for this speaker + _, _, spk_words = words[key] + # Now iterate over the speaker segments and create segment annotations + for seg_start, seg_end in spk_segments: + seg_words = list( + filter(lambda w: w[0] >= seg_start and w[1] <= seg_end, spk_words) + ) + if len(seg_words) == 0: + continue + start = seg_words[0][0] + end = seg_words[-1][1] + word_alignments = [ + AlignmentItem( + start=round(w[0], ndigits=4), + duration=add_durations(w[1], -w[0], sampling_rate=16000), + symbol=normalize_text_ami(w[2], normalize=normalize), + ) + for w in seg_words + ] + # Filter out empty words + word_alignments = [w for w in word_alignments if len(w.symbol) > 0] + text = " ".join(w.symbol for w in word_alignments) + annotations[key].append( + IcsiSegmentAnnotation( + text=text, + speaker=spk_id, + gender=spk_id[0], + start_time=start, + end_time=end, + words=word_alignments, + ) + ) return annotations, channel_to_idx_map @@ -422,6 +500,7 @@ def prepare_supervision_ihm( speaker=seg_info.speaker, gender=seg_info.gender, text=seg_info.text, + alignment={"word": seg_info.words}, ) ) @@ -465,6 +544,7 @@ def prepare_supervision_other( speaker=seg_info.speaker, gender=seg_info.gender, text=seg_info.text, + alignment={"word": seg_info.words}, ) ) return SupervisionSet.from_segments(segments) From 4b39c6fe9b27188b0f97c980eef7b6e3d0dc97b0 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Thu, 18 May 2023 09:26:05 -0400 Subject: [PATCH 10/16] remove unwanted whitespace --- lhotse/recipes/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lhotse/recipes/utils.py b/lhotse/recipes/utils.py index 37fc00a99..b63ad0c66 100644 --- a/lhotse/recipes/utils.py +++ b/lhotse/recipes/utils.py @@ -147,7 +147,7 @@ def normalize_text_ami(text: str, normalize: str = "upper") -> str: text = re.sub(r"UH HUH", "UH-HUH", text) text = re.sub(r"(\b)O K(\b)", r"\g<1>OK\g<2>", text) text = re.sub(r"(\b)O_K(\b)", r"\g<1>OK\g<2>", text) - return text + return text.strip() def normalize_text_chime6(text: str, normalize: str = "upper") -> str: From 3c16b906d7d7ff7a22541e04f1f118509005d544 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Thu, 18 May 2023 10:22:09 -0400 Subject: [PATCH 11/16] fix IHM preparation --- lhotse/recipes/icsi.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py index 757461ac5..6f42896e6 100644 --- a/lhotse/recipes/icsi.py +++ b/lhotse/recipes/icsi.py @@ -245,7 +245,9 @@ class IcsiSegmentAnnotation(NamedTuple): def parse_icsi_annotations( transcripts_dir: Pathlike, normalize: str = "upper" -) -> Tuple[Dict[str, List[SupervisionSegment]], Dict[str, Dict[str, int]]]: +) -> Tuple[ + Dict[Tuple[str, str, str], List[SupervisionSegment]], Dict[str, Dict[str, int]] +]: annotations = defaultdict(list) # In Lhotse, channels are integers, so we map channel ids to integers for each session channel_to_idx_map = defaultdict(dict) @@ -329,12 +331,13 @@ def parse_icsi_annotations( # Now we create segment-level annotations by combining the word-level # annotations with the speaker segment times. We also normalize the text - # (if requested). + # (if requested). The annotations is a dict indexed by (meeting_id, spk_id, channel). annotations = defaultdict(list) for key, (spk_id, channel, spk_segments) in segments.items(): # Get the words for this speaker _, _, spk_words = words[key] + new_key = (key[0], spk_id, channel) # Now iterate over the speaker segments and create segment annotations for seg_start, seg_end in spk_segments: seg_words = list( @@ -355,7 +358,7 @@ def parse_icsi_annotations( # Filter out empty words word_alignments = [w for w in word_alignments if len(w.symbol) > 0] text = " ".join(w.symbol for w in word_alignments) - annotations[key].append( + annotations[new_key].append( IcsiSegmentAnnotation( text=text, speaker=spk_id, From 9921575d85b6eaa5f58a2b98c69fd4bc81e7afef Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Thu, 18 May 2023 11:36:32 -0400 Subject: [PATCH 12/16] remove words with zero or negative duration --- lhotse/recipes/ami.py | 11 +++++++++-- lhotse/recipes/icsi.py | 8 ++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py index 73b47cc54..f668cb0b5 100644 --- a/lhotse/recipes/ami.py +++ b/lhotse/recipes/ami.py @@ -370,8 +370,15 @@ def parse_ami_annotations( ) for w in subseg ] - # Filter out empty words - word_alignments = [w for w in word_alignments if w.symbol] + word_alignments = [w for w in word_alignments if len(w.symbol) > 0] + if any(w.duration <= 0 for w in word_alignments): + logging.warning( + f"Segment {key[0]}.{key[1]}.{key[2]} at time {start}-{end} " + f"has a word with zero or negative duration." + ) + word_alignments = [ + w for w in word_alignments if w.duration > 0 + ] # type: ignore text = " ".join(w.symbol for w in word_alignments) annotations[key].append( AmiSegmentAnnotation( diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py index 6f42896e6..18699b0dc 100644 --- a/lhotse/recipes/icsi.py +++ b/lhotse/recipes/icsi.py @@ -357,6 +357,14 @@ def parse_icsi_annotations( ] # Filter out empty words word_alignments = [w for w in word_alignments if len(w.symbol) > 0] + if any(w.duration <= 0 for w in word_alignments): + logging.warning( + f"Segment {key[0]}.{spk_id}.{channel} at time {start}-{end} " + f"has a word with zero or negative duration." + ) + word_alignments = [ + w for w in word_alignments if w.duration > 0 + ] # type: ignore text = " ".join(w.symbol for w in word_alignments) annotations[new_key].append( IcsiSegmentAnnotation( From dba413f3bf844acb58f5bdab349334094d32de5b Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Thu, 18 May 2023 13:14:14 -0400 Subject: [PATCH 13/16] ensure word alignments respect segment boundary --- lhotse/recipes/ami.py | 32 ++++++++++++++++---------------- lhotse/recipes/icsi.py | 33 ++++++++++++++++----------------- 2 files changed, 32 insertions(+), 33 deletions(-) diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py index f668cb0b5..c71db9e4a 100644 --- a/lhotse/recipes/ami.py +++ b/lhotse/recipes/ami.py @@ -362,23 +362,23 @@ def parse_ami_annotations( for subseg in subsegments: start = subseg[0][0] end = subseg[-1][1] - word_alignments = [ - AlignmentItem( - start=round(w[0], ndigits=4), - duration=add_durations(w[1], -w[0], sampling_rate=16000), - symbol=normalize_text_ami(w[2], normalize=normalize), - ) - for w in subseg - ] - word_alignments = [w for w in word_alignments if len(w.symbol) > 0] - if any(w.duration <= 0 for w in word_alignments): - logging.warning( - f"Segment {key[0]}.{key[1]}.{key[2]} at time {start}-{end} " - f"has a word with zero or negative duration." + word_alignments = [] + for w in subseg: + w_start = max(start, round(w[0], ndigits=4)) + w_end = min(end, round(w[1], ndigits=4)) + w_dur = add_durations(w_end, -w_start, sampling_rate=16000) + w_symbol = normalize_text_ami(w[2], normalize=normalize) + if len(w_symbol) == 0: + continue + if w_dur <= 0: + logging.warning( + f"Segment {key[0]}.{key[1]}.{key[2]} at time {start}-{end} " + f"has a word with zero or negative duration. Skipping." + ) + continue + word_alignments.append( + AlignmentItem(start=w_start, duration=w_dur, symbol=w_symbol) ) - word_alignments = [ - w for w in word_alignments if w.duration > 0 - ] # type: ignore text = " ".join(w.symbol for w in word_alignments) annotations[key].append( AmiSegmentAnnotation( diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py index 18699b0dc..72b92a407 100644 --- a/lhotse/recipes/icsi.py +++ b/lhotse/recipes/icsi.py @@ -347,24 +347,23 @@ def parse_icsi_annotations( continue start = seg_words[0][0] end = seg_words[-1][1] - word_alignments = [ - AlignmentItem( - start=round(w[0], ndigits=4), - duration=add_durations(w[1], -w[0], sampling_rate=16000), - symbol=normalize_text_ami(w[2], normalize=normalize), - ) - for w in seg_words - ] - # Filter out empty words - word_alignments = [w for w in word_alignments if len(w.symbol) > 0] - if any(w.duration <= 0 for w in word_alignments): - logging.warning( - f"Segment {key[0]}.{spk_id}.{channel} at time {start}-{end} " - f"has a word with zero or negative duration." + word_alignments = [] + for w in seg_words: + w_start = max(start, round(w[0], ndigits=4)) + w_end = min(end, round(w[1], ndigits=4)) + w_dur = add_durations(w_end, -w_start, sampling_rate=16000) + w_symbol = normalize_text_ami(w[2], normalize=normalize) + if len(w_symbol) == 0: + continue + if w_dur <= 0: + logging.warning( + f"Segment {key[0]}.{spk_id}.{channel} at time {start}-{end} " + f"has a word with zero or negative duration. Skipping." + ) + continue + word_alignments.append( + AlignmentItem(start=w_start, duration=w_dur, symbol=w_symbol) ) - word_alignments = [ - w for w in word_alignments if w.duration > 0 - ] # type: ignore text = " ".join(w.symbol for w in word_alignments) annotations[new_key].append( IcsiSegmentAnnotation( From 12be4242c7b28233dfcd9a569b52c3c2104015ab Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Mon, 22 May 2023 10:16:42 -0400 Subject: [PATCH 14/16] add save-to-wav option for icsi --- lhotse/bin/modes/recipes/icsi.py | 10 +++++++++- lhotse/recipes/icsi.py | 27 ++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/lhotse/bin/modes/recipes/icsi.py b/lhotse/bin/modes/recipes/icsi.py index 29c7f4edf..78a18473c 100644 --- a/lhotse/bin/modes/recipes/icsi.py +++ b/lhotse/bin/modes/recipes/icsi.py @@ -68,18 +68,26 @@ def icsi( default="kaldi", help="Type of text normalization to apply (kaldi style, by default)", ) +@click.option( + "--save-to-wav", + is_flag=True, + default=False, + help="If True and `mic` is sdm/ihm/mdm, save the recordings as WAV for faster processing.", +) def icsi( audio_dir: Pathlike, transcripts_dir: Pathlike, output_dir: Pathlike, mic: str, normalize_text: bool, + save_to_wav: bool, ): - """AMI data preparation.""" + """ICSI data preparation.""" prepare_icsi( audio_dir, transcripts_dir, output_dir=output_dir, mic=mic, normalize_text=normalize_text, + save_to_wav=save_to_wav, ) diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py index 72b92a407..124064009 100644 --- a/lhotse/recipes/icsi.py +++ b/lhotse/recipes/icsi.py @@ -100,6 +100,7 @@ from pathlib import Path from typing import Dict, List, NamedTuple, Optional, Tuple, Union +import soundfile as sf from tqdm.auto import tqdm from lhotse import validate_recordings_and_supervisions @@ -385,6 +386,8 @@ def parse_icsi_annotations( def prepare_audio_grouped( audio_paths: List[Pathlike], channel_to_idx_map: Dict[str, Dict[str, int]] = None, + save_to_wav: bool = False, + output_dir: Pathlike = None, ) -> RecordingSet: # Group together multiple channels from the same session. # We will use that to create a Recording with multiple sources (channels). @@ -404,6 +407,16 @@ def prepare_audio_grouped( } audio_sf, samplerate = read_sph(channel_paths[0]) + if save_to_wav: + session_dir = Path(output_dir) / "wavs" / session_name + session_dir.mkdir(parents=True, exist_ok=True) + for i, audio_path in enumerate(channel_paths): + audio, _ = read_sph(audio_path) + wav_path = session_dir / f"{audio_path.stem}.wav" + sf.write(wav_path, audio.T, samplerate) + # Replace the sph path with the wav path + channel_paths[i] = wav_path + recordings.append( Recording( id=session_name, @@ -436,7 +449,7 @@ def prepare_audio_single( for audio_path in tqdm(audio_paths, desc="Preparing audio"): session_name = audio_path.parts[-2] if audio_path.suffix == ".wav": - audio_sf = sf.SoundFile(str(audio_path)) + audio_sf = sf.SoundFile(audio_path) num_frames = audio_sf.frames num_channels = audio_sf.channels samplerate = audio_sf.samplerate @@ -566,6 +579,7 @@ def prepare_icsi( output_dir: Optional[Pathlike] = None, mic: Optional[str] = "ihm", normalize_text: str = "kaldi", + save_to_wav: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions @@ -574,6 +588,7 @@ def prepare_icsi( :param output_dir: Pathlike, the path where to write the manifests - `None` means manifests aren't stored on disk. :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use. :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text + :param save_to_wav: bool, whether to save the sph audio to wav format :return: a Dict whose key is ('train', 'dev', 'test'), and the values are dicts of manifests under keys 'recordings' and 'supervisions'. """ @@ -588,6 +603,9 @@ def prepare_icsi( assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}" assert mic in MIC_TO_CHANNELS.keys(), f"Mic {mic} not supported" + if save_to_wav: + assert output_dir is not None, "output_dir must be specified when saving to wav" + if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) @@ -604,7 +622,10 @@ def prepare_icsi( if mic == "ihm" or mic == "mdm": audio_paths = audio_dir.rglob(f"chan[{channels}].sph") audio = prepare_audio_grouped( - list(audio_paths), channel_to_idx_map if mic == "ihm" else None + list(audio_paths), + channel_to_idx_map if mic == "ihm" else None, + save_to_wav, + output_dir, ) elif mic == "sdm" or mic == "ihm-mix": audio_paths = ( @@ -612,7 +633,7 @@ def prepare_icsi( if len(channels) else audio_dir.rglob("*.wav") ) - audio = prepare_audio_single(list(audio_paths)) + audio = prepare_audio_single(list(audio_paths), save_to_wav, output_dir) # Supervisions logging.info("Preparing supervision manifests") From c4b957df6387d7c13ba75a73998b0d54709f49a9 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Mon, 22 May 2023 10:37:07 -0400 Subject: [PATCH 15/16] add test for mixing cut with recording --- lhotse/cut/set.py | 2 +- test/cut/test_cut_mixing.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lhotse/cut/set.py b/lhotse/cut/set.py index 81851794c..ad5b3e1a2 100644 --- a/lhotse/cut/set.py +++ b/lhotse/cut/set.py @@ -2610,7 +2610,7 @@ def mix( ) snr = None - if reference_cut.num_features is not None: + if reference_cut.num_features is not None and mixed_in_cut.num_features is not None: assert ( reference_cut.num_features == mixed_in_cut.num_features ), "Cannot mix cuts with different feature dimensions." diff --git a/test/cut/test_cut_mixing.py b/test/cut/test_cut_mixing.py index 8cac69d48..2f0b06ab1 100644 --- a/test/cut/test_cut_mixing.py +++ b/test/cut/test_cut_mixing.py @@ -324,6 +324,12 @@ def test_mix_cut_snr(libri_cut): assert E(feats) > E(feats_snr) +def test_mix_cut_with_other_raises_error(libri_cut): + libri_cut = libri_cut.drop_features() + with pytest.raises(ValueError): + _ = libri_cut.mix(libri_cut.recording) + + def test_mix_cut_snr_truncate_snr_reference(libri_cut): mixed = libri_cut.pad(duration=20).mix(libri_cut, offset_other_by=10) mixed_snr = libri_cut.pad(duration=20).mix(libri_cut, offset_other_by=10, snr=10) From a253cf419a538939658a1747c612902eb054ad27 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Mon, 22 May 2023 14:15:41 -0400 Subject: [PATCH 16/16] add read_sph_torchaudio --- lhotse/audio.py | 39 ++++++++++++++++++++++++++++--- test/cut/test_cut_augmentation.py | 2 +- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/lhotse/audio.py b/lhotse/audio.py index f6267850c..46045ca6a 100644 --- a/lhotse/audio.py +++ b/lhotse/audio.py @@ -1767,7 +1767,6 @@ def info( force_opus_sampling_rate: Optional[int] = None, force_read_audio: bool = False, ) -> LibsndfileCompatibleAudioInfo: - if force_read_audio: # This is a reliable fallback for situations when the user knows that audio files do not # have duration metadata in their headers. @@ -1824,7 +1823,8 @@ def torchaudio_2_0_ffmpeg_enabled() -> bool: import torchaudio from packaging import version - ver = version.parse(torchaudio.__version__) + # Handle cases like '2.0.0+cu117' + ver = version.parse(version.parse(torchaudio.__version__).base_version) if ver == version.parse("2.0.0"): return os.environ.get("TORCHAUDIO_USE_BACKEND_DISPATCHER", "0") == "1" if ver >= version.parse("2.1.0"): @@ -2374,6 +2374,40 @@ def sph_info(path: Pathlike) -> LibsndfileCompatibleAudioInfo: def read_sph( sph_path: Pathlike, offset: Seconds = 0.0, duration: Optional[Seconds] = None +) -> Tuple[np.ndarray, int]: + """ + Reads SPH files either using torchaudio or using sph2pipe in a shell subprocess. + + :return: a tuple of audio samples and the sampling rate. + """ + try: + return read_sph_torchaudio(sph_path=sph_path, offset=offset, duration=duration) + except: + return read_sph_sph2pipe(sph_path=sph_path, offset=offset, duration=duration) + + +def read_sph_torchaudio( + sph_path: Pathlike, offset: Seconds = 0.0, duration: Optional[Seconds] = None +) -> Tuple[np.ndarray, int]: + """ + Reads SPH files using torchaudio. + + :return: a tuple of audio samples and the sampling rate. + """ + # Actual audio reading. + sph_path = str(sph_path) + try: + samples, sampling_rate = torchaudio_2_ffmpeg_load(sph_path, offset, duration) + except RuntimeError as e: + raise AudioLoadingError( + f"{e}\nThe torchaudio command for which the program failed is: " + f"torchaudio.load({sph_path}, frame_offset={int(offset * 100)}, num_frames={int(duration * 100)})" + ) + return samples, sampling_rate + + +def read_sph_sph2pipe( + sph_path: Pathlike, offset: Seconds = 0.0, duration: Optional[Seconds] = None ) -> Tuple[np.ndarray, int]: """ Reads SPH files using sph2pipe in a shell subprocess. @@ -2381,7 +2415,6 @@ def read_sph( :return: a tuple of audio samples and the sampling rate. """ - sph_path = Path(sph_path) # Construct the sph2pipe command depending on the arguments passed. diff --git a/test/cut/test_cut_augmentation.py b/test/cut/test_cut_augmentation.py index 7e9d04a7b..968c6e12d 100644 --- a/test/cut/test_cut_augmentation.py +++ b/test/cut/test_cut_augmentation.py @@ -420,7 +420,7 @@ def test_mixed_cut_start01_reverb_rir_with_fast_random( ): mixed_rvb = cut_with_supervision_start01.append( cut_with_supervision_start01 - ).reverb_rir() + ).reverb_rir(mix_first=False) assert mixed_rvb.start == 0 # MixedCut always starts at 0 assert mixed_rvb.duration == cut_with_supervision_start01.duration * 2 assert mixed_rvb.end == cut_with_supervision_start01.duration * 2