diff --git a/axlearn/audio/frontend.py b/axlearn/audio/frontend.py index 0051bd902..e6b507614 100644 --- a/axlearn/audio/frontend.py +++ b/axlearn/audio/frontend.py @@ -170,11 +170,6 @@ def __init__(self, cfg: Config, *, parent: Optional[Module]): if cfg.output_transformation is not None: self._output_transformation = maybe_instantiate(cfg.output_transformation) - self._pre_emphasis = None - if cfg.pre_emphasis is not None: - self._frame_size += 1 - self._pre_emphasis = cfg.pre_emphasis.instantiate() - fft_size = cfg.fft_size(self._frame_size) if cfg.fft is not None: self._fft = cfg.fft.set(n=fft_size).instantiate() @@ -190,6 +185,11 @@ def __init__(self, cfg: Config, *, parent: Optional[Module]): ) self._spectrogram = spectrogram.instantiate() + self._pre_emphasis = None + if cfg.pre_emphasis is not None: + self._frame_size += 1 + self._pre_emphasis = cfg.pre_emphasis.instantiate() + def forward(self, inputs: Tensor, *, paddings: Tensor) -> Dict[str, Tensor]: """Computes log-mel spectrogram features. diff --git a/axlearn/audio/frontend_test.py b/axlearn/audio/frontend_test.py index 85df807d2..c7cf11b6b 100644 --- a/axlearn/audio/frontend_test.py +++ b/axlearn/audio/frontend_test.py @@ -74,9 +74,10 @@ def jit_forward(inputs, paddings): dict(frame_size_ms=32, hop_size_ms=10), dict(frame_size_ms=31.9375, hop_size_ms=10), ], + pre_emphasis=[True, False], ) @pytest.mark.fp64 - def test_against_ref(self, frame_size_ms, hop_size_ms): + def test_against_ref(self, frame_size_ms, hop_size_ms, pre_emphasis): sample_rate, batch_size, max_seconds = 16_000, 4, 13 num_filters = 80 @@ -100,6 +101,7 @@ def test_against_ref(self, frame_size_ms, hop_size_ms): lower_edge_hertz=125.0, upper_edge_hertz=7600.0, mel_floor=1.0, + pre_emphasis=pre_emphasis, ) # Compute test outputs. @@ -110,6 +112,8 @@ def test_against_ref(self, frame_size_ms, hop_size_ms): hop_size_ms=hop_size_ms, mel_floor=1.0, ) + if not pre_emphasis: + cfg.pre_emphasis = None layer: LogMelFrontend = cfg.set(name="test").instantiate(parent=None) test_outputs = self._jit_forward(layer, inputs, paddings) @@ -340,6 +344,7 @@ def _ref_frontend( lower_edge_hertz: float, upper_edge_hertz: float, mel_floor: float, + pre_emphasis: bool, ): """Lingvo ASR frontend. @@ -348,13 +353,16 @@ def _ref_frontend( """ frame_size = int(round(_ms_to_samples(frame_size_ms, sample_rate=sample_rate))) frame_step = int(round(_ms_to_samples(hop_size_ms, sample_rate=sample_rate))) - + fft_size = int(max(512.0, math.pow(2, math.ceil(math.log(frame_size, 2))))) inputs, output_paddings = _ref_framer( - inputs=inputs, paddings=paddings, frame_size=frame_size + 1, frame_step=frame_step + inputs=inputs, + paddings=paddings, + frame_size=frame_size + int(pre_emphasis), + frame_step=frame_step, ) - inputs = _ref_pre_emphasis(inputs=inputs, coeff=coeff) + if pre_emphasis: + inputs = _ref_pre_emphasis(inputs=inputs, coeff=coeff) inputs = tf.signal.hann_window(frame_size, dtype=inputs.dtype) * inputs - fft_size = int(max(512.0, math.pow(2, math.ceil(math.log(frame_size + 1, 2))))) outputs = _ref_log_mel_spectrogram( inputs=inputs, fft_size=fft_size, diff --git a/axlearn/open_api/metrics/tool_use_execution_test.py b/axlearn/open_api/metrics/tool_use_execution_test.py index 0c0ac755e..214df0488 100644 --- a/axlearn/open_api/metrics/tool_use_execution_test.py +++ b/axlearn/open_api/metrics/tool_use_execution_test.py @@ -5,6 +5,7 @@ import json from unittest.mock import MagicMock, Mock, patch +import pytest from absl.testing import parameterized from axlearn.open_api.mock_utils import mock_openai_package @@ -137,6 +138,7 @@ def test_responses_without_tool_calls(self): pred_arguments='{"location": "cupertino"', ), ) + @pytest.mark.skip(reason="Flaky in CI. TODO(gyin94): Fix and re-enable.") def test_match_rules( self, target_arguments, accuracy, target_message_match_rules=None, pred_arguments=None ):