Skip to content

Commit

Permalink
Fix frame size for pre-emphasis. (apple#617)
Browse files Browse the repository at this point in the history
  • Loading branch information
markblee authored Aug 2, 2024
1 parent 62d0d26 commit 339591e
Showing 3 changed files with 20 additions and 10 deletions.
10 changes: 5 additions & 5 deletions axlearn/audio/frontend.py
Original file line number Diff line number Diff line change
@@ -170,11 +170,6 @@ def __init__(self, cfg: Config, *, parent: Optional[Module]):
if cfg.output_transformation is not None:
self._output_transformation = maybe_instantiate(cfg.output_transformation)

self._pre_emphasis = None
if cfg.pre_emphasis is not None:
self._frame_size += 1
self._pre_emphasis = cfg.pre_emphasis.instantiate()

fft_size = cfg.fft_size(self._frame_size)
if cfg.fft is not None:
self._fft = cfg.fft.set(n=fft_size).instantiate()
@@ -190,6 +185,11 @@ def __init__(self, cfg: Config, *, parent: Optional[Module]):
)
self._spectrogram = spectrogram.instantiate()

self._pre_emphasis = None
if cfg.pre_emphasis is not None:
self._frame_size += 1
self._pre_emphasis = cfg.pre_emphasis.instantiate()

def forward(self, inputs: Tensor, *, paddings: Tensor) -> Dict[str, Tensor]:
"""Computes log-mel spectrogram features.
18 changes: 13 additions & 5 deletions axlearn/audio/frontend_test.py
Original file line number Diff line number Diff line change
@@ -74,9 +74,10 @@ def jit_forward(inputs, paddings):
dict(frame_size_ms=32, hop_size_ms=10),
dict(frame_size_ms=31.9375, hop_size_ms=10),
],
pre_emphasis=[True, False],
)
@pytest.mark.fp64
def test_against_ref(self, frame_size_ms, hop_size_ms):
def test_against_ref(self, frame_size_ms, hop_size_ms, pre_emphasis):
sample_rate, batch_size, max_seconds = 16_000, 4, 13
num_filters = 80

@@ -100,6 +101,7 @@ def test_against_ref(self, frame_size_ms, hop_size_ms):
lower_edge_hertz=125.0,
upper_edge_hertz=7600.0,
mel_floor=1.0,
pre_emphasis=pre_emphasis,
)

# Compute test outputs.
@@ -110,6 +112,8 @@ def test_against_ref(self, frame_size_ms, hop_size_ms):
hop_size_ms=hop_size_ms,
mel_floor=1.0,
)
if not pre_emphasis:
cfg.pre_emphasis = None
layer: LogMelFrontend = cfg.set(name="test").instantiate(parent=None)
test_outputs = self._jit_forward(layer, inputs, paddings)

@@ -340,6 +344,7 @@ def _ref_frontend(
lower_edge_hertz: float,
upper_edge_hertz: float,
mel_floor: float,
pre_emphasis: bool,
):
"""Lingvo ASR frontend.
@@ -348,13 +353,16 @@ def _ref_frontend(
"""
frame_size = int(round(_ms_to_samples(frame_size_ms, sample_rate=sample_rate)))
frame_step = int(round(_ms_to_samples(hop_size_ms, sample_rate=sample_rate)))

fft_size = int(max(512.0, math.pow(2, math.ceil(math.log(frame_size, 2)))))
inputs, output_paddings = _ref_framer(
inputs=inputs, paddings=paddings, frame_size=frame_size + 1, frame_step=frame_step
inputs=inputs,
paddings=paddings,
frame_size=frame_size + int(pre_emphasis),
frame_step=frame_step,
)
inputs = _ref_pre_emphasis(inputs=inputs, coeff=coeff)
if pre_emphasis:
inputs = _ref_pre_emphasis(inputs=inputs, coeff=coeff)
inputs = tf.signal.hann_window(frame_size, dtype=inputs.dtype) * inputs
fft_size = int(max(512.0, math.pow(2, math.ceil(math.log(frame_size + 1, 2)))))
outputs = _ref_log_mel_spectrogram(
inputs=inputs,
fft_size=fft_size,
2 changes: 2 additions & 0 deletions axlearn/open_api/metrics/tool_use_execution_test.py
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@
import json
from unittest.mock import MagicMock, Mock, patch

import pytest
from absl.testing import parameterized

from axlearn.open_api.mock_utils import mock_openai_package
@@ -137,6 +138,7 @@ def test_responses_without_tool_calls(self):
pred_arguments='{"location": "cupertino"',
),
)
@pytest.mark.skip(reason="Flaky in CI. TODO(gyin94): Fix and re-enable.")
def test_match_rules(
self, target_arguments, accuracy, target_message_match_rules=None, pred_arguments=None
):

0 comments on commit 339591e

Please sign in to comment.