diff --git a/axlearn/audio/frontend.py b/axlearn/audio/frontend.py
index 0051bd902..e6b507614 100644
--- a/axlearn/audio/frontend.py
+++ b/axlearn/audio/frontend.py
@@ -170,11 +170,6 @@ def __init__(self, cfg: Config, *, parent: Optional[Module]):
         if cfg.output_transformation is not None:
             self._output_transformation = maybe_instantiate(cfg.output_transformation)
 
-        self._pre_emphasis = None
-        if cfg.pre_emphasis is not None:
-            self._frame_size += 1
-            self._pre_emphasis = cfg.pre_emphasis.instantiate()
-
         fft_size = cfg.fft_size(self._frame_size)
         if cfg.fft is not None:
             self._fft = cfg.fft.set(n=fft_size).instantiate()
@@ -190,6 +185,11 @@ def __init__(self, cfg: Config, *, parent: Optional[Module]):
         )
         self._spectrogram = spectrogram.instantiate()
 
+        self._pre_emphasis = None
+        if cfg.pre_emphasis is not None:
+            self._frame_size += 1
+            self._pre_emphasis = cfg.pre_emphasis.instantiate()
+
     def forward(self, inputs: Tensor, *, paddings: Tensor) -> Dict[str, Tensor]:
         """Computes log-mel spectrogram features.
 
diff --git a/axlearn/audio/frontend_test.py b/axlearn/audio/frontend_test.py
index 85df807d2..c7cf11b6b 100644
--- a/axlearn/audio/frontend_test.py
+++ b/axlearn/audio/frontend_test.py
@@ -74,9 +74,10 @@ def jit_forward(inputs, paddings):
             dict(frame_size_ms=32, hop_size_ms=10),
             dict(frame_size_ms=31.9375, hop_size_ms=10),
         ],
+        pre_emphasis=[True, False],
     )
     @pytest.mark.fp64
-    def test_against_ref(self, frame_size_ms, hop_size_ms):
+    def test_against_ref(self, frame_size_ms, hop_size_ms, pre_emphasis):
         sample_rate, batch_size, max_seconds = 16_000, 4, 13
         num_filters = 80
 
@@ -100,6 +101,7 @@ def test_against_ref(self, frame_size_ms, hop_size_ms):
             lower_edge_hertz=125.0,
             upper_edge_hertz=7600.0,
             mel_floor=1.0,
+            pre_emphasis=pre_emphasis,
         )
 
         # Compute test outputs.
@@ -110,6 +112,8 @@ def test_against_ref(self, frame_size_ms, hop_size_ms):
             hop_size_ms=hop_size_ms,
             mel_floor=1.0,
         )
+        if not pre_emphasis:
+            cfg.pre_emphasis = None
         layer: LogMelFrontend = cfg.set(name="test").instantiate(parent=None)
         test_outputs = self._jit_forward(layer, inputs, paddings)
 
@@ -340,6 +344,7 @@ def _ref_frontend(
     lower_edge_hertz: float,
     upper_edge_hertz: float,
     mel_floor: float,
+    pre_emphasis: bool,
 ):
     """Lingvo ASR frontend.
 
@@ -348,13 +353,16 @@ def _ref_frontend(
     """
     frame_size = int(round(_ms_to_samples(frame_size_ms, sample_rate=sample_rate)))
     frame_step = int(round(_ms_to_samples(hop_size_ms, sample_rate=sample_rate)))
-
+    fft_size = int(max(512.0, math.pow(2, math.ceil(math.log(frame_size, 2)))))
     inputs, output_paddings = _ref_framer(
-        inputs=inputs, paddings=paddings, frame_size=frame_size + 1, frame_step=frame_step
+        inputs=inputs,
+        paddings=paddings,
+        frame_size=frame_size + int(pre_emphasis),
+        frame_step=frame_step,
     )
-    inputs = _ref_pre_emphasis(inputs=inputs, coeff=coeff)
+    if pre_emphasis:
+        inputs = _ref_pre_emphasis(inputs=inputs, coeff=coeff)
     inputs = tf.signal.hann_window(frame_size, dtype=inputs.dtype) * inputs
-    fft_size = int(max(512.0, math.pow(2, math.ceil(math.log(frame_size + 1, 2)))))
     outputs = _ref_log_mel_spectrogram(
         inputs=inputs,
         fft_size=fft_size,
diff --git a/axlearn/open_api/metrics/tool_use_execution_test.py b/axlearn/open_api/metrics/tool_use_execution_test.py
index 0c0ac755e..214df0488 100644
--- a/axlearn/open_api/metrics/tool_use_execution_test.py
+++ b/axlearn/open_api/metrics/tool_use_execution_test.py
@@ -5,6 +5,7 @@
 import json
 from unittest.mock import MagicMock, Mock, patch
 
+import pytest
 from absl.testing import parameterized
 
 from axlearn.open_api.mock_utils import mock_openai_package
@@ -137,6 +138,7 @@ def test_responses_without_tool_calls(self):
             pred_arguments='{"location": "cupertino"',
         ),
     )
+    @pytest.mark.skip(reason="Flaky in CI. TODO(gyin94): Fix and re-enable.")
     def test_match_rules(
         self, target_arguments, accuracy, target_message_match_rules=None, pred_arguments=None
     ):