diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index d93796305..975fb730a 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -460,8 +460,6 @@ def test_processing_correctness(
     num_batches: int,
     simplify_rate: float,
 ):
-    if model_id == "allendou/Fun-ASR-Nano-2512-vllm":
-        pytest.skip("Cached audio `input_features` not matched. Fix later.")
     if model_id == "google/gemma-3n-E2B-it":
         pytest.skip("Fix later")
     if model_id == "OpenGVLab/InternVL2-2B":
diff --git a/vllm/transformers_utils/processors/funasr_processor.py b/vllm/transformers_utils/processors/funasr_processor.py
index 4807c87d3..c4cb2a2c4 100644
--- a/vllm/transformers_utils/processors/funasr_processor.py
+++ b/vllm/transformers_utils/processors/funasr_processor.py
@@ -361,11 +361,11 @@ class FunASRFeatureExtractor(SequenceFeatureExtractor):
 
         input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
 
-        self.frontend = WavFrontend(**self.frontend_conf)
+        frontend = WavFrontend(**self.frontend_conf, dither=self.dither)
         input_features, speech_lengths = self.extract_fbank(
             input_features[0],
             data_type=kwargs.get("data_type", "sound"),
-            frontend=self.frontend,
+            frontend=frontend,
             is_final=True,
         )
         olens = 1 + (speech_lengths - 3 + 2 * 1) // 2