diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index d93796305..975fb730a 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -460,8 +460,6 @@ def test_processing_correctness( num_batches: int, simplify_rate: float, ): - if model_id == "allendou/Fun-ASR-Nano-2512-vllm": - pytest.skip("Cached audio `input_features` not matched. Fix later.") if model_id == "google/gemma-3n-E2B-it": pytest.skip("Fix later") if model_id == "OpenGVLab/InternVL2-2B": diff --git a/vllm/transformers_utils/processors/funasr_processor.py b/vllm/transformers_utils/processors/funasr_processor.py index 4807c87d3..c4cb2a2c4 100644 --- a/vllm/transformers_utils/processors/funasr_processor.py +++ b/vllm/transformers_utils/processors/funasr_processor.py @@ -361,11 +361,11 @@ class FunASRFeatureExtractor(SequenceFeatureExtractor): input_features = padded_inputs.get("input_features").transpose(2, 0, 1) - self.frontend = WavFrontend(**self.frontend_conf) + frontend = WavFrontend(**self.frontend_conf, dither=self.dither) input_features, speech_lengths = self.extract_fbank( input_features[0], data_type=kwargs.get("data_type", "sound"), - frontend=self.frontend, + frontend=frontend, is_final=True, ) olens = 1 + (speech_lengths - 3 + 2 * 1) // 2