[Bugfix] Fix Whisper tokenization (#34011)

Signed-off-by: NickLucche <nlucches@redhat.com>
2026-02-07 03:42:52 +01:00
parent 906077181b
commit 55aeec04f5
1 changed files with 8 additions and 0 deletions
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -727,6 +727,14 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo
                **mm_kwargs,
                sampling_rate=feature_extractor.sampling_rate,
            )
+        # The HF WhisperProcessor passes **kwargs to both the tokenizer
+        # and the feature extractor. Text-tokenizer kwargs like
+        # `truncation` and `max_length` must be removed when audio data
+        # is present, otherwise the feature extractor interprets
+        # `max_length` as raw audio samples and truncates the audio.
+        tok_kwargs = {
+            k: v for k, v in tok_kwargs.items() if k not in ("truncation", "max_length")
+        }
        processed_outputs = super()._call_hf_processor(
            prompt=prompt,
            mm_data=mm_data,