From 55aeec04f52a9d347c3299c4f4d0df2683ffac00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Sat, 7 Feb 2026 03:42:52 +0100 Subject: [PATCH] [Bugfix] Fix Whisper tokenization (#34011) Signed-off-by: NickLucche --- vllm/model_executor/models/whisper.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index f62bffada..0c777e4a5 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -727,6 +727,14 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, ) + # The HF WhisperProcessor passes **kwargs to both the tokenizer + # and the feature extractor. Text-tokenizer kwargs like + # `truncation` and `max_length` must be removed when audio data + # is present, otherwise the feature extractor interprets + # `max_length` as raw audio samples and truncates the audio. + tok_kwargs = { + k: v for k, v in tok_kwargs.items() if k not in ("truncation", "max_length") + } processed_outputs = super()._call_hf_processor( prompt=prompt, mm_data=mm_data,