From 55aeec04f52a9d347c3299c4f4d0df2683ffac00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 7 Feb 2026 03:42:52 +0100
Subject: [PATCH] [Bugfix] Fix Whisper tokenization (#34011)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/model_executor/models/whisper.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index f62bffada..0c777e4a5 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -727,6 +727,14 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo
                 **mm_kwargs,
                 sampling_rate=feature_extractor.sampling_rate,
             )
+        # The HF WhisperProcessor passes **kwargs to both the tokenizer
+        # and the feature extractor. Text-tokenizer kwargs like
+        # `truncation` and `max_length` must be removed when audio data
+        # is present, otherwise the feature extractor interprets
+        # `max_length` as raw audio samples and truncates the audio.
+        tok_kwargs = {
+            k: v for k, v in tok_kwargs.items() if k not in ("truncation", "max_length")
+        }
         processed_outputs = super()._call_hf_processor(
             prompt=prompt,
             mm_data=mm_data,