diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index f62bffada..0c777e4a5 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -727,6 +727,14 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, ) + # The HF WhisperProcessor passes **kwargs to both the tokenizer + # and the feature extractor. Text-tokenizer kwargs like + # `truncation` and `max_length` must be removed when audio data + # is present, otherwise the feature extractor interprets + # `max_length` as raw audio samples and truncates the audio. + tok_kwargs = { + k: v for k, v in tok_kwargs.items() if k not in ("truncation", "max_length") + } processed_outputs = super()._call_hf_processor( prompt=prompt, mm_data=mm_data,