[Bugfix] Fix Whisper tokenization (#34011)

Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
Nicolò Lucchesi
2026-02-07 03:42:52 +01:00
committed by GitHub
parent 906077181b
commit 55aeec04f5

View File

@@ -727,6 +727,14 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo
**mm_kwargs,
sampling_rate=feature_extractor.sampling_rate,
)
# The HF WhisperProcessor passes **kwargs to both the tokenizer
# and the feature extractor. Text-tokenizer kwargs like
# `truncation` and `max_length` must be removed when audio data
# is present, otherwise the feature extractor interprets
# `max_length` as raw audio samples and truncates the audio.
tok_kwargs = {
k: v for k, v in tok_kwargs.items() if k not in ("truncation", "max_length")
}
processed_outputs = super()._call_hf_processor(
prompt=prompt,
mm_data=mm_data,