[Bugfix] Fix Whisper tokenization (#34011)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
@@ -727,6 +727,14 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo
|
||||
**mm_kwargs,
|
||||
sampling_rate=feature_extractor.sampling_rate,
|
||||
)
|
||||
# The HF WhisperProcessor passes **kwargs to both the tokenizer
|
||||
# and the feature extractor. Text-tokenizer kwargs like
|
||||
# `truncation` and `max_length` must be removed when audio data
|
||||
# is present, otherwise the feature extractor interprets
|
||||
# `max_length` as raw audio samples and truncates the audio.
|
||||
tok_kwargs = {
|
||||
k: v for k, v in tok_kwargs.items() if k not in ("truncation", "max_length")
|
||||
}
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
|
||||
Reference in New Issue
Block a user