[VLM][Bugfix] Pass processor kwargs properly on init (#13516)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-02-19 21:13:50 +08:00
committed by GitHub
parent 52ce14d31f
commit 377d10bd14
44 changed files with 677 additions and 455 deletions

View File

@@ -29,7 +29,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.audio import resample_audio
from vllm.sequence import SequenceData
from vllm.transformers_utils.processor import cached_get_processor
from vllm.transformers_utils.processor import cached_processor_from_config
from .interfaces import SupportsMultiModal, SupportsTranscription
from .utils import AutoWeightsLoader, WeightsMapper, make_layers
@@ -579,7 +579,7 @@ def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
assert mm_counts["audio"] == 1
num_tokens = get_max_whisper_audio_tokens(ctx)
processor = cached_get_processor(ctx.model_config.model)
processor = cached_processor_from_config(ctx.model_config)
chunk_length = processor.feature_extractor.chunk_length
sampling_rate = processor.feature_extractor.sampling_rate
num_samples = chunk_length * sampling_rate
@@ -596,7 +596,7 @@ def input_processor_for_whisper(ctx: InputContext, inputs):
multi_modal_data["audio"] = multi_modal_data["audio"][0]
# Resample and process audio
audio, orig_sr = multi_modal_data["audio"]
processor = cached_get_processor(ctx.model_config.model)
processor = cached_processor_from_config(ctx.model_config)
target_sr = processor.feature_extractor.sampling_rate
audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr)
multi_modal_data["audio"] = (audio, target_sr)
@@ -618,7 +618,7 @@ def input_mapper_for_whisper(
if len(multi_modal_data) == 0:
return MultiModalKwargs()
processor = cached_get_processor(ctx.model_config.model)
processor = cached_processor_from_config(ctx.model_config)
sampling_rate = processor.feature_extractor.sampling_rate
audios = [audio for audio, _ in multi_modal_data]