[Bugfix] Fix mm_hashes forgetting to be passed (#15668)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-03-28 13:51:05 +08:00
committed by GitHub
parent cec8c7d7f8
commit 8693e47e6a
6 changed files with 15 additions and 10 deletions

View File

@@ -28,7 +28,7 @@ from vllm.model_executor.models.llama import LlamaModel
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalInputs, NestedTensors
from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
from vllm.sequence import IntermediateTensors, SequenceData
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
@@ -1319,9 +1319,9 @@ def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
def input_mapper_for_phi4mm_audio(ctx: InputContext,
data: object) -> MultiModalInputs:
data: object) -> MultiModalKwargs:
"""
This function is used to create the MultiModalInputs for the Phi4MM
This function is used to create the MultiModalKwargs for the Phi4MM
(audio) model.
Specifically, for audio, we extract the audio features from the sound
file and create pairs of audio features and audio embed lengths (the
@@ -1338,13 +1338,13 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
data (object): Audio data.
Returns:
MultiModalInputs: Multi-modal inputs.
MultiModalKwargs: Multi-modal inputs.
"""
if not isinstance(data, list):
data = [data]
if len(data) == 0:
return MultiModalInputs()
return MultiModalKwargs()
audio_features = []
for audio_input in data:
@@ -1365,7 +1365,7 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
[single_audio_embed_size],
)
audio_features.append(single_audio_feature_audio_len_pair)
return MultiModalInputs({"audio_features": audio_features})
return MultiModalKwargs({"audio_features": audio_features})
def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
@@ -1373,7 +1373,7 @@ def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
data = [data]
# data: list of PIL images
if len(data) == 0:
return MultiModalInputs()
return MultiModalKwargs()
hf_config = ctx.get_hf_config()
vision_encoder_name = hf_config.img_processor
if vision_encoder_name is None:
@@ -1385,7 +1385,7 @@ def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
image_input_dict = preprocess(data, dynamic_hd_size, vit_image_size,
vit_patch_size)
return MultiModalInputs({
return MultiModalKwargs({
"pixel_values":
image_input_dict["pixel_values"],
"image_sizes":