[Bugfix] Always apply MM processor even when no MM items are passed (#26240)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-05 18:10:20 +08:00
committed by GitHub
parent 432e1cbc23
commit b7e8e4e6be
6 changed files with 102 additions and 30 deletions

View File

@@ -46,7 +46,6 @@ from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel)
from vllm.inputs import TextPrompt
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.multimodal.utils import fetch_image
@@ -760,17 +759,24 @@ class VllmRunner:
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
) -> list[TextPrompt]:
) -> list[dict[str, Any]]:
if any(x is not None and len(x) != len(prompts)
for x in [images, videos, audios]):
raise ValueError(
"All non-None multimodal inputs must have the same length as "
"prompts")
inputs = []
inputs = list[dict[str, Any]]()
for i, prompt in enumerate(prompts):
multi_modal_data = {}
prompt_dict = dict[str, Any]()
if isinstance(prompt, str):
prompt_dict["prompt"] = prompt
elif isinstance(prompt, list):
prompt_dict["prompt_token_ids"] = prompt
else:
prompt_dict["prompt_embeds"] = prompt
multi_modal_data = dict[str, Any]()
if images is not None and (image := images[i]) is not None:
multi_modal_data["image"] = image
if videos is not None and (video := videos[i]) is not None:
@@ -778,17 +784,10 @@ class VllmRunner:
if audios is not None and (audio := audios[i]) is not None:
multi_modal_data["audio"] = audio
text_prompt_kwargs: dict[str, Any] = {
"multi_modal_data": multi_modal_data or None
}
if isinstance(prompt, str):
text_prompt_kwargs["prompt"] = prompt
elif isinstance(prompt, list):
text_prompt_kwargs["prompt_token_ids"] = prompt
else:
text_prompt_kwargs["prompt_embeds"] = prompt
if multi_modal_data:
prompt_dict["multi_modal_data"] = multi_modal_data
inputs.append(TextPrompt(**text_prompt_kwargs))
inputs.append(prompt_dict)
return inputs

View File

@@ -3,8 +3,11 @@
import pytest
from vllm.config import ModelConfig
from vllm.inputs import zip_enc_dec_prompts
from vllm.inputs.parse import parse_raw_prompts
from vllm.inputs.preprocess import InputPreprocessor
from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
pytestmark = pytest.mark.cpu_test
@@ -80,3 +83,50 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
assert zipped['encoder_prompt'] == enc
assert zipped['decoder_prompt'] == dec
assert zipped['mm_processor_kwargs'] == exp_kwargs
@pytest.mark.parametrize("model_id", [
"facebook/opt-125m",
])
@pytest.mark.parametrize("prompt", [
{
"prompt": "",
"multi_modal_data": {
"dummy": []
},
},
{
"prompt_token_ids": [],
"multi_modal_data": {
"dummy": []
},
},
])
def test_preprocessor_text_no_mm_inputs(model_id, prompt):
model_config = ModelConfig(model=model_id)
tokenizer = init_tokenizer_from_configs(model_config)
input_preprocessor = InputPreprocessor(model_config, tokenizer)
with pytest.raises(ValueError, match="does not support multimodal inputs"):
input_preprocessor.preprocess(prompt)
@pytest.mark.parametrize("model_id", [
"facebook/chameleon-7b",
])
@pytest.mark.parametrize("prompt", [
"",
{
"prompt_token_ids": []
},
])
def test_preprocessor_always_mm_code_path(model_id, prompt):
model_config = ModelConfig(model=model_id)
tokenizer = init_tokenizer_from_configs(model_config)
input_preprocessor = InputPreprocessor(model_config, tokenizer)
# HF processor adds sep token
sep_token_id = tokenizer.vocab[tokenizer.sep_token]
processed_inputs = input_preprocessor.preprocess(prompt)
assert sep_token_id in processed_inputs["prompt_token_ids"]