[Misc] Clean up renderers (#36770)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-03-12 00:39:29 +08:00
committed by GitHub
parent c84b519cf3
commit 196802dfa6
12 changed files with 136 additions and 220 deletions

View File

@@ -6,9 +6,6 @@ from functools import partial
import numpy as np
import pytest
from mistral_common.protocol.instruct.chunk import ImageChunk, TextChunk
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from PIL import Image
from vllm.config import ModelConfig
@@ -21,7 +18,10 @@ from vllm.config.multimodal import (
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
InputProcessingContext,
)
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
from vllm.utils.mistral import is_mistral_tokenizer
@@ -74,20 +74,6 @@ def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
return mm_data
# For some multimodal models, tokenizer will always add bos_token
# at the beginning of prompt by default, causing hf_processor outputs
# incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor.
_ADD_SPECIAL_TOKENS_OVERRIDES = {
"lfm2_vl": False,
"nemotron_parse": False,
"ovis": False,
"ovis2_5": False,
"paligemma": False,
"ultravox": False,
"whisper": False,
}
_IGNORE_MM_KEYS = {
# In Ultravox, the audio_features can be different depending on padding
# The slight difference should not be a problem though, since
@@ -152,63 +138,34 @@ def get_text_token_prompts(
parsed_data = processor.info.parse_mm_data(mm_data)
mm_counts = {k: len(vs) for k, vs in parsed_data.items()}
text_prompt: str | None
token_prompt: list[int]
if is_mistral_tokenizer(tokenizer):
# ChatCompletionRequest only supports ImageChunk natively;
# for other modalities (e.g. audio), fall back to the model's
# own dummy inputs builder which knows the right placeholders.
has_non_image = any(
k != "image" and count > 0 for k, count in mm_counts.items()
inputs = dummy_inputs.get_dummy_processor_inputs(
model_config.max_model_len,
mm_counts,
mm_options={},
# Assume all Mistral models define this extra argument
mm_data=mm_data, # type: ignore[call-arg]
)
if has_non_image:
inputs = dummy_inputs.get_dummy_processor_inputs(
model_config.max_model_len,
mm_counts,
mm_options={},
)
text_prompt = None
token_prompt = (
inputs.prompt
if isinstance(inputs.prompt, list)
else tokenizer.encode(inputs.prompt, add_special_tokens=False)
)
else:
images = parsed_data.get("image", [])
request = ChatCompletionRequest(
messages=[
UserMessage(
content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]
),
]
)
res = tokenizer.mistral.encode_chat_completion(request)
# Mistral does not support decode_tokens with
# skip_special_tokens=False
text_prompt = None
token_prompt = res.tokens
else:
inputs = dummy_inputs.get_dummy_processor_inputs(
model_config.max_model_len,
mm_counts,
mm_options={},
)
# Some models (e.g., Kimi-Audio) return token IDs directly instead of str
if isinstance(inputs.prompt, list):
text_prompt = None
token_prompt = inputs.prompt
else:
assert isinstance(inputs.prompt, str)
text_prompt = inputs.prompt
token_prompt = tokenizer.encode(
text_prompt,
add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True),
)
text_prompt: str | None
token_prompt: list[int]
if isinstance(inputs.prompt, list):
text_prompt = None
token_prompt = inputs.prompt
elif isinstance(inputs.prompt, str):
text_prompt = inputs.prompt
token_prompt = tokenizer.encode(
text_prompt,
**processor.info.get_default_tok_params().get_encode_kwargs(),
)
else:
raise TypeError(type(inputs.prompt))
return text_prompt, token_prompt
@@ -448,7 +405,7 @@ def test_processing_correctness(
)
if model_id == "mistralai/Voxtral-Mini-4B-Realtime-2602":
pytest.skip(
"Voxtral Realtime doesn't make use of any place-holder"
"Voxtral Realtime doesn't make use of any place-holder "
"tokens and hence cannot pass the processing "
"correctness test as is. Let's revisit adapting this "
"test once more realtime models exist."