[Misc] Clean up renderers (#36770)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -6,9 +6,6 @@ from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from mistral_common.protocol.instruct.chunk import ImageChunk, TextChunk
|
||||
from mistral_common.protocol.instruct.messages import UserMessage
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
@@ -21,7 +18,10 @@ from vllm.config.multimodal import (
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
|
||||
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
|
||||
from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.multimodal.processing import (
|
||||
BaseMultiModalProcessor,
|
||||
InputProcessingContext,
|
||||
)
|
||||
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
|
||||
from vllm.utils.mistral import is_mistral_tokenizer
|
||||
|
||||
@@ -74,20 +74,6 @@ def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
|
||||
return mm_data
|
||||
|
||||
|
||||
# For some multimodal models, tokenizer will always add bos_token
|
||||
# at the beginning of prompt by default, causing hf_processor outputs
|
||||
# incorrect token ids. So we need use `add_special_tokens=False` here
|
||||
# to leave bos_token to be added by the processor.
|
||||
_ADD_SPECIAL_TOKENS_OVERRIDES = {
|
||||
"lfm2_vl": False,
|
||||
"nemotron_parse": False,
|
||||
"ovis": False,
|
||||
"ovis2_5": False,
|
||||
"paligemma": False,
|
||||
"ultravox": False,
|
||||
"whisper": False,
|
||||
}
|
||||
|
||||
_IGNORE_MM_KEYS = {
|
||||
# In Ultravox, the audio_features can be different depending on padding
|
||||
# The slight difference should not be a problem though, since
|
||||
@@ -152,63 +138,34 @@ def get_text_token_prompts(
|
||||
parsed_data = processor.info.parse_mm_data(mm_data)
|
||||
mm_counts = {k: len(vs) for k, vs in parsed_data.items()}
|
||||
|
||||
text_prompt: str | None
|
||||
token_prompt: list[int]
|
||||
if is_mistral_tokenizer(tokenizer):
|
||||
# ChatCompletionRequest only supports ImageChunk natively;
|
||||
# for other modalities (e.g. audio), fall back to the model's
|
||||
# own dummy inputs builder which knows the right placeholders.
|
||||
has_non_image = any(
|
||||
k != "image" and count > 0 for k, count in mm_counts.items()
|
||||
inputs = dummy_inputs.get_dummy_processor_inputs(
|
||||
model_config.max_model_len,
|
||||
mm_counts,
|
||||
mm_options={},
|
||||
# Assume all Mistral models define this extra argument
|
||||
mm_data=mm_data, # type: ignore[call-arg]
|
||||
)
|
||||
|
||||
if has_non_image:
|
||||
inputs = dummy_inputs.get_dummy_processor_inputs(
|
||||
model_config.max_model_len,
|
||||
mm_counts,
|
||||
mm_options={},
|
||||
)
|
||||
text_prompt = None
|
||||
token_prompt = (
|
||||
inputs.prompt
|
||||
if isinstance(inputs.prompt, list)
|
||||
else tokenizer.encode(inputs.prompt, add_special_tokens=False)
|
||||
)
|
||||
else:
|
||||
images = parsed_data.get("image", [])
|
||||
request = ChatCompletionRequest(
|
||||
messages=[
|
||||
UserMessage(
|
||||
content=[
|
||||
TextChunk(text=""),
|
||||
*(ImageChunk(image=image) for image in images),
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
res = tokenizer.mistral.encode_chat_completion(request)
|
||||
|
||||
# Mistral does not support decode_tokens with
|
||||
# skip_special_tokens=False
|
||||
text_prompt = None
|
||||
token_prompt = res.tokens
|
||||
else:
|
||||
inputs = dummy_inputs.get_dummy_processor_inputs(
|
||||
model_config.max_model_len,
|
||||
mm_counts,
|
||||
mm_options={},
|
||||
)
|
||||
# Some models (e.g., Kimi-Audio) return token IDs directly instead of str
|
||||
if isinstance(inputs.prompt, list):
|
||||
text_prompt = None
|
||||
token_prompt = inputs.prompt
|
||||
else:
|
||||
assert isinstance(inputs.prompt, str)
|
||||
text_prompt = inputs.prompt
|
||||
token_prompt = tokenizer.encode(
|
||||
text_prompt,
|
||||
add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True),
|
||||
)
|
||||
|
||||
text_prompt: str | None
|
||||
token_prompt: list[int]
|
||||
if isinstance(inputs.prompt, list):
|
||||
text_prompt = None
|
||||
token_prompt = inputs.prompt
|
||||
elif isinstance(inputs.prompt, str):
|
||||
text_prompt = inputs.prompt
|
||||
token_prompt = tokenizer.encode(
|
||||
text_prompt,
|
||||
**processor.info.get_default_tok_params().get_encode_kwargs(),
|
||||
)
|
||||
else:
|
||||
raise TypeError(type(inputs.prompt))
|
||||
|
||||
return text_prompt, token_prompt
|
||||
|
||||
@@ -448,7 +405,7 @@ def test_processing_correctness(
|
||||
)
|
||||
if model_id == "mistralai/Voxtral-Mini-4B-Realtime-2602":
|
||||
pytest.skip(
|
||||
"Voxtral Realtime doesn't make use of any place-holder"
|
||||
"Voxtral Realtime doesn't make use of any place-holder "
|
||||
"tokens and hence cannot pass the processing "
|
||||
"correctness test as is. Let's revisit adapting this "
|
||||
"test once more realtime models exist."
|
||||
|
||||
Reference in New Issue
Block a user