[Renderer] Separate out RendererConfig from ModelConfig (#30145)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -25,7 +25,6 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC
|
||||
from vllm.tokenizers import (
|
||||
MistralTokenizer,
|
||||
TokenizerLike,
|
||||
cached_tokenizer_from_config,
|
||||
)
|
||||
|
||||
from ....multimodal.utils import random_audio, random_image, random_video
|
||||
@@ -212,31 +211,20 @@ def _test_processing_correctness(
|
||||
else:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
|
||||
model_id = model_id_or_arch
|
||||
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
renderer_config = model_info.build_renderer_config(
|
||||
model=model_id,
|
||||
# Ensure that the cache can fit all of the data
|
||||
mm_processor_cache_gb=2048,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
model_config = renderer_config.model_config
|
||||
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
factories = model_cls._processor_factory
|
||||
ctx = InputProcessingContext(
|
||||
model_config,
|
||||
tokenizer=cached_tokenizer_from_config(model_config),
|
||||
)
|
||||
ctx = InputProcessingContext.from_config(renderer_config)
|
||||
cache = MultiModalProcessorOnlyCache(model_config)
|
||||
|
||||
processing_info = factories.info(ctx)
|
||||
|
||||
@@ -40,7 +40,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
hf_processor_mm_kwargs = {"fps": fps}
|
||||
|
||||
@@ -79,7 +79,7 @@ def test_video_loader_consistency(
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
hf_processor_mm_kwargs = {"fps": fps}
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
|
||||
@@ -162,7 +162,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
@@ -38,7 +38,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
|
||||
@@ -116,7 +116,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
@@ -30,7 +30,7 @@ def test_processor_override(
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
config = processor.info.get_hf_config()
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
hf_processor = processor.info.get_hf_processor()
|
||||
|
||||
@@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
info = processor.info
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
@@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
@@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
image_sizes = list[ImageSize]()
|
||||
|
||||
@@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
info = processor.info
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
@@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
@@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
image_sizes = list[ImageSize]()
|
||||
|
||||
@@ -24,7 +24,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
prompt = "<image>" * num_imgs
|
||||
image = Image.new("RGB", size=(364, 364))
|
||||
mm_data = {"image": [image] * num_imgs}
|
||||
@@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
|
||||
@@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int):
|
||||
limit_mm_per_prompt=mm_counts,
|
||||
)
|
||||
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
profiler = MultiModalProfiler(processor)
|
||||
|
||||
decoder_dummy_data = profiler.get_decoder_dummy_data(
|
||||
|
||||
@@ -118,7 +118,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
@@ -39,7 +39,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
|
||||
@@ -39,7 +39,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
|
||||
@@ -34,7 +34,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
|
||||
@@ -11,7 +11,7 @@ import pytest
|
||||
import torch.nn as nn
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.config.multimodal import (
|
||||
AudioDummyOptions,
|
||||
BaseDummyOptions,
|
||||
@@ -31,7 +31,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.tokenizers import cached_tokenizer_from_config
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
@@ -150,7 +149,10 @@ def initialize_dummy_model(
|
||||
backend="nccl",
|
||||
)
|
||||
initialize_model_parallel(tensor_model_parallel_size=1)
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
)
|
||||
with set_current_vllm_config(vllm_config=vllm_config):
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
model = model_cls(vllm_config=vllm_config)
|
||||
@@ -182,19 +184,12 @@ def test_model_tensor_schema(model_id: str):
|
||||
else:
|
||||
dtype = model_info.dtype
|
||||
|
||||
model_config = ModelConfig(
|
||||
renderer_config = model_info.build_renderer_config(
|
||||
model_id,
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=hf_overrides_fn,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=dtype,
|
||||
)
|
||||
model_config = renderer_config.model_config
|
||||
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
assert supports_multimodal(model_cls)
|
||||
@@ -212,10 +207,7 @@ def test_model_tensor_schema(model_id: str):
|
||||
if not any(inputs_parse_methods):
|
||||
pytest.skip(f"{model_arch} does not support tensor schema validation.")
|
||||
|
||||
ctx = InputProcessingContext(
|
||||
model_config,
|
||||
tokenizer=cached_tokenizer_from_config(model_config),
|
||||
)
|
||||
ctx = InputProcessingContext.from_config(renderer_config)
|
||||
processing_info = factories.info(ctx)
|
||||
supported_mm_limits = processing_info.get_supported_mm_limits()
|
||||
limit_mm_per_prompt = {
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
import pytest
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
|
||||
@@ -13,8 +13,9 @@ def test_multimodal_processor(model_id):
|
||||
model=model_id,
|
||||
model_impl="transformers",
|
||||
)
|
||||
renderer_config = RendererConfig(model_config=model_config)
|
||||
|
||||
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||
mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
|
||||
|
||||
image_pil = ImageAsset("cherry_blossom").pil_image
|
||||
mm_data = {"image": image_pil}
|
||||
|
||||
Reference in New Issue
Block a user