[Renderer] Separate out RendererConfig from ModelConfig (#30145)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-12-07 15:15:42 +08:00
parent a49d813fa8
commit 27f4c2fd46
105 changed files with 969 additions and 797 deletions
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -25,7 +25,6 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC
 from vllm.tokenizers import (
    MistralTokenizer,
    TokenizerLike,
-    cached_tokenizer_from_config,
 )

 from ....multimodal.utils import random_audio, random_image, random_video
@@ -212,31 +211,20 @@ def _test_processing_correctness(
    else:
        model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
        model_id = model_id_or_arch
+
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

-    model_config = ModelConfig(
-        model_id,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
-        hf_overrides=model_info.hf_overrides,
+    renderer_config = model_info.build_renderer_config(
+        model=model_id,
        # Ensure that the cache can fit all of the data
        mm_processor_cache_gb=2048,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
    )
+    model_config = renderer_config.model_config

    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
    factories = model_cls._processor_factory
-    ctx = InputProcessingContext(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    ctx = InputProcessingContext.from_config(renderer_config)
    cache = MultiModalProcessorOnlyCache(model_config)

    processing_info = factories.info(ctx)
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -40,7 +40,7 @@ def test_processor_override(
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"video": 1},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    tokenizer = processor.info.get_tokenizer()
    hf_processor_mm_kwargs = {"fps": fps}

@@ -79,7 +79,7 @@ def test_video_loader_consistency(
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"video": 1},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    hf_processor_mm_kwargs = {"fps": fps}

    # Build the image str / prompt based on the number of images we pass
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -162,7 +162,7 @@ def test_processor_override(
        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": len(size_factors)},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

    min_num = min_dynamic_patch if dynamic_image_size else 1
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -38,7 +38,7 @@ def test_processor_override(
        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

    # Build the image str / prompt based on the number of images we pass
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -116,7 +116,7 @@ def test_processor_override(
        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": len(size_factors)},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

    min_num = min_dynamic_patch if dynamic_image_size else 1
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -30,7 +30,7 @@ def test_processor_override(
        limit_mm_per_prompt={"image": num_imgs},
        mm_processor_cache_gb=mm_processor_cache_gb,
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    config = processor.info.get_hf_config()
    tokenizer = processor.info.get_tokenizer()
    hf_processor = processor.info.get_hf_processor()
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": 1},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    info = processor.info

    seen_aspect_ratios = set[float]()
@@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)

    image_ratios = [
        (171, 152),
@@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)

    seen_aspect_ratios = set[float]()
    image_sizes = list[ImageSize]()
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": 1},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    info = processor.info

    seen_aspect_ratios = set[float]()
@@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)

    image_ratios = [
        (171, 152),
@@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)

    seen_aspect_ratios = set[float]()
    image_sizes = list[ImageSize]()
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -24,7 +24,7 @@ def test_processor_override(
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    prompt = "<image>" * num_imgs
    image = Image.new("RGB", size=(364, 364))
    mm_data = {"image": [image] * num_imgs}
@@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)

    image_ratios = [
        (171, 152),
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int):
        limit_mm_per_prompt=mm_counts,
    )

-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    profiler = MultiModalProfiler(processor)

    decoder_dummy_data = profiler.get_decoder_dummy_data(
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -118,7 +118,7 @@ def test_processor_override(
        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": len(size_factors)},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

    min_num = min_dynamic_patch if dynamic_image_size else 1
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -39,7 +39,7 @@ def test_processor_override(
        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

    # Build the image str / prompt based on the number of images we pass
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -39,7 +39,7 @@ def test_processor_override(
        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

    # Build the image str / prompt based on the number of images we pass
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -34,7 +34,7 @@ def test_processor_override(
        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    tokenizer = processor.info.get_tokenizer()
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -38,7 +38,7 @@ def test_processor_override(
        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

    # Build the image str / prompt based on the number of images we pass
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -11,7 +11,7 @@ import pytest
 import torch.nn as nn
 from PIL import Image

-from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config
 from vllm.config.multimodal import (
    AudioDummyOptions,
    BaseDummyOptions,
@@ -31,7 +31,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.platforms import current_platform
-from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype

@@ -150,7 +149,10 @@ def initialize_dummy_model(
        backend="nccl",
    )
    initialize_model_parallel(tensor_model_parallel_size=1)
-    vllm_config = VllmConfig(model_config=model_config)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        renderer_config=RendererConfig(model_config=model_config),
+    )
    with set_current_vllm_config(vllm_config=vllm_config):
        with set_default_torch_dtype(model_config.dtype):
            model = model_cls(vllm_config=vllm_config)
@@ -182,19 +184,12 @@ def test_model_tensor_schema(model_id: str):
    else:
        dtype = model_info.dtype

-    model_config = ModelConfig(
+    renderer_config = model_info.build_renderer_config(
        model_id,
-        tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
-        revision=model_info.revision,
-        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=hf_overrides_fn,
-        skip_tokenizer_init=model_info.require_embed_inputs,
-        enable_prompt_embeds=model_info.require_embed_inputs,
-        enable_mm_embeds=model_info.require_embed_inputs,
-        enforce_eager=model_info.enforce_eager,
        dtype=dtype,
    )
+    model_config = renderer_config.model_config

    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
    assert supports_multimodal(model_cls)
@@ -212,10 +207,7 @@ def test_model_tensor_schema(model_id: str):
    if not any(inputs_parse_methods):
        pytest.skip(f"{model_arch} does not support tensor schema validation.")

-    ctx = InputProcessingContext(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    ctx = InputProcessingContext.from_config(renderer_config)
    processing_info = factories.info(ctx)
    supported_mm_limits = processing_info.get_supported_mm_limits()
    limit_mm_per_prompt = {
--- a/tests/models/multimodal/processing/test_transformers.py
+++ b/tests/models/multimodal/processing/test_transformers.py
@@ -3,7 +3,7 @@
 import pytest

 from vllm.assets.image import ImageAsset
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, RendererConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY


@@ -13,8 +13,9 @@ def test_multimodal_processor(model_id):
        model=model_id,
        model_impl="transformers",
    )
+    renderer_config = RendererConfig(model_config=model_config)

-    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
+    mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)

    image_pil = ImageAsset("cherry_blossom").pil_image
    mm_data = {"image": image_pil}