[Renderer] Separate out RendererConfig from ModelConfig (#30145)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-12-07 15:15:42 +08:00
committed by GitHub
parent a49d813fa8
commit 27f4c2fd46
105 changed files with 969 additions and 797 deletions

View File

@@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig
from vllm.config.observability import ObservabilityConfig
from vllm.config.parallel import EPLBConfig, ParallelConfig
from vllm.config.pooler import PoolerConfig
from vllm.config.renderer import RendererConfig
from vllm.config.scheduler import SchedulerConfig
from vllm.config.speculative import SpeculativeConfig
from vllm.config.speech_to_text import SpeechToTextConfig
@@ -81,6 +82,8 @@ __all__ = [
"ParallelConfig",
# From vllm.config.pooler
"PoolerConfig",
# From vllm.config.renderer
"RendererConfig",
# From vllm.config.scheduler
"SchedulerConfig",
# From vllm.config.speculative

View File

@@ -36,7 +36,6 @@ from vllm.transformers_utils.config import (
uses_xdrope_dim,
)
from vllm.transformers_utils.gguf_utils import (
is_gguf,
is_remote_gguf,
maybe_patch_hf_config_from_gguf,
split_remote_gguf,
@@ -83,7 +82,6 @@ TaskOption = Literal[
"transcription",
"draft",
]
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
LogprobsMode = Literal[
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -131,18 +129,6 @@ class ModelConfig:
Note that the model may support other tasks using the same model runner.
"""
tokenizer: SkipValidation[str] = None # type: ignore
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode: TokenizerMode | str = "auto"
"""Tokenizer mode:\n
- "auto" will use the tokenizer from `mistral_common` for Mistral models
if available, otherwise it will use the "hf" tokenizer.\n
- "hf" will use the fast tokenizer if available.\n
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- Other custom values can be supported via plugins."""
trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model
and tokenizer."""
@@ -168,13 +154,6 @@ class ModelConfig:
hf_config_path: str | None = None
"""Name or path of the Hugging Face config to use. If unspecified, model
name or path will be used."""
allowed_local_media_path: str = ""
"""Allowing API requests to read local images or videos from directories
specified by the server file system. This is a security risk. Should only
be enabled in trusted environments."""
allowed_media_domains: list[str] | None = None
"""If set, only media URLs that belong to this domain can be used for
multi-modal inputs. """
revision: str | None = None
"""The specific model version to use. It can be a branch name, a tag name,
or a commit id. If unspecified, will use the default version."""
@@ -182,10 +161,6 @@ class ModelConfig:
"""The specific revision to use for the model code on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
tokenizer_revision: str | None = None
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
max_model_len: SkipValidation[int] = None # type: ignore
"""Model context length (prompt and output). If unspecified, will be
automatically derived from the model config.
@@ -230,10 +205,6 @@ class ModelConfig:
preventing potential numerical issues. Note that even if this is set to
False, cascade attention will be only used when the heuristic tells that
it's beneficial."""
skip_tokenizer_init: bool = False
"""Skip initialization of tokenizer and detokenizer. Expects valid
`prompt_token_ids` and `None` for prompt from the input. The generated
output will contain token ids."""
enable_prompt_embeds: bool = False
"""If `True`, enables passing text embeddings as inputs via the
`prompt_embeds` key.
@@ -294,8 +265,6 @@ class ModelConfig:
logits_processors: list[str | type[LogitsProcessor]] | None = None
"""One or more logits processors' fully-qualified class names or class
definitions"""
io_processor_plugin: str | None = None
"""IOProcessor plugin name to load at model startup"""
# Pooler config
pooler_config: PoolerConfig | None = None
@@ -308,7 +277,6 @@ class ModelConfig:
from the architecture of `self.model`."""
limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
enable_mm_embeds: InitVar[bool | None] = None
media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
mm_processor_cache_gb: InitVar[float | None] = None
mm_processor_cache_type: InitVar[MMCacheType | None] = None
@@ -335,18 +303,12 @@ class ModelConfig:
"runner",
"convert",
"task",
"tokenizer",
"tokenizer_mode",
"seed",
"hf_config_path",
"allowed_local_media_path",
"allowed_media_domains",
"tokenizer_revision",
"spec_target_max_model_len",
"enforce_eager",
"logprobs_mode",
"disable_cascade_attn",
"skip_tokenizer_init",
"served_model_name",
"config_format",
"hf_token",
@@ -354,11 +316,9 @@ class ModelConfig:
"logits_processor_pattern",
"override_attention_dtype",
"logits_processors",
"io_processor_plugin",
"pooler_config",
"multimodal_config",
"limit_mm_per_prompt",
"media_io_kwargs",
"mm_processor_kwargs",
"mm_processor_cache_gb",
"mm_processor_cache_type",
@@ -423,7 +383,6 @@ class ModelConfig:
# Multimodal config init vars
limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
enable_mm_embeds: bool | None,
media_io_kwargs: dict[str, dict[str, Any]] | None,
mm_processor_kwargs: dict[str, Any] | None,
mm_processor_cache_gb: float | None,
mm_processor_cache_type: MMCacheType | None,
@@ -438,13 +397,8 @@ class ModelConfig:
self.served_model_name = get_served_model_name(
self.model, self.served_model_name
)
self.model = maybe_model_redirect(self.model)
# The tokenizer is consistent with the model by default.
if self.tokenizer is None:
self.tokenizer = self.model
if self.tokenizer_revision is None:
self.tokenizer_revision = self.revision
self.tokenizer = maybe_model_redirect(self.tokenizer)
self.original_model = self.model
self.model = maybe_model_redirect(self.original_model)
if isinstance(self.hf_config_path, str):
self.hf_config_path = maybe_model_redirect(self.hf_config_path)
@@ -465,7 +419,7 @@ class ModelConfig:
hf_overrides_kw[key] = value
hf_overrides_fn = None
self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
self.maybe_pull_model_for_runai(self.model)
from vllm.platforms import current_platform
@@ -648,7 +602,8 @@ class ModelConfig:
)
self.original_max_model_len = self.max_model_len
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
self.recalculate_max_model_len(self.original_max_model_len)
# Init multimodal config if needed
if self._model_info.supports_multimodal:
if (
@@ -664,7 +619,6 @@ class ModelConfig:
mm_config_kwargs = dict(
limit_per_prompt=limit_mm_per_prompt,
enable_mm_embeds=enable_mm_embeds,
media_io_kwargs=media_io_kwargs,
mm_processor_kwargs=mm_processor_kwargs,
mm_processor_cache_gb=mm_processor_cache_gb,
mm_processor_cache_type=mm_processor_cache_type,
@@ -682,16 +636,8 @@ class ModelConfig:
self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
# Multimodal GGUF models must use original repo for mm processing
if is_gguf(self.tokenizer) and self.is_multimodal_model:
raise ValueError(
"Loading a multimodal GGUF model needs to use original "
"tokenizer. Please specify the unquantized hf model's "
"repo name or path using the --tokenizer argument."
)
if self.disable_sliding_window:
# Set after get_and_verify_max_len to ensure that max_model_len
# Set after recalculate_max_model_len to ensure that max_model_len
# can be correctly capped to sliding window size
self.hf_text_config.sliding_window = None
@@ -715,10 +661,9 @@ class ModelConfig:
@model_validator(mode="after")
def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
if not isinstance(self.tokenizer, str):
raise ValueError("tokenizer must be a string after __post_init__.")
if not isinstance(self.max_model_len, int):
raise ValueError("max_model_len must be an integer after __post_init__.")
return self
def _get_transformers_backend_cls(self) -> str:
@@ -767,49 +712,17 @@ class ModelConfig:
"""The architecture vllm actually used."""
return self._architecture
def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
"""Pull model/tokenizer from Object Storage to temporary
directory when needed.
Args:
model: Model name or path
tokenizer: Tokenizer name or path
"""
if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
def maybe_pull_model_for_runai(self, model: str) -> None:
"""Pull model from Object Storage to temporary directory when needed."""
if not is_runai_obj_uri(model):
return
if is_runai_obj_uri(model):
object_storage_model = ObjectStorageModel(url=model)
object_storage_model.pull_files(
model, allow_pattern=["*.model", "*.py", "*.json"]
)
self.model_weights = model
self.model = object_storage_model.dir
# If tokenizer is same as model, download to same directory
if model == tokenizer:
object_storage_model.pull_files(
model,
ignore_pattern=[
"*.pt",
"*.safetensors",
"*.bin",
"*.tensors",
"*.pth",
],
)
self.tokenizer = object_storage_model.dir
return
# Only download tokenizer if needed and not already handled
if is_runai_obj_uri(tokenizer):
object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
object_storage_tokenizer.pull_files(
model,
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
)
self.tokenizer = object_storage_tokenizer.dir
object_storage_model = ObjectStorageModel(url=model)
object_storage_model.pull_files(
model, allow_pattern=["*.model", "*.py", "*.json"]
)
self.model_weights = model
self.model = object_storage_model.dir
def _get_encoder_config(self):
model = self.model
@@ -1712,30 +1625,38 @@ class ModelConfig:
return dense_modules[-1]["out_features"]
return self.get_hidden_size()
def get_and_verify_max_len(self, max_model_len: int):
def recalculate_max_model_len(
self,
original_max_model_len: int | None,
*,
tokenizer: str | None = None,
tokenizer_revision: str | None = None,
) -> None:
# Consider max_model_len in tokenizer_config only when
# pooling models use absolute position_embedding.
# NOTE: For simplicity we assume `args.model == args.tokenizer`
# since this is
tokenizer_config = None
if (
self.runner_type == "pooling"
and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
):
tokenizer_config = try_get_tokenizer_config(
self.tokenizer,
tokenizer or self.model,
trust_remote_code=self.trust_remote_code,
revision=self.tokenizer_revision,
revision=tokenizer_revision or self.revision,
)
max_model_len = _get_and_verify_max_len(
self.max_model_len = _get_and_verify_max_len(
hf_config=self.hf_text_config,
tokenizer_config=tokenizer_config,
max_model_len=max_model_len,
max_model_len=original_max_model_len,
disable_sliding_window=self.disable_sliding_window,
sliding_window=self.get_sliding_window(),
spec_target_max_model_len=self.spec_target_max_model_len,
encoder_config=self.encoder_config,
)
logger.info("Using max model len %s", max_model_len)
return max_model_len
logger.info("Using max model len %s", self.max_model_len)
@property
def attn_type(self) -> AttnTypeStr:

View File

@@ -79,10 +79,6 @@ class MultiModalConfig:
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
mm_processor_kwargs: dict[str, object] | None = None
"""Arguments to be forwarded to the model's processor for multi-modal data,
e.g., image processor. Overrides for the multi-modal processor obtained

109
vllm/config/renderer.py Normal file
View File

@@ -0,0 +1,109 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Literal
from pydantic import Field, SkipValidation
from pydantic.dataclasses import dataclass
from vllm.config.model import ModelConfig
from vllm.config.utils import config
from vllm.transformers_utils.gguf_utils import is_gguf
from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
from vllm.transformers_utils.utils import maybe_model_redirect
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
@config
@dataclass
class RendererConfig:
"""Configuration for the renderer."""
# NOTE: In reality, this is a required argument.
# We provide a dummy default value here to generate the CLI args.
model_config: SkipValidation[ModelConfig] = None # type: ignore
"""Provides model context to the renderer."""
tokenizer: str = ""
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode: TokenizerMode | str = "auto"
"""Tokenizer mode:\n
- "auto" will use the tokenizer from `mistral_common` for Mistral models
if available, otherwise it will use the "hf" tokenizer.\n
- "hf" will use the fast tokenizer if available.\n
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- Other custom values can be supported via plugins."""
tokenizer_revision: str | None = None
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
skip_tokenizer_init: bool = False
"""Skip initialization of tokenizer and detokenizer. Expects valid
`prompt_token_ids` and `None` for prompt from the input. The generated
output will contain token ids."""
io_processor_plugin: str | None = None
"""IOProcessor plugin name to load at model startup."""
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
allowed_local_media_path: str = ""
"""Allowing API requests to read local images or videos from directories
specified by the server file system. This is a security risk. Should only
be enabled in trusted environments."""
allowed_media_domains: list[str] | None = None
"""If set, only media URLs that belong to this domain can be used for
multi-modal inputs. """
@property
def trust_remote_code(self) -> bool:
return self.model_config.trust_remote_code
def __post_init__(self) -> None:
model_config = self.model_config
# The tokenizer is consistent with the model by default.
if not self.tokenizer:
self.tokenizer = (
ModelConfig.model
if model_config is None
else model_config.original_model
)
if not self.tokenizer_revision:
self.tokenizer_revision = (
ModelConfig.revision if model_config is None else model_config.revision
)
self.original_tokenizer = self.tokenizer
self.tokenizer = maybe_model_redirect(self.original_tokenizer)
self.maybe_pull_tokenizer_for_runai(self.tokenizer)
# Multimodal GGUF models must use original repo for mm processing
is_multimodal_model = (
ModelConfig.is_multimodal_model
if model_config is None
else model_config.is_multimodal_model
)
if is_gguf(self.tokenizer) and is_multimodal_model:
raise ValueError(
"Loading a multimodal GGUF model needs to use original "
"tokenizer. Please specify the unquantized hf model's "
"repo name or path using the --tokenizer argument."
)
def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None:
"""Pull tokenizer from Object Storage to temporary directory when needed."""
if not is_runai_obj_uri(tokenizer):
return
object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
object_storage_tokenizer.pull_files(
tokenizer,
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
)
self.tokenizer = object_storage_tokenizer.dir

View File

@@ -322,16 +322,11 @@ class SpeculativeConfig:
self.draft_model_config = ModelConfig(
model=self.model,
runner="draft",
tokenizer=self.target_model_config.tokenizer,
tokenizer_mode=self.target_model_config.tokenizer_mode,
trust_remote_code=self.target_model_config.trust_remote_code,
allowed_local_media_path=self.target_model_config.allowed_local_media_path,
allowed_media_domains=self.target_model_config.allowed_media_domains,
dtype=self.target_model_config.dtype,
seed=self.target_model_config.seed,
revision=self.revision,
code_revision=self.code_revision,
tokenizer_revision=self.target_model_config.tokenizer_revision,
spec_target_max_model_len=self.target_model_config.max_model_len,
quantization=self.quantization,
enforce_eager=self.target_model_config.enforce_eager,

View File

@@ -39,6 +39,7 @@ from .lora import LoRAConfig
from .model import ModelConfig
from .observability import ObservabilityConfig
from .parallel import ParallelConfig
from .renderer import RendererConfig
from .scheduler import SchedulerConfig
from .speculative import SpeculativeConfig
from .structured_outputs import StructuredOutputsConfig
@@ -181,6 +182,8 @@ class VllmConfig:
# try to download a model
model_config: ModelConfig = Field(default=None)
"""Model configuration."""
renderer_config: RendererConfig = Field(default_factory=RendererConfig)
"""Renderer configuration."""
cache_config: CacheConfig = Field(default_factory=CacheConfig)
"""Cache configuration."""
parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
@@ -741,7 +744,7 @@ class VllmConfig:
from vllm.multimodal import MULTIMODAL_REGISTRY
self.scheduler_config.max_num_encoder_input_tokens = (
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.renderer_config)
)
logger.debug(
"Encoder-decoder model detected: setting "
@@ -1186,11 +1189,13 @@ class VllmConfig:
computed_compile_ranges_split_points
)
def recalculate_max_model_len(self, max_model_len: int):
# Can only be called in try_verify_and_update_config
model_config = self.model_config
max_model_len = model_config.get_and_verify_max_len(max_model_len)
self.model_config.max_model_len = max_model_len
def recalculate_max_model_len(self, original_max_model_len: int | None) -> None:
# Can only be called during try_verify_and_update_config
self.model_config.recalculate_max_model_len(
original_max_model_len,
tokenizer=self.renderer_config.tokenizer,
tokenizer_revision=self.renderer_config.tokenizer_revision,
)
def try_verify_and_update_config(self):
if self.model_config is None:
@@ -1264,11 +1269,11 @@ class VllmConfig:
return (
f"model={self.model_config.model!r}, "
f"speculative_config={self.speculative_config!r}, "
f"tokenizer={self.model_config.tokenizer!r}, "
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
f"tokenizer={self.renderer_config.tokenizer!r}, "
f"skip_tokenizer_init={self.renderer_config.skip_tokenizer_init}, "
f"tokenizer_mode={self.renderer_config.tokenizer_mode}, "
f"revision={self.model_config.revision}, "
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
f"tokenizer_revision={self.renderer_config.tokenizer_revision}, "
f"trust_remote_code={self.model_config.trust_remote_code}, "
f"dtype={self.model_config.dtype}, "
f"max_seq_len={self.model_config.max_model_len}, "