[Misc] Move config fields to MultiModalConfig (#17343)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-04-29 14:37:21 +08:00
committed by GitHub
parent cde384cd92
commit ebb3930d28
8 changed files with 62 additions and 36 deletions

View File

@@ -263,6 +263,10 @@ class ModelConfig:
the model name will be the same as `model`.
limit_mm_per_prompt: Maximum number of data items per modality
per prompt. Only applicable for multimodal models.
mm_processor_kwargs: Overrides for the multi-modal processor obtained
from `AutoProcessor.from_pretrained`.
disable_mm_preprocessor_cache: If True, disable caching of the
processed multi-modal inputs.
use_async_output_proc: Whether to use async output processor.
Defaults to True.
config_format: The config format which shall be loaded.
@@ -273,10 +277,6 @@ class ModelConfig:
hf_overrides: If a dictionary, contains arguments to be forwarded to the
HuggingFace config. If a callable, it is called to update the
HuggingFace config.
mm_processor_kwargs: Arguments to be forwarded to the model's processor
for multi-modal data, e.g., image processor.
disable_mm_preprocessor_cache: If true, then disables caching of the
multi-modal preprocessor/mapper. (not recommended)
override_neuron_config: Initialize non default neuron config or
override default neuron config that are specific to Neuron devices,
this argument will be used to configure the neuron config that
@@ -320,7 +320,6 @@ class ModelConfig:
factors.append(self.max_logprobs)
factors.append(self.disable_sliding_window)
factors.append(self.trust_remote_code)
factors.append(self.mm_processor_kwargs)
factors.append(self.generation_config)
factors.append(self.model_impl)
factors.append(self.override_generation_config)
@@ -359,12 +358,12 @@ class ModelConfig:
skip_tokenizer_init: bool = False,
served_model_name: Optional[Union[str, list[str]]] = None,
limit_mm_per_prompt: Optional[dict[str, int]] = None,
mm_processor_kwargs: Optional[dict[str, Any]] = None,
disable_mm_preprocessor_cache: bool = False,
use_async_output_proc: bool = True,
config_format: ConfigFormat = ConfigFormat.AUTO,
hf_token: Optional[Union[bool, str]] = None,
hf_overrides: Optional[HfOverrides] = None,
mm_processor_kwargs: Optional[dict[str, Any]] = None,
disable_mm_preprocessor_cache: bool = False,
override_neuron_config: Optional[dict[str, Any]] = None,
override_pooler_config: Optional["PoolerConfig"] = None,
logits_processor_pattern: Optional[str] = None,
@@ -469,8 +468,6 @@ class ModelConfig:
self.model, hf_token=hf_token, revision=revision)
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
self.use_async_output_proc = use_async_output_proc
self.mm_processor_kwargs = mm_processor_kwargs
self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
# Set enforce_eager to False if the value is unset.
if self.enforce_eager is None:
@@ -515,7 +512,10 @@ class ModelConfig:
self.served_model_name = get_served_model_name(model,
served_model_name)
self.multimodal_config = self._init_multimodal_config(
limit_mm_per_prompt)
limit_mm_per_prompt=limit_mm_per_prompt,
mm_processor_kwargs=mm_processor_kwargs,
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
)
if not self.skip_tokenizer_init:
self._verify_tokenizer_mode()
@@ -581,14 +581,27 @@ class ModelConfig:
self.tokenizer = s3_tokenizer.dir
def _init_multimodal_config(
self, limit_mm_per_prompt: Optional[dict[str, int]]
self,
limit_mm_per_prompt: Optional[dict[str, int]],
mm_processor_kwargs: Optional[dict[str, Any]],
disable_mm_preprocessor_cache: bool,
) -> Optional["MultiModalConfig"]:
if self.registry.is_multimodal_model(self.architectures):
return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
return MultiModalConfig(
limit_per_prompt=limit_mm_per_prompt or {},
mm_processor_kwargs=mm_processor_kwargs or {},
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
)
if limit_mm_per_prompt:
raise ValueError("`limit_mm_per_prompt` is only supported for "
"multimodal models.")
if mm_processor_kwargs:
raise ValueError("`mm_processor_kwargs` is only supported for "
"multimodal models.")
if disable_mm_preprocessor_cache:
raise ValueError("`disable_mm_preprocessor_cache` is only "
"supported for multimodal models.")
return None
@@ -2776,7 +2789,23 @@ class MultiModalConfig:
Defaults to 1 (V0) or 999 (V1) for each modality.
For example, to allow up to 16 images and 2 videos per prompt:
``{"images": 16, "videos": 2}``
:code:`{"images": 16, "videos": 2}`
"""
mm_processor_kwargs: Optional[dict[str, object]] = None
"""
Overrides for the multi-modal processor obtained from
:meth:`transformers.AutoProcessor.from_pretrained`.
The available overrides depend on the model that is being run.
For example, for Phi-3-Vision:
:code:`{"num_crops": 4}`.
"""
disable_mm_preprocessor_cache: bool = False
"""
If :code:`True`, disable caching of the processed multi-modal inputs.
"""
def compute_hash(self) -> str:
@@ -4080,8 +4109,6 @@ class VllmConfig:
f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa
f"use_async_output_proc={self.model_config.use_async_output_proc}, "
f"disable_mm_preprocessor_cache={self.model_config.disable_mm_preprocessor_cache!r}, " # noqa
f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
f"pooler_config={self.model_config.pooler_config!r}, "
f"compilation_config={self.compilation_config!r}")