Move MultiModalConfig from config/__init__.py to config/multimodal.py (#24659)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -9,7 +9,7 @@ from unittest.mock import MagicMock
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import MultiModalConfig
|
from vllm.config.multimodal import MultiModalConfig
|
||||||
from vllm.engine.multiprocessing.client import MQLLMEngineClient
|
from vllm.engine.multiprocessing.client import MQLLMEngineClient
|
||||||
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
|
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
|
||||||
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ from unittest.mock import MagicMock
|
|||||||
import pytest
|
import pytest
|
||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
|
|
||||||
from vllm.config import MultiModalConfig
|
from vllm.config.multimodal import MultiModalConfig
|
||||||
from vllm.engine.multiprocessing.client import MQLLMEngineClient
|
from vllm.engine.multiprocessing.client import MQLLMEngineClient
|
||||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||||
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.config import ModelConfig, ParallelConfig, VllmConfig
|
from vllm.config import ModelConfig, ParallelConfig, VllmConfig
|
||||||
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.cache import (MultiModalCache,
|
from vllm.multimodal.cache import (MultiModalCache,
|
||||||
MultiModalProcessorCacheItem,
|
MultiModalProcessorCacheItem,
|
||||||
MultiModalProcessorCacheItemMetadata,
|
MultiModalProcessorCacheItemMetadata,
|
||||||
@@ -17,7 +18,6 @@ from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
|
|||||||
MultiModalKwargsItems,
|
MultiModalKwargsItems,
|
||||||
MultiModalSharedField)
|
MultiModalSharedField)
|
||||||
from vllm.multimodal.processing import PromptInsertion
|
from vllm.multimodal.processing import PromptInsertion
|
||||||
from vllm.multimodal.registry import MultiModalRegistry
|
|
||||||
|
|
||||||
|
|
||||||
def _dummy_elem(
|
def _dummy_elem(
|
||||||
@@ -96,7 +96,9 @@ def _create_vllm_config(
|
|||||||
enable_ipc: bool,
|
enable_ipc: bool,
|
||||||
):
|
):
|
||||||
return VllmConfig(
|
return VllmConfig(
|
||||||
model_config=ModelConfig(mm_processor_cache_gb=mm_processor_cache_gb),
|
model_config=ModelConfig(
|
||||||
|
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||||
|
mm_processor_cache_gb=mm_processor_cache_gb),
|
||||||
parallel_config=ParallelConfig(
|
parallel_config=ParallelConfig(
|
||||||
data_parallel_size=1 if enable_ipc else 2),
|
data_parallel_size=1 if enable_ipc else 2),
|
||||||
)
|
)
|
||||||
@@ -113,15 +115,16 @@ def _compare_caches(
|
|||||||
n_iter: int = 100,
|
n_iter: int = 100,
|
||||||
seed: int = 0,
|
seed: int = 0,
|
||||||
):
|
):
|
||||||
mm_registry = MultiModalRegistry()
|
cache_0_p0 = processor_cache_from_config(config_0, MULTIMODAL_REGISTRY)
|
||||||
cache_0_p0 = processor_cache_from_config(config_0, mm_registry)
|
cache_0_p1 = engine_receiver_cache_from_config(config_0,
|
||||||
cache_0_p1 = engine_receiver_cache_from_config(config_0, mm_registry)
|
MULTIMODAL_REGISTRY)
|
||||||
cache_1_p0 = processor_cache_from_config(config_1, mm_registry)
|
cache_1_p0 = processor_cache_from_config(config_1, MULTIMODAL_REGISTRY)
|
||||||
cache_1_p1 = engine_receiver_cache_from_config(config_1, mm_registry)
|
cache_1_p1 = engine_receiver_cache_from_config(config_1,
|
||||||
|
MULTIMODAL_REGISTRY)
|
||||||
|
|
||||||
cache_size_gb = max(
|
cache_size_gb = max(
|
||||||
config_0.model_config.mm_processor_cache_gb,
|
config_0.model_config.multimodal_config.mm_processor_cache_gb,
|
||||||
config_1.model_config.mm_processor_cache_gb,
|
config_1.model_config.multimodal_config.mm_processor_cache_gb,
|
||||||
)
|
)
|
||||||
item_size_gb = int(cache_size_gb / item_capacity)
|
item_size_gb = int(cache_size_gb / item_capacity)
|
||||||
|
|
||||||
|
|||||||
@@ -6,9 +6,9 @@ from dataclasses import MISSING, Field, asdict, dataclass, field
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.compilation.backends import VllmBackend
|
from vllm.compilation.backends import VllmBackend
|
||||||
from vllm.config import (ModelConfig, PoolerConfig, VllmConfig, get_field,
|
from vllm.config import ModelConfig, PoolerConfig, VllmConfig, update_config
|
||||||
update_config)
|
|
||||||
from vllm.config.load import LoadConfig
|
from vllm.config.load import LoadConfig
|
||||||
|
from vllm.config.utils import get_field
|
||||||
from vllm.model_executor.layers.pooler import PoolingType
|
from vllm.model_executor.layers.pooler import PoolingType
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ def _mk_processor(monkeypatch,
|
|||||||
raising=True)
|
raising=True)
|
||||||
monkeypatch.setattr(ModelConfig,
|
monkeypatch.setattr(ModelConfig,
|
||||||
"__post_init__",
|
"__post_init__",
|
||||||
lambda self: None,
|
lambda self, *args: None,
|
||||||
raising=True)
|
raising=True)
|
||||||
monkeypatch.setattr(UnspecifiedPlatform,
|
monkeypatch.setattr(UnspecifiedPlatform,
|
||||||
"is_async_output_supported",
|
"is_async_output_supported",
|
||||||
|
|||||||
@@ -11,13 +11,12 @@ import json
|
|||||||
import os
|
import os
|
||||||
import textwrap
|
import textwrap
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import Mapping
|
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
|
from dataclasses import InitVar, field, fields, is_dataclass, replace
|
||||||
from functools import cached_property, lru_cache
|
from functools import cached_property, lru_cache
|
||||||
from importlib.util import find_spec
|
from importlib.util import find_spec
|
||||||
from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional,
|
from typing import (TYPE_CHECKING, Any, Callable, Literal, Optional, Protocol,
|
||||||
Protocol, TypeVar, Union, cast, get_args)
|
TypeVar, Union, cast, get_args)
|
||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
import torch
|
import torch
|
||||||
@@ -37,6 +36,8 @@ from vllm.config.kv_events import KVEventsConfig
|
|||||||
from vllm.config.kv_transfer import KVTransferConfig
|
from vllm.config.kv_transfer import KVTransferConfig
|
||||||
from vllm.config.load import LoadConfig
|
from vllm.config.load import LoadConfig
|
||||||
from vllm.config.lora import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
|
from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
|
||||||
|
MultiModalConfig)
|
||||||
from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
|
from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
|
||||||
ParallelConfig)
|
ParallelConfig)
|
||||||
from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
|
from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
|
||||||
@@ -238,31 +239,12 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def get_field(cls: ConfigType, name: str) -> Field:
|
|
||||||
"""Get the default factory field of a dataclass by name. Used for getting
|
|
||||||
default factory fields in `EngineArgs`."""
|
|
||||||
if not is_dataclass(cls):
|
|
||||||
raise TypeError("The given class is not a dataclass.")
|
|
||||||
cls_fields = {f.name: f for f in fields(cls)}
|
|
||||||
if name not in cls_fields:
|
|
||||||
raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
|
|
||||||
named_field: Field = cls_fields[name]
|
|
||||||
if (default_factory := named_field.default_factory) is not MISSING:
|
|
||||||
return field(default_factory=default_factory)
|
|
||||||
if (default := named_field.default) is not MISSING:
|
|
||||||
return field(default=default)
|
|
||||||
raise ValueError(
|
|
||||||
f"{cls.__name__}.{name} must have a default value or default factory.")
|
|
||||||
|
|
||||||
|
|
||||||
def is_init_field(cls: ConfigType, name: str) -> bool:
|
def is_init_field(cls: ConfigType, name: str) -> bool:
|
||||||
return next(f for f in fields(cls) if f.name == name).init
|
return next(f for f in fields(cls) if f.name == name).init
|
||||||
|
|
||||||
|
|
||||||
TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
|
TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
|
||||||
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
||||||
MMEncoderTPMode = Literal["weights", "data"]
|
|
||||||
MMCacheType = Literal["shm", "lru"]
|
|
||||||
|
|
||||||
|
|
||||||
class LogprobsMode(enum.Enum):
|
class LogprobsMode(enum.Enum):
|
||||||
@@ -407,20 +389,6 @@ class ModelConfig:
|
|||||||
that this name(s) will also be used in `model_name` tag content of
|
that this name(s) will also be used in `model_name` tag content of
|
||||||
prometheus metrics, if multiple names provided, metrics tag will take the
|
prometheus metrics, if multiple names provided, metrics tag will take the
|
||||||
first one."""
|
first one."""
|
||||||
limit_mm_per_prompt: dict[str, int] = field(default_factory=dict)
|
|
||||||
"""Maximum number of data items per modality per prompt. Only applicable
|
|
||||||
for multimodal models."""
|
|
||||||
interleave_mm_strings: bool = False
|
|
||||||
"""Enable fully interleaved support for multimodal prompts, while using
|
|
||||||
--chat-template-content-format=string. Defaults to False."""
|
|
||||||
skip_mm_profiling: bool = False
|
|
||||||
"""When enabled, skips multimodal memory profiling and only profiles with
|
|
||||||
language backbone model during engine initialization.
|
|
||||||
"""
|
|
||||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
||||||
"""Additional args passed to process media inputs, keyed by modalities.
|
|
||||||
For example, to set num_frames for video, set
|
|
||||||
`--media-io-kwargs '{"video": {"num_frames": 40} }'` """
|
|
||||||
use_async_output_proc: bool = True
|
use_async_output_proc: bool = True
|
||||||
"""Whether to use async output processor."""
|
"""Whether to use async output processor."""
|
||||||
config_format: Union[str, ConfigFormat] = "auto"
|
config_format: Union[str, ConfigFormat] = "auto"
|
||||||
@@ -436,41 +404,6 @@ class ModelConfig:
|
|||||||
hf_overrides: HfOverrides = field(default_factory=dict)
|
hf_overrides: HfOverrides = field(default_factory=dict)
|
||||||
"""If a dictionary, contains arguments to be forwarded to the Hugging Face
|
"""If a dictionary, contains arguments to be forwarded to the Hugging Face
|
||||||
config. If a callable, it is called to update the HuggingFace config."""
|
config. If a callable, it is called to update the HuggingFace config."""
|
||||||
mm_processor_kwargs: Optional[dict[str, Any]] = None
|
|
||||||
"""Arguments to be forwarded to the model's processor for multi-modal data,
|
|
||||||
e.g., image processor. Overrides for the multi-modal processor obtained
|
|
||||||
from `AutoProcessor.from_pretrained`. The available overrides depend on the
|
|
||||||
model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
|
|
||||||
"""
|
|
||||||
mm_processor_cache_gb: float = 4
|
|
||||||
"""The size (in GiB) of the multi-modal processor cache, which is used to
|
|
||||||
avoid re-processing past multi-modal inputs.
|
|
||||||
|
|
||||||
This cache is duplicated for each API process and engine core process,
|
|
||||||
resulting in a total memory usage of
|
|
||||||
`mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
|
|
||||||
|
|
||||||
Set to `0` to disable this cache completely (not recommended)."""
|
|
||||||
mm_processor_cache_type: MMCacheType = "lru"
|
|
||||||
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
|
|
||||||
use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
|
|
||||||
mm_shm_cache_max_object_size_mb: int = 128
|
|
||||||
"""Size limit (in MiB) for each object stored in the multi-modal processor
|
|
||||||
shared memory cache. Only effective when `mm_processor_cache_type` is
|
|
||||||
`"shm"`."""
|
|
||||||
mm_encoder_tp_mode: MMEncoderTPMode = "weights"
|
|
||||||
"""Indicates how to optimize multi-modal encoder inference using
|
|
||||||
tensor parallelism (TP).
|
|
||||||
|
|
||||||
- `"weights"`: Within the same vLLM engine, split the weights of
|
|
||||||
each layer across TP ranks. (default TP behavior)
|
|
||||||
- `"data"`: Within the same vLLM engine, split the batched input data
|
|
||||||
across TP ranks to process the data in parallel, while hosting
|
|
||||||
the full weights on each TP rank.
|
|
||||||
This batch-level DP is not to be confused with API request-level
|
|
||||||
DP (which is controlled by `--data-parallel-size`).
|
|
||||||
This is only supported on a per-model basis and falls back to
|
|
||||||
`"weights"` if the encoder does not support DP."""
|
|
||||||
pooler_config: Optional["PoolerConfig"] = field(init=False)
|
pooler_config: Optional["PoolerConfig"] = field(init=False)
|
||||||
"""Pooler config which controls the behaviour of output pooling in pooling
|
"""Pooler config which controls the behaviour of output pooling in pooling
|
||||||
models."""
|
models."""
|
||||||
@@ -513,6 +446,18 @@ class ModelConfig:
|
|||||||
io_processor_plugin: Optional[str] = None
|
io_processor_plugin: Optional[str] = None
|
||||||
"""IOProcessor plugin name to load at model startup"""
|
"""IOProcessor plugin name to load at model startup"""
|
||||||
|
|
||||||
|
# Multimodal config and init vars
|
||||||
|
multimodal_config: Optional[MultiModalConfig] = None
|
||||||
|
limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None
|
||||||
|
media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
|
||||||
|
mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None
|
||||||
|
mm_processor_cache_gb: InitVar[Optional[float]] = None
|
||||||
|
mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None
|
||||||
|
mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None
|
||||||
|
mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None
|
||||||
|
interleave_mm_strings: InitVar[Optional[bool]] = None
|
||||||
|
skip_mm_profiling: InitVar[Optional[bool]] = None
|
||||||
|
|
||||||
def compute_hash(self) -> str:
|
def compute_hash(self) -> str:
|
||||||
"""
|
"""
|
||||||
WARNING: Whenever a new field is added to this config,
|
WARNING: Whenever a new field is added to this config,
|
||||||
@@ -546,7 +491,18 @@ class ModelConfig:
|
|||||||
assert_hashable(str_factors)
|
assert_hashable(str_factors)
|
||||||
return hashlib.sha256(str(factors).encode()).hexdigest()
|
return hashlib.sha256(str(factors).encode()).hexdigest()
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(
|
||||||
|
self,
|
||||||
|
# Multimodal config init vars
|
||||||
|
limit_mm_per_prompt: Optional[dict[str, int]],
|
||||||
|
media_io_kwargs: Optional[dict[str, dict[str, Any]]],
|
||||||
|
mm_processor_kwargs: Optional[dict[str, Any]],
|
||||||
|
mm_processor_cache_gb: Optional[float],
|
||||||
|
mm_processor_cache_type: Optional[MMCacheType],
|
||||||
|
mm_shm_cache_max_object_size_mb: Optional[int],
|
||||||
|
mm_encoder_tp_mode: Optional[MMEncoderTPMode],
|
||||||
|
interleave_mm_strings: Optional[bool],
|
||||||
|
skip_mm_profiling: Optional[bool]) -> None:
|
||||||
# Set the default seed to 0 in V1.
|
# Set the default seed to 0 in V1.
|
||||||
# NOTE(woosuk): In V0, we set the default seed to None because the
|
# NOTE(woosuk): In V0, we set the default seed to None because the
|
||||||
# driver worker shares the same process as the user process, and thus
|
# driver worker shares the same process as the user process, and thus
|
||||||
@@ -777,7 +733,33 @@ class ModelConfig:
|
|||||||
|
|
||||||
self.original_max_model_len = self.max_model_len
|
self.original_max_model_len = self.max_model_len
|
||||||
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
|
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
|
||||||
self.multimodal_config = self._init_multimodal_config()
|
# Init multimodal config if needed
|
||||||
|
if self._model_info.supports_multimodal:
|
||||||
|
if (mm_encoder_tp_mode == "data" and
|
||||||
|
not self._model_info.supports_multimodal_encoder_tp_data):
|
||||||
|
logger.warning_once(
|
||||||
|
"This model does not support `--mm-encoder-tp-mode data`. "
|
||||||
|
"Falling back to `--mm-encoder-tp-mode weights`.")
|
||||||
|
mm_encoder_tp_mode = "weights"
|
||||||
|
|
||||||
|
mm_config_kwargs = dict(
|
||||||
|
limit_per_prompt=limit_mm_per_prompt,
|
||||||
|
media_io_kwargs=media_io_kwargs,
|
||||||
|
mm_processor_kwargs=mm_processor_kwargs,
|
||||||
|
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||||
|
mm_processor_cache_type=mm_processor_cache_type,
|
||||||
|
mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb,
|
||||||
|
mm_encoder_tp_mode=mm_encoder_tp_mode,
|
||||||
|
interleave_mm_strings=interleave_mm_strings,
|
||||||
|
skip_mm_profiling=skip_mm_profiling,
|
||||||
|
)
|
||||||
|
|
||||||
|
mm_config_kwargs = {
|
||||||
|
k: v
|
||||||
|
for k, v in mm_config_kwargs.items() if v is not None
|
||||||
|
}
|
||||||
|
|
||||||
|
self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
|
||||||
|
|
||||||
if self.disable_sliding_window:
|
if self.disable_sliding_window:
|
||||||
# Set after get_and_verify_max_len to ensure that max_model_len
|
# Set after get_and_verify_max_len to ensure that max_model_len
|
||||||
@@ -875,30 +857,6 @@ class ModelConfig:
|
|||||||
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"])
|
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"])
|
||||||
self.tokenizer = object_storage_tokenizer.dir
|
self.tokenizer = object_storage_tokenizer.dir
|
||||||
|
|
||||||
def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
|
|
||||||
if self._model_info.supports_multimodal:
|
|
||||||
if (self.mm_encoder_tp_mode == "data" and
|
|
||||||
not self._model_info.supports_multimodal_encoder_tp_data):
|
|
||||||
logger.warning_once(
|
|
||||||
"This model does not support `--mm-encoder-tp-mode data`. "
|
|
||||||
"Falling back to `--mm-encoder-tp-mode weights`.")
|
|
||||||
self.mm_encoder_tp_mode = "weights"
|
|
||||||
|
|
||||||
return MultiModalConfig(
|
|
||||||
limit_per_prompt=self.limit_mm_per_prompt,
|
|
||||||
media_io_kwargs=self.media_io_kwargs,
|
|
||||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
|
||||||
mm_processor_cache_gb=self.mm_processor_cache_gb,
|
|
||||||
mm_processor_cache_type=self.mm_processor_cache_type,
|
|
||||||
mm_shm_cache_max_object_size_mb=self.
|
|
||||||
mm_shm_cache_max_object_size_mb,
|
|
||||||
mm_encoder_tp_mode=self.mm_encoder_tp_mode,
|
|
||||||
interleave_mm_strings=self.interleave_mm_strings,
|
|
||||||
skip_mm_profiling=self.skip_mm_profiling,
|
|
||||||
)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _get_encoder_config(self):
|
def _get_encoder_config(self):
|
||||||
return get_sentence_transformer_tokenizer_config(
|
return get_sentence_transformer_tokenizer_config(
|
||||||
self.model, self.revision)
|
self.model, self.revision)
|
||||||
@@ -2417,129 +2375,6 @@ class SpeculativeConfig:
|
|||||||
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
|
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
|
||||||
|
|
||||||
|
|
||||||
@config
|
|
||||||
@dataclass
|
|
||||||
class MultiModalConfig:
|
|
||||||
"""Controls the behavior of multimodal models."""
|
|
||||||
|
|
||||||
limit_per_prompt: dict[str, int] = \
|
|
||||||
cast(dict[str, int], get_field(ModelConfig, "limit_mm_per_prompt"))
|
|
||||||
"""
|
|
||||||
The maximum number of input items allowed per prompt for each modality.
|
|
||||||
Defaults to 1 (V0) or 999 (V1) for each modality.
|
|
||||||
|
|
||||||
For example, to allow up to 16 images and 2 videos per prompt:
|
|
||||||
`{"image": 16, "video": 2}`
|
|
||||||
"""
|
|
||||||
|
|
||||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
||||||
"""Additional args passed to process media inputs, keyed by modalities.
|
|
||||||
For example, to set num_frames for video, set
|
|
||||||
`--media-io-kwargs '{"video": {"num_frames": 40} }'` """
|
|
||||||
|
|
||||||
mm_processor_kwargs: Optional[dict[str, object]] = None
|
|
||||||
"""
|
|
||||||
Overrides for the multi-modal processor obtained from
|
|
||||||
`transformers.AutoProcessor.from_pretrained`.
|
|
||||||
|
|
||||||
The available overrides depend on the model that is being run.
|
|
||||||
|
|
||||||
For example, for Phi-3-Vision:
|
|
||||||
`{"num_crops": 4}`.
|
|
||||||
"""
|
|
||||||
|
|
||||||
mm_processor_cache_gb: float = 4
|
|
||||||
"""
|
|
||||||
The size (in GiB) of the multi-modal processor cache, which is used to
|
|
||||||
|
|
||||||
This cache is duplicated for each API process and engine core process,
|
|
||||||
resulting in a total memory usage of
|
|
||||||
`mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
|
|
||||||
|
|
||||||
Set to `0` to disable this cache completely (not recommended).
|
|
||||||
"""
|
|
||||||
|
|
||||||
mm_processor_cache_type: MMCacheType = "lru"
|
|
||||||
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
|
|
||||||
use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
|
|
||||||
|
|
||||||
mm_shm_cache_max_object_size_mb: int = 128
|
|
||||||
"""Size limit (in MiB) for each object stored in the multi-modal processor
|
|
||||||
shared memory cache. Only effective when `mm_processor_cache_type` is
|
|
||||||
`"shm"`."""
|
|
||||||
|
|
||||||
mm_encoder_tp_mode: MMEncoderTPMode = "weights"
|
|
||||||
"""
|
|
||||||
Indicates how to optimize multi-modal encoder inference using
|
|
||||||
tensor parallelism (TP).
|
|
||||||
|
|
||||||
- `"weights"`: Within the same vLLM engine, split the weights of
|
|
||||||
each layer across TP ranks. (default TP behavior)
|
|
||||||
- `"data"`: Within the same vLLM engine, split the batched input data
|
|
||||||
across TP ranks to process the data in parallel, while hosting
|
|
||||||
the full weights on each TP rank.
|
|
||||||
This batch-level DP is not to be confused with API request-level
|
|
||||||
DP (which is controlled by `--data-parallel-size`).
|
|
||||||
This is only supported on a per-model basis and falls back to
|
|
||||||
`"weights"` if the encoder does not support DP.
|
|
||||||
"""
|
|
||||||
|
|
||||||
interleave_mm_strings: bool = False
|
|
||||||
"""
|
|
||||||
Enable fully interleaved support for multimodal prompts.
|
|
||||||
"""
|
|
||||||
|
|
||||||
skip_mm_profiling: bool = False
|
|
||||||
"""
|
|
||||||
When enabled, skips multimodal memory profiling and only profiles with
|
|
||||||
language backbone model during engine initialization.
|
|
||||||
|
|
||||||
This reduces engine startup time but shifts the responsibility to users for
|
|
||||||
estimating the peak memory usage of the activation of multimodal encoder and
|
|
||||||
embedding cache.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def compute_hash(self) -> str:
|
|
||||||
"""
|
|
||||||
WARNING: Whenever a new field is added to this config,
|
|
||||||
ensure that it is included in the factors list if
|
|
||||||
it affects the computation graph.
|
|
||||||
|
|
||||||
Provide a hash that uniquely identifies all the configs
|
|
||||||
that affect the structure of the computation
|
|
||||||
graph from input ids/embeddings to the final hidden states,
|
|
||||||
excluding anything before input ids/embeddings and after
|
|
||||||
the final hidden states.
|
|
||||||
"""
|
|
||||||
# no factors to consider.
|
|
||||||
# this config will not affect the computation graph.
|
|
||||||
factors: list[Any] = []
|
|
||||||
hash_str = hashlib.md5(str(factors).encode(),
|
|
||||||
usedforsecurity=False).hexdigest()
|
|
||||||
return hash_str
|
|
||||||
|
|
||||||
def get_limit_per_prompt(self, modality: str) -> int:
|
|
||||||
"""
|
|
||||||
Get the maximum number of input items allowed per prompt
|
|
||||||
for the given modality.
|
|
||||||
"""
|
|
||||||
return self.limit_per_prompt.get(
|
|
||||||
modality,
|
|
||||||
999 if envs.VLLM_USE_V1 else 1,
|
|
||||||
)
|
|
||||||
|
|
||||||
def merge_mm_processor_kwargs(
|
|
||||||
self,
|
|
||||||
inference_kwargs: Mapping[str, object],
|
|
||||||
) -> dict[str, object]:
|
|
||||||
"""
|
|
||||||
Get the keyword arguments to pass to the multi-modal processor
|
|
||||||
according to the extra arguments passed during inference.
|
|
||||||
"""
|
|
||||||
kwargs = self.mm_processor_kwargs or {}
|
|
||||||
return kwargs | dict(inference_kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
@config
|
@config
|
||||||
@dataclass
|
@dataclass
|
||||||
class PoolerConfig:
|
class PoolerConfig:
|
||||||
|
|||||||
120
vllm/config/multimodal.py
Normal file
120
vllm/config/multimodal.py
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
from collections.abc import Mapping
|
||||||
|
from dataclasses import field
|
||||||
|
from typing import Any, Literal, Optional
|
||||||
|
|
||||||
|
from pydantic.dataclasses import dataclass
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
|
from vllm.config.utils import config
|
||||||
|
|
||||||
|
MMEncoderTPMode = Literal["weights", "data"]
|
||||||
|
MMCacheType = Literal["shm", "lru"]
|
||||||
|
|
||||||
|
|
||||||
|
@config
|
||||||
|
@dataclass
|
||||||
|
class MultiModalConfig:
|
||||||
|
"""Controls the behavior of multimodal models."""
|
||||||
|
|
||||||
|
limit_per_prompt: dict[str, int] = field(default_factory=dict)
|
||||||
|
"""The maximum number of input items allowed per prompt for each modality.
|
||||||
|
Defaults to 1 (V0) or 999 (V1) for each modality.
|
||||||
|
|
||||||
|
For example, to allow up to 16 images and 2 videos per prompt:
|
||||||
|
`{"image": 16, "video": 2}`"""
|
||||||
|
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||||
|
"""Additional args passed to process media inputs, keyed by modalities.
|
||||||
|
For example, to set num_frames for video, set
|
||||||
|
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
|
||||||
|
mm_processor_kwargs: Optional[dict[str, object]] = None
|
||||||
|
"""Arguments to be forwarded to the model's processor for multi-modal data,
|
||||||
|
e.g., image processor. Overrides for the multi-modal processor obtained
|
||||||
|
from `transformers.AutoProcessor.from_pretrained`.
|
||||||
|
|
||||||
|
The available overrides depend on the model that is being run.
|
||||||
|
|
||||||
|
For example, for Phi-3-Vision:
|
||||||
|
`{"num_crops": 4}`."""
|
||||||
|
mm_processor_cache_gb: float = 4
|
||||||
|
"""The size (in GiB) of the multi-modal processor cache, which is used to
|
||||||
|
avoid re-processing past multi-modal inputs.
|
||||||
|
|
||||||
|
This cache is duplicated for each API process and engine core process,
|
||||||
|
resulting in a total memory usage of
|
||||||
|
`mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
|
||||||
|
|
||||||
|
Set to `0` to disable this cache completely (not recommended)."""
|
||||||
|
mm_processor_cache_type: MMCacheType = "lru"
|
||||||
|
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
|
||||||
|
use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
|
||||||
|
mm_shm_cache_max_object_size_mb: int = 128
|
||||||
|
"""Size limit (in MiB) for each object stored in the multi-modal processor
|
||||||
|
shared memory cache. Only effective when `mm_processor_cache_type` is
|
||||||
|
`"shm"`."""
|
||||||
|
mm_encoder_tp_mode: MMEncoderTPMode = "weights"
|
||||||
|
"""Indicates how to optimize multi-modal encoder inference using tensor
|
||||||
|
parallelism (TP).
|
||||||
|
|
||||||
|
- `"weights"`: Within the same vLLM engine, split the weights of
|
||||||
|
each layer across TP ranks. (default TP behavior)\n
|
||||||
|
- `"data"`: Within the same vLLM engine, split the batched input data
|
||||||
|
across TP ranks to process the data in parallel, while hosting
|
||||||
|
the full weights on each TP rank.
|
||||||
|
This batch-level DP is not to be confused with API request-level
|
||||||
|
DP (which is controlled by `--data-parallel-size`).
|
||||||
|
This is only supported on a per-model basis and falls back to
|
||||||
|
`"weights"` if the encoder does not support DP."""
|
||||||
|
interleave_mm_strings: bool = False
|
||||||
|
"""Enable fully interleaved support for multimodal prompts, while using
|
||||||
|
--chat-template-content-format=string."""
|
||||||
|
skip_mm_profiling: bool = False
|
||||||
|
"""When enabled, skips multimodal memory profiling and only profiles with
|
||||||
|
language backbone model during engine initialization.
|
||||||
|
|
||||||
|
This reduces engine startup time but shifts the responsibility to users for
|
||||||
|
estimating the peak memory usage of the activation of multimodal encoder and
|
||||||
|
embedding cache."""
|
||||||
|
|
||||||
|
def compute_hash(self) -> str:
|
||||||
|
"""
|
||||||
|
WARNING: Whenever a new field is added to this config,
|
||||||
|
ensure that it is included in the factors list if
|
||||||
|
it affects the computation graph.
|
||||||
|
|
||||||
|
Provide a hash that uniquely identifies all the configs
|
||||||
|
that affect the structure of the computation
|
||||||
|
graph from input ids/embeddings to the final hidden states,
|
||||||
|
excluding anything before input ids/embeddings and after
|
||||||
|
the final hidden states.
|
||||||
|
"""
|
||||||
|
# no factors to consider.
|
||||||
|
# this config will not affect the computation graph.
|
||||||
|
factors: list[Any] = []
|
||||||
|
hash_str = hashlib.md5(str(factors).encode(),
|
||||||
|
usedforsecurity=False).hexdigest()
|
||||||
|
return hash_str
|
||||||
|
|
||||||
|
def get_limit_per_prompt(self, modality: str) -> int:
|
||||||
|
"""
|
||||||
|
Get the maximum number of input items allowed per prompt
|
||||||
|
for the given modality.
|
||||||
|
"""
|
||||||
|
return self.limit_per_prompt.get(
|
||||||
|
modality,
|
||||||
|
999 if envs.VLLM_USE_V1 else 1,
|
||||||
|
)
|
||||||
|
|
||||||
|
def merge_mm_processor_kwargs(
|
||||||
|
self,
|
||||||
|
inference_kwargs: Mapping[str, object],
|
||||||
|
) -> dict[str, object]:
|
||||||
|
"""
|
||||||
|
Get the keyword arguments to pass to the multi-modal processor
|
||||||
|
according to the extra arguments passed during inference.
|
||||||
|
"""
|
||||||
|
kwargs = self.mm_processor_kwargs or {}
|
||||||
|
return kwargs | dict(inference_kwargs)
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from dataclasses import MISSING, Field, field, fields, is_dataclass
|
||||||
from typing import TYPE_CHECKING, TypeVar
|
from typing import TYPE_CHECKING, TypeVar
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -27,3 +28,20 @@ def config(cls: ConfigT) -> ConfigT:
|
|||||||
script, which is invoked during the pre-commit checks.
|
script, which is invoked during the pre-commit checks.
|
||||||
"""
|
"""
|
||||||
return cls
|
return cls
|
||||||
|
|
||||||
|
|
||||||
|
def get_field(cls: ConfigType, name: str) -> Field:
|
||||||
|
"""Get the default factory field of a dataclass by name. Used for getting
|
||||||
|
default factory fields in `EngineArgs`."""
|
||||||
|
if not is_dataclass(cls):
|
||||||
|
raise TypeError("The given class is not a dataclass.")
|
||||||
|
cls_fields = {f.name: f for f in fields(cls)}
|
||||||
|
if name not in cls_fields:
|
||||||
|
raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
|
||||||
|
named_field: Field = cls_fields[name]
|
||||||
|
if (default_factory := named_field.default_factory) is not MISSING:
|
||||||
|
return field(default_factory=default_factory)
|
||||||
|
if (default := named_field.default) is not MISSING:
|
||||||
|
return field(default=default)
|
||||||
|
raise ValueError(
|
||||||
|
f"{cls.__name__}.{name} must have a default value or default factory.")
|
||||||
|
|||||||
@@ -27,12 +27,14 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
|
|||||||
DistributedExecutorBackend, EPLBConfig,
|
DistributedExecutorBackend, EPLBConfig,
|
||||||
GuidedDecodingBackend, HfOverrides, KVEventsConfig,
|
GuidedDecodingBackend, HfOverrides, KVEventsConfig,
|
||||||
KVTransferConfig, LoadConfig, LogprobsMode,
|
KVTransferConfig, LoadConfig, LogprobsMode,
|
||||||
LoRAConfig, MambaDType, MMCacheType, MMEncoderTPMode,
|
LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig,
|
||||||
ModelConfig, ModelDType, ModelImpl, MultiModalConfig,
|
ModelDType, ModelImpl, ObservabilityConfig,
|
||||||
ObservabilityConfig, ParallelConfig, PoolerConfig,
|
ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
|
||||||
PrefixCachingHashAlgo, RunnerOption, SchedulerConfig,
|
RunnerOption, SchedulerConfig, SchedulerPolicy,
|
||||||
SchedulerPolicy, SpeculativeConfig, TaskOption,
|
SpeculativeConfig, TaskOption, TokenizerMode,
|
||||||
TokenizerMode, VllmConfig, get_attr_docs, get_field)
|
VllmConfig, get_attr_docs)
|
||||||
|
from vllm.config.multimodal import MMCacheType, MultiModalConfig
|
||||||
|
from vllm.config.utils import get_field
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import CpuArchEnum, current_platform
|
from vllm.platforms import CpuArchEnum, current_platform
|
||||||
from vllm.plugins import load_general_plugins
|
from vllm.plugins import load_general_plugins
|
||||||
|
|||||||
@@ -800,9 +800,10 @@ class MultiModalContentParser(BaseMultiModalContentParser):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self._tracker = tracker
|
self._tracker = tracker
|
||||||
|
multimodal_config = self._tracker.model_config.multimodal_config
|
||||||
|
media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
|
||||||
self._connector = MediaConnector(
|
self._connector = MediaConnector(
|
||||||
media_io_kwargs=self._tracker._model_config.media_io_kwargs,
|
media_io_kwargs=media_io_kwargs,
|
||||||
allowed_local_media_path=tracker.allowed_local_media_path,
|
allowed_local_media_path=tracker.allowed_local_media_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -883,8 +884,10 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self._tracker = tracker
|
self._tracker = tracker
|
||||||
|
multimodal_config = self._tracker.model_config.multimodal_config
|
||||||
|
media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
|
||||||
self._connector = MediaConnector(
|
self._connector = MediaConnector(
|
||||||
media_io_kwargs=self._tracker._model_config.media_io_kwargs,
|
media_io_kwargs=media_io_kwargs,
|
||||||
allowed_local_media_path=tracker.allowed_local_media_path,
|
allowed_local_media_path=tracker.allowed_local_media_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -229,7 +229,8 @@ class MultiModalProcessingInfo(BaseProcessingInfo):
|
|||||||
def get_max_image_tokens(self) -> int:
|
def get_max_image_tokens(self) -> int:
|
||||||
width, height = self.get_max_image_size()
|
width, height = self.get_max_image_size()
|
||||||
processor = self.get_hf_processor()
|
processor = self.get_hf_processor()
|
||||||
mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {}
|
multimodal_config = self.ctx.model_config.multimodal_config
|
||||||
|
mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
|
||||||
mm_tokens = processor._get_num_multimodal_tokens(
|
mm_tokens = processor._get_num_multimodal_tokens(
|
||||||
image_sizes=([height, width], ), **mm_processor_kwargs)
|
image_sizes=([height, width], ), **mm_processor_kwargs)
|
||||||
image_tokens = mm_tokens["num_image_tokens"][0]
|
image_tokens = mm_tokens["num_image_tokens"][0]
|
||||||
@@ -380,8 +381,8 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
|
|||||||
# Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1
|
# Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1
|
||||||
mm_positions = torch.where(mm_token_type_ids == 1)[1]
|
mm_positions = torch.where(mm_token_type_ids == 1)[1]
|
||||||
images = mm_items.get_items("image", ImageProcessorItems)
|
images = mm_items.get_items("image", ImageProcessorItems)
|
||||||
mm_processor_kwargs = (self.info.ctx.model_config.mm_processor_kwargs
|
multimodal_config = self.info.ctx.model_config.multimodal_config
|
||||||
or {})
|
mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
|
||||||
image_sizes = []
|
image_sizes = []
|
||||||
for item_idx in range(len(images)):
|
for item_idx in range(len(images)):
|
||||||
image_size = images.get_image_size(item_idx)
|
image_size = images.get_image_size(item_idx)
|
||||||
|
|||||||
Reference in New Issue
Block a user