[V1] Consolidate MM cache size to vllm.envs (#13239)
This commit is contained in:
11
vllm/envs.py
11
vllm/envs.py
@@ -55,6 +55,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
||||||
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
|
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
|
||||||
VLLM_AUDIO_FETCH_TIMEOUT: int = 10
|
VLLM_AUDIO_FETCH_TIMEOUT: int = 10
|
||||||
|
VLLM_MM_INPUT_CACHE_SIZE: int = 256
|
||||||
VLLM_TARGET_DEVICE: str = "cuda"
|
VLLM_TARGET_DEVICE: str = "cuda"
|
||||||
MAX_JOBS: Optional[str] = None
|
MAX_JOBS: Optional[str] = None
|
||||||
NVCC_THREADS: Optional[str] = None
|
NVCC_THREADS: Optional[str] = None
|
||||||
@@ -401,15 +402,21 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
|||||||
lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
|
lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
|
||||||
|
|
||||||
# Timeout for fetching videos when serving multimodal models
|
# Timeout for fetching videos when serving multimodal models
|
||||||
# Default is 15 seconds
|
# Default is 30 seconds
|
||||||
"VLLM_VIDEO_FETCH_TIMEOUT":
|
"VLLM_VIDEO_FETCH_TIMEOUT":
|
||||||
lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "15")),
|
lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")),
|
||||||
|
|
||||||
# Timeout for fetching audio when serving multimodal models
|
# Timeout for fetching audio when serving multimodal models
|
||||||
# Default is 10 seconds
|
# Default is 10 seconds
|
||||||
"VLLM_AUDIO_FETCH_TIMEOUT":
|
"VLLM_AUDIO_FETCH_TIMEOUT":
|
||||||
lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
|
lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
|
||||||
|
|
||||||
|
# Cache size for multimodal feature/input cache for multimodal models
|
||||||
|
# in unit of number of multimodal data items (e.g. image, video, audio).
|
||||||
|
# Default is 256 multimodal data items.
|
||||||
|
"VLLM_MM_INPUT_CACHE_SIZE":
|
||||||
|
lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_SIZE", "256")),
|
||||||
|
|
||||||
# Path to the XLA persistent cache directory.
|
# Path to the XLA persistent cache directory.
|
||||||
# Only used for XLA devices such as TPUs.
|
# Only used for XLA devices such as TPUs.
|
||||||
"VLLM_XLA_CACHE_PATH":
|
"VLLM_XLA_CACHE_PATH":
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional,
|
|||||||
|
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
|
||||||
from vllm.inputs import InputProcessingContext
|
from vllm.inputs import InputProcessingContext
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
@@ -28,9 +29,6 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
# TODO: Tune the MM cache size
|
|
||||||
MM_CACHE_SIZE = 256
|
|
||||||
|
|
||||||
N = TypeVar("N", bound=Type[nn.Module])
|
N = TypeVar("N", bound=Type[nn.Module])
|
||||||
_I = TypeVar("_I", bound=BaseProcessingInfo)
|
_I = TypeVar("_I", bound=BaseProcessingInfo)
|
||||||
_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
|
_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
|
||||||
@@ -121,7 +119,7 @@ class MultiModalRegistry:
|
|||||||
|
|
||||||
self._limits_by_model = _MultiModalLimits()
|
self._limits_by_model = _MultiModalLimits()
|
||||||
|
|
||||||
self._processing_cache = ProcessingCache(MM_CACHE_SIZE)
|
self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_SIZE)
|
||||||
|
|
||||||
def register_plugin(self, plugin: MultiModalPlugin) -> None:
|
def register_plugin(self, plugin: MultiModalPlugin) -> None:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
|
from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
|
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
|
||||||
MultiModalKwargs, MultiModalRegistry)
|
MultiModalKwargs, MultiModalRegistry)
|
||||||
@@ -28,9 +29,8 @@ logger = init_logger(__name__)
|
|||||||
# client (=P0) and server (=P1) processes.
|
# client (=P0) and server (=P1) processes.
|
||||||
|
|
||||||
# Both Client and Server must use the same cache size
|
# Both Client and Server must use the same cache size
|
||||||
# (to perform mirrored caching)
|
# (to perform mirrored caching). This cache size is set by the environment
|
||||||
# TODO: Tune the MM cache size
|
# variable VLLM_MM_INPUT_CACHE_SIZE.
|
||||||
MM_CACHE_SIZE = 256
|
|
||||||
|
|
||||||
|
|
||||||
# TODO(ywang96): Deprecate this class once all multimodal models migrate to use
|
# TODO(ywang96): Deprecate this class once all multimodal models migrate to use
|
||||||
@@ -50,7 +50,8 @@ class MMInputCacheClient:
|
|||||||
|
|
||||||
# Init cache
|
# Init cache
|
||||||
self.use_cache = not model_config.disable_mm_preprocessor_cache
|
self.use_cache = not model_config.disable_mm_preprocessor_cache
|
||||||
self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
|
self.mm_cache = LRUCache[str,
|
||||||
|
MultiModalKwargs](VLLM_MM_INPUT_CACHE_SIZE)
|
||||||
|
|
||||||
# DEBUG: Set to None to disable
|
# DEBUG: Set to None to disable
|
||||||
self.mm_debug_cache_hit_ratio_steps = None
|
self.mm_debug_cache_hit_ratio_steps = None
|
||||||
@@ -127,7 +128,8 @@ class MMInputCacheServer:
|
|||||||
|
|
||||||
def __init__(self, model_config):
|
def __init__(self, model_config):
|
||||||
self.use_cache = not model_config.disable_mm_preprocessor_cache
|
self.use_cache = not model_config.disable_mm_preprocessor_cache
|
||||||
self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
|
self.mm_cache = LRUCache[str,
|
||||||
|
MultiModalKwargs](VLLM_MM_INPUT_CACHE_SIZE)
|
||||||
|
|
||||||
def get_and_update(
|
def get_and_update(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user