[Core] Shared memory based object store for Multimodal data caching and IPC (#20452)
Signed-off-by: donglu <donglu@cohere.com>
This commit is contained in:
@@ -262,6 +262,7 @@ def is_init_field(cls: ConfigType, name: str) -> bool:
|
||||
TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
|
||||
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
||||
MMEncoderTPMode = Literal["weights", "data"]
|
||||
MMCacheType = Literal["shm", "lru"]
|
||||
|
||||
|
||||
class LogprobsMode(enum.Enum):
|
||||
@@ -450,6 +451,13 @@ class ModelConfig:
|
||||
`mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
|
||||
|
||||
Set to `0` to disable this cache completely (not recommended)."""
|
||||
mm_processor_cache_type: MMCacheType = "lru"
|
||||
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
|
||||
use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
|
||||
mm_shm_cache_max_object_size_mb: int = 128
|
||||
"""Size limit (in MiB) for each object stored in the multi-modal processor
|
||||
shared memory cache. Only effective when `mm_processor_cache_type` is
|
||||
`"shm"`."""
|
||||
mm_encoder_tp_mode: MMEncoderTPMode = "weights"
|
||||
"""Indicates how to optimize multi-modal encoder inference using
|
||||
tensor parallelism (TP).
|
||||
@@ -881,6 +889,9 @@ class ModelConfig:
|
||||
media_io_kwargs=self.media_io_kwargs,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
mm_processor_cache_gb=self.mm_processor_cache_gb,
|
||||
mm_processor_cache_type=self.mm_processor_cache_type,
|
||||
mm_shm_cache_max_object_size_mb=self.
|
||||
mm_shm_cache_max_object_size_mb,
|
||||
mm_encoder_tp_mode=self.mm_encoder_tp_mode,
|
||||
interleave_mm_strings=self.interleave_mm_strings,
|
||||
skip_mm_profiling=self.skip_mm_profiling,
|
||||
@@ -2448,6 +2459,15 @@ class MultiModalConfig:
|
||||
Set to `0` to disable this cache completely (not recommended).
|
||||
"""
|
||||
|
||||
mm_processor_cache_type: MMCacheType = "lru"
|
||||
"""Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
|
||||
use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
|
||||
|
||||
mm_shm_cache_max_object_size_mb: int = 128
|
||||
"""Size limit (in MiB) for each object stored in the multi-modal processor
|
||||
shared memory cache. Only effective when `mm_processor_cache_type` is
|
||||
`"shm"`."""
|
||||
|
||||
mm_encoder_tp_mode: MMEncoderTPMode = "weights"
|
||||
"""
|
||||
Indicates how to optimize multi-modal encoder inference using
|
||||
|
||||
Reference in New Issue
Block a user