[Core] Use key-only cache for BaseMultiModalProcessor (#23018)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-08-27 14:19:13 +08:00
committed by GitHub
parent 8dbf6ed7be
commit 69244e67e6
29 changed files with 954 additions and 394 deletions

View File

@@ -351,7 +351,7 @@ class EngineArgs:
mm_processor_kwargs: Optional[Dict[str, Any]] = \
MultiModalConfig.mm_processor_kwargs
disable_mm_preprocessor_cache: bool = False # DEPRECATED
mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
# LoRA fields
@@ -1293,18 +1293,6 @@ class EngineArgs:
worker_extension_cls=self.worker_extension_cls,
)
if model_config.is_multimodal_model:
dp_supports_mm_processor_cache = (self.data_parallel_size == 1
or data_parallel_external_lb)
if (not dp_supports_mm_processor_cache
and model_config.mm_processor_cache_gb > 0):
logger.warning(
"Multi-modal processor cache is disabled because "
"it is not compatible with data parallelism when "
"there does not exist a one-to-one correspondance "
"between API and engine core processes.")
model_config.set_mm_processor_cache_gb(0)
speculative_config = self.create_speculative_config(
target_model_config=model_config,
target_parallel_config=parallel_config,