[Frontend] Use engine argument to control MM cache size (#22441)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-08-08 00:47:10 +08:00
committed by GitHub
parent 8c9da6be22
commit 139d155781
13 changed files with 101 additions and 47 deletions

View File

@@ -358,8 +358,8 @@ class EngineArgs:
"media_io_kwargs")
mm_processor_kwargs: Optional[Dict[str, Any]] = \
MultiModalConfig.mm_processor_kwargs
disable_mm_preprocessor_cache: bool = \
MultiModalConfig.disable_mm_preprocessor_cache
disable_mm_preprocessor_cache: bool = False # DEPRECATED
mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
# LoRA fields
enable_lora: bool = False
enable_lora_bias: bool = LoRAConfig.bias_enabled
@@ -720,8 +720,11 @@ class EngineArgs:
"--mm-processor-kwargs",
**multimodal_kwargs["mm_processor_kwargs"])
multimodal_group.add_argument(
"--disable-mm-preprocessor-cache",
**multimodal_kwargs["disable_mm_preprocessor_cache"])
"--mm-processor-cache-gb",
**multimodal_kwargs["mm_processor_cache_gb"])
multimodal_group.add_argument("--disable-mm-preprocessor-cache",
type=bool,
deprecated=True)
multimodal_group.add_argument(
"--interleave-mm-strings",
**multimodal_kwargs["interleave_mm_strings"])
@@ -886,6 +889,23 @@ class EngineArgs:
self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
self.load_format = "runai_streamer"
if self.disable_mm_preprocessor_cache:
logger.warning(
"`--disable-mm-preprocessor-cache` is deprecated "
"and will be removed in v0.13. "
"Please use `--mm-processor-cache-gb 0` instead.", )
self.mm_processor_cache_gb = 0
elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
logger.warning(
"VLLM_MM_INPUT_CACHE_GIB` is deprecated "
"and will be removed in v0.13. "
"Please use `--mm-processor-cache-gb %d` instead.",
envs.VLLM_MM_INPUT_CACHE_GIB,
)
self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
return ModelConfig(
model=self.model,
hf_config_path=self.hf_config_path,
@@ -922,7 +942,7 @@ class EngineArgs:
use_async_output_proc=not self.disable_async_output_proc,
config_format=self.config_format,
mm_processor_kwargs=self.mm_processor_kwargs,
disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
mm_processor_cache_gb=self.mm_processor_cache_gb,
override_neuron_config=self.override_neuron_config,
override_pooler_config=self.override_pooler_config,
logits_processor_pattern=self.logits_processor_pattern,
@@ -1234,13 +1254,13 @@ class EngineArgs:
dp_supports_mm_processor_cache = (self.data_parallel_size == 1
or data_parallel_external_lb)
if (not dp_supports_mm_processor_cache
and not model_config.disable_mm_preprocessor_cache):
and model_config.mm_processor_cache_gb > 0):
logger.warning(
"Multi-modal processor cache is disabled because "
"it is not compatible with data parallelism when "
"there does not exist a one-to-one correspondance "
"between API and engine core processes.")
model_config.set_disable_mm_preprocessor_cache(True)
model_config.set_mm_processor_cache_gb(0)
speculative_config = self.create_speculative_config(
target_model_config=model_config,