[Frontend] Use engine argument to control MM cache size (#22441)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-08 00:47:10 +08:00
parent 8c9da6be22
commit 139d155781
13 changed files with 101 additions and 47 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -358,8 +358,8 @@ class EngineArgs:
                                                      "media_io_kwargs")
    mm_processor_kwargs: Optional[Dict[str, Any]] = \
        MultiModalConfig.mm_processor_kwargs
-    disable_mm_preprocessor_cache: bool = \
-        MultiModalConfig.disable_mm_preprocessor_cache
+    disable_mm_preprocessor_cache: bool = False  # DEPRECATED
+    mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
    # LoRA fields
    enable_lora: bool = False
    enable_lora_bias: bool = LoRAConfig.bias_enabled
@@ -720,8 +720,11 @@ class EngineArgs:
            "--mm-processor-kwargs",
            **multimodal_kwargs["mm_processor_kwargs"])
        multimodal_group.add_argument(
-            "--disable-mm-preprocessor-cache",
-            **multimodal_kwargs["disable_mm_preprocessor_cache"])
+            "--mm-processor-cache-gb",
+            **multimodal_kwargs["mm_processor_cache_gb"])
+        multimodal_group.add_argument("--disable-mm-preprocessor-cache",
+                                      type=bool,
+                                      deprecated=True)
        multimodal_group.add_argument(
            "--interleave-mm-strings",
            **multimodal_kwargs["interleave_mm_strings"])
@@ -886,6 +889,23 @@ class EngineArgs:
            self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
            self.load_format = "runai_streamer"

+        if self.disable_mm_preprocessor_cache:
+            logger.warning(
+                "`--disable-mm-preprocessor-cache` is deprecated "
+                "and will be removed in v0.13. "
+                "Please use `--mm-processor-cache-gb 0` instead.", )
+
+            self.mm_processor_cache_gb = 0
+        elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
+            logger.warning(
+                "VLLM_MM_INPUT_CACHE_GIB` is deprecated "
+                "and will be removed in v0.13. "
+                "Please use `--mm-processor-cache-gb %d` instead.",
+                envs.VLLM_MM_INPUT_CACHE_GIB,
+            )
+
+            self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
+
        return ModelConfig(
            model=self.model,
            hf_config_path=self.hf_config_path,
@@ -922,7 +942,7 @@ class EngineArgs:
            use_async_output_proc=not self.disable_async_output_proc,
            config_format=self.config_format,
            mm_processor_kwargs=self.mm_processor_kwargs,
-            disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
+            mm_processor_cache_gb=self.mm_processor_cache_gb,
            override_neuron_config=self.override_neuron_config,
            override_pooler_config=self.override_pooler_config,
            logits_processor_pattern=self.logits_processor_pattern,
@@ -1234,13 +1254,13 @@ class EngineArgs:
            dp_supports_mm_processor_cache = (self.data_parallel_size == 1
                                              or data_parallel_external_lb)
            if (not dp_supports_mm_processor_cache
-                    and not model_config.disable_mm_preprocessor_cache):
+                    and model_config.mm_processor_cache_gb > 0):
                logger.warning(
                    "Multi-modal processor cache is disabled because "
                    "it is not compatible with data parallelism when "
                    "there does not exist a one-to-one correspondance "
                    "between API and engine core processes.")
-                model_config.set_disable_mm_preprocessor_cache(True)
+                model_config.set_mm_processor_cache_gb(0)

        speculative_config = self.create_speculative_config(
            target_model_config=model_config,