[Frontend] Use engine argument to control MM cache size (#22441)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-08 00:47:10 +08:00
parent 8c9da6be22
commit 139d155781
13 changed files with 101 additions and 47 deletions
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -62,9 +62,7 @@ def run_test(
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).

-    vllm_runner_kwargs_: dict[str, Any] = {
-        "disable_mm_preprocessor_cache": True,
-    }
+    vllm_runner_kwargs_: dict[str, Any] = {"mm_processor_cache_gb": 0}
    if model_info.tokenizer:
        vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
    if model_info.tokenizer_mode: