[VLM] Limit multimodal input cache by memory (#14805)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-15 17:52:05 +08:00
parent 9ed6ee92d6
commit 3556a41434
13 changed files with 159 additions and 55 deletions
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -48,7 +48,7 @@ def _test_processing_correctness(
        tokenizer=cached_tokenizer_from_config(model_config),
    )
    # Ensure that it can fit all of the data
-    cache = ProcessingCache(capacity=1 << 30)
+    cache = ProcessingCache(capacity_gb=2048)

    processing_info = factories.info(ctx)
    supported_mm_limits = processing_info.get_supported_mm_limits()