Enforce valid max_num_batched_tokens when disable_chunked_mm_input=True (#16447)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-11 02:09:52 -06:00
parent f7030df3be
commit aa3b3d76e0
3 changed files with 18 additions and 1 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1030,7 +1030,7 @@ class EngineArgs:
            action=StoreBoolean,
            default=EngineArgs.disable_chunked_mm_input,
            nargs="?",
-            const="False",
+            const="True",
            help="Disable multimodal input chunking attention for V1. "
            "If set to true and chunked prefill is enabled, we do not want to"
            " partially schedule a multimodal item. This ensures that if a "
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -133,6 +133,14 @@ def _compute_encoder_budget_multimodal(
    _, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(),
                                    key=lambda item: item[1])

+    if (scheduler_config.disable_chunked_mm_input and max_tokens_per_mm_item
+            > scheduler_config.max_num_batched_tokens):
+        raise ValueError(
+            "Chunked MM input disabled but max_tokens_per_mm_item "
+            f"({max_tokens_per_mm_item}) is larger than max_num_batched_tokens"
+            f" ({scheduler_config.max_num_batched_tokens}). Please increase "
+            "max_num_batched_tokens.")
+
    encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens,
                                 max_tokens_per_mm_item)
    encoder_cache_size = max(scheduler_config.encoder_cache_size,