[V1] Add disable_chunked_mm_input arg to disable partial mm input prefill (#15837)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-04-08 00:24:07 -06:00
parent 87918e40c4
commit 8e5314a468
4 changed files with 80 additions and 0 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1721,6 +1721,14 @@ class SchedulerConfig:

    chunked_prefill_enabled: bool = field(init=False)

+    # If set to true and chunked prefill is enabled, we do not want to
+    # partially schedule a multimodal item. Only used in V1
+    # This ensures that if a request has a mixed prompt
+    # (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
+    # some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
+    # it will be scheduled as TTTT in one step and IIIIIIIIII in the next.
+    disable_chunked_mm_input: bool = False
+
    # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
    # or "mod.custom_class".
    scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"