Enforce valid max_num_batched_tokens when disable_chunked_mm_input=True (#16447)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -1030,7 +1030,7 @@ class EngineArgs:
|
||||
action=StoreBoolean,
|
||||
default=EngineArgs.disable_chunked_mm_input,
|
||||
nargs="?",
|
||||
const="False",
|
||||
const="True",
|
||||
help="Disable multimodal input chunking attention for V1. "
|
||||
"If set to true and chunked prefill is enabled, we do not want to"
|
||||
" partially schedule a multimodal item. This ensures that if a "
|
||||
|
||||
@@ -133,6 +133,14 @@ def _compute_encoder_budget_multimodal(
|
||||
_, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(),
|
||||
key=lambda item: item[1])
|
||||
|
||||
if (scheduler_config.disable_chunked_mm_input and max_tokens_per_mm_item
|
||||
> scheduler_config.max_num_batched_tokens):
|
||||
raise ValueError(
|
||||
"Chunked MM input disabled but max_tokens_per_mm_item "
|
||||
f"({max_tokens_per_mm_item}) is larger than max_num_batched_tokens"
|
||||
f" ({scheduler_config.max_num_batched_tokens}). Please increase "
|
||||
"max_num_batched_tokens.")
|
||||
|
||||
encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens,
|
||||
max_tokens_per_mm_item)
|
||||
encoder_cache_size = max(scheduler_config.encoder_cache_size,
|
||||
|
||||
Reference in New Issue
Block a user