diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 2cc7900eb..9f01af9a8 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -1098,7 +1098,7 @@ class MultiModalEncDecInputs(MultiModalInputs): Note: Even text-only encoder-decoder models are currently implemented as multi-modal models for convenience. - (Example: https://github.com/neuralmagic/bart-plugin) + (Example: https://github.com/vllm-project/bart-plugin) """ encoder_prompt_token_ids: list[int] diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 745d9ffec..9f0643e4f 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -185,7 +185,13 @@ class Scheduler(SchedulerInterface): # NOTE: Text-only encoder-decoder models are implemented as # multi-modal models for convenience - # Example: https://github.com/neuralmagic/bart-plugin + # Example: https://github.com/vllm-project/bart-plugin + if self.is_encoder_decoder: + assert mm_budget and len(mm_budget.mm_max_toks_per_item) <= 1, ( + "Encoder-decoder models are expected to implement the " + "multimodal interface with at most one modality." + ) + self.max_num_encoder_input_tokens = ( mm_budget.encoder_compute_budget if mm_budget else 0 ) @@ -200,7 +206,7 @@ class Scheduler(SchedulerInterface): # TODO (NickLucche): Generalize to models with variable-length encoder inputs. self._num_encoder_max_input_tokens = ( mm_budget.mm_max_toks_per_item[mm_budget.get_modality_with_max_tokens()] - if mm_budget + if mm_budget and mm_budget.mm_max_toks_per_item else 0 )