[Misc] Update code for encoder-decoder models (#33900)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-06 11:38:39 +08:00
parent a32cb49b60
commit 035a6cb09a
2 changed files with 9 additions and 3 deletions
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -1098,7 +1098,7 @@ class MultiModalEncDecInputs(MultiModalInputs):

    Note: Even text-only encoder-decoder models are currently implemented
    as multi-modal models for convenience.
-    (Example: https://github.com/neuralmagic/bart-plugin)
+    (Example: https://github.com/vllm-project/bart-plugin)
    """

    encoder_prompt_token_ids: list[int]
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -185,7 +185,13 @@ class Scheduler(SchedulerInterface):

        # NOTE: Text-only encoder-decoder models are implemented as
        # multi-modal models for convenience
-        # Example: https://github.com/neuralmagic/bart-plugin
+        # Example: https://github.com/vllm-project/bart-plugin
+        if self.is_encoder_decoder:
+            assert mm_budget and len(mm_budget.mm_max_toks_per_item) <= 1, (
+                "Encoder-decoder models are expected to implement the "
+                "multimodal interface with at most one modality."
+            )
+
        self.max_num_encoder_input_tokens = (
            mm_budget.encoder_compute_budget if mm_budget else 0
        )
@@ -200,7 +206,7 @@ class Scheduler(SchedulerInterface):
        # TODO (NickLucche): Generalize to models with variable-length encoder inputs.
        self._num_encoder_max_input_tokens = (
            mm_budget.mm_max_toks_per_item[mm_budget.get_modality_with_max_tokens()]
-            if mm_budget
+            if mm_budget and mm_budget.mm_max_toks_per_item
            else 0
        )