diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 2cc7900eb..9f01af9a8 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -1098,7 +1098,7 @@ class MultiModalEncDecInputs(MultiModalInputs):
 
     Note: Even text-only encoder-decoder models are currently implemented
     as multi-modal models for convenience.
-    (Example: https://github.com/neuralmagic/bart-plugin)
+    (Example: https://github.com/vllm-project/bart-plugin)
     """
 
     encoder_prompt_token_ids: list[int]
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 745d9ffec..9f0643e4f 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -185,7 +185,13 @@ class Scheduler(SchedulerInterface):
 
         # NOTE: Text-only encoder-decoder models are implemented as
         # multi-modal models for convenience
-        # Example: https://github.com/neuralmagic/bart-plugin
+        # Example: https://github.com/vllm-project/bart-plugin
+        if self.is_encoder_decoder:
+            assert mm_budget and len(mm_budget.mm_max_toks_per_item) <= 1, (
+                "Encoder-decoder models are expected to implement the "
+                "multimodal interface with at most one modality."
+            )
+
         self.max_num_encoder_input_tokens = (
             mm_budget.encoder_compute_budget if mm_budget else 0
         )
@@ -200,7 +206,7 @@ class Scheduler(SchedulerInterface):
         # TODO (NickLucche): Generalize to models with variable-length encoder inputs.
         self._num_encoder_max_input_tokens = (
             mm_budget.mm_max_toks_per_item[mm_budget.get_modality_with_max_tokens()]
-            if mm_budget
+            if mm_budget and mm_budget.mm_max_toks_per_item
             else 0
         )