From 467886a0c48b37552c8a2f3bdea99e96f2e98f8c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 3 Mar 2026 13:47:45 -0800 Subject: [PATCH] [Model Runner V2] Fix inputs_embeds=None bug for MM models (#35917) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/model_runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 35dd617ee..17a5be7d7 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -907,9 +907,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): ) inputs_embeds = None - if self.supports_mm_inputs and self.is_first_pp_rank and not dummy_run: + if self.supports_mm_inputs and self.is_first_pp_rank: # Run MM encoder (if needed) and get multimodal embeddings. # Only first PP rank prepares multimodal embeddings. + # NOTE(woosuk): We must call get_mm_embeddings even during dummy runs + # to obtain inputs_embeds, because the compiled model expects this input. inputs_embeds = self.model_state.get_mm_embeddings( scheduler_output.scheduled_encoder_inputs, input_batch,