From ddbb0d230a3592106ac9f5f7f4e9a861863fcbee Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 10 Mar 2026 00:24:58 -0700 Subject: [PATCH] [Model Runner V2] Fix mm input embeddings lookup (#36588) Signed-off-by: Nick Hill --- vllm/v1/worker/gpu/model_states/default.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/model_states/default.py b/vllm/v1/worker/gpu/model_states/default.py index f0b0e20c5..770c65049 100644 --- a/vllm/v1/worker/gpu/model_states/default.py +++ b/vllm/v1/worker/gpu/model_states/default.py @@ -98,8 +98,11 @@ class DefaultModelState(ModelState): req_states.prefill_len.np[input_batch.idx_mapping_np], req_states.num_computed_prefill_tokens[input_batch.idx_mapping_np], ) + # Use unpadded input_ids to match is_mm_embed size (num_tokens). + # input_batch.input_ids may be padded for CUDA graphs. + input_ids_unpadded = input_batch.input_ids[: input_batch.num_tokens] inputs_embeds = self.encoder_runner.get_inputs_embeds( - input_batch.input_ids, mm_embeds, is_mm_embed + input_ids_unpadded, mm_embeds, is_mm_embed ) return inputs_embeds[: input_batch.num_tokens_after_padding]