[Core][Bugfix] Fix Online MM Beam Search (#19688)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
2025-06-19 11:18:07 -06:00
parent 01220ce89a
commit ead2110297
3 changed files with 45 additions and 12 deletions
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -88,9 +88,18 @@ class EngineClient(ABC):
        if processed_inputs["type"] == "embeds":
            raise NotImplementedError

-        prompt_token_ids = processed_inputs["prompt_token_ids"]
+        # This is a workaround to fix multimodal beam search; this is a
+        # bandaid fix for 2 small problems:
+        # 1. Multi_modal_data on the processed_inputs currently resolves to
+        #    `None`.
+        # 2. preprocessing above expands the multimodal placeholders. However,
+        #    this happens again in generation, so the double expansion causes
+        #    a mismatch.
+        # TODO - would be ideal to handle this more gracefully.
+        prompt_token_ids = prompt.get("prompt_token_ids")
+        multi_modal_data = prompt.get("multi_modal_data")
+
        prompt_text = processed_inputs.get("prompt")
-        multi_modal_data = processed_inputs.get("multi_modal_data")
        mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs")

        tokenized_length = len(prompt_token_ids)