[Frontend] Add Support for MM Encoder/Decoder Beam Search (Offline) (#36153)

Signed-off-by: Alex Brooks <albrooks@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Alex Brooks
2026-03-06 02:16:56 -07:00
committed by GitHub
parent 5b3ba94ab4
commit 10f4db4dbe
5 changed files with 191 additions and 29 deletions

View File

@@ -843,7 +843,10 @@ class VllmRunner:
def get_inputs(
self,
prompts: list[str] | list[torch.Tensor] | list[list[int]],
prompts: list[str]
| list[torch.Tensor]
| list[list[int]]
| list[dict[str, Any]],
images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None,
@@ -857,26 +860,32 @@ class VllmRunner:
inputs = list[dict[str, Any]]()
for i, prompt in enumerate(prompts):
prompt_dict = dict[str, Any]()
if isinstance(prompt, str):
prompt_dict["prompt"] = prompt
elif isinstance(prompt, list):
prompt_dict["prompt_token_ids"] = prompt
# If we're passing an encoder/decoder prompt, we assume it
# already contains the multimodal data in the prompt
if isinstance(prompt, dict):
assert images is None and audios is None and videos is None
inputs.append(prompt.copy())
else:
prompt_dict["prompt_embeds"] = prompt
prompt_dict = dict[str, Any]()
if isinstance(prompt, str):
prompt_dict["prompt"] = prompt
elif isinstance(prompt, list):
prompt_dict["prompt_token_ids"] = prompt
else:
prompt_dict["prompt_embeds"] = prompt
multi_modal_data = dict[str, Any]()
if images is not None and (image := images[i]) is not None:
multi_modal_data["image"] = image
if videos is not None and (video := videos[i]) is not None:
multi_modal_data["video"] = video
if audios is not None and (audio := audios[i]) is not None:
multi_modal_data["audio"] = audio
multi_modal_data = dict[str, Any]()
if images is not None and (image := images[i]) is not None:
multi_modal_data["image"] = image
if videos is not None and (video := videos[i]) is not None:
multi_modal_data["video"] = video
if audios is not None and (audio := audios[i]) is not None:
multi_modal_data["audio"] = audio
if multi_modal_data:
prompt_dict["multi_modal_data"] = multi_modal_data
if multi_modal_data:
prompt_dict["multi_modal_data"] = multi_modal_data
inputs.append(prompt_dict)
inputs.append(prompt_dict)
return inputs