[Frontend] Add Support for MM Encoder/Decoder Beam Search (Offline) (#36153)
Signed-off-by: Alex Brooks <albrooks@redhat.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -843,7 +843,10 @@ class VllmRunner:
|
||||
|
||||
def get_inputs(
|
||||
self,
|
||||
prompts: list[str] | list[torch.Tensor] | list[list[int]],
|
||||
prompts: list[str]
|
||||
| list[torch.Tensor]
|
||||
| list[list[int]]
|
||||
| list[dict[str, Any]],
|
||||
images: PromptImageInput | None = None,
|
||||
videos: PromptVideoInput | None = None,
|
||||
audios: PromptAudioInput | None = None,
|
||||
@@ -857,26 +860,32 @@ class VllmRunner:
|
||||
|
||||
inputs = list[dict[str, Any]]()
|
||||
for i, prompt in enumerate(prompts):
|
||||
prompt_dict = dict[str, Any]()
|
||||
if isinstance(prompt, str):
|
||||
prompt_dict["prompt"] = prompt
|
||||
elif isinstance(prompt, list):
|
||||
prompt_dict["prompt_token_ids"] = prompt
|
||||
# If we're passing an encoder/decoder prompt, we assume it
|
||||
# already contains the multimodal data in the prompt
|
||||
if isinstance(prompt, dict):
|
||||
assert images is None and audios is None and videos is None
|
||||
inputs.append(prompt.copy())
|
||||
else:
|
||||
prompt_dict["prompt_embeds"] = prompt
|
||||
prompt_dict = dict[str, Any]()
|
||||
if isinstance(prompt, str):
|
||||
prompt_dict["prompt"] = prompt
|
||||
elif isinstance(prompt, list):
|
||||
prompt_dict["prompt_token_ids"] = prompt
|
||||
else:
|
||||
prompt_dict["prompt_embeds"] = prompt
|
||||
|
||||
multi_modal_data = dict[str, Any]()
|
||||
if images is not None and (image := images[i]) is not None:
|
||||
multi_modal_data["image"] = image
|
||||
if videos is not None and (video := videos[i]) is not None:
|
||||
multi_modal_data["video"] = video
|
||||
if audios is not None and (audio := audios[i]) is not None:
|
||||
multi_modal_data["audio"] = audio
|
||||
multi_modal_data = dict[str, Any]()
|
||||
if images is not None and (image := images[i]) is not None:
|
||||
multi_modal_data["image"] = image
|
||||
if videos is not None and (video := videos[i]) is not None:
|
||||
multi_modal_data["video"] = video
|
||||
if audios is not None and (audio := audios[i]) is not None:
|
||||
multi_modal_data["audio"] = audio
|
||||
|
||||
if multi_modal_data:
|
||||
prompt_dict["multi_modal_data"] = multi_modal_data
|
||||
if multi_modal_data:
|
||||
prompt_dict["multi_modal_data"] = multi_modal_data
|
||||
|
||||
inputs.append(prompt_dict)
|
||||
inputs.append(prompt_dict)
|
||||
|
||||
return inputs
|
||||
|
||||
|
||||
Reference in New Issue
Block a user