[Core][VLM] Support image embeddings as input (#6613)

This commit is contained in:
Roger Wang
2024-08-12 01:16:06 -07:00
committed by GitHub
parent ec2affa8ae
commit e6e42e4b17
13 changed files with 517 additions and 138 deletions

View File

@@ -49,6 +49,17 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI
"multi_modal_data": {"image": image},
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
# Inference with image embeddings as input
image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {"image": image_embeds},
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)