[Core][VLM] Support image embeddings as input (#6613)

2024-08-12 01:16:06 -07:00
parent ec2affa8ae
commit e6e42e4b17
13 changed files with 517 additions and 138 deletions
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -49,6 +49,17 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI
        "multi_modal_data": {"image": image},
    })

+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+    # Inference with image embeddings as input
+    image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": image_embeds},
+    })
+
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)