[Model] support input embeddings for qwen2vl (#8856)

2024-09-30 11:16:10 +08:00
parent f13a07b1f8
commit e01ab595d8
3 changed files with 135 additions and 70 deletions
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -281,7 +281,7 @@ Multimodal Language Models
    -
  * - :code:`Qwen2VLForConditionalGeneration`
    - Qwen2-VL
-    - Image\ :sup:`+` / Video\ :sup:`+`
+    - Image\ :sup:`E+` / Video\ :sup:`+`
    - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
    -
  * - :code:`UltravoxModel`
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -60,7 +60,24 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
+
+    # Inference with image embeddings as input with additional parameters
+    # Specifically, we are conducting a trial run of Qwen2VL with the new input format, as the model utilizes additional parameters for calculating positional encoding.
+    image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
+    image_grid_thw = torch.load(...) # torch.Tensor of shape (1, 3)
+    mm_data['image'] = {
+        "image_embeds": image_embeds,
+        "image_grid_thw":  image_grid_thw,
+    }
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": mm_data,
+    })
    
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
    # Batch inference
    image_1 = PIL.Image.open(...)
    image_2 = PIL.Image.open(...)