[Model] support input embeddings for qwen2vl (#8856)
This commit is contained in:
@@ -281,7 +281,7 @@ Multimodal Language Models
|
||||
-
|
||||
* - :code:`Qwen2VLForConditionalGeneration`
|
||||
- Qwen2-VL
|
||||
- Image\ :sup:`+` / Video\ :sup:`+`
|
||||
- Image\ :sup:`E+` / Video\ :sup:`+`
|
||||
- :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
|
||||
-
|
||||
* - :code:`UltravoxModel`
|
||||
|
||||
@@ -60,7 +60,24 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
|
||||
# Inference with image embeddings as input with additional parameters
|
||||
# Specifically, we are conducting a trial run of Qwen2VL with the new input format, as the model utilizes additional parameters for calculating positional encoding.
|
||||
image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
|
||||
image_grid_thw = torch.load(...) # torch.Tensor of shape (1, 3)
|
||||
mm_data['image'] = {
|
||||
"image_embeds": image_embeds,
|
||||
"image_grid_thw": image_grid_thw,
|
||||
}
|
||||
outputs = llm.generate({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": mm_data,
|
||||
})
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
|
||||
# Batch inference
|
||||
image_1 = PIL.Image.open(...)
|
||||
image_2 = PIL.Image.open(...)
|
||||
|
||||
Reference in New Issue
Block a user