[Model] VLM2Vec, the first multimodal embedding model in vLLM (#9303)
This commit is contained in:
21
examples/offline_inference_vision_language_embedding.py
Normal file
21
examples/offline_inference_vision_language_embedding.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from vllm import LLM
|
||||
from vllm.assets.image import ImageAsset
|
||||
|
||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
||||
prompt = "<|image_1|> Represent the given image with the following question: What is in the image" # noqa: E501
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
mm_processor_kwargs={"num_crops": 16},
|
||||
)
|
||||
|
||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||
outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}})
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
print(output.outputs.embedding) # list of 3072 floats
|
||||
Reference in New Issue
Block a user