[Model] VLM2Vec, the first multimodal embedding model in vLLM (#9303)

2024-10-16 14:31:00 +08:00
parent 7e7eae338d
commit 7abba39ee6
16 changed files with 465 additions and 261 deletions
--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
@@ -0,0 +1,21 @@
+from vllm import LLM
+from vllm.assets.image import ImageAsset
+
+image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+prompt = "<|image_1|> Represent the given image with the following question: What is in the image"  # noqa: E501
+
+# Create an LLM.
+llm = LLM(
+    model="TIGER-Lab/VLM2Vec-Full",
+    trust_remote_code=True,
+    max_model_len=4096,
+    max_num_seqs=2,
+    mm_processor_kwargs={"num_crops": 16},
+)
+
+# Generate embedding. The output is a list of EmbeddingRequestOutputs.
+outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}})
+
+# Print the outputs.
+for output in outputs:
+    print(output.outputs.embedding)  # list of 3072 floats