[Doc] Improve MM Pooling model documentation (#25966)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-01 02:58:29 +08:00
parent e6a226efba
commit 2f652e6cdf
9 changed files with 292 additions and 100 deletions
--- a/examples/offline_inference/vision_language_pooling.py
+++ b/examples/offline_inference/vision_language_pooling.py
@@ -10,6 +10,7 @@ on HuggingFace model repository.

 from argparse import Namespace
 from dataclasses import asdict
+from pathlib import Path
 from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args

 from PIL.Image import Image
@@ -19,6 +20,9 @@ from vllm.entrypoints.score_utils import ScoreMultiModalParam
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser

+ROOT_DIR = Path(__file__).parent.parent.parent
+EXAMPLES_DIR = ROOT_DIR / "examples"
+

 class TextQuery(TypedDict):
    modality: Literal["text"]
@@ -82,23 +86,27 @@ def run_e5_v(query: Query) -> ModelRequestData:
    )


-def run_vlm2vec(query: Query) -> ModelRequestData:
+def _get_vlm2vec_prompt_image(query: Query, image_token: str):
    if query["modality"] == "text":
        text = query["text"]
        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
        image = None
    elif query["modality"] == "image":
-        prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image."  # noqa: E501
+        prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image."  # noqa: E501
        image = query["image"]
    elif query["modality"] == "text+image":
        text = query["text"]
-        prompt = (
-            f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
-        )
+        prompt = f"{image_token} Represent the given image with the following question: {text}"  # noqa: E501
        image = query["image"]
    else:
        modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: '{modality}'")
+        raise ValueError(f"Unsupported query modality: {modality!r}")
+
+    return prompt, image
+
+
+def run_vlm2vec_phi3v(query: Query) -> ModelRequestData:
+    prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>")

    engine_args = EngineArgs(
        model="TIGER-Lab/VLM2Vec-Full",
@@ -116,6 +124,66 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
    )


+def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
+    # vLLM does not support LoRA adapters on multi-modal encoder,
+    # so we merge the weights first
+    from huggingface_hub.constants import HF_HUB_CACHE
+    from peft import PeftConfig, PeftModel
+    from transformers import AutoModelForImageTextToText, AutoProcessor
+
+    from vllm.entrypoints.chat_utils import load_chat_template
+
+    model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
+
+    base_model = AutoModelForImageTextToText.from_pretrained(model_id)
+    lora_model = PeftModel.from_pretrained(
+        base_model,
+        model_id,
+        config=PeftConfig.from_pretrained(model_id),
+    )
+    model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
+    model._hf_peft_config_loaded = False  # Needed to save the merged model
+
+    processor = AutoProcessor.from_pretrained(
+        model_id,
+        # `min_pixels` and `max_pixels` are deprecated
+        size={"shortest_edge": 3136, "longest_edge": 12845056},
+    )
+    processor.chat_template = load_chat_template(
+        # The original chat template is not correct
+        EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja",
+    )
+
+    merged_path = str(
+        Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
+    )
+    print(f"Saving merged model to {merged_path}...")
+    print(
+        "NOTE: This directory is not tracked by `huggingface_hub` "
+        "so you have to delete this manually if you don't want it anymore."
+    )
+    model.save_pretrained(merged_path)
+    processor.save_pretrained(merged_path)
+    print("Done!")
+
+    prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>")
+
+    engine_args = EngineArgs(
+        model=merged_path,
+        runner="pooling",
+        max_model_len=4096,
+        trust_remote_code=True,
+        mm_processor_kwargs={"num_crops": 4},
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image=image,
+    )
+
+
 def run_jinavl_reranker(query: Query) -> ModelRequestData:
    if query["modality"] != "text+images":
        raise ValueError(f"Unsupported query modality: '{query['modality']}'")
@@ -232,7 +300,8 @@ def run_score(model: str, modality: QueryModality, seed: Optional[int]):

 model_example_map = {
    "e5_v": run_e5_v,
-    "vlm2vec": run_vlm2vec,
+    "vlm2vec_phi3v": run_vlm2vec_phi3v,
+    "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
    "jinavl_reranker": run_jinavl_reranker,
 }

@@ -246,7 +315,7 @@ def parse_args():
        "--model-name",
        "-m",
        type=str,
-        default="vlm2vec",
+        default="vlm2vec_phi3v",
        choices=model_example_map.keys(),
        help="The name of the embedding model.",
    )