[Doc] Improve MM Pooling model documentation (#25966)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -10,6 +10,7 @@ on HuggingFace model repository.
|
||||
|
||||
from argparse import Namespace
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
|
||||
|
||||
from PIL.Image import Image
|
||||
@@ -19,6 +20,9 @@ from vllm.entrypoints.score_utils import ScoreMultiModalParam
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent
|
||||
EXAMPLES_DIR = ROOT_DIR / "examples"
|
||||
|
||||
|
||||
class TextQuery(TypedDict):
|
||||
modality: Literal["text"]
|
||||
@@ -82,23 +86,27 @@ def run_e5_v(query: Query) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_vlm2vec(query: Query) -> ModelRequestData:
|
||||
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
|
||||
if query["modality"] == "text":
|
||||
text = query["text"]
|
||||
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||
image = query["image"]
|
||||
elif query["modality"] == "text+image":
|
||||
text = query["text"]
|
||||
prompt = (
|
||||
f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
|
||||
)
|
||||
prompt = f"{image_token} Represent the given image with the following question: {text}" # noqa: E501
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query["modality"]
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
raise ValueError(f"Unsupported query modality: {modality!r}")
|
||||
|
||||
return prompt, image
|
||||
|
||||
|
||||
def run_vlm2vec_phi3v(query: Query) -> ModelRequestData:
|
||||
prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
@@ -116,6 +124,66 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
|
||||
# vLLM does not support LoRA adapters on multi-modal encoder,
|
||||
# so we merge the weights first
|
||||
from huggingface_hub.constants import HF_HUB_CACHE
|
||||
from peft import PeftConfig, PeftModel
|
||||
from transformers import AutoModelForImageTextToText, AutoProcessor
|
||||
|
||||
from vllm.entrypoints.chat_utils import load_chat_template
|
||||
|
||||
model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
|
||||
|
||||
base_model = AutoModelForImageTextToText.from_pretrained(model_id)
|
||||
lora_model = PeftModel.from_pretrained(
|
||||
base_model,
|
||||
model_id,
|
||||
config=PeftConfig.from_pretrained(model_id),
|
||||
)
|
||||
model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
|
||||
model._hf_peft_config_loaded = False # Needed to save the merged model
|
||||
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_id,
|
||||
# `min_pixels` and `max_pixels` are deprecated
|
||||
size={"shortest_edge": 3136, "longest_edge": 12845056},
|
||||
)
|
||||
processor.chat_template = load_chat_template(
|
||||
# The original chat template is not correct
|
||||
EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja",
|
||||
)
|
||||
|
||||
merged_path = str(
|
||||
Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
|
||||
)
|
||||
print(f"Saving merged model to {merged_path}...")
|
||||
print(
|
||||
"NOTE: This directory is not tracked by `huggingface_hub` "
|
||||
"so you have to delete this manually if you don't want it anymore."
|
||||
)
|
||||
model.save_pretrained(merged_path)
|
||||
processor.save_pretrained(merged_path)
|
||||
print("Done!")
|
||||
|
||||
prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=merged_path,
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={"num_crops": 4},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def run_jinavl_reranker(query: Query) -> ModelRequestData:
|
||||
if query["modality"] != "text+images":
|
||||
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
|
||||
@@ -232,7 +300,8 @@ def run_score(model: str, modality: QueryModality, seed: Optional[int]):
|
||||
|
||||
model_example_map = {
|
||||
"e5_v": run_e5_v,
|
||||
"vlm2vec": run_vlm2vec,
|
||||
"vlm2vec_phi3v": run_vlm2vec_phi3v,
|
||||
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
|
||||
"jinavl_reranker": run_jinavl_reranker,
|
||||
}
|
||||
|
||||
@@ -246,7 +315,7 @@ def parse_args():
|
||||
"--model-name",
|
||||
"-m",
|
||||
type=str,
|
||||
default="vlm2vec",
|
||||
default="vlm2vec_phi3v",
|
||||
choices=model_example_map.keys(),
|
||||
help="The name of the embedding model.",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user