[Model] Always use Transformers backend for PaliGemma and Gemma3-MM (#26715)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-17 13:03:35 +08:00
committed by GitHub
parent 9c2c2287a0
commit 8c017b3490
12 changed files with 54 additions and 1219 deletions

View File

@@ -113,25 +113,6 @@ VLM_TEST_SETTINGS = {
dtype="bfloat16" if current_platform.is_cpu() else "auto",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
"paligemma": VLMTestInfo(
models=["google/paligemma-3b-mix-224"],
test_type=VLMTestType.IMAGE,
prompt_formatter=identity,
img_idx_to_prompt=lambda idx: "",
# Paligemma uses its own sample prompts because the default one fails
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "caption es",
"cherry_blossom": "What is in the picture?",
}
),
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype="bfloat16",
marks=[
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
],
),
"qwen2_5_vl": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
@@ -196,14 +177,24 @@ VLM_TEST_SETTINGS = {
# Gemma3 has bidirectional mask on images
"gemma3-transformers": VLMTestInfo(
models=["google/gemma-3-4b-it"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda vid_prompt: f"<'<bos><start_of_turn>user\n{vid_prompt}<start_of_image><end_of_turn>\n<start_of_turn>model\n", # noqa: E501
max_model_len=4096,
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<start_of_image>What is the season?",
}
),
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=8192,
auto_cls=AutoModelForImageTextToText,
# TODO: Support `do_pan_and_scan` in transformers backend
# patch_hf_runner=model_utils.gemma3_patch_hf_runner,
vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output,
image_size_factors=[(0.25, 0.5, 1.0)],
vllm_runner_kwargs={
"model_impl": "transformers",
# "mm_processor_kwargs": {"do_pan_and_scan": True},
},
marks=[pytest.mark.core_model],
),
@@ -222,6 +213,27 @@ VLM_TEST_SETTINGS = {
},
marks=[pytest.mark.core_model],
),
# PaliGemma has PrefixLM attention
"paligemma-transformers": VLMTestInfo(
models=["google/paligemma-3b-mix-224"],
test_type=VLMTestType.IMAGE,
prompt_formatter=identity,
img_idx_to_prompt=lambda idx: "",
# PaliGemma uses its own sample prompts because the default one fails
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "caption es",
"cherry_blossom": "What is in the picture?",
}
),
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
image_size_factors=[(0.25, 0.5, 1.0)],
vllm_runner_kwargs={
"model_impl": "transformers",
},
marks=[pytest.mark.core_model],
),
# Pixel values from processor are not 4D or 5D arrays
"qwen2_5_vl-transformers": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
@@ -348,24 +360,6 @@ VLM_TEST_SETTINGS = {
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[large_gpu_mark(min_gb=32)],
),
"gemma3": VLMTestInfo(
models=["google/gemma-3-4b-it"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<start_of_image>What is the season?",
}
),
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
num_logprobs=10,
),
"glm4v": VLMTestInfo(
models=["zai-org/glm-4v-9b"],
test_type=VLMTestType.IMAGE,