[Model] Always use Transformers backend for PaliGemma and Gemma3-MM (#26715)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-17 13:03:35 +08:00
committed by GitHub
parent 9c2c2287a0
commit 8c017b3490
12 changed files with 54 additions and 1219 deletions

View File

@@ -113,25 +113,6 @@ VLM_TEST_SETTINGS = {
dtype="bfloat16" if current_platform.is_cpu() else "auto",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
"paligemma": VLMTestInfo(
models=["google/paligemma-3b-mix-224"],
test_type=VLMTestType.IMAGE,
prompt_formatter=identity,
img_idx_to_prompt=lambda idx: "",
# Paligemma uses its own sample prompts because the default one fails
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "caption es",
"cherry_blossom": "What is in the picture?",
}
),
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype="bfloat16",
marks=[
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
],
),
"qwen2_5_vl": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
@@ -196,14 +177,24 @@ VLM_TEST_SETTINGS = {
# Gemma3 has bidirectional mask on images
"gemma3-transformers": VLMTestInfo(
models=["google/gemma-3-4b-it"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda vid_prompt: f"<'<bos><start_of_turn>user\n{vid_prompt}<start_of_image><end_of_turn>\n<start_of_turn>model\n", # noqa: E501
max_model_len=4096,
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<start_of_image>What is the season?",
}
),
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=8192,
auto_cls=AutoModelForImageTextToText,
# TODO: Support `do_pan_and_scan` in transformers backend
# patch_hf_runner=model_utils.gemma3_patch_hf_runner,
vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output,
image_size_factors=[(0.25, 0.5, 1.0)],
vllm_runner_kwargs={
"model_impl": "transformers",
# "mm_processor_kwargs": {"do_pan_and_scan": True},
},
marks=[pytest.mark.core_model],
),
@@ -222,6 +213,27 @@ VLM_TEST_SETTINGS = {
},
marks=[pytest.mark.core_model],
),
# PaliGemma has PrefixLM attention
"paligemma-transformers": VLMTestInfo(
models=["google/paligemma-3b-mix-224"],
test_type=VLMTestType.IMAGE,
prompt_formatter=identity,
img_idx_to_prompt=lambda idx: "",
# PaliGemma uses its own sample prompts because the default one fails
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "caption es",
"cherry_blossom": "What is in the picture?",
}
),
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
image_size_factors=[(0.25, 0.5, 1.0)],
vllm_runner_kwargs={
"model_impl": "transformers",
},
marks=[pytest.mark.core_model],
),
# Pixel values from processor are not 4D or 5D arrays
"qwen2_5_vl-transformers": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
@@ -348,24 +360,6 @@ VLM_TEST_SETTINGS = {
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[large_gpu_mark(min_gb=32)],
),
"gemma3": VLMTestInfo(
models=["google/gemma-3-4b-it"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<start_of_image>What is the season?",
}
),
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
num_logprobs=10,
),
"glm4v": VLMTestInfo(
models=["zai-org/glm-4v-9b"],
test_type=VLMTestType.IMAGE,

View File

@@ -328,16 +328,6 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
hf_model.processor = processor
orig_generate = hf_model.model.generate
def _generate(self, *args, **kwargs):
# FIXME: https://github.com/huggingface/transformers/issues/38333
kwargs["disable_compile"] = True
return orig_generate(*args, **kwargs)
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
return hf_model

View File

@@ -222,7 +222,6 @@ def _test_processing_correctness(
_ADD_SPECIAL_TOKENS_OVERRIDES = {
"ovis": False,
"ovis2_5": False,
"paligemma": False,
"ultravox": False,
"whisper": False,
}
@@ -333,7 +332,6 @@ def _test_processing_correctness_one(
"deepseek-ai/deepseek-vl2-tiny",
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
"adept/fuyu-8b",
"google/gemma-3-4b-it",
"google/gemma-3n-E2B-it",
"zai-org/glm-4v-9b",
"zai-org/GLM-4.1V-9B-Thinking",
@@ -370,8 +368,6 @@ def _test_processing_correctness_one(
"AIDC-AI/Ovis1.6-Llama3.2-3B",
"AIDC-AI/Ovis2-1B",
"AIDC-AI/Ovis2.5-2B",
"google/paligemma-3b-mix-224",
"google/paligemma2-3b-ft-docci-448",
"microsoft/Phi-3.5-vision-instruct",
"microsoft/Phi-4-multimodal-instruct",
"mistralai/Pixtral-12B-2409",

View File

@@ -48,7 +48,6 @@ ARCH_NEEDS_EXTRAS = [
"Idefics3ForConditionalGeneration",
"LlavaForConditionalGeneration",
"MiniCPMV",
"PaliGemmaForConditionalGeneration",
]
REPO_ID_TO_SKIP = {
"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",