[Model] Always use Transformers backend for PaliGemma and Gemma3-MM (#26715)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-17 13:03:35 +08:00
parent 9c2c2287a0
commit 8c017b3490
12 changed files with 54 additions and 1219 deletions
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -113,25 +113,6 @@ VLM_TEST_SETTINGS = {
        dtype="bfloat16" if current_platform.is_cpu() else "auto",
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
-    "paligemma": VLMTestInfo(
-        models=["google/paligemma-3b-mix-224"],
-        test_type=VLMTestType.IMAGE,
-        prompt_formatter=identity,
-        img_idx_to_prompt=lambda idx: "",
-        # Paligemma uses its own sample prompts because the default one fails
-        single_image_prompts=IMAGE_ASSETS.prompts(
-            {
-                "stop_sign": "caption es",
-                "cherry_blossom": "What is in the picture?",
-            }
-        ),
-        auto_cls=AutoModelForImageTextToText,
-        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
-        dtype="bfloat16",
-        marks=[
-            pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
-        ],
-    ),
    "qwen2_5_vl": VLMTestInfo(
        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
@@ -196,14 +177,24 @@ VLM_TEST_SETTINGS = {
    # Gemma3 has bidirectional mask on images
    "gemma3-transformers": VLMTestInfo(
        models=["google/gemma-3-4b-it"],
-        test_type=VLMTestType.IMAGE,
-        prompt_formatter=lambda vid_prompt: f"<'<bos><start_of_turn>user\n{vid_prompt}<start_of_image><end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
-        max_model_len=4096,
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
+                "cherry_blossom": "<start_of_image>What is the season?",
+            }
+        ),
+        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=8192,
        auto_cls=AutoModelForImageTextToText,
+        # TODO: Support `do_pan_and_scan` in transformers backend
+        # patch_hf_runner=model_utils.gemma3_patch_hf_runner,
        vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output,
        image_size_factors=[(0.25, 0.5, 1.0)],
        vllm_runner_kwargs={
            "model_impl": "transformers",
+            # "mm_processor_kwargs": {"do_pan_and_scan": True},
        },
        marks=[pytest.mark.core_model],
    ),
@@ -222,6 +213,27 @@ VLM_TEST_SETTINGS = {
        },
        marks=[pytest.mark.core_model],
    ),
+    # PaliGemma has PrefixLM attention
+    "paligemma-transformers": VLMTestInfo(
+        models=["google/paligemma-3b-mix-224"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=identity,
+        img_idx_to_prompt=lambda idx: "",
+        # PaliGemma uses its own sample prompts because the default one fails
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "caption es",
+                "cherry_blossom": "What is in the picture?",
+            }
+        ),
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+        },
+        marks=[pytest.mark.core_model],
+    ),
    # Pixel values from processor are not 4D or 5D arrays
    "qwen2_5_vl-transformers": VLMTestInfo(
        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
@@ -348,24 +360,6 @@ VLM_TEST_SETTINGS = {
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[large_gpu_mark(min_gb=32)],
    ),
-    "gemma3": VLMTestInfo(
-        models=["google/gemma-3-4b-it"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
-        single_image_prompts=IMAGE_ASSETS.prompts(
-            {
-                "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
-                "cherry_blossom": "<start_of_image>What is the season?",
-            }
-        ),
-        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
-        max_model_len=4096,
-        max_num_seqs=2,
-        auto_cls=AutoModelForImageTextToText,
-        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
-        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
-        num_logprobs=10,
-    ),
    "glm4v": VLMTestInfo(
        models=["zai-org/glm-4v-9b"],
        test_type=VLMTestType.IMAGE,
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -328,16 +328,6 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:

    hf_model.processor = processor

-    orig_generate = hf_model.model.generate
-
-    def _generate(self, *args, **kwargs):
-        # FIXME: https://github.com/huggingface/transformers/issues/38333
-        kwargs["disable_compile"] = True
-
-        return orig_generate(*args, **kwargs)
-
-    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
-
    return hf_model


--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -222,7 +222,6 @@ def _test_processing_correctness(
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
    "ovis": False,
    "ovis2_5": False,
-    "paligemma": False,
    "ultravox": False,
    "whisper": False,
 }
@@ -333,7 +332,6 @@ def _test_processing_correctness_one(
        "deepseek-ai/deepseek-vl2-tiny",
        "baidu/ERNIE-4.5-VL-28B-A3B-PT",
        "adept/fuyu-8b",
-        "google/gemma-3-4b-it",
        "google/gemma-3n-E2B-it",
        "zai-org/glm-4v-9b",
        "zai-org/GLM-4.1V-9B-Thinking",
@@ -370,8 +368,6 @@ def _test_processing_correctness_one(
        "AIDC-AI/Ovis1.6-Llama3.2-3B",
        "AIDC-AI/Ovis2-1B",
        "AIDC-AI/Ovis2.5-2B",
-        "google/paligemma-3b-mix-224",
-        "google/paligemma2-3b-ft-docci-448",
        "microsoft/Phi-3.5-vision-instruct",
        "microsoft/Phi-4-multimodal-instruct",
        "mistralai/Pixtral-12B-2409",
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -48,7 +48,6 @@ ARCH_NEEDS_EXTRAS = [
    "Idefics3ForConditionalGeneration",
    "LlavaForConditionalGeneration",
    "MiniCPMV",
-    "PaliGemmaForConditionalGeneration",
 ]
 REPO_ID_TO_SKIP = {
    "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",