[VLM] Support pan-and-scan for Gemma3 multi-modal processor (#14672)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Roger Wang <ywang@roblox.com>
2025-03-13 17:23:12 +08:00
parent a73122de96
commit 382403921f
9 changed files with 315 additions and 81 deletions
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -9,7 +9,7 @@ from pathlib import PosixPath

 import pytest
 from packaging.version import Version
-from transformers import AutoModelForVision2Seq
+from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION

 from vllm.platforms import current_platform
@@ -234,6 +234,23 @@ VLM_TEST_SETTINGS = {
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
+    "gemma3": VLMTestInfo(
+        models=["google/gemma-3-4b-it"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<start_of_image>What is the season?",  # noqa: E501
+        }),
+        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        # TODO: Use AutoModelForVision2Seq once transformers supports this
+        auto_cls=AutoModelForPreTraining,
+        dtype="bfloat16",
+        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
+        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
+    ),
    "glm4v": VLMTestInfo(
        models=["THUDM/glm-4v-9b"],
        test_type=VLMTestType.IMAGE,