[VLM] Support pan-and-scan for Gemma3 multi-modal processor (#14672)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Roger Wang <ywang@roblox.com>
2025-03-13 17:23:12 +08:00
parent a73122de96
commit 382403921f
9 changed files with 315 additions and 81 deletions
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -123,10 +123,14 @@ def run_gemma3(questions: list[str], modality: str):
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

-    llm = LLM(model=model_name,
-              max_model_len=2048,
-              max_num_seqs=2,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+    llm = LLM(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        # Default is False; setting it to True is not supported in V1 yet
+        mm_processor_kwargs={"do_pan_and_scan": True},
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )

    prompts = [("<bos><start_of_turn>user\n"
                f"<start_of_image>{question}<end_of_turn>\n"