[CI/Build] Use AutoModelForImageTextToText to load VLMs in tests (#14945)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-18 02:35:17 +08:00
parent 5340b0e221
commit b89fb2a4a1
3 changed files with 19 additions and 19 deletions
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -4,8 +4,8 @@ from typing import Optional, overload

 import pytest
 import torch
-from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
-                          BatchEncoding)
+from transformers import (AutoConfig, AutoModelForImageTextToText,
+                          AutoTokenizer, BatchEncoding)

 from vllm import LLM, SamplingParams
 from vllm.attention.backends.flash_attn import FlashAttentionMetadata
@@ -234,7 +234,7 @@ def _run_test(
                   dtype=dtype,
                   model_kwargs={"device_map": "auto"},
                   postprocess_inputs=process,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
+                   auto_cls=AutoModelForImageTextToText) as hf_model:
        hf_outputs_per_image = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,