[CI/Build] Use AutoModelForImageTextToText to load VLMs in tests (#14945)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-18 02:35:17 +08:00
parent 5340b0e221
commit b89fb2a4a1
3 changed files with 19 additions and 19 deletions
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -2,7 +2,7 @@

 import pytest
 import torch.nn.functional as F
-from transformers import AutoModelForVision2Seq
+from transformers import AutoModelForImageTextToText

 from vllm.platforms import current_platform

@@ -70,7 +70,7 @@ def _run_test(
        vllm_outputs = vllm_model.encode(input_texts, images=input_images)

    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
+                   auto_cls=AutoModelForImageTextToText) as hf_model:
        # Patch the issue where generation_config.json is missing
        hf_model.processor.patch_size = \
            hf_model.model.config.vision_config.patch_size