[VLM] Separate text-only and vision variants of the same model architecture (#13157)

2025-02-13 22:19:15 +08:00
parent 02ed8a1fbe
commit 1bc3b5e71b
15 changed files with 1728 additions and 1642 deletions
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -155,10 +155,7 @@ VLM_TEST_SETTINGS = {
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        marks=[pytest.mark.skipif(
-                TRANSFORMERS_VERSION < "4.49.0",
-                reason="HF model requires transformers>=4.49.0",
-            ), pytest.mark.core_model, pytest.mark.cpu_model],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
    #### Extended model tests
    "aria": VLMTestInfo(
@@ -215,7 +212,6 @@ VLM_TEST_SETTINGS = {
            "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
        }),
        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
-        vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}},  # noqa: E501
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
        postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
@@ -240,7 +236,7 @@ VLM_TEST_SETTINGS = {
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
    ),
-    "glm4": VLMTestInfo(
+    "glm4v": VLMTestInfo(
        models=["THUDM/glm-4v-9b"],
        test_type=VLMTestType.IMAGE,
        prompt_formatter=identity,
@@ -351,7 +347,6 @@ VLM_TEST_SETTINGS = {
        postprocess_inputs=model_utils.cast_dtype_post_processor(
            "pixel_values"
        ),
-        vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}},  # noqa: E501
        get_stop_token_ids=lambda tok: [128009],
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
@@ -437,7 +432,7 @@ VLM_TEST_SETTINGS = {
        auto_cls=AutoModelForVision2Seq,
        marks=[large_gpu_mark(min_gb=48)],
    ),
-    "qwen": VLMTestInfo(
+    "qwen_vl": VLMTestInfo(
        models=["Qwen/Qwen-VL"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=identity,