[VLM] Separate text-only and vision variants of the same model architecture (#13157)

2025-02-13 22:19:15 +08:00
parent 02ed8a1fbe
commit 1bc3b5e71b
15 changed files with 1728 additions and 1642 deletions
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -6,6 +6,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 all workers in a node other than the head node, which can cause the test
 to fail.
 """
+import json
 import os
 from dataclasses import dataclass
 from typing import List, Literal, NamedTuple, Optional
@@ -15,6 +16,7 @@ import pytest
 from vllm.config import TaskOption
 from vllm.logger import init_logger

+from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import compare_two_settings, fork_new_process_for_each_test

 logger = init_logger("test_pipeline_parallel")
@@ -31,10 +33,7 @@ class ParallelSetup(NamedTuple):

 class PPTestOptions(NamedTuple):
    multi_node_only: bool
-    trust_remote_code: bool
-    tokenizer_mode: Optional[str]
    load_format: Optional[str] = None
-    hf_overrides: Optional[str] = None


@dataclass
@@ -64,10 +63,7 @@ class PPTestSettings:
        pp_base: int = 2,
        multi_node_only: bool = False,
        task: TaskOption = "auto",
-        trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
        load_format: Optional[str] = None,
-        hf_overrides: Optional[str] = None,
    ):
        return PPTestSettings(
            parallel_setups=[
@@ -97,10 +93,7 @@ class PPTestSettings:
            vllm_major_versions=["0", "0", "1"],
            task=task,
            test_options=PPTestOptions(multi_node_only=multi_node_only,
-                                       trust_remote_code=trust_remote_code,
-                                       tokenizer_mode=tokenizer_mode,
-                                       load_format=load_format,
-                                       hf_overrides=hf_overrides),
+                                       load_format=load_format),
        )

    @staticmethod
@@ -110,10 +103,7 @@ class PPTestSettings:
        pp_base: int = 2,
        task: TaskOption = "auto",
        multi_node_only: bool = False,
-        trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
        load_format: Optional[str] = None,
-        hf_overrides: Optional[str] = None,
    ):
        return PPTestSettings(
            parallel_setups=[
@@ -126,19 +116,16 @@ class PPTestSettings:
            vllm_major_versions=["0"],
            task=task,
            test_options=PPTestOptions(multi_node_only=multi_node_only,
-                                       trust_remote_code=trust_remote_code,
-                                       tokenizer_mode=tokenizer_mode,
-                                       load_format=load_format,
-                                       hf_overrides=hf_overrides),
+                                       load_format=load_format),
        )

-    def iter_params(self, model_name: str):
+    def iter_params(self, model_id: str):
        opts = self.test_options

        for parallel_setup in self.parallel_setups:
            for backend, vllm_major_version in zip(self.distributed_backends,
                                                   self.vllm_major_versions):
-                yield (model_name, parallel_setup, backend, vllm_major_version,
+                yield (model_id, parallel_setup, backend, vllm_major_version,
                       self.task, opts)


@@ -150,16 +137,16 @@ TEXT_GENERATION_MODELS = {
    # [Decoder-only]
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
-    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True),  # noqa: E501
-    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
-    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
+    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
+    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
    "bigscience/bloomz-1b1": PPTestSettings.fast(),
-    "THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
-    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
-    "databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
-    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
+    "THUDM/chatglm3-6b": PPTestSettings.fast(),
+    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
+    "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
+    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
-    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
    "tiiuae/falcon-7b": PPTestSettings.fast(),
    "google/gemma-2b": PPTestSettings.fast(),
@@ -172,36 +159,36 @@ TEXT_GENERATION_MODELS = {
    "ibm/PowerMoE-3b": PPTestSettings.fast(),
    # Uses Llama
    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
-    "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
+    "internlm/internlm2-chat-7b": PPTestSettings.fast(),
    "inceptionai/jais-13b-chat": PPTestSettings.fast(),
    "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
-    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
-    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
+    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
+    "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
    # Uses Llama
    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
    "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
    "mosaicml/mpt-7b": PPTestSettings.fast(),
    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
    "shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(),
    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
-    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
    "microsoft/phi-2": PPTestSettings.fast(),
-    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
-    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'),  # noqa: E501
-    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
+    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"),  # noqa: E501
+    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
    "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
    "bigcode/starcoder2-3b": PPTestSettings.fast(),
-    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
+    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
    # FIXME: Cannot load tokenizer in latest transformers version.
    # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
-    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
    # [Encoder-only]
    # TODO: Implement PP
    # "facebook/bart-base": PPTestSettings.fast(),
@@ -211,7 +198,7 @@ EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
-    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True),  # noqa: E501
+    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
 }

 MULTIMODAL_MODELS = {
@@ -219,20 +206,20 @@ MULTIMODAL_MODELS = {
    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
    "facebook/chameleon-7b": PPTestSettings.fast(),
    "adept/fuyu-8b": PPTestSettings.fast(),
-    "THUDM/glm-4v-9b": PPTestSettings.fast(trust_remote_code=True),
-    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
+    "THUDM/glm-4v-9b": PPTestSettings.fast(),
+    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
-    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
-    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(trust_remote_code=True),
-    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
-    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
-    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
+    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
+    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(),
+    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
+    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
    "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
-    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
    # [Encoder-decoder]
    # TODO: Implement PP
    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
@@ -258,7 +245,7 @@ TEST_MODELS = [


 def _compare_tp(
-    model_name: str,
+    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
    vllm_major_version: str,
@@ -267,6 +254,7 @@ def _compare_tp(
    num_gpus_available: int,
    *,
    method: Literal["generate", "encode"],
+    is_multimodal: bool,
 ):
    (
        tp_size,
@@ -274,13 +262,32 @@ def _compare_tp(
        eager_mode,
        chunked_prefill,
    ) = parallel_setup
-    (
-        multi_node_only,
-        trust_remote_code,
-        tokenizer_mode,
-        load_format,
-        hf_overrides,
-    ) = test_options
+
+    multi_node_only, load_format = test_options
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+
+    trust_remote_code = model_info.trust_remote_code
+    tokenizer_mode = model_info.tokenizer_mode
+    hf_overrides = model_info.hf_overrides
+
+    if load_format == "dummy":
+        # Avoid OOM
+        text_overrides = {
+            "num_layers": 1,
+            "num_hidden_layers": 1,
+            "num_experts": 2,
+            "num_experts_per_tok": 2,
+            "num_local_experts": 2,
+        }
+
+        if is_multimodal:
+            hf_overrides.update({"text_config": text_overrides})
+        else:
+            hf_overrides.update(text_overrides)
+    else:
+        model_info.check_available_online(on_fail="skip")

    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
@@ -312,7 +319,7 @@ def _compare_tp(
    if load_format:
        common_args.extend(["--load-format", load_format])
    if hf_overrides:
-        common_args.extend(["--hf-overrides", hf_overrides])
+        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])

    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
    if distributed_backend == "ray" and (vllm_major_version == "1"
@@ -355,11 +362,7 @@ def _compare_tp(
    ]

    try:
-        compare_two_settings(model_name,
-                             pp_args,
-                             tp_args,
-                             pp_env,
-                             method=method)
+        compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method)
    except Exception:
        if pp_env is None:
            raise
@@ -369,17 +372,16 @@ def _compare_tp(


@pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
-     "vllm_major_version", "task", "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "task", "test_options"),
    [
-        params for model_name, settings in TEXT_GENERATION_MODELS.items()
-        for params in settings.iter_params(model_name)
-        if model_name in TEST_MODELS
+        params for model_id, settings in TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
    ],
 )
@fork_new_process_for_each_test
 def test_tp_language_generation(
-    model_name: str,
+    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
    vllm_major_version: str,
@@ -387,28 +389,28 @@ def test_tp_language_generation(
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    _compare_tp(model_name,
+    _compare_tp(model_id,
                parallel_setup,
                distributed_backend,
                vllm_major_version,
                task,
                test_options,
                num_gpus_available,
-                method="generate")
+                method="generate",
+                is_multimodal=False)


@pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
-     "vllm_major_version", "task", "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "task", "test_options"),
    [
-        params for model_name, settings in EMBEDDING_MODELS.items()
-        for params in settings.iter_params(model_name)
-        if model_name in TEST_MODELS
+        params for model_id, settings in EMBEDDING_MODELS.items()
+        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
    ],
 )
@fork_new_process_for_each_test
 def test_tp_language_embedding(
-    model_name: str,
+    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
    vllm_major_version: str,
@@ -416,28 +418,28 @@ def test_tp_language_embedding(
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    _compare_tp(model_name,
+    _compare_tp(model_id,
                parallel_setup,
                distributed_backend,
                vllm_major_version,
                task,
                test_options,
                num_gpus_available,
-                method="encode")
+                method="encode",
+                is_multimodal=False)


@pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
-     "vllm_major_version", "task", "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "task", "test_options"),
    [
-        params for model_name, settings in MULTIMODAL_MODELS.items()
-        for params in settings.iter_params(model_name)
-        if model_name in TEST_MODELS
+        params for model_id, settings in MULTIMODAL_MODELS.items()
+        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
    ],
 )
@fork_new_process_for_each_test
 def test_tp_multimodal_generation(
-    model_name: str,
+    model_id: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
    vllm_major_version: str,
@@ -445,11 +447,12 @@ def test_tp_multimodal_generation(
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    _compare_tp(model_name,
+    _compare_tp(model_id,
                parallel_setup,
                distributed_backend,
                vllm_major_version,
                task,
                test_options,
                num_gpus_available,
-                method="generate")
+                method="generate",
+                is_multimodal=True)