[Models] Add remaining model PP support (#7168)

Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-10-03 19:56:58 -07:00
parent 303d44790a
commit 0f6d7a9a34
69 changed files with 2585 additions and 1344 deletions
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -6,6 +6,8 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 to fail.
 """
 import os
+from dataclasses import dataclass
+from typing import List, NamedTuple, Optional

 import pytest

@@ -18,79 +20,236 @@ logger = init_logger("test_pipeline_parallel")
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"


+class ParallelSetup(NamedTuple):
+    tp_size: int
+    pp_size: int
+    eager_mode: bool
+    chunked_prefill: bool
+
+
+@dataclass
+class PPTestSettings:
+    parallel_setups: List[ParallelSetup]
+    distributed_backends: List[str]
+    trust_remote_code: bool
+    tokenizer_mode: Optional[str]
+
+    @staticmethod
+    def detailed(
+        *,
+        tp_base: int = 1,
+        pp_base: int = 2,
+        trust_remote_code: bool = False,
+        tokenizer_mode: Optional[str] = None,
+    ):
+        return PPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=pp_base,
+                              eager_mode=False,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=2 * pp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=2 * pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=2 * tp_base,
+                              pp_size=pp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=2 * tp_base,
+                              pp_size=pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp", "ray"],
+            trust_remote_code=trust_remote_code,
+            tokenizer_mode=tokenizer_mode,
+        )
+
+    @staticmethod
+    def fast(
+        *,
+        tp_base: int = 1,
+        pp_base: int = 2,
+        trust_remote_code: bool = False,
+        tokenizer_mode: Optional[str] = None,
+    ):
+        return PPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              pp_size=pp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp"],
+            trust_remote_code=trust_remote_code,
+            tokenizer_mode=tokenizer_mode,
+        )
+
+    def iter_params(self, model_name: str):
+        for parallel_setup in self.parallel_setups:
+            for distributed_backend in self.distributed_backends:
+                yield (model_name, parallel_setup, distributed_backend,
+                       self.trust_remote_code, self.tokenizer_mode)
+
+
+# yapf: disable
+GENERATION_MODEL_SETTINGS = {
+    # [DETAILED TESTS]
+    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
+    # [FAST TESTS]
+    # Uses Llama
+    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
+    # TODO: Test on larger GPU
+    # "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
+    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "bigscience/bloomz-1b1": PPTestSettings.fast(),
+    "THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
+    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
+    # TODO: Test on larger GPU
+    # "databricks/dbrx-instruct": PPTestSettings.fast(),
+    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
+    "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
+    "tiiuae/falcon-7b": PPTestSettings.fast(),
+    "google/gemma-2b": PPTestSettings.fast(),
+    "google/gemma-2-9b": PPTestSettings.fast(),
+    "gpt2": PPTestSettings.fast(),
+    "bigcode/starcoder": PPTestSettings.fast(),
+    "EleutherAI/gpt-j-6b": PPTestSettings.fast(),
+    "EleutherAI/pythia-12b": PPTestSettings.fast(),
+    "ibm/PowerLM-3b": PPTestSettings.fast(),
+    "ibm/PowerMoE-3b": PPTestSettings.fast(),
+    # Uses Llama
+    # "internlm/internlm-chat-7b": PPTestSettings.fast(),
+    "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
+    "core42/jais-13b-chat": PPTestSettings.fast(),
+    # TODO: Implement PP
+    # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(),
+    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
+    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
+    # Uses Llama
+    # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
+    "mosaicml/mpt-7b": PPTestSettings.fast(),
+    "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
+    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
+    "allenai/OLMo-1B-hf": PPTestSettings.fast(),
+    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
+    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "microsoft/phi-2": PPTestSettings.fast(),
+    "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(),
+    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    # FIXME: https://github.com/vllm-project/vllm/issues/8553
+    # "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "adept/persimmon-8b-chat": PPTestSettings.fast(),
+    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
+    "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
+    "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
+    "bigcode/starcoder2-3b": PPTestSettings.fast(),
+    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
+    # FIXME: Cannot load tokenizer in latest transformers version
+    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+}
+
+EMBEDDING_MODEL_SETTINGS = {  # type: ignore[var-annotated]
+    # [FAST TESTS]
+    # Uses Llama
+    # "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
+}
+
+MULTIMODAL_MODEL_SETTINGS = {
+    # [FAST TESTS]
+    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
+    "facebook/chameleon-7b": PPTestSettings.fast(),
+    "adept/fuyu-8b": PPTestSettings.fast(),
+    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
+    "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
+    "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
+    "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
+    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
+    # TODO: Implement PP
+    # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
+    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
+    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
+    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
+}
+
+CONDITIONAL_GENERATION_MODEL_SETTINGS = {  # type: ignore[var-annotated]
+    # [FAST TESTS]
+    # TODO: Implement PP
+    # "facebook/bart-base": PPTestSettings.fast(),
+}
+# yapf: enable
+
+MODEL_SETTINGS = {
+    **GENERATION_MODEL_SETTINGS,
+    **EMBEDDING_MODEL_SETTINGS,
+    **MULTIMODAL_MODEL_SETTINGS,
+}
+
+# You can update this on your local machine to run specific tests
+TEST_MODELS = [
+    "meta-llama/Meta-Llama-3-8B",
+    "facebook/chameleon-7b",
+    "OpenGVLab/InternVL2-1B",
+    "microsoft/Phi-3-vision-128k-instruct",
+    "mistralai/Pixtral-12B-2409",
+    "fixie-ai/ultravox-v0_3",
+]
+
+
@pytest.mark.parametrize(
-    ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
-     "MODEL_NAME, DIST_BACKEND"),
+    ("model_name", "parallel_setup", "distributed_backend",
+     "trust_remote_code", "tokenizer_mode"),
    [
-        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        # NOTE: InternVL2 multi-node tests are flaky,
-        # use mp backend to skip the multi-node tests
-        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
-        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
-        (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
-        (1, 2, 0, 1, 0, "Qwen/Qwen2-VL-2B-Instruct", "mp"),
-        # TP only models
-        (2, 1, 1, 0, 0, "adept/fuyu-8b", "mp"),
+        params for model_name, settings in MODEL_SETTINGS.items()
+        for params in settings.iter_params(model_name)
+        if model_name in TEST_MODELS
    ],
 )
@fork_new_process_for_each_test
-def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
-                    TRUST_REMOTE_CODE, MODEL_NAME, DIST_BACKEND):
-    if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
+def test_compare_tp(model_name: str, parallel_setup: ParallelSetup,
+                    distributed_backend: str, trust_remote_code: bool,
+                    tokenizer_mode: Optional[str], num_gpus_available):
+    tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
+
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Need at least {tp_size} GPUs to run the test")
+    if VLLM_MULTI_NODE and distributed_backend == "mp":
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")

-    pp_args = [
+    common_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",
        "--max-model-len",
-        "8192",
-        "--pipeline-parallel-size",
-        str(PP_SIZE),
-        "--tensor-parallel-size",
-        str(TP_SIZE),
-        "--distributed-executor-backend",
-        DIST_BACKEND,
+        "2048",
+        "--max-num-seqs",
+        "8",
    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])

-    # compare without pipeline parallelism
-    # NOTE: use mp backend for TP
-    # PP tests might involve multiple nodes, and ray might
-    #  schedule all workers in a node other than the head node,
-    #  which can cause the test to fail.
-    tp_args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "float16",
-        "--max-model-len",
-        "8192",
-        "--tensor-parallel-size",
-        str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
-        "--distributed-executor-backend",
-        "mp",
-    ]
-    if CHUNKED_PREFILL:
-        pp_args.append("--enable-chunked-prefill")
-        tp_args.append("--enable-chunked-prefill")
-    if EAGER_MODE:
-        pp_args.append("--enforce-eager")
-        tp_args.append("--enforce-eager")
-    if TRUST_REMOTE_CODE:
-        pp_args.append("--trust-remote-code")
-        tp_args.append("--trust-remote-code")
-    pp_env = None
-    if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2
-            and CHUNKED_PREFILL):
+    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
+            and chunked_prefill):
        # Test Ray ADAG for a subset of the tests
        pp_env = {
            "VLLM_USE_RAY_COMPILED_DAG": "1",
@@ -99,11 +258,35 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
        }
        # Temporary. Currently when zeromq + SPMD is used, it does not properly
        # terminate because of aDAG issue.
-        pp_args.append("--disable-frontend-multiprocessing")
-        tp_args.append("--disable-frontend-multiprocessing")
+        common_args.append("--disable-frontend-multiprocessing")
+    else:
+        pp_env = None
+
+    pp_args = [
+        *common_args,
+        "--pipeline-parallel-size",
+        str(pp_size),
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+
+    # compare without pipeline parallelism
+    # NOTE: use mp backend for TP
+    # PP tests might involve multiple nodes, and ray might
+    #  schedule all workers in a node other than the head node,
+    #  which can cause the test to fail.
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]

    try:
-        compare_two_settings(MODEL_NAME, pp_args, tp_args, pp_env)
+        compare_two_settings(model_name, pp_args, tp_args, pp_env)
    except Exception:
        if pp_env is None:
            raise