[Model] Add user-configurable task for models that support both generation and embedding (#9424)

2024-10-19 02:31:58 +08:00
parent 7dbe738d65
commit 051eaf6db3
33 changed files with 451 additions and 201 deletions
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -11,6 +11,7 @@ from typing import List, Literal, NamedTuple, Optional

 import pytest

+from vllm.config import TaskOption
 from vllm.logger import init_logger

 from ..utils import compare_two_settings, fork_new_process_for_each_test
@@ -31,6 +32,7 @@ class ParallelSetup(NamedTuple):
 class PPTestSettings:
    parallel_setups: List[ParallelSetup]
    distributed_backends: List[str]
+    task: TaskOption
    trust_remote_code: bool
    tokenizer_mode: Optional[str]

@@ -39,6 +41,7 @@ class PPTestSettings:
        *,
        tp_base: int = 1,
        pp_base: int = 2,
+        task: TaskOption = "auto",
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
@@ -66,6 +69,7 @@ class PPTestSettings:
                              chunked_prefill=False),
            ],
            distributed_backends=["mp", "ray"],
+            task=task,
            trust_remote_code=trust_remote_code,
            tokenizer_mode=tokenizer_mode,
        )
@@ -75,6 +79,7 @@ class PPTestSettings:
        *,
        tp_base: int = 1,
        pp_base: int = 2,
+        task: TaskOption = "auto",
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
@@ -86,6 +91,7 @@ class PPTestSettings:
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
+            task=task,
            trust_remote_code=trust_remote_code,
            tokenizer_mode=tokenizer_mode,
        )
@@ -94,7 +100,7 @@ class PPTestSettings:
        for parallel_setup in self.parallel_setups:
            for distributed_backend in self.distributed_backends:
                yield (model_name, parallel_setup, distributed_backend,
-                       self.trust_remote_code, self.tokenizer_mode)
+                       self.task, self.trust_remote_code, self.tokenizer_mode)


 # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
@@ -213,6 +219,7 @@ def _compare_tp(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    task: TaskOption,
    trust_remote_code: bool,
    tokenizer_mode: Optional[str],
    num_gpus_available: int,
@@ -240,6 +247,8 @@ def _compare_tp(
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
+    if task != "auto":
+        common_args.extend(["--task", task])
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
@@ -297,7 +306,7 @@ def _compare_tp(


@pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
+    ("model_name", "parallel_setup", "distributed_backend", "task",
     "trust_remote_code", "tokenizer_mode"),
    [
        params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
@@ -310,6 +319,7 @@ def test_tp_language_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    task: TaskOption,
    trust_remote_code: bool,
    tokenizer_mode: Optional[str],
    num_gpus_available,
@@ -317,6 +327,7 @@ def test_tp_language_generation(
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
+                task,
                trust_remote_code,
                tokenizer_mode,
                num_gpus_available,
@@ -324,7 +335,7 @@ def test_tp_language_generation(


@pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
+    ("model_name", "parallel_setup", "distributed_backend", "task",
     "trust_remote_code", "tokenizer_mode"),
    [
        params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
@@ -337,6 +348,7 @@ def test_tp_language_embedding(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    task: TaskOption,
    trust_remote_code: bool,
    tokenizer_mode: Optional[str],
    num_gpus_available,
@@ -344,6 +356,7 @@ def test_tp_language_embedding(
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
+                task,
                trust_remote_code,
                tokenizer_mode,
                num_gpus_available,
@@ -351,7 +364,7 @@ def test_tp_language_embedding(


@pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
+    ("model_name", "parallel_setup", "distributed_backend", "task",
     "trust_remote_code", "tokenizer_mode"),
    [
        params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
@@ -364,6 +377,7 @@ def test_tp_multimodal_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    task: TaskOption,
    trust_remote_code: bool,
    tokenizer_mode: Optional[str],
    num_gpus_available,
@@ -371,6 +385,7 @@ def test_tp_multimodal_generation(
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
+                task,
                trust_remote_code,
                tokenizer_mode,
                num_gpus_available,