[V1][core] Implement pipeline parallel on Ray (#12996)
This commit is contained in:
@@ -40,10 +40,23 @@ class PPTestOptions(NamedTuple):
|
||||
@dataclass
|
||||
class PPTestSettings:
|
||||
parallel_setups: List[ParallelSetup]
|
||||
# NOTE: the length of distributed_backends and
|
||||
# vllm_major_versions should be the same, and they
|
||||
# are first zipped together to iterate over all
|
||||
# test settings.
|
||||
distributed_backends: List[str]
|
||||
# vllm major version: "0" for V0, "1" for V1
|
||||
vllm_major_versions: List[str]
|
||||
task: TaskOption
|
||||
test_options: PPTestOptions
|
||||
|
||||
def __post_init__(self):
|
||||
if len(self.distributed_backends) != len(self.vllm_major_versions):
|
||||
raise ValueError(
|
||||
f"Length mismatch: distributed_backends "
|
||||
f"({len(self.distributed_backends)}) != "
|
||||
f"vllm_major_versions ({len(self.vllm_major_versions)})")
|
||||
|
||||
@staticmethod
|
||||
def detailed(
|
||||
*,
|
||||
@@ -79,7 +92,9 @@ class PPTestSettings:
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp", "ray"],
|
||||
# only ray is supported for V1
|
||||
distributed_backends=["mp", "ray", "ray"],
|
||||
vllm_major_versions=["0", "0", "1"],
|
||||
task=task,
|
||||
test_options=PPTestOptions(multi_node_only=multi_node_only,
|
||||
trust_remote_code=trust_remote_code,
|
||||
@@ -108,6 +123,7 @@ class PPTestSettings:
|
||||
chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp"],
|
||||
vllm_major_versions=["0"],
|
||||
task=task,
|
||||
test_options=PPTestOptions(multi_node_only=multi_node_only,
|
||||
trust_remote_code=trust_remote_code,
|
||||
@@ -120,8 +136,9 @@ class PPTestSettings:
|
||||
opts = self.test_options
|
||||
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for distributed_backend in self.distributed_backends:
|
||||
yield (model_name, parallel_setup, distributed_backend,
|
||||
for backend, vllm_major_version in zip(self.distributed_backends,
|
||||
self.vllm_major_versions):
|
||||
yield (model_name, parallel_setup, backend, vllm_major_version,
|
||||
self.task, opts)
|
||||
|
||||
|
||||
@@ -244,6 +261,7 @@ def _compare_tp(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
vllm_major_version: str,
|
||||
task: TaskOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available: int,
|
||||
@@ -296,10 +314,13 @@ def _compare_tp(
|
||||
if hf_overrides:
|
||||
common_args.extend(["--hf-overrides", hf_overrides])
|
||||
|
||||
if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
|
||||
and chunked_prefill):
|
||||
# Test Ray ADAG for a subset of the tests
|
||||
specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
|
||||
if distributed_backend == "ray" and (vllm_major_version == "1"
|
||||
or specific_case):
|
||||
# For V1, test Ray ADAG for all the tests
|
||||
# For V0, test Ray ADAG for a subset of the tests
|
||||
pp_env = {
|
||||
"VLLM_USE_V1": vllm_major_version,
|
||||
"VLLM_USE_RAY_COMPILED_DAG": "1",
|
||||
"VLLM_USE_RAY_SPMD_WORKER": "1",
|
||||
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
|
||||
@@ -348,8 +369,8 @@ def _compare_tp(
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"test_options"),
|
||||
("model_name", "parallel_setup", "distributed_backend",
|
||||
"vllm_major_version", "task", "test_options"),
|
||||
[
|
||||
params for model_name, settings in TEXT_GENERATION_MODELS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
@@ -361,6 +382,7 @@ def test_tp_language_generation(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
vllm_major_version: str,
|
||||
task: TaskOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
@@ -368,6 +390,7 @@ def test_tp_language_generation(
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
vllm_major_version,
|
||||
task,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
@@ -375,8 +398,8 @@ def test_tp_language_generation(
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"test_options"),
|
||||
("model_name", "parallel_setup", "distributed_backend",
|
||||
"vllm_major_version", "task", "test_options"),
|
||||
[
|
||||
params for model_name, settings in EMBEDDING_MODELS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
@@ -388,6 +411,7 @@ def test_tp_language_embedding(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
vllm_major_version: str,
|
||||
task: TaskOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
@@ -395,6 +419,7 @@ def test_tp_language_embedding(
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
vllm_major_version,
|
||||
task,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
@@ -402,8 +427,8 @@ def test_tp_language_embedding(
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"test_options"),
|
||||
("model_name", "parallel_setup", "distributed_backend",
|
||||
"vllm_major_version", "task", "test_options"),
|
||||
[
|
||||
params for model_name, settings in MULTIMODAL_MODELS.items()
|
||||
for params in settings.iter_params(model_name)
|
||||
@@ -415,6 +440,7 @@ def test_tp_multimodal_generation(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
vllm_major_version: str,
|
||||
task: TaskOption,
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
@@ -422,6 +448,7 @@ def test_tp_multimodal_generation(
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
vllm_major_version,
|
||||
task,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
|
||||
Reference in New Issue
Block a user