Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -111,8 +111,7 @@ class MockSubscriber:
|
||||
self.last_seq = -1
|
||||
self.decoder = msgspec.msgpack.Decoder(type=decode_type)
|
||||
|
||||
def receive_one(self,
|
||||
timeout=1000) -> Union[tuple[int, SampleBatch], None]:
|
||||
def receive_one(self, timeout=1000) -> Union[tuple[int, SampleBatch], None]:
|
||||
"""Receive a single message with timeout"""
|
||||
if not self.sub.poll(timeout):
|
||||
return None
|
||||
@@ -135,8 +134,7 @@ class MockSubscriber:
|
||||
|
||||
self.replay_sockets[socket_idx].send(start_seq.to_bytes(8, "big"))
|
||||
|
||||
def receive_replay(self,
|
||||
socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
|
||||
def receive_replay(self, socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
|
||||
"""Receive replayed messages from a specific replay socket"""
|
||||
if not self.replay_sockets:
|
||||
raise ValueError("Replay sockets not initialized")
|
||||
|
||||
@@ -12,7 +12,8 @@ import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
||||
from vllm.distributed.device_communicators.custom_all_reduce import ( # noqa
|
||||
CustomAllreduce)
|
||||
CustomAllreduce,
|
||||
)
|
||||
|
||||
# create a cpu process group for communicating metadata (ipc handle)
|
||||
dist.init_process_group(backend="gloo")
|
||||
@@ -52,7 +53,8 @@ for p in pointers:
|
||||
assert ord(host_data[i]) == byte_value, (
|
||||
f"Rank {rank} failed"
|
||||
f" to verify buffer {p}. Expected {byte_value}, "
|
||||
f"got {ord(host_data[i])}")
|
||||
f"got {ord(host_data[i])}"
|
||||
)
|
||||
|
||||
print(f"Rank {rank} verified all buffers")
|
||||
|
||||
|
||||
@@ -13,13 +13,19 @@ import pytest
|
||||
import ray
|
||||
import torch
|
||||
|
||||
from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
|
||||
tensor_model_parallel_all_gather,
|
||||
tensor_model_parallel_all_reduce,
|
||||
tensor_model_parallel_reduce_scatter)
|
||||
from vllm.distributed import (
|
||||
broadcast_tensor_dict,
|
||||
get_pp_group,
|
||||
tensor_model_parallel_all_gather,
|
||||
tensor_model_parallel_all_reduce,
|
||||
tensor_model_parallel_reduce_scatter,
|
||||
)
|
||||
|
||||
from ..utils import (init_test_distributed_environment, multi_gpu_test,
|
||||
multi_process_parallel)
|
||||
from ..utils import (
|
||||
init_test_distributed_environment,
|
||||
multi_gpu_test,
|
||||
multi_process_parallel,
|
||||
)
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
@@ -37,12 +43,11 @@ def all_reduce_test_worker(
|
||||
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
num_elements = 8
|
||||
all_tensors = [
|
||||
torch.arange(num_elements, dtype=torch.float32, device="cuda") *
|
||||
(r + 1) for r in range(tp_size)
|
||||
torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
|
||||
for r in range(tp_size)
|
||||
]
|
||||
expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
|
||||
t = all_tensors[rank % tp_size]
|
||||
@@ -51,28 +56,31 @@ def all_reduce_test_worker(
|
||||
|
||||
|
||||
@ray.remote(num_gpus=1, max_calls=1)
|
||||
def reduce_scatter_test_worker(monkeypatch: pytest.MonkeyPatch, tp_size: int,
|
||||
pp_size: int, rank: int,
|
||||
distributed_init_port: str):
|
||||
def reduce_scatter_test_worker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
rank: int,
|
||||
distributed_init_port: str,
|
||||
):
|
||||
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
|
||||
# so that each worker can see all the GPUs
|
||||
# they will be able to set the device to the correct GPU
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
|
||||
num_elements = 8
|
||||
all_tensors = [
|
||||
torch.arange(num_elements, dtype=torch.float32, device="cuda") *
|
||||
(r + 1) for r in range(tp_size)
|
||||
torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
|
||||
for r in range(tp_size)
|
||||
]
|
||||
|
||||
index = rank % tp_size
|
||||
partition_size = num_elements // tp_size
|
||||
all_reduce = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
|
||||
expected = all_reduce[index * partition_size:(index + 1) * partition_size]
|
||||
expected = all_reduce[index * partition_size : (index + 1) * partition_size]
|
||||
t = all_tensors[index]
|
||||
t = tensor_model_parallel_reduce_scatter(t, 0)
|
||||
torch.testing.assert_close(t, expected)
|
||||
@@ -92,8 +100,7 @@ def all_gather_test_worker(
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
num_dimensions = 3
|
||||
tensor_size = list(range(2, num_dimensions + 2))
|
||||
total_size = 1
|
||||
@@ -101,8 +108,10 @@ def all_gather_test_worker(
|
||||
total_size *= s
|
||||
for all_gather_dimension in range(num_dimensions):
|
||||
all_tensors = [
|
||||
torch.arange(total_size, dtype=torch.float32,
|
||||
device="cuda").reshape(tensor_size) * (r + 1)
|
||||
torch.arange(total_size, dtype=torch.float32, device="cuda").reshape(
|
||||
tensor_size
|
||||
)
|
||||
* (r + 1)
|
||||
for r in range(tp_size)
|
||||
]
|
||||
expected = torch.cat(all_tensors, dim=all_gather_dimension)
|
||||
@@ -125,8 +134,7 @@ def broadcast_tensor_dict_test_worker(
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
test_dict = {
|
||||
# device tensor
|
||||
"a": torch.arange(8, dtype=torch.float32, device="cuda"),
|
||||
@@ -134,10 +142,7 @@ def broadcast_tensor_dict_test_worker(
|
||||
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
|
||||
"c": "test",
|
||||
"d": [1, 2, 3],
|
||||
"e": {
|
||||
"a": 1,
|
||||
"b": 2
|
||||
},
|
||||
"e": {"a": 1, "b": 2},
|
||||
# empty tensor
|
||||
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
|
||||
}
|
||||
@@ -166,8 +171,7 @@ def send_recv_tensor_dict_test_worker(
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
|
||||
test_dict = {
|
||||
# device tensor
|
||||
@@ -176,10 +180,7 @@ def send_recv_tensor_dict_test_worker(
|
||||
"b": torch.arange(16, dtype=torch.int8, device="cpu"),
|
||||
"c": "test",
|
||||
"d": [1, 2, 3],
|
||||
"e": {
|
||||
"a": 1,
|
||||
"b": 2
|
||||
},
|
||||
"e": {"a": 1, "b": 2},
|
||||
# empty tensor
|
||||
"f": torch.tensor([], dtype=torch.float32, device="cuda"),
|
||||
}
|
||||
@@ -211,8 +212,7 @@ def send_recv_test_worker(
|
||||
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
|
||||
size = 64
|
||||
test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
|
||||
@@ -229,10 +229,10 @@ def send_recv_test_worker(
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("test_target", [
|
||||
all_reduce_test_worker, all_gather_test_worker,
|
||||
broadcast_tensor_dict_test_worker
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"test_target",
|
||||
[all_reduce_test_worker, all_gather_test_worker, broadcast_tensor_dict_test_worker],
|
||||
)
|
||||
def test_multi_process_tensor_parallel(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size: int,
|
||||
@@ -244,7 +244,8 @@ def test_multi_process_tensor_parallel(
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("pp_size", [2])
|
||||
@pytest.mark.parametrize(
|
||||
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
|
||||
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]
|
||||
)
|
||||
def test_multi_process_pipeline_parallel(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
pp_size: int,
|
||||
@@ -256,11 +257,16 @@ def test_multi_process_pipeline_parallel(
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("pp_size", [2])
|
||||
@pytest.mark.parametrize("test_target", [
|
||||
send_recv_test_worker, send_recv_tensor_dict_test_worker,
|
||||
all_reduce_test_worker, all_gather_test_worker,
|
||||
broadcast_tensor_dict_test_worker
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"test_target",
|
||||
[
|
||||
send_recv_test_worker,
|
||||
send_recv_tensor_dict_test_worker,
|
||||
all_reduce_test_worker,
|
||||
all_gather_test_worker,
|
||||
broadcast_tensor_dict_test_worker,
|
||||
],
|
||||
)
|
||||
def test_multi_process_tensor_parallel_pipeline_parallel(
|
||||
tp_size: int,
|
||||
pp_size: int,
|
||||
|
||||
@@ -7,6 +7,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
all workers in a node other than the head node, which can cause the test
|
||||
to fail.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
@@ -56,7 +57,8 @@ class CPTestSettings:
|
||||
raise ValueError(
|
||||
f"Length mismatch: distributed_backends "
|
||||
f"({len(self.distributed_backends)}) != "
|
||||
f"vllm_major_versions ({len(self.vllm_major_versions)})")
|
||||
f"vllm_major_versions ({len(self.vllm_major_versions)})"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def detailed(
|
||||
@@ -74,29 +76,39 @@ class CPTestSettings:
|
||||
for dcp_multiplier in [0.5, 1]:
|
||||
for chunked_prefill_val in [True]:
|
||||
parallel_setups.append(
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_multiplier * pp_base,
|
||||
dcp_size=int(dcp_multiplier *
|
||||
tp_base),
|
||||
eager_mode=eager_mode_val,
|
||||
chunked_prefill=chunked_prefill_val))
|
||||
ParallelSetup(
|
||||
tp_size=tp_base,
|
||||
pp_size=pp_multiplier * pp_base,
|
||||
dcp_size=int(dcp_multiplier * tp_base),
|
||||
eager_mode=eager_mode_val,
|
||||
chunked_prefill=chunked_prefill_val,
|
||||
)
|
||||
)
|
||||
return CPTestSettings(
|
||||
parallel_setups=parallel_setups,
|
||||
distributed_backends=["mp"],
|
||||
vllm_major_versions=["1"],
|
||||
runner=runner,
|
||||
test_options=CPTestOptions(multi_node_only=multi_node_only,
|
||||
load_format=load_format),
|
||||
test_options=CPTestOptions(
|
||||
multi_node_only=multi_node_only, load_format=load_format
|
||||
),
|
||||
)
|
||||
|
||||
def iter_params(self, model_id: str):
|
||||
opts = self.test_options
|
||||
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for backend, vllm_major_version in zip(self.distributed_backends,
|
||||
self.vllm_major_versions):
|
||||
yield (model_id, parallel_setup, backend, vllm_major_version,
|
||||
self.runner, opts)
|
||||
for backend, vllm_major_version in zip(
|
||||
self.distributed_backends, self.vllm_major_versions
|
||||
):
|
||||
yield (
|
||||
model_id,
|
||||
parallel_setup,
|
||||
backend,
|
||||
vllm_major_version,
|
||||
self.runner,
|
||||
opts,
|
||||
)
|
||||
|
||||
|
||||
def _compare_cp_with_tp(
|
||||
@@ -148,8 +160,10 @@ def _compare_cp_with_tp(
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
if VLLM_MULTI_NODE and distributed_backend == "mp":
|
||||
pytest.skip("Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend")
|
||||
pytest.skip(
|
||||
"Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend"
|
||||
)
|
||||
if multi_node_only and not VLLM_MULTI_NODE:
|
||||
pytest.skip("Not in multi-node setting")
|
||||
|
||||
@@ -178,8 +192,7 @@ def _compare_cp_with_tp(
|
||||
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||
|
||||
cp_env = tp_env = {
|
||||
"VLLM_USE_V1":
|
||||
vllm_major_version, # Note(hc): DCP only support V1 engine only
|
||||
"VLLM_USE_V1": vllm_major_version, # Note(hc): DCP only support V1 engine only
|
||||
}
|
||||
|
||||
cp_args = [
|
||||
@@ -205,13 +218,15 @@ def _compare_cp_with_tp(
|
||||
]
|
||||
|
||||
try:
|
||||
compare_two_settings(model_id,
|
||||
cp_args,
|
||||
tp_args,
|
||||
cp_env,
|
||||
tp_env,
|
||||
method=method,
|
||||
max_wait_seconds=720)
|
||||
compare_two_settings(
|
||||
model_id,
|
||||
cp_args,
|
||||
tp_args,
|
||||
cp_env,
|
||||
tp_env,
|
||||
method=method,
|
||||
max_wait_seconds=720,
|
||||
)
|
||||
except Exception:
|
||||
testing_ray_compiled_graph = cp_env is not None
|
||||
if testing_ray_compiled_graph and vllm_major_version == "0":
|
||||
@@ -224,9 +239,10 @@ def _compare_cp_with_tp(
|
||||
|
||||
CP_TEXT_GENERATION_MODELS = {
|
||||
# [MLA attention only]
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat":
|
||||
[CPTestSettings.detailed(),
|
||||
CPTestSettings.detailed(tp_base=2)],
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": [
|
||||
CPTestSettings.detailed(),
|
||||
CPTestSettings.detailed(tp_base=2),
|
||||
],
|
||||
}
|
||||
|
||||
CP_TEST_MODELS = [
|
||||
@@ -237,11 +253,19 @@ CP_TEST_MODELS = [
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
|
||||
"runner", "test_options"),
|
||||
(
|
||||
"model_id",
|
||||
"parallel_setup",
|
||||
"distributed_backend",
|
||||
"vllm_major_version",
|
||||
"runner",
|
||||
"test_options",
|
||||
),
|
||||
[
|
||||
params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
|
||||
for setting in settings for params in setting.iter_params(model_id)
|
||||
params
|
||||
for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
|
||||
for setting in settings
|
||||
for params in setting.iter_params(model_id)
|
||||
if model_id in CP_TEST_MODELS
|
||||
],
|
||||
)
|
||||
@@ -255,12 +279,14 @@ def test_cp_generation(
|
||||
test_options: CPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_cp_with_tp(model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
vllm_major_version,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
is_multimodal=False)
|
||||
_compare_cp_with_tp(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
vllm_major_version,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
is_multimodal=False,
|
||||
)
|
||||
|
||||
@@ -8,12 +8,14 @@ import ray
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.communication_op import ( # noqa
|
||||
tensor_model_parallel_all_reduce)
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce # noqa
|
||||
from vllm.distributed.parallel_state import get_tp_group, graph_capture
|
||||
|
||||
from ..utils import (ensure_model_parallel_initialized,
|
||||
init_test_distributed_environment, multi_process_parallel)
|
||||
from ..utils import (
|
||||
ensure_model_parallel_initialized,
|
||||
init_test_distributed_environment,
|
||||
multi_process_parallel,
|
||||
)
|
||||
|
||||
random.seed(42)
|
||||
test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
|
||||
@@ -33,8 +35,7 @@ def graph_allreduce(
|
||||
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
ensure_model_parallel_initialized(tp_size, pp_size)
|
||||
group = get_tp_group().device_group
|
||||
|
||||
@@ -60,18 +61,15 @@ def graph_allreduce(
|
||||
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
||||
with graph_capture(device=device) as graph_capture_context:
|
||||
# use integers so result matches NCCL exactly
|
||||
inp1 = torch.randint(1,
|
||||
16, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
inp2 = torch.randint(1,
|
||||
16, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
inp1 = torch.randint(
|
||||
1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
inp2 = torch.randint(
|
||||
1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph,
|
||||
stream=graph_capture_context.stream):
|
||||
with torch.cuda.graph(graph, stream=graph_capture_context.stream):
|
||||
for i in range(num_communication):
|
||||
out1 = tensor_model_parallel_all_reduce(inp1)
|
||||
# the input buffer is immediately modified to test
|
||||
@@ -96,8 +94,7 @@ def eager_allreduce(
|
||||
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
|
||||
# we use the first group to communicate once
|
||||
# and the second group to communicate twice
|
||||
@@ -132,5 +129,4 @@ def test_custom_allreduce(
|
||||
world_size = tp_size * pipeline_parallel_size
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
|
||||
test_target)
|
||||
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from ..entrypoints.openai.test_oot_registration import (
|
||||
run_and_test_dummy_opt_api_server)
|
||||
from ..entrypoints.openai.test_oot_registration import run_and_test_dummy_opt_api_server
|
||||
|
||||
|
||||
def test_distributed_oot(dummy_opt_path: str):
|
||||
|
||||
@@ -10,10 +10,12 @@ from vllm.distributed.eplb.rebalance_algo import rebalance_experts
|
||||
def test_basic_rebalance():
|
||||
"""Test basic rebalancing functionality"""
|
||||
# Example from https://github.com/deepseek-ai/eplb
|
||||
weight = torch.tensor([
|
||||
[90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
|
||||
[20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
|
||||
])
|
||||
weight = torch.tensor(
|
||||
[
|
||||
[90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
|
||||
[20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
|
||||
]
|
||||
)
|
||||
|
||||
num_layers = weight.shape[0]
|
||||
num_replicas = 16
|
||||
@@ -21,45 +23,49 @@ def test_basic_rebalance():
|
||||
num_nodes = 2
|
||||
num_gpus = 8
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
|
||||
num_groups, num_nodes,
|
||||
num_gpus)
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Verify output shapes
|
||||
assert phy2log.shape == (
|
||||
2,
|
||||
16,
|
||||
), f"Expected `phy2log` shape (2, 16), got {phy2log.shape}"
|
||||
assert (log2phy.shape[0] == 2
|
||||
), f"Expected `log2phy` first dimension 2, got {log2phy.shape[0]}"
|
||||
assert (
|
||||
log2phy.shape[1] == 12
|
||||
), f"Expected `log2phy` second dimension 12, got {log2phy.shape[1]}"
|
||||
assert log2phy.shape[0] == 2, (
|
||||
f"Expected `log2phy` first dimension 2, got {log2phy.shape[0]}"
|
||||
)
|
||||
assert log2phy.shape[1] == 12, (
|
||||
f"Expected `log2phy` second dimension 12, got {log2phy.shape[1]}"
|
||||
)
|
||||
assert logcnt.shape == (
|
||||
2,
|
||||
12,
|
||||
), f"Expected `logcnt` shape (2, 12), got {logcnt.shape}"
|
||||
|
||||
# Verify physical to logical expert mapping range is correct
|
||||
assert torch.all(phy2log >= 0) and torch.all(
|
||||
phy2log < 12), "Physical to logical mapping should be in range [0, 12)"
|
||||
assert torch.all(phy2log >= 0) and torch.all(phy2log < 12), (
|
||||
"Physical to logical mapping should be in range [0, 12)"
|
||||
)
|
||||
|
||||
# Verify expert count reasonableness
|
||||
assert torch.all(
|
||||
logcnt >= 1), "Each logical expert should have at least 1 replica"
|
||||
assert (
|
||||
torch.sum(logcnt, dim=1).sum() == num_replicas *
|
||||
num_layers), f"Total replicas should be {num_replicas * num_layers}"
|
||||
assert torch.all(logcnt >= 1), "Each logical expert should have at least 1 replica"
|
||||
assert torch.sum(logcnt, dim=1).sum() == num_replicas * num_layers, (
|
||||
f"Total replicas should be {num_replicas * num_layers}"
|
||||
)
|
||||
|
||||
# Verify expected output
|
||||
expected_phy2log = torch.tensor([
|
||||
[5, 6, 5, 7, 8, 4, 3, 4, 10, 9, 10, 2, 0, 1, 11, 1],
|
||||
[7, 10, 6, 8, 6, 11, 8, 9, 2, 4, 5, 1, 5, 0, 3, 1],
|
||||
])
|
||||
expected_phy2log = torch.tensor(
|
||||
[
|
||||
[5, 6, 5, 7, 8, 4, 3, 4, 10, 9, 10, 2, 0, 1, 11, 1],
|
||||
[7, 10, 6, 8, 6, 11, 8, 9, 2, 4, 5, 1, 5, 0, 3, 1],
|
||||
]
|
||||
)
|
||||
assert torch.all(phy2log == expected_phy2log)
|
||||
|
||||
expected_logcnt = torch.tensor([[1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1],
|
||||
[1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]])
|
||||
expected_logcnt = torch.tensor(
|
||||
[[1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1], [1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]]
|
||||
)
|
||||
assert torch.all(logcnt == expected_logcnt)
|
||||
|
||||
|
||||
@@ -71,9 +77,9 @@ def test_single_gpu_case():
|
||||
num_nodes = 1
|
||||
num_gpus = 1
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
|
||||
num_groups, num_nodes,
|
||||
num_gpus)
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Verify shapes
|
||||
assert phy2log.shape == (1, 4)
|
||||
@@ -93,19 +99,19 @@ def test_equal_weights():
|
||||
num_nodes = 2
|
||||
num_gpus = 4
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
|
||||
num_groups, num_nodes,
|
||||
num_gpus)
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Verify shapes
|
||||
assert phy2log.shape == (1, 8)
|
||||
assert logcnt.shape == (1, 8)
|
||||
|
||||
# With equal weights, each expert should have exactly one replica
|
||||
assert torch.all(
|
||||
logcnt == 1
|
||||
), "With equal weights and no replication, " \
|
||||
"each expert should have exactly 1 replica"
|
||||
assert torch.all(logcnt == 1), (
|
||||
"With equal weights and no replication, "
|
||||
"each expert should have exactly 1 replica"
|
||||
)
|
||||
|
||||
|
||||
def test_extreme_weight_imbalance():
|
||||
@@ -116,35 +122,37 @@ def test_extreme_weight_imbalance():
|
||||
num_nodes = 2
|
||||
num_gpus = 4
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
|
||||
num_groups, num_nodes,
|
||||
num_gpus)
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Verify shapes
|
||||
assert phy2log.shape == (1, 12)
|
||||
assert logcnt.shape == (1, 8)
|
||||
|
||||
# Expert with highest weight (index 0) should have more replicas
|
||||
assert (
|
||||
logcnt[0, 0]
|
||||
> logcnt[0, 1]), "Expert with highest weight should have more replicas"
|
||||
assert logcnt[0, 0] > logcnt[0, 1], (
|
||||
"Expert with highest weight should have more replicas"
|
||||
)
|
||||
|
||||
|
||||
def test_multiple_layers():
|
||||
"""Test multiple layers case"""
|
||||
weight = torch.tensor([
|
||||
[10, 20, 30, 40, 50, 60], # First layer
|
||||
[60, 50, 40, 30, 20, 10], # Second layer (opposite weight pattern)
|
||||
[25, 25, 25, 25, 25, 25], # Third layer (equal weights)
|
||||
])
|
||||
weight = torch.tensor(
|
||||
[
|
||||
[10, 20, 30, 40, 50, 60], # First layer
|
||||
[60, 50, 40, 30, 20, 10], # Second layer (opposite weight pattern)
|
||||
[25, 25, 25, 25, 25, 25], # Third layer (equal weights)
|
||||
]
|
||||
)
|
||||
num_replicas = 8
|
||||
num_groups = 2
|
||||
num_nodes = 2
|
||||
num_gpus = 4
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
|
||||
num_groups, num_nodes,
|
||||
num_gpus)
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Verify shapes
|
||||
assert phy2log.shape == (3, 8)
|
||||
@@ -152,12 +160,12 @@ def test_multiple_layers():
|
||||
|
||||
# Verify expert allocation is reasonable for each layer
|
||||
for layer in range(3):
|
||||
assert torch.all(phy2log[layer] >= 0) and torch.all(
|
||||
phy2log[layer] < 6
|
||||
), f"Layer {layer} physical to logical mapping" \
|
||||
"should be in range [0, 6)"
|
||||
assert (torch.sum(logcnt[layer]) == num_replicas
|
||||
), f"Layer {layer} total replicas should be {num_replicas}"
|
||||
assert torch.all(phy2log[layer] >= 0) and torch.all(phy2log[layer] < 6), (
|
||||
f"Layer {layer} physical to logical mappingshould be in range [0, 6)"
|
||||
)
|
||||
assert torch.sum(logcnt[layer]) == num_replicas, (
|
||||
f"Layer {layer} total replicas should be {num_replicas}"
|
||||
)
|
||||
|
||||
|
||||
def test_parameter_validation():
|
||||
@@ -179,17 +187,19 @@ def test_parameter_validation():
|
||||
|
||||
def test_small_scale_hierarchical():
|
||||
"""Test small-scale hierarchical load balancing"""
|
||||
weight = torch.tensor([
|
||||
[100, 50, 200, 75, 150, 25, 300, 80], # 8 experts
|
||||
])
|
||||
weight = torch.tensor(
|
||||
[
|
||||
[100, 50, 200, 75, 150, 25, 300, 80], # 8 experts
|
||||
]
|
||||
)
|
||||
num_replicas = 12
|
||||
num_groups = 4 # 4 groups, 2 experts each
|
||||
num_nodes = 2 # 2 nodes
|
||||
num_gpus = 4 # 4 GPUs
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
|
||||
num_groups, num_nodes,
|
||||
num_gpus)
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Verify basic constraints
|
||||
assert phy2log.shape == (1, 12)
|
||||
@@ -199,8 +209,9 @@ def test_small_scale_hierarchical():
|
||||
|
||||
# Expert with highest weight should have more replicas
|
||||
max_weight_expert = torch.argmax(weight[0])
|
||||
assert (logcnt[0, max_weight_expert]
|
||||
>= 2), "Highest weight expert should have multiple replicas"
|
||||
assert logcnt[0, max_weight_expert] >= 2, (
|
||||
"Highest weight expert should have multiple replicas"
|
||||
)
|
||||
|
||||
|
||||
def test_global_load_balance_fallback():
|
||||
@@ -213,9 +224,9 @@ def test_global_load_balance_fallback():
|
||||
num_nodes = 2
|
||||
num_gpus = 4
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
|
||||
num_groups, num_nodes,
|
||||
num_gpus)
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Should work normally, just using global load balancing strategy
|
||||
assert phy2log.shape == (1, 8)
|
||||
@@ -235,9 +246,9 @@ def test_device_compatibility(device):
|
||||
num_nodes = 1
|
||||
num_gpus = 2
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
|
||||
num_groups, num_nodes,
|
||||
num_gpus)
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
|
||||
# Function will convert to CPU internally, but should handle different
|
||||
# device inputs normally
|
||||
@@ -250,7 +261,8 @@ def test_additional_cases():
|
||||
|
||||
# Test case 1: Large-scale distributed setup
|
||||
weight1 = torch.tensor(
|
||||
[[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]])
|
||||
[[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]]
|
||||
)
|
||||
phy2log1, log2phy1, logcnt1 = rebalance_experts(weight1, 24, 8, 4, 8)
|
||||
|
||||
assert phy2log1.shape == (1, 24)
|
||||
@@ -258,10 +270,12 @@ def test_additional_cases():
|
||||
assert torch.sum(logcnt1) == 24
|
||||
|
||||
# Test case 2: Different weight distributions
|
||||
weight2 = torch.tensor([
|
||||
[200, 150, 100, 50, 25, 12], # Decreasing weights
|
||||
[12, 25, 50, 100, 150, 200], # Increasing weights
|
||||
])
|
||||
weight2 = torch.tensor(
|
||||
[
|
||||
[200, 150, 100, 50, 25, 12], # Decreasing weights
|
||||
[12, 25, 50, 100, 150, 200], # Increasing weights
|
||||
]
|
||||
)
|
||||
phy2log2, log2phy2, logcnt2 = rebalance_experts(weight2, 10, 3, 1, 2)
|
||||
|
||||
assert phy2log2.shape == (2, 10)
|
||||
@@ -274,19 +288,21 @@ def test_additional_cases():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
weight = torch.tensor([
|
||||
[90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
|
||||
[20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
|
||||
])
|
||||
weight = torch.tensor(
|
||||
[
|
||||
[90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
|
||||
[20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
|
||||
]
|
||||
)
|
||||
|
||||
num_replicas = 16
|
||||
num_groups = 4
|
||||
num_nodes = 2
|
||||
num_gpus = 8
|
||||
|
||||
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
|
||||
num_groups, num_nodes,
|
||||
num_gpus)
|
||||
phy2log, log2phy, logcnt = rebalance_experts(
|
||||
weight, num_replicas, num_groups, num_nodes, num_gpus
|
||||
)
|
||||
print(phy2log)
|
||||
|
||||
test_basic_rebalance()
|
||||
|
||||
@@ -9,11 +9,12 @@ import pytest
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
from vllm.distributed.eplb.rebalance_execute import (
|
||||
rearrange_expert_weights_inplace)
|
||||
from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
|
||||
get_tp_group,
|
||||
init_distributed_environment)
|
||||
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
|
||||
from vllm.distributed.parallel_state import (
|
||||
ensure_model_parallel_initialized,
|
||||
get_tp_group,
|
||||
init_distributed_environment,
|
||||
)
|
||||
from vllm.utils import update_environment_variables
|
||||
|
||||
|
||||
@@ -22,13 +23,13 @@ def distributed_run(fn, world_size):
|
||||
processes: list[multiprocessing.Process] = []
|
||||
for i in range(number_of_processes):
|
||||
env: dict[str, str] = {}
|
||||
env['RANK'] = str(i)
|
||||
env['LOCAL_RANK'] = str(i)
|
||||
env['WORLD_SIZE'] = str(number_of_processes)
|
||||
env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
|
||||
env['MASTER_ADDR'] = 'localhost'
|
||||
env['MASTER_PORT'] = '12345'
|
||||
p = multiprocessing.Process(target=fn, args=(env, ))
|
||||
env["RANK"] = str(i)
|
||||
env["LOCAL_RANK"] = str(i)
|
||||
env["WORLD_SIZE"] = str(number_of_processes)
|
||||
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
|
||||
env["MASTER_ADDR"] = "localhost"
|
||||
env["MASTER_PORT"] = "12345"
|
||||
p = multiprocessing.Process(target=fn, args=(env,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
@@ -45,7 +46,7 @@ def worker_fn_wrapper(fn):
|
||||
# and update the environment variables in the function
|
||||
def wrapped_fn(env):
|
||||
update_environment_variables(env)
|
||||
local_rank = os.environ['LOCAL_RANK']
|
||||
local_rank = os.environ["LOCAL_RANK"]
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_distributed_environment()
|
||||
@@ -60,20 +61,20 @@ def worker_fn_wrapper(fn):
|
||||
|
||||
|
||||
def create_expert_indices_with_redundancy(
|
||||
num_layers: int,
|
||||
num_logical_experts: int,
|
||||
total_physical_experts: int,
|
||||
redundancy_config: list[int], # redundancy for each logical expert
|
||||
num_layers: int,
|
||||
num_logical_experts: int,
|
||||
total_physical_experts: int,
|
||||
redundancy_config: list[int], # redundancy for each logical expert
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create expert indices with redundancy.
|
||||
|
||||
|
||||
Args:
|
||||
num_layers: number of layers
|
||||
num_logical_experts: number of logical experts
|
||||
total_physical_experts: total number of physical experts
|
||||
redundancy_config: redundancy for each logical expert
|
||||
|
||||
|
||||
Returns:
|
||||
indices: Shape (num_layers, total_physical_experts)
|
||||
"""
|
||||
@@ -106,11 +107,11 @@ def create_expert_weights(
|
||||
) -> list[list[torch.Tensor]]:
|
||||
"""
|
||||
Create fake expert weights tensor for testing.
|
||||
|
||||
|
||||
Use `arange` to generate predictable weights values, based on logical
|
||||
expert ID.
|
||||
All replicas of the same logical expert should have the same weights.
|
||||
|
||||
|
||||
Args:
|
||||
physical_to_logical_mapping: Shape (num_layers, num_local_experts)
|
||||
mapping[layer, physical_pos] = logical_expert_id
|
||||
@@ -120,27 +121,27 @@ def create_expert_weights(
|
||||
for layer in range(num_layers):
|
||||
layer_weights = []
|
||||
for weight_idx, hidden_size in enumerate(hidden_sizes):
|
||||
weight_tensor = torch.zeros(num_local_experts,
|
||||
hidden_size,
|
||||
device=device,
|
||||
dtype=torch.float32)
|
||||
weight_tensor = torch.zeros(
|
||||
num_local_experts, hidden_size, device=device, dtype=torch.float32
|
||||
)
|
||||
|
||||
for local_expert in range(num_local_experts):
|
||||
# Get the logical expert ID for this physical expert
|
||||
global_pos = rank * num_local_experts + local_expert
|
||||
logical_expert_id = physical_to_logical_mapping[
|
||||
layer, global_pos].item()
|
||||
layer, global_pos
|
||||
].item()
|
||||
|
||||
# Generate weights based on logical expert ID
|
||||
# (so that all replicas of the same logical expert have the
|
||||
# same weights)
|
||||
base_value = (logical_expert_id * 1000 + layer * 100 +
|
||||
weight_idx * 10)
|
||||
weight_tensor[local_expert] = torch.arange(base_value,
|
||||
base_value +
|
||||
hidden_size,
|
||||
device=device,
|
||||
dtype=torch.float32)
|
||||
base_value = logical_expert_id * 1000 + layer * 100 + weight_idx * 10
|
||||
weight_tensor[local_expert] = torch.arange(
|
||||
base_value,
|
||||
base_value + hidden_size,
|
||||
device=device,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
|
||||
layer_weights.append(weight_tensor)
|
||||
expert_weights.append(layer_weights)
|
||||
@@ -182,12 +183,15 @@ def verify_expert_weights_after_shuffle(
|
||||
|
||||
# Check if the weights are correct
|
||||
actual_weights = weight_tensor[local_expert]
|
||||
expected_base = (expected_logical_expert * 1000 + layer * 100 +
|
||||
weight_idx * 10)
|
||||
expected_weights = torch.arange(expected_base,
|
||||
expected_base + hidden_size,
|
||||
device=actual_weights.device,
|
||||
dtype=actual_weights.dtype)
|
||||
expected_base = (
|
||||
expected_logical_expert * 1000 + layer * 100 + weight_idx * 10
|
||||
)
|
||||
expected_weights = torch.arange(
|
||||
expected_base,
|
||||
expected_base + hidden_size,
|
||||
device=actual_weights.device,
|
||||
dtype=actual_weights.dtype,
|
||||
)
|
||||
|
||||
torch.testing.assert_close(
|
||||
actual_weights,
|
||||
@@ -195,7 +199,8 @@ def verify_expert_weights_after_shuffle(
|
||||
msg=f"Layer {layer}, weight {weight_idx},"
|
||||
f"local expert {local_expert}: "
|
||||
f"weights do not match. "
|
||||
f"Expected logical expert {expected_logical_expert}")
|
||||
f"Expected logical expert {expected_logical_expert}",
|
||||
)
|
||||
|
||||
|
||||
def verify_redundant_experts_have_same_weights(
|
||||
@@ -222,23 +227,23 @@ def verify_redundant_experts_have_same_weights(
|
||||
total_physical_experts,
|
||||
hidden_size,
|
||||
device=expert_weights[layer][weight_idx].device,
|
||||
dtype=expert_weights[layer][weight_idx].dtype)
|
||||
dtype=expert_weights[layer][weight_idx].dtype,
|
||||
)
|
||||
|
||||
# Use all_gather to collect expert weights from current node
|
||||
# expert_weights[layer][weight_idx] shape:
|
||||
# [num_local_experts, hidden_size]
|
||||
local_weights = expert_weights[layer][
|
||||
weight_idx] # [num_local_experts, hidden_size]
|
||||
weight_idx
|
||||
] # [num_local_experts, hidden_size]
|
||||
|
||||
# Split tensor along dim 0 into a list for all_gather
|
||||
gathered_weights_list = torch.chunk(gathered_weights,
|
||||
world_size,
|
||||
dim=0)
|
||||
gathered_weights_list = torch.chunk(gathered_weights, world_size, dim=0)
|
||||
|
||||
torch.distributed.all_gather(
|
||||
# Output list: each element corresponds to one rank's weights
|
||||
list(gathered_weights_list),
|
||||
local_weights # Input: current rank's local weights
|
||||
local_weights, # Input: current rank's local weights
|
||||
)
|
||||
|
||||
all_weights.append(gathered_weights)
|
||||
@@ -266,7 +271,8 @@ def verify_redundant_experts_have_same_weights(
|
||||
msg=f"Layer {layer}, weight {weight_idx},"
|
||||
f"logical expert {logical_expert_id}: "
|
||||
f"Physical expert {physical_pos} has different weights"
|
||||
f"than expected")
|
||||
f"than expected",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -290,10 +296,11 @@ def verify_redundant_experts_have_same_weights(
|
||||
# 4 GPU, 8 experts per GPU
|
||||
# 16 logical experts, 32 physical experts, 16 redundant experts
|
||||
(4, 8, 8, 16),
|
||||
])
|
||||
def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
|
||||
num_local_experts,
|
||||
num_logical_experts):
|
||||
],
|
||||
)
|
||||
def test_rearrange_expert_weights_with_redundancy(
|
||||
world_size, num_layers, num_local_experts, num_logical_experts
|
||||
):
|
||||
"""Test the functionality of rearranging expert weights with redundancy."""
|
||||
|
||||
if torch.cuda.device_count() < world_size:
|
||||
@@ -304,8 +311,8 @@ def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
|
||||
# Initialize model parallel (using tensor parallel as an entrypoint
|
||||
# to expert parallel)
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size,
|
||||
pipeline_model_parallel_size=1)
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
@@ -316,8 +323,9 @@ def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
|
||||
hidden_sizes = [32, 64] # Two different weight matrices
|
||||
|
||||
# Create old expert indices (with redundancy)
|
||||
redundancy_config = create_redundancy_config(num_logical_experts,
|
||||
total_physical_experts)
|
||||
redundancy_config = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
|
||||
old_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
@@ -328,7 +336,8 @@ def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
|
||||
|
||||
# Create new expert indices (with redundancy)
|
||||
new_redundancy_config = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts)
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
new_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
@@ -337,9 +346,9 @@ def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
|
||||
)
|
||||
|
||||
# Create expert weights
|
||||
expert_weights = create_expert_weights(num_layers, num_local_experts,
|
||||
hidden_sizes, ep_rank, device,
|
||||
old_indices)
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
|
||||
)
|
||||
|
||||
# Execute weight rearrangement
|
||||
rearrange_expert_weights_inplace(
|
||||
@@ -383,8 +392,8 @@ def test_rearrange_expert_weights_no_change(world_size):
|
||||
@worker_fn_wrapper
|
||||
def worker_fn():
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size,
|
||||
pipeline_model_parallel_size=1)
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
@@ -401,12 +410,12 @@ def test_rearrange_expert_weights_no_change(world_size):
|
||||
|
||||
# Same indices - no change
|
||||
indices = create_expert_indices_with_redundancy(
|
||||
num_layers, num_logical_experts, total_physical_experts,
|
||||
redundancy_config)
|
||||
num_layers, num_logical_experts, total_physical_experts, redundancy_config
|
||||
)
|
||||
|
||||
expert_weights = create_expert_weights(num_layers, num_local_experts,
|
||||
hidden_sizes, ep_rank, device,
|
||||
indices)
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
|
||||
)
|
||||
|
||||
# Save original weights
|
||||
original_weights = []
|
||||
@@ -422,7 +431,8 @@ def test_rearrange_expert_weights_no_change(world_size):
|
||||
indices, # Same indices
|
||||
expert_weights,
|
||||
ep_group,
|
||||
is_profile=False)
|
||||
is_profile=False,
|
||||
)
|
||||
|
||||
# Verify that the weights have not changed
|
||||
for layer in range(num_layers):
|
||||
@@ -430,8 +440,8 @@ def test_rearrange_expert_weights_no_change(world_size):
|
||||
torch.testing.assert_close(
|
||||
expert_weights[layer][weight_idx],
|
||||
original_weights[layer][weight_idx],
|
||||
msg=f"Layer {layer}, weight {weight_idx} should remain "
|
||||
f"unchanged")
|
||||
msg=f"Layer {layer}, weight {weight_idx} should remain unchanged",
|
||||
)
|
||||
|
||||
distributed_run(worker_fn, world_size)
|
||||
|
||||
@@ -446,8 +456,8 @@ def test_rearrange_expert_weights_profile_mode(world_size):
|
||||
@worker_fn_wrapper
|
||||
def worker_fn():
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size,
|
||||
pipeline_model_parallel_size=1)
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
@@ -460,21 +470,23 @@ def test_rearrange_expert_weights_profile_mode(world_size):
|
||||
hidden_sizes = [32]
|
||||
|
||||
# Create different index distributions
|
||||
old_redundancy = create_redundancy_config(num_logical_experts,
|
||||
total_physical_experts)
|
||||
new_redundancy = create_redundancy_config(num_logical_experts,
|
||||
total_physical_experts)
|
||||
old_redundancy = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
new_redundancy = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
|
||||
old_indices = create_expert_indices_with_redundancy(
|
||||
num_layers, num_logical_experts, total_physical_experts,
|
||||
old_redundancy)
|
||||
num_layers, num_logical_experts, total_physical_experts, old_redundancy
|
||||
)
|
||||
new_indices = create_expert_indices_with_redundancy(
|
||||
num_layers, num_logical_experts, total_physical_experts,
|
||||
new_redundancy)
|
||||
num_layers, num_logical_experts, total_physical_experts, new_redundancy
|
||||
)
|
||||
|
||||
expert_weights = create_expert_weights(num_layers, num_local_experts,
|
||||
hidden_sizes, ep_rank, device,
|
||||
old_indices)
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
|
||||
)
|
||||
|
||||
# Save original weights
|
||||
original_weights = []
|
||||
@@ -490,7 +502,7 @@ def test_rearrange_expert_weights_profile_mode(world_size):
|
||||
new_indices,
|
||||
expert_weights,
|
||||
ep_group,
|
||||
is_profile=True # Profile mode
|
||||
is_profile=True, # Profile mode
|
||||
)
|
||||
|
||||
# In profile mode, the weights should remain unchanged
|
||||
@@ -499,6 +511,7 @@ def test_rearrange_expert_weights_profile_mode(world_size):
|
||||
torch.testing.assert_close(
|
||||
expert_weights[layer][weight_idx],
|
||||
original_weights[layer][weight_idx],
|
||||
msg="In profile mode, the weights should remain unchanged")
|
||||
msg="In profile mode, the weights should remain unchanged",
|
||||
)
|
||||
|
||||
distributed_run(worker_fn, world_size)
|
||||
|
||||
@@ -6,24 +6,29 @@ import time
|
||||
import msgspec
|
||||
import pytest
|
||||
|
||||
from vllm.distributed.kv_events import (EventBatch, EventPublisherFactory,
|
||||
NullEventPublisher)
|
||||
from vllm.distributed.kv_events import (
|
||||
EventBatch,
|
||||
EventPublisherFactory,
|
||||
NullEventPublisher,
|
||||
)
|
||||
|
||||
DP_RANK = 0
|
||||
|
||||
|
||||
class EventSample(
|
||||
msgspec.Struct,
|
||||
tag=True, # type: ignore
|
||||
array_like=True # type: ignore
|
||||
msgspec.Struct,
|
||||
tag=True, # type: ignore
|
||||
array_like=True, # type: ignore
|
||||
):
|
||||
"""Test event for publisher testing"""
|
||||
|
||||
id: int
|
||||
value: str
|
||||
|
||||
|
||||
class SampleBatch(EventBatch):
|
||||
"""Test event batch for publisher testing"""
|
||||
|
||||
events: list[EventSample]
|
||||
|
||||
|
||||
@@ -44,10 +49,8 @@ def test_basic_publishing(publisher, subscriber):
|
||||
|
||||
seq, received = result
|
||||
assert seq == 0, "Sequence number mismatch"
|
||||
assert received.ts == pytest.approx(test_batch.ts,
|
||||
abs=0.1), ("Timestamp mismatch")
|
||||
assert len(received.events) == len(
|
||||
test_batch.events), ("Number of events mismatch")
|
||||
assert received.ts == pytest.approx(test_batch.ts, abs=0.1), "Timestamp mismatch"
|
||||
assert len(received.events) == len(test_batch.events), "Number of events mismatch"
|
||||
|
||||
for i, event in enumerate(received.events):
|
||||
assert event.id == i, "Event id mismatch"
|
||||
@@ -88,9 +91,9 @@ def test_replay_mechanism(publisher, subscriber):
|
||||
assert len(replayed) > 0, "No replayed messages received"
|
||||
seqs = [seq for seq, _ in replayed]
|
||||
assert all(seq >= 10 for seq in seqs), "Replayed messages not in order"
|
||||
assert seqs == list(range(min(seqs),
|
||||
max(seqs) +
|
||||
1)), ("Replayed messages not consecutive")
|
||||
assert seqs == list(range(min(seqs), max(seqs) + 1)), (
|
||||
"Replayed messages not consecutive"
|
||||
)
|
||||
|
||||
|
||||
def test_buffer_limit(publisher, subscriber, publisher_config):
|
||||
@@ -126,6 +129,7 @@ def test_topic_filtering(publisher_config):
|
||||
pub = EventPublisherFactory.create(publisher_config, DP_RANK)
|
||||
|
||||
from .conftest import MockSubscriber
|
||||
|
||||
sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
|
||||
sub_bar = MockSubscriber(publisher_config.endpoint, None, "bar")
|
||||
|
||||
@@ -137,11 +141,13 @@ def test_topic_filtering(publisher_config):
|
||||
|
||||
foo_received = [sub_foo.receive_one(timeout=200) for _ in range(3)]
|
||||
assert all(msg is not None for msg in foo_received), (
|
||||
"Subscriber with matching topic should receive messages")
|
||||
"Subscriber with matching topic should receive messages"
|
||||
)
|
||||
|
||||
bar_received = [sub_bar.receive_one(timeout=200) for _ in range(3)]
|
||||
assert all(msg is None for msg in bar_received), (
|
||||
"Subscriber with non-matching topic should receive no messages")
|
||||
"Subscriber with non-matching topic should receive no messages"
|
||||
)
|
||||
finally:
|
||||
pub.shutdown()
|
||||
sub_foo.close()
|
||||
@@ -178,8 +184,7 @@ def test_high_volume(publisher, subscriber):
|
||||
|
||||
publisher_thread.join()
|
||||
|
||||
assert len(received) >= num_batches * 0.9, (
|
||||
"We should have received most messages")
|
||||
assert len(received) >= num_batches * 0.9, "We should have received most messages"
|
||||
|
||||
seqs = [seq for seq, _ in received]
|
||||
assert sorted(seqs) == seqs, "Sequence numbers should be in order"
|
||||
@@ -209,13 +214,15 @@ def test_data_parallel_rank_tagging(publisher_config):
|
||||
# For TCP endpoints: tcp://localhost:5557 -> tcp://localhost:5557, tcp://localhost:5558
|
||||
expected_endpoint_0 = base_endpoint # rank 0 gets port + 0 = same port
|
||||
expected_endpoint_1 = base_endpoint.replace(
|
||||
":5557", ":5558") # rank 1 gets port + 1
|
||||
":5557", ":5558"
|
||||
) # rank 1 gets port + 1
|
||||
else:
|
||||
# For inproc endpoints: inproc://test -> inproc://test_dp0, inproc://test_dp1
|
||||
expected_endpoint_0 = base_endpoint # rank 0 gets base
|
||||
expected_endpoint_1 = base_endpoint + "_dp1" # rank 1 gets _dp1
|
||||
|
||||
from .conftest import MockSubscriber
|
||||
|
||||
sub_0 = MockSubscriber(expected_endpoint_0, None, publisher_config.topic)
|
||||
sub_1 = MockSubscriber(expected_endpoint_1, None, publisher_config.topic)
|
||||
|
||||
@@ -241,15 +248,15 @@ def test_data_parallel_rank_tagging(publisher_config):
|
||||
|
||||
# Verify DP rank tagging
|
||||
assert received_0.data_parallel_rank == 0, (
|
||||
f"Expected DP rank 0, got {received_0.data_parallel_rank}")
|
||||
f"Expected DP rank 0, got {received_0.data_parallel_rank}"
|
||||
)
|
||||
assert received_1.data_parallel_rank == 1, (
|
||||
f"Expected DP rank 1, got {received_1.data_parallel_rank}")
|
||||
f"Expected DP rank 1, got {received_1.data_parallel_rank}"
|
||||
)
|
||||
|
||||
# Verify event content is correct
|
||||
assert len(
|
||||
received_0.events) == 2, "Wrong number of events from rank 0"
|
||||
assert len(
|
||||
received_1.events) == 3, "Wrong number of events from rank 1"
|
||||
assert len(received_0.events) == 2, "Wrong number of events from rank 0"
|
||||
assert len(received_1.events) == 3, "Wrong number of events from rank 1"
|
||||
|
||||
finally:
|
||||
pub_0.shutdown()
|
||||
|
||||
@@ -46,28 +46,24 @@ class EPTestSettings:
|
||||
):
|
||||
return EPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
eager_mode=False,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
eager_mode=False,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=2 * tp_base,
|
||||
eager_mode=False,
|
||||
chunked_prefill=True),
|
||||
ParallelSetup(tp_size=2 * tp_base,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base, eager_mode=False, chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base, eager_mode=False, chunked_prefill=True),
|
||||
ParallelSetup(tp_size=tp_base, eager_mode=True, chunked_prefill=False),
|
||||
ParallelSetup(
|
||||
tp_size=2 * tp_base, eager_mode=False, chunked_prefill=True
|
||||
),
|
||||
ParallelSetup(
|
||||
tp_size=2 * tp_base, eager_mode=True, chunked_prefill=False
|
||||
),
|
||||
],
|
||||
distributed_backends=["mp", "ray"],
|
||||
runner=runner,
|
||||
test_options=EPTestOptions(trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
hf_overrides=hf_overrides),
|
||||
test_options=EPTestOptions(
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
hf_overrides=hf_overrides,
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -82,16 +78,16 @@ class EPTestSettings:
|
||||
):
|
||||
return EPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False),
|
||||
ParallelSetup(tp_size=tp_base, eager_mode=True, chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp"],
|
||||
runner=runner,
|
||||
test_options=EPTestOptions(trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
hf_overrides=hf_overrides),
|
||||
test_options=EPTestOptions(
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
load_format=load_format,
|
||||
hf_overrides=hf_overrides,
|
||||
),
|
||||
)
|
||||
|
||||
def iter_params(self, model_name: str):
|
||||
@@ -99,8 +95,13 @@ class EPTestSettings:
|
||||
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for distributed_backend in self.distributed_backends:
|
||||
yield (model_name, parallel_setup, distributed_backend,
|
||||
self.runner, opts)
|
||||
yield (
|
||||
model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
self.runner,
|
||||
opts,
|
||||
)
|
||||
|
||||
|
||||
# NOTE: You can adjust tp_base locally to fit the model in GPU
|
||||
|
||||
@@ -6,8 +6,7 @@ import pytest
|
||||
from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
|
||||
|
||||
|
||||
def verify_round_robin_pattern(expert_map, ep_rank, ep_size,
|
||||
global_num_experts):
|
||||
def verify_round_robin_pattern(expert_map, ep_rank, ep_size, global_num_experts):
|
||||
"""Verify that the expert map follows the round_robin pattern."""
|
||||
# Calculate expected local experts (supporting non-divisible cases)
|
||||
base_experts = global_num_experts // ep_size
|
||||
@@ -30,24 +29,21 @@ def verify_round_robin_pattern(expert_map, ep_rank, ep_size,
|
||||
if global_expert_id in expected_expert_ids:
|
||||
local_expert_id = expert_map[global_expert_id]
|
||||
expected_local_id = expected_expert_ids.index(global_expert_id)
|
||||
assert (
|
||||
local_expert_id == expected_local_id
|
||||
), f"Global expert {global_expert_id} should map to local expert " \
|
||||
assert local_expert_id == expected_local_id, (
|
||||
f"Global expert {global_expert_id} should map to local expert "
|
||||
f"{expected_local_id}, got {local_expert_id}"
|
||||
)
|
||||
else:
|
||||
assert (
|
||||
expert_map[global_expert_id] == -1
|
||||
), f"Global expert {global_expert_id} should not be mapped to " \
|
||||
f"this rank"
|
||||
assert expert_map[global_expert_id] == -1, (
|
||||
f"Global expert {global_expert_id} should not be mapped to this rank"
|
||||
)
|
||||
|
||||
# Verify that all local expert IDs are consecutive starting from 0
|
||||
local_expert_ids = [
|
||||
expert_map[global_id] for global_id in expected_expert_ids
|
||||
]
|
||||
local_expert_ids = [expert_map[global_id] for global_id in expected_expert_ids]
|
||||
expected_local_ids = list(range(local_num_experts))
|
||||
assert (
|
||||
local_expert_ids == expected_local_ids
|
||||
), f"Expected local expert IDs {expected_local_ids}, got {local_expert_ids}"
|
||||
assert local_expert_ids == expected_local_ids, (
|
||||
f"Expected local expert IDs {expected_local_ids}, got {local_expert_ids}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("expert_placement_strategy", ["round_robin"])
|
||||
@@ -78,8 +74,9 @@ def test_expert_placement_various_sizes(expert_placement_strategy, world_size):
|
||||
|
||||
for test_global_experts, test_ep_size in test_cases:
|
||||
# Ensure ep_size matches world_size
|
||||
assert (test_ep_size == world_size
|
||||
), f"ep_size {test_ep_size} must equal world_size {world_size}"
|
||||
assert test_ep_size == world_size, (
|
||||
f"ep_size {test_ep_size} must equal world_size {world_size}"
|
||||
)
|
||||
|
||||
# Test each rank
|
||||
for ep_rank in range(world_size):
|
||||
@@ -98,21 +95,22 @@ def test_expert_placement_various_sizes(expert_placement_strategy, world_size):
|
||||
expert_placement_strategy=expert_placement_strategy,
|
||||
)
|
||||
|
||||
assert (
|
||||
test_local_experts == expected_test_local
|
||||
), f"For {test_global_experts} experts on {test_ep_size} ranks, " \
|
||||
f"rank {ep_rank}: expected {expected_test_local} local" \
|
||||
assert test_local_experts == expected_test_local, (
|
||||
f"For {test_global_experts} experts on {test_ep_size} ranks, "
|
||||
f"rank {ep_rank}: expected {expected_test_local} local"
|
||||
f"experts, got {test_local_experts}"
|
||||
)
|
||||
|
||||
if test_expert_map is not None:
|
||||
assert test_expert_map.shape == (
|
||||
test_global_experts,
|
||||
), f"Expected expert map shape ({test_global_experts},), " \
|
||||
assert test_expert_map.shape == (test_global_experts,), (
|
||||
f"Expected expert map shape ({test_global_experts},), "
|
||||
f"got {test_expert_map.shape}"
|
||||
)
|
||||
|
||||
# Verify round_robin pattern for this test case
|
||||
verify_round_robin_pattern(test_expert_map, ep_rank,
|
||||
test_ep_size, test_global_experts)
|
||||
verify_round_robin_pattern(
|
||||
test_expert_map, ep_rank, test_ep_size, test_global_experts
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("expert_placement_strategy", ["round_robin"])
|
||||
@@ -147,28 +145,81 @@ def test_determine_expert_map_comprehensive():
|
||||
# expert_placement_strategy, expected_local, expected_map_pattern)
|
||||
test_cases = [
|
||||
# Round robin placement tests
|
||||
(2, 0, 8, "round_robin", 4, [0, -1, 1, -1, 2, -1, 3,
|
||||
-1]), # rank 0 gets even experts
|
||||
(2, 1, 8, "round_robin", 4, [-1, 0, -1, 1, -1, 2, -1,
|
||||
3]), # rank 1 gets odd experts
|
||||
(2, 0, 9, "round_robin", 5, [0, -1, 1, -1, 2, -1, 3, -1, 4
|
||||
]), # rank 0 gets 5 experts (even + last)
|
||||
(2, 1, 9, "round_robin", 4, [-1, 0, -1, 1, -1, 2, -1, 3,
|
||||
-1]), # rank 1 gets 4 experts (odd)
|
||||
|
||||
(
|
||||
2,
|
||||
0,
|
||||
8,
|
||||
"round_robin",
|
||||
4,
|
||||
[0, -1, 1, -1, 2, -1, 3, -1],
|
||||
), # rank 0 gets even experts
|
||||
(
|
||||
2,
|
||||
1,
|
||||
8,
|
||||
"round_robin",
|
||||
4,
|
||||
[-1, 0, -1, 1, -1, 2, -1, 3],
|
||||
), # rank 1 gets odd experts
|
||||
(
|
||||
2,
|
||||
0,
|
||||
9,
|
||||
"round_robin",
|
||||
5,
|
||||
[0, -1, 1, -1, 2, -1, 3, -1, 4],
|
||||
), # rank 0 gets 5 experts (even + last)
|
||||
(
|
||||
2,
|
||||
1,
|
||||
9,
|
||||
"round_robin",
|
||||
4,
|
||||
[-1, 0, -1, 1, -1, 2, -1, 3, -1],
|
||||
), # rank 1 gets 4 experts (odd)
|
||||
# 4-rank tests
|
||||
(4, 0, 8, "round_robin", 2, [0, -1, -1, -1, 1, -1, -1,
|
||||
-1]), # rank 0 gets experts 0, 4
|
||||
(4, 1, 8, "round_robin", 2, [-1, 0, -1, -1, -1, 1, -1,
|
||||
-1]), # rank 1 gets experts 1, 5
|
||||
(4, 2, 8, "round_robin", 2, [-1, -1, 0, -1, -1, -1, 1,
|
||||
-1]), # rank 2 gets experts 2, 6
|
||||
(4, 3, 8, "round_robin", 2, [-1, -1, -1, 0, -1, -1, -1,
|
||||
1]), # rank 3 gets experts 3, 7
|
||||
(
|
||||
4,
|
||||
0,
|
||||
8,
|
||||
"round_robin",
|
||||
2,
|
||||
[0, -1, -1, -1, 1, -1, -1, -1],
|
||||
), # rank 0 gets experts 0, 4
|
||||
(
|
||||
4,
|
||||
1,
|
||||
8,
|
||||
"round_robin",
|
||||
2,
|
||||
[-1, 0, -1, -1, -1, 1, -1, -1],
|
||||
), # rank 1 gets experts 1, 5
|
||||
(
|
||||
4,
|
||||
2,
|
||||
8,
|
||||
"round_robin",
|
||||
2,
|
||||
[-1, -1, 0, -1, -1, -1, 1, -1],
|
||||
), # rank 2 gets experts 2, 6
|
||||
(
|
||||
4,
|
||||
3,
|
||||
8,
|
||||
"round_robin",
|
||||
2,
|
||||
[-1, -1, -1, 0, -1, -1, -1, 1],
|
||||
), # rank 3 gets experts 3, 7
|
||||
]
|
||||
|
||||
for ep_size, ep_rank, global_num_experts, expert_placement_strategy, \
|
||||
expected_local, expected_map_pattern in test_cases:
|
||||
for (
|
||||
ep_size,
|
||||
ep_rank,
|
||||
global_num_experts,
|
||||
expert_placement_strategy,
|
||||
expected_local,
|
||||
expected_map_pattern,
|
||||
) in test_cases:
|
||||
local_num_experts, expert_map = determine_expert_map(
|
||||
ep_size=ep_size,
|
||||
ep_rank=ep_rank,
|
||||
@@ -176,19 +227,21 @@ def test_determine_expert_map_comprehensive():
|
||||
expert_placement_strategy=expert_placement_strategy,
|
||||
)
|
||||
|
||||
assert local_num_experts == expected_local, \
|
||||
f"ep_size={ep_size}, ep_rank={ep_rank}, " \
|
||||
f"global_num_experts={global_num_experts}, " \
|
||||
f"expert_placement_strategy={expert_placement_strategy}: " \
|
||||
assert local_num_experts == expected_local, (
|
||||
f"ep_size={ep_size}, ep_rank={ep_rank}, "
|
||||
f"global_num_experts={global_num_experts}, "
|
||||
f"expert_placement_strategy={expert_placement_strategy}: "
|
||||
f"expected {expected_local} local experts, got {local_num_experts}"
|
||||
)
|
||||
|
||||
if expected_map_pattern is None:
|
||||
assert expert_map is None, "Expected expert_map to be None"
|
||||
else:
|
||||
assert expert_map is not None, "Expected expert_map to not be None"
|
||||
actual_map = expert_map.tolist()
|
||||
assert actual_map == expected_map_pattern, \
|
||||
f"ep_size={ep_size}, ep_rank={ep_rank}, " \
|
||||
f"global_num_experts={global_num_experts}, " \
|
||||
f"expert_placement_strategy={expert_placement_strategy}: " \
|
||||
assert actual_map == expected_map_pattern, (
|
||||
f"ep_size={ep_size}, ep_rank={ep_rank}, "
|
||||
f"global_num_experts={global_num_experts}, "
|
||||
f"expert_placement_strategy={expert_placement_strategy}: "
|
||||
f"expected map {expected_map_pattern}, got {actual_map}"
|
||||
)
|
||||
|
||||
@@ -1,10 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.config import (DeviceConfig, KVTransferConfig, ModelConfig,
|
||||
VllmConfig, set_current_vllm_config)
|
||||
from vllm.config import (
|
||||
DeviceConfig,
|
||||
KVTransferConfig,
|
||||
ModelConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.distributed.kv_transfer.kv_connector.utils import (
|
||||
get_kv_connector_cache_layout)
|
||||
get_kv_connector_cache_layout,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger("test_expert_parallel")
|
||||
@@ -23,8 +29,9 @@ def test_get_kv_connector_cache_layout_with_lmcache_connector():
|
||||
kv_connector="LMCacheConnectorV1",
|
||||
kv_role="kv_both",
|
||||
)
|
||||
vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
|
||||
kv_transfer_config=kv_transfer_config)
|
||||
vllm_config = VllmConfig(
|
||||
device_config=DeviceConfig("cpu"), kv_transfer_config=kv_transfer_config
|
||||
)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
# Test with default settings
|
||||
layout = get_kv_connector_cache_layout()
|
||||
@@ -37,9 +44,11 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
|
||||
kv_role="kv_both",
|
||||
)
|
||||
model_config = ModelConfig()
|
||||
vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
|
||||
model_config=model_config,
|
||||
kv_transfer_config=kv_transfer_config)
|
||||
vllm_config = VllmConfig(
|
||||
device_config=DeviceConfig("cpu"),
|
||||
model_config=model_config,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
# Test with default settings
|
||||
layout = get_kv_connector_cache_layout()
|
||||
@@ -47,25 +56,22 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
|
||||
|
||||
|
||||
def test_get_kv_connector_cache_layout_with_multi_connector():
|
||||
kv_transfer_config = KVTransferConfig(kv_connector="MultiConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"connectors": [{
|
||||
"kv_connector":
|
||||
"SharedStorageConnector",
|
||||
"kv_role":
|
||||
"kv_both"
|
||||
}, {
|
||||
"kv_connector":
|
||||
"NixlConnector",
|
||||
"kv_role":
|
||||
"kv_both"
|
||||
}]
|
||||
})
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="MultiConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"connectors": [
|
||||
{"kv_connector": "SharedStorageConnector", "kv_role": "kv_both"},
|
||||
{"kv_connector": "NixlConnector", "kv_role": "kv_both"},
|
||||
]
|
||||
},
|
||||
)
|
||||
model_config = ModelConfig()
|
||||
vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
|
||||
model_config=model_config,
|
||||
kv_transfer_config=kv_transfer_config)
|
||||
vllm_config = VllmConfig(
|
||||
device_config=DeviceConfig("cpu"),
|
||||
model_config=model_config,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
# Test with default settings
|
||||
layout = get_kv_connector_cache_layout()
|
||||
|
||||
@@ -24,14 +24,13 @@ from vllm.utils import get_ip
|
||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not VLLM_MULTI_NODE,
|
||||
reason="Need at least 2 nodes to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
not VLLM_MULTI_NODE, reason="Need at least 2 nodes to run the test."
|
||||
)
|
||||
def test_multi_node_assignment() -> None:
|
||||
|
||||
# NOTE: important to keep this class definition here
|
||||
# to let ray use cloudpickle to serialize it.
|
||||
class Actor:
|
||||
|
||||
def get_ip(self):
|
||||
return get_ip()
|
||||
|
||||
@@ -41,8 +40,7 @@ def test_multi_node_assignment() -> None:
|
||||
|
||||
current_ip = get_ip()
|
||||
workers = []
|
||||
for bundle_id, bundle in enumerate(
|
||||
config.placement_group.bundle_specs):
|
||||
for bundle_id, bundle in enumerate(config.placement_group.bundle_specs):
|
||||
if not bundle.get("GPU", 0):
|
||||
continue
|
||||
scheduling_strategy = PlacementGroupSchedulingStrategy(
|
||||
|
||||
@@ -11,15 +11,17 @@ import torch.multiprocessing as mp
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.distributed.device_communicators.cuda_communicator import (
|
||||
CudaCommunicator)
|
||||
from vllm.distributed.device_communicators.pynccl import (
|
||||
register_nccl_symmetric_ops)
|
||||
from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
|
||||
from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric_ops
|
||||
from vllm.distributed.device_communicators.pynccl_allocator import (
|
||||
get_nccl_mem_pool, is_symmetric_memory_enabled)
|
||||
from vllm.distributed.parallel_state import (get_tp_group,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel)
|
||||
get_nccl_mem_pool,
|
||||
is_symmetric_memory_enabled,
|
||||
)
|
||||
from vllm.distributed.parallel_state import (
|
||||
get_tp_group,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import update_environment_variables
|
||||
|
||||
@@ -38,31 +40,32 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
|
||||
torch.cuda.set_device(device)
|
||||
torch.set_default_device(device)
|
||||
torch.set_default_dtype(dtype)
|
||||
update_environment_variables({
|
||||
"RANK": str(local_rank),
|
||||
"LOCAL_RANK": str(local_rank),
|
||||
"WORLD_SIZE": str(world_size),
|
||||
"MASTER_ADDR": "localhost",
|
||||
"MASTER_PORT": "12345",
|
||||
})
|
||||
update_environment_variables(
|
||||
{
|
||||
"RANK": str(local_rank),
|
||||
"LOCAL_RANK": str(local_rank),
|
||||
"WORLD_SIZE": str(world_size),
|
||||
"MASTER_ADDR": "localhost",
|
||||
"MASTER_PORT": "12345",
|
||||
}
|
||||
)
|
||||
|
||||
init_distributed_environment()
|
||||
initialize_model_parallel(tensor_model_parallel_size=world_size)
|
||||
|
||||
cuda_communicator = typing.cast(CudaCommunicator,
|
||||
get_tp_group().device_communicator)
|
||||
cuda_communicator = typing.cast(
|
||||
CudaCommunicator, get_tp_group().device_communicator
|
||||
)
|
||||
pynccl_comm = cuda_communicator.pynccl_comm
|
||||
if get_nccl_mem_pool() is None:
|
||||
pytest.skip("NCCL allocator compilation failed "
|
||||
"(probably missing NCCL headers).")
|
||||
pytest.skip(
|
||||
"NCCL allocator compilation failed (probably missing NCCL headers)."
|
||||
)
|
||||
if not is_symmetric_memory_enabled():
|
||||
pytest.skip("NCCL symmetric memory allreduce is disabled.")
|
||||
|
||||
register_nccl_symmetric_ops(pynccl_comm)
|
||||
input = torch.randint(1,
|
||||
23, (test_size_elements, ),
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
input = torch.randint(1, 23, (test_size_elements,), dtype=dtype, device=device)
|
||||
input_clone = input.clone()
|
||||
output = torch.ops.vllm.all_reduce_symmetric_with_copy(input)
|
||||
assert output is not None
|
||||
@@ -77,8 +80,7 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
|
||||
reason="NCCLSymmMemAllreduce is only available for CUDA platforms.",
|
||||
)
|
||||
@pytest.mark.parametrize("world_size", [2])
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
|
||||
reason="Only test on CUDA")
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
|
||||
def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size):
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
@@ -88,7 +90,5 @@ def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size):
|
||||
monkeypatch.setenv("NCCL_NVLS_ENABLE", "1")
|
||||
monkeypatch.setenv("NCCL_CUMEM_ENABLE", "1")
|
||||
|
||||
mp.spawn(nccl_symm_mem_allreduce_worker,
|
||||
args=(world_size, ),
|
||||
nprocs=world_size)
|
||||
mp.spawn(nccl_symm_mem_allreduce_worker, args=(world_size,), nprocs=world_size)
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
@@ -32,12 +32,15 @@ if __name__ == "__main__":
|
||||
# Expected node count based on environment variable)
|
||||
expected = int(os.environ.get("NUM_NODES", "1"))
|
||||
|
||||
assert test_result == expected, \
|
||||
f"Expected {expected} nodes, got {test_result}"
|
||||
assert test_result == expected, f"Expected {expected} nodes, got {test_result}"
|
||||
|
||||
if pg == dist.group.WORLD:
|
||||
print(f"Node count test passed! Got {test_result} nodes "
|
||||
f"when using torch distributed!")
|
||||
print(
|
||||
f"Node count test passed! Got {test_result} nodes "
|
||||
f"when using torch distributed!"
|
||||
)
|
||||
else:
|
||||
print(f"Node count test passed! Got {test_result} nodes "
|
||||
f"when using StatelessProcessGroup!")
|
||||
print(
|
||||
f"Node count test passed! Got {test_result} nodes "
|
||||
f"when using StatelessProcessGroup!"
|
||||
)
|
||||
|
||||
@@ -7,6 +7,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
all workers in a node other than the head node, which can cause the test
|
||||
to fail.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
@@ -55,26 +56,17 @@ class PPTestSettings:
|
||||
):
|
||||
return PPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
eager_mode=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
eager_mode=False),
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=2 * pp_base,
|
||||
eager_mode=True),
|
||||
ParallelSetup(tp_size=2 * tp_base,
|
||||
pp_size=pp_base,
|
||||
eager_mode=False),
|
||||
ParallelSetup(tp_size=2 * tp_base,
|
||||
pp_size=pp_base,
|
||||
eager_mode=True),
|
||||
ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=False),
|
||||
ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=False),
|
||||
ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=True),
|
||||
ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=False),
|
||||
ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=True),
|
||||
],
|
||||
distributed_backends=["mp", "ray"],
|
||||
runner=runner,
|
||||
test_options=PPTestOptions(multi_node_only=multi_node_only,
|
||||
load_format=load_format),
|
||||
test_options=PPTestOptions(
|
||||
multi_node_only=multi_node_only, load_format=load_format
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -86,17 +78,15 @@ class PPTestSettings:
|
||||
multi_node_only: bool = False,
|
||||
load_format: Optional[str] = None,
|
||||
):
|
||||
|
||||
return PPTestSettings(
|
||||
parallel_setups=[
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
eager_mode=True),
|
||||
ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=True),
|
||||
],
|
||||
distributed_backends=["mp"],
|
||||
runner=runner,
|
||||
test_options=PPTestOptions(multi_node_only=multi_node_only,
|
||||
load_format=load_format),
|
||||
test_options=PPTestOptions(
|
||||
multi_node_only=multi_node_only, load_format=load_format
|
||||
),
|
||||
)
|
||||
|
||||
def iter_params(self, model_id: str):
|
||||
@@ -281,8 +271,10 @@ def _compare_tp(
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
if VLLM_MULTI_NODE and distributed_backend == "mp":
|
||||
pytest.skip("Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend")
|
||||
pytest.skip(
|
||||
"Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend"
|
||||
)
|
||||
if multi_node_only and not VLLM_MULTI_NODE:
|
||||
pytest.skip("Not in multi-node setting")
|
||||
|
||||
@@ -357,20 +349,16 @@ def _compare_tp(
|
||||
"mp",
|
||||
]
|
||||
|
||||
compare_two_settings(model_id,
|
||||
pp_args,
|
||||
tp_args,
|
||||
pp_env,
|
||||
tp_env,
|
||||
method=method)
|
||||
compare_two_settings(model_id, pp_args, tp_args, pp_env, tp_env, method=method)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "runner",
|
||||
"test_options"),
|
||||
("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
|
||||
[
|
||||
params for model_id, settings in TEXT_GENERATION_MODELS.items()
|
||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||
params
|
||||
for model_id, settings in TEXT_GENERATION_MODELS.items()
|
||||
for params in settings.iter_params(model_id)
|
||||
if model_id in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
@@ -382,22 +370,25 @@ def test_tp_language_generation(
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
is_multimodal=False)
|
||||
_compare_tp(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
is_multimodal=False,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "runner",
|
||||
"test_options"),
|
||||
("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
|
||||
[
|
||||
params for model_id, settings in EMBEDDING_MODELS.items()
|
||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||
params
|
||||
for model_id, settings in EMBEDDING_MODELS.items()
|
||||
for params in settings.iter_params(model_id)
|
||||
if model_id in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
@@ -409,22 +400,25 @@ def test_tp_language_embedding(
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="encode",
|
||||
is_multimodal=False)
|
||||
_compare_tp(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="encode",
|
||||
is_multimodal=False,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "runner",
|
||||
"test_options"),
|
||||
("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
|
||||
[
|
||||
params for model_id, settings in MULTIMODAL_MODELS.items()
|
||||
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
|
||||
params
|
||||
for model_id, settings in MULTIMODAL_MODELS.items()
|
||||
for params in settings.iter_params(model_id)
|
||||
if model_id in TEST_MODELS
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
@@ -436,11 +430,13 @@ def test_tp_multimodal_generation(
|
||||
test_options: PPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_tp(model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
is_multimodal=True)
|
||||
_compare_tp(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
is_multimodal=True,
|
||||
)
|
||||
|
||||
@@ -9,7 +9,6 @@ from vllm.distributed.utils import get_pp_indices
|
||||
|
||||
|
||||
def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
|
||||
def _verify(partition_str, num_layers, pp_size, goldens):
|
||||
@@ -57,7 +56,8 @@ def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
|
||||
(5, 3, 0, (0, 2)),
|
||||
(5, 3, 1, (2, 4)),
|
||||
(5, 3, 2, (4, 5)),
|
||||
])
|
||||
],
|
||||
)
|
||||
def test_uneven_auto_partition(
|
||||
num_hidden_layers: int,
|
||||
pp_size: int,
|
||||
|
||||
@@ -12,12 +12,18 @@ if TYPE_CHECKING:
|
||||
from typing_extensions import LiteralString
|
||||
|
||||
|
||||
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
|
||||
(2, "JackFram/llama-160m"),
|
||||
])
|
||||
@pytest.mark.parametrize("ATTN_BACKEND", [
|
||||
"FLASH_ATTN",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"PP_SIZE, MODEL_NAME",
|
||||
[
|
||||
(2, "JackFram/llama-160m"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ATTN_BACKEND",
|
||||
[
|
||||
"FLASH_ATTN",
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_pp_cudagraph(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
|
||||
@@ -9,13 +9,15 @@ import pytest
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
from vllm.distributed.communication_op import ( # noqa
|
||||
tensor_model_parallel_all_reduce)
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce # noqa
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
|
||||
from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
|
||||
get_world_group, graph_capture,
|
||||
init_distributed_environment)
|
||||
from vllm.distributed.parallel_state import (
|
||||
ensure_model_parallel_initialized,
|
||||
get_world_group,
|
||||
graph_capture,
|
||||
init_distributed_environment,
|
||||
)
|
||||
from vllm.utils import update_environment_variables
|
||||
|
||||
|
||||
@@ -24,13 +26,13 @@ def distributed_run(fn, world_size):
|
||||
processes: list[multiprocessing.Process] = []
|
||||
for i in range(number_of_processes):
|
||||
env: dict[str, str] = {}
|
||||
env['RANK'] = str(i)
|
||||
env['LOCAL_RANK'] = str(i)
|
||||
env['WORLD_SIZE'] = str(number_of_processes)
|
||||
env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
|
||||
env['MASTER_ADDR'] = 'localhost'
|
||||
env['MASTER_PORT'] = '12345'
|
||||
p = multiprocessing.Process(target=fn, args=(env, ))
|
||||
env["RANK"] = str(i)
|
||||
env["LOCAL_RANK"] = str(i)
|
||||
env["WORLD_SIZE"] = str(number_of_processes)
|
||||
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
|
||||
env["MASTER_ADDR"] = "localhost"
|
||||
env["MASTER_PORT"] = "12345"
|
||||
p = multiprocessing.Process(target=fn, args=(env,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
@@ -47,7 +49,7 @@ def worker_fn_wrapper(fn):
|
||||
# and update the environment variables in the function
|
||||
def wrapped_fn(env):
|
||||
update_environment_variables(env)
|
||||
local_rank = os.environ['LOCAL_RANK']
|
||||
local_rank = os.environ["LOCAL_RANK"]
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_distributed_environment()
|
||||
@@ -58,17 +60,18 @@ def worker_fn_wrapper(fn):
|
||||
|
||||
@worker_fn_wrapper
|
||||
def worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
tensor = torch.ones(16, 1024, 1024,
|
||||
dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
tensor = pynccl_comm.all_reduce(tensor)
|
||||
torch.cuda.synchronize()
|
||||
assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl():
|
||||
distributed_run(worker_fn, 2)
|
||||
|
||||
@@ -78,7 +81,7 @@ def multiple_allreduce_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
groups = [
|
||||
torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
|
||||
torch.distributed.new_group(ranks=[2, 3], backend="gloo")
|
||||
torch.distributed.new_group(ranks=[2, 3], backend="gloo"),
|
||||
]
|
||||
group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
|
||||
pynccl_comm = PyNcclCommunicator(group=group, device=device)
|
||||
@@ -95,8 +98,9 @@ def multiple_allreduce_worker_fn():
|
||||
assert torch.all(tensor == 2).cpu().item()
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_multiple_allreduce():
|
||||
# this tests pynccl for multiple tp groups, in a standalone way
|
||||
# i.e. call `pynccl_comm.all_reduce` directly
|
||||
@@ -121,8 +125,9 @@ def multiple_allreduce_with_vllm_worker_fn():
|
||||
assert torch.all(tensor == 2).cpu().item()
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_multiple_allreduce_with_vllm():
|
||||
# this tests pynccl for multiple tp groups, together with vllm
|
||||
# i.e. call `tensor_model_parallel_all_reduce`
|
||||
@@ -133,10 +138,11 @@ def test_pynccl_multiple_allreduce_with_vllm():
|
||||
def worker_fn_with_cudagraph():
|
||||
with torch.no_grad():
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
# run something in the default stream to initialize torch engine
|
||||
a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
|
||||
a = torch.ones((4, 4), device=f"cuda:{pynccl_comm.rank}")
|
||||
torch.cuda.synchronize()
|
||||
with torch.cuda.graph(graph):
|
||||
a_out = pynccl_comm.all_reduce(a)
|
||||
@@ -148,84 +154,90 @@ def worker_fn_with_cudagraph():
|
||||
|
||||
@worker_fn_wrapper
|
||||
def all_gather_worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
|
||||
rank = pynccl_comm.rank
|
||||
world_size = pynccl_comm.world_size
|
||||
device = f'cuda:{pynccl_comm.rank}'
|
||||
device = f"cuda:{pynccl_comm.rank}"
|
||||
|
||||
num_elems = 1000
|
||||
tensor = torch.arange(num_elems, dtype=torch.float32,
|
||||
device=device) + rank * num_elems
|
||||
result = torch.zeros(num_elems * world_size,
|
||||
dtype=torch.float32,
|
||||
device=device)
|
||||
tensor = (
|
||||
torch.arange(num_elems, dtype=torch.float32, device=device) + rank * num_elems
|
||||
)
|
||||
result = torch.zeros(num_elems * world_size, dtype=torch.float32, device=device)
|
||||
|
||||
expected = torch.cat([
|
||||
torch.arange(num_elems, dtype=torch.float32) + r * num_elems
|
||||
for r in range(world_size)
|
||||
]).to(device)
|
||||
expected = torch.cat(
|
||||
[
|
||||
torch.arange(num_elems, dtype=torch.float32) + r * num_elems
|
||||
for r in range(world_size)
|
||||
]
|
||||
).to(device)
|
||||
|
||||
pynccl_comm.all_gather(result, tensor)
|
||||
torch.cuda.synchronize()
|
||||
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_all_gather():
|
||||
distributed_run(all_gather_worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def all_gatherv_worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
|
||||
rank = pynccl_comm.rank
|
||||
world_size = pynccl_comm.world_size
|
||||
device = f'cuda:{pynccl_comm.rank}'
|
||||
device = f"cuda:{pynccl_comm.rank}"
|
||||
|
||||
assert world_size <= 8
|
||||
sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
|
||||
num_elems = sizes[rank]
|
||||
tensor = torch.arange(num_elems, dtype=torch.float32,
|
||||
device=device) + rank * 100
|
||||
tensor = torch.arange(num_elems, dtype=torch.float32, device=device) + rank * 100
|
||||
result = torch.zeros(sum(sizes), dtype=torch.float32, device=device)
|
||||
|
||||
expected = torch.cat([
|
||||
torch.arange(sizes[r], dtype=torch.float32) + r * 100
|
||||
for r in range(world_size)
|
||||
]).to(device)
|
||||
expected = torch.cat(
|
||||
[
|
||||
torch.arange(sizes[r], dtype=torch.float32) + r * 100
|
||||
for r in range(world_size)
|
||||
]
|
||||
).to(device)
|
||||
|
||||
pynccl_comm.all_gatherv(result, tensor, sizes=sizes)
|
||||
torch.cuda.synchronize()
|
||||
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_all_gatherv():
|
||||
distributed_run(all_gatherv_worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def reduce_scatter_worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
|
||||
rank = pynccl_comm.rank
|
||||
world_size = pynccl_comm.world_size
|
||||
device = f'cuda:{pynccl_comm.rank}'
|
||||
device = f"cuda:{pynccl_comm.rank}"
|
||||
|
||||
num_elems = 1000
|
||||
tensor = torch.arange(num_elems, dtype=torch.float32,
|
||||
device=device) + rank * num_elems
|
||||
assert (num_elems % world_size == 0)
|
||||
result = torch.zeros(num_elems // world_size,
|
||||
dtype=torch.float32,
|
||||
device=device)
|
||||
tensor = (
|
||||
torch.arange(num_elems, dtype=torch.float32, device=device) + rank * num_elems
|
||||
)
|
||||
assert num_elems % world_size == 0
|
||||
result = torch.zeros(num_elems // world_size, dtype=torch.float32, device=device)
|
||||
|
||||
# Calculate expected result for this rank's chunk
|
||||
scattered_size = num_elems // world_size
|
||||
@@ -233,34 +245,37 @@ def reduce_scatter_worker_fn():
|
||||
torch.arange(num_elems, dtype=torch.float32) + r * num_elems
|
||||
for r in range(world_size)
|
||||
]
|
||||
expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size]
|
||||
for tensor in all_tensors).to(device)
|
||||
expected = sum(
|
||||
tensor[rank * scattered_size : (rank + 1) * scattered_size]
|
||||
for tensor in all_tensors
|
||||
).to(device)
|
||||
|
||||
pynccl_comm.reduce_scatter(result, tensor)
|
||||
torch.cuda.synchronize()
|
||||
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_reduce_scatter():
|
||||
distributed_run(reduce_scatter_worker_fn, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def reduce_scatterv_worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
|
||||
rank = pynccl_comm.rank
|
||||
world_size = pynccl_comm.world_size
|
||||
device = f'cuda:{pynccl_comm.rank}'
|
||||
device = f"cuda:{pynccl_comm.rank}"
|
||||
|
||||
assert world_size <= 8
|
||||
sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
|
||||
num_elems = sum(sizes)
|
||||
tensor = torch.arange(num_elems, dtype=torch.float32,
|
||||
device=device) + rank * 100
|
||||
tensor = torch.arange(num_elems, dtype=torch.float32, device=device) + rank * 100
|
||||
result = torch.zeros(sizes[rank], dtype=torch.float32, device=device)
|
||||
|
||||
# Calculate expected result for this rank's chunk
|
||||
@@ -278,41 +293,41 @@ def reduce_scatterv_worker_fn():
|
||||
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_reduce_scatterv():
|
||||
distributed_run(reduce_scatterv_worker_fn, 2)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_with_cudagraph():
|
||||
distributed_run(worker_fn_with_cudagraph, 2)
|
||||
|
||||
|
||||
@worker_fn_wrapper
|
||||
def send_recv_worker_fn():
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
if pynccl_comm.rank == 0:
|
||||
tensor = torch.ones(16, 1024, 1024,
|
||||
dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
else:
|
||||
tensor = torch.empty(16, 1024, 1024,
|
||||
dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
tensor = torch.empty(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
|
||||
|
||||
if pynccl_comm.rank == 0:
|
||||
pynccl_comm.send(tensor,
|
||||
dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
|
||||
pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
|
||||
else:
|
||||
pynccl_comm.recv(tensor,
|
||||
src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
|
||||
pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
|
||||
torch.cuda.synchronize()
|
||||
assert torch.all(tensor == 1).cpu().item()
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_send_recv():
|
||||
distributed_run(send_recv_worker_fn, 2)
|
||||
|
||||
@@ -322,27 +337,20 @@ def multiple_send_recv_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
groups = [
|
||||
torch.distributed.new_group(ranks=[0, 2], backend="gloo"),
|
||||
torch.distributed.new_group(ranks=[1, 3], backend="gloo")
|
||||
torch.distributed.new_group(ranks=[1, 3], backend="gloo"),
|
||||
]
|
||||
group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1]
|
||||
pynccl_comm = PyNcclCommunicator(group=group, device=device)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
elif torch.distributed.get_rank() == 1:
|
||||
tensor = 2 * torch.ones(
|
||||
16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
tensor = 2 * torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
else:
|
||||
tensor = torch.empty(16,
|
||||
1024,
|
||||
1024,
|
||||
dtype=torch.float32,
|
||||
device=device)
|
||||
tensor = torch.empty(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
if torch.distributed.get_rank() in [0, 1]:
|
||||
pynccl_comm.send(tensor,
|
||||
dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
|
||||
pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
|
||||
else:
|
||||
pynccl_comm.recv(tensor,
|
||||
src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
|
||||
pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
|
||||
torch.cuda.synchronize()
|
||||
if torch.distributed.get_rank() in [0, 2]:
|
||||
assert torch.all(tensor == 1).cpu().item()
|
||||
@@ -350,14 +358,16 @@ def multiple_send_recv_worker_fn():
|
||||
assert torch.all(tensor == 2).cpu().item()
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_multiple_send_recv():
|
||||
distributed_run(multiple_send_recv_worker_fn, 4)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="Need at least 4 GPUs to run the test.")
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
|
||||
)
|
||||
def test_pynccl_broadcast():
|
||||
distributed_run(broadcast_worker_fn, 4)
|
||||
|
||||
@@ -366,19 +376,17 @@ def test_pynccl_broadcast():
|
||||
def broadcast_worker_fn():
|
||||
# Test broadcast for every root rank.
|
||||
# Essentially this is an all-gather operation.
|
||||
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
|
||||
device=get_world_group().device)
|
||||
pynccl_comm = PyNcclCommunicator(
|
||||
get_world_group().cpu_group, device=get_world_group().device
|
||||
)
|
||||
recv_tensors = [
|
||||
torch.empty(16,
|
||||
1024,
|
||||
1024,
|
||||
dtype=torch.float32,
|
||||
device=pynccl_comm.device)
|
||||
torch.empty(16, 1024, 1024, dtype=torch.float32, device=pynccl_comm.device)
|
||||
for i in range(pynccl_comm.world_size)
|
||||
]
|
||||
recv_tensors[pynccl_comm.rank] = torch.ones(
|
||||
16, 1024, 1024, dtype=torch.float32,
|
||||
device=pynccl_comm.device) * pynccl_comm.rank
|
||||
recv_tensors[pynccl_comm.rank] = (
|
||||
torch.ones(16, 1024, 1024, dtype=torch.float32, device=pynccl_comm.device)
|
||||
* pynccl_comm.rank
|
||||
)
|
||||
|
||||
for i in range(pynccl_comm.world_size):
|
||||
pynccl_comm.broadcast(recv_tensors[i], src=i)
|
||||
|
||||
@@ -8,20 +8,20 @@ import ray
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.communication_op import ( # noqa
|
||||
tensor_model_parallel_all_reduce)
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce # noqa
|
||||
from vllm.distributed.parallel_state import get_tp_group, graph_capture
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import (ensure_model_parallel_initialized,
|
||||
init_test_distributed_environment, multi_process_parallel)
|
||||
from ..utils import (
|
||||
ensure_model_parallel_initialized,
|
||||
init_test_distributed_environment,
|
||||
multi_process_parallel,
|
||||
)
|
||||
|
||||
torch.manual_seed(42)
|
||||
random.seed(44)
|
||||
# Size over 8MB is sufficient for custom quick allreduce.
|
||||
test_sizes = [
|
||||
random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)
|
||||
]
|
||||
test_sizes = [random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)]
|
||||
for i, v in enumerate(test_sizes):
|
||||
test_sizes[i] -= v % 8
|
||||
|
||||
@@ -38,8 +38,7 @@ def graph_quickreduce(
|
||||
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
ensure_model_parallel_initialized(tp_size, pp_size)
|
||||
group = get_tp_group().device_group
|
||||
|
||||
@@ -64,18 +63,15 @@ def graph_quickreduce(
|
||||
for sz in test_sizes:
|
||||
for dtype in [torch.float16, torch.bfloat16]:
|
||||
with graph_capture(device=device) as graph_capture_context:
|
||||
inp1 = torch.randint(1,
|
||||
23, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
inp2 = torch.randint(-23,
|
||||
1, (sz, ),
|
||||
dtype=dtype,
|
||||
device=torch.cuda.current_device())
|
||||
inp1 = torch.randint(
|
||||
1, 23, (sz,), dtype=dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
inp2 = torch.randint(
|
||||
-23, 1, (sz,), dtype=dtype, device=torch.cuda.current_device()
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph,
|
||||
stream=graph_capture_context.stream):
|
||||
with torch.cuda.graph(graph, stream=graph_capture_context.stream):
|
||||
for _ in range(num_communication):
|
||||
out1 = tensor_model_parallel_all_reduce(inp1)
|
||||
dist.all_reduce(inp1, group=group)
|
||||
@@ -99,39 +95,42 @@ def eager_quickreduce(
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
init_test_distributed_environment(tp_size, pp_size, rank,
|
||||
distributed_init_port)
|
||||
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
|
||||
|
||||
# Size over 8MB is sufficient for custom quick allreduce.
|
||||
sz = 16 * 1024 * 1024
|
||||
fa = get_tp_group().device_communicator.qr_comm
|
||||
inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
|
||||
dtype=torch.float16,
|
||||
device=device)
|
||||
inp = torch.tensor(
|
||||
[1.0 * ((i) % 23) for i in range(sz)], dtype=torch.float16, device=device
|
||||
)
|
||||
out = fa.quick_all_reduce(inp)
|
||||
torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
|
||||
|
||||
inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
|
||||
dtype=torch.bfloat16,
|
||||
device=device)
|
||||
inp = torch.tensor(
|
||||
[1.0 * ((i) % 23) for i in range(sz)], dtype=torch.bfloat16, device=device
|
||||
)
|
||||
out = fa.quick_all_reduce(inp)
|
||||
torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_rocm(),
|
||||
reason="only test quick allreduce for rocm")
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_rocm(), reason="only test quick allreduce for rocm"
|
||||
)
|
||||
@pytest.mark.parametrize("quant_mode", ["FP", "INT8", "INT6", "INT4"])
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
|
||||
@pytest.mark.parametrize("test_target", [graph_quickreduce, eager_quickreduce])
|
||||
def test_custom_quick_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
|
||||
pipeline_parallel_size, test_target,
|
||||
quant_mode):
|
||||
def test_custom_quick_allreduce(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tp_size,
|
||||
pipeline_parallel_size,
|
||||
test_target,
|
||||
quant_mode,
|
||||
):
|
||||
world_size = tp_size * pipeline_parallel_size
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
|
||||
monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode)
|
||||
|
||||
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
|
||||
test_target)
|
||||
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
|
||||
|
||||
@@ -22,15 +22,13 @@ if __name__ == "__main__":
|
||||
dist.broadcast_object_list(recv, src=0)
|
||||
ip, port = recv
|
||||
|
||||
stateless_pg = StatelessProcessGroup.create(ip, port, rank,
|
||||
dist.get_world_size())
|
||||
stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())
|
||||
|
||||
for pg in [dist.group.WORLD, stateless_pg]:
|
||||
test_result = all(in_the_same_node_as(pg, source_rank=0))
|
||||
|
||||
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
|
||||
assert test_result == expected, \
|
||||
f"Expected {expected}, got {test_result}"
|
||||
assert test_result == expected, f"Expected {expected}, got {test_result}"
|
||||
if pg == dist.group.WORLD:
|
||||
print("Same node test passed! when using torch distributed!")
|
||||
else:
|
||||
|
||||
@@ -7,6 +7,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
all workers in a node other than the head node, which can cause the test
|
||||
to fail.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
@@ -56,7 +57,8 @@ class SPTestSettings:
|
||||
raise ValueError(
|
||||
f"Length mismatch: distributed_backends "
|
||||
f"({len(self.distributed_backends)}) != "
|
||||
f"vllm_major_versions ({len(self.vllm_major_versions)})")
|
||||
f"vllm_major_versions ({len(self.vllm_major_versions)})"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def detailed(
|
||||
@@ -72,18 +74,22 @@ class SPTestSettings:
|
||||
for pp_multiplier in [1, 2]:
|
||||
for chunked_prefill_val in [False, True]:
|
||||
parallel_setups.append(
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_multiplier * pp_base,
|
||||
enable_fusion=False,
|
||||
eager_mode=eager_mode_val,
|
||||
chunked_prefill=chunked_prefill_val))
|
||||
ParallelSetup(
|
||||
tp_size=tp_base,
|
||||
pp_size=pp_multiplier * pp_base,
|
||||
enable_fusion=False,
|
||||
eager_mode=eager_mode_val,
|
||||
chunked_prefill=chunked_prefill_val,
|
||||
)
|
||||
)
|
||||
return SPTestSettings(
|
||||
parallel_setups=parallel_setups,
|
||||
distributed_backends=["mp", "ray"],
|
||||
vllm_major_versions=["1", "1"],
|
||||
runner=runner,
|
||||
test_options=SPTestOptions(multi_node_only=multi_node_only,
|
||||
load_format=load_format),
|
||||
test_options=SPTestOptions(
|
||||
multi_node_only=multi_node_only, load_format=load_format
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -100,18 +106,22 @@ class SPTestSettings:
|
||||
for pp_multiplier in [1, 2]:
|
||||
for chunked_prefill_val in [False, True]:
|
||||
parallel_setups.append(
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_multiplier * pp_base,
|
||||
enable_fusion=False,
|
||||
eager_mode=eager_mode_val,
|
||||
chunked_prefill=chunked_prefill_val))
|
||||
ParallelSetup(
|
||||
tp_size=tp_base,
|
||||
pp_size=pp_multiplier * pp_base,
|
||||
enable_fusion=False,
|
||||
eager_mode=eager_mode_val,
|
||||
chunked_prefill=chunked_prefill_val,
|
||||
)
|
||||
)
|
||||
return SPTestSettings(
|
||||
parallel_setups=parallel_setups,
|
||||
distributed_backends=["mp", "ray"],
|
||||
vllm_major_versions=["1", "1"],
|
||||
runner=runner,
|
||||
test_options=SPTestOptions(multi_node_only=multi_node_only,
|
||||
load_format=load_format),
|
||||
test_options=SPTestOptions(
|
||||
multi_node_only=multi_node_only, load_format=load_format
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -126,28 +136,39 @@ class SPTestSettings:
|
||||
parallel_setups = []
|
||||
for fusion_val in [False, True]:
|
||||
parallel_setups.append(
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
enable_fusion=fusion_val,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False))
|
||||
ParallelSetup(
|
||||
tp_size=tp_base,
|
||||
pp_size=pp_base,
|
||||
enable_fusion=fusion_val,
|
||||
eager_mode=True,
|
||||
chunked_prefill=False,
|
||||
)
|
||||
)
|
||||
return SPTestSettings(
|
||||
parallel_setups=parallel_setups,
|
||||
distributed_backends=["mp", "ray"],
|
||||
vllm_major_versions=["1", "1"],
|
||||
runner=runner,
|
||||
test_options=SPTestOptions(multi_node_only=multi_node_only,
|
||||
load_format=load_format),
|
||||
test_options=SPTestOptions(
|
||||
multi_node_only=multi_node_only, load_format=load_format
|
||||
),
|
||||
)
|
||||
|
||||
def iter_params(self, model_id: str):
|
||||
opts = self.test_options
|
||||
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for backend, vllm_major_version in zip(self.distributed_backends,
|
||||
self.vllm_major_versions):
|
||||
yield (model_id, parallel_setup, backend, vllm_major_version,
|
||||
self.runner, opts)
|
||||
for backend, vllm_major_version in zip(
|
||||
self.distributed_backends, self.vllm_major_versions
|
||||
):
|
||||
yield (
|
||||
model_id,
|
||||
parallel_setup,
|
||||
backend,
|
||||
vllm_major_version,
|
||||
self.runner,
|
||||
opts,
|
||||
)
|
||||
|
||||
|
||||
def _compare_sp(
|
||||
@@ -200,8 +221,10 @@ def _compare_sp(
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
if VLLM_MULTI_NODE and distributed_backend == "mp":
|
||||
pytest.skip("Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend")
|
||||
pytest.skip(
|
||||
"Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend"
|
||||
)
|
||||
if multi_node_only and not VLLM_MULTI_NODE:
|
||||
pytest.skip("Not in multi-node setting")
|
||||
|
||||
@@ -232,13 +255,13 @@ def _compare_sp(
|
||||
common_args.append("--skip-tokenizer-init")
|
||||
|
||||
compilation_config = {
|
||||
'level': 3,
|
||||
'custom_ops': ["+rms_norm"],
|
||||
'compile_sizes': [4, 8],
|
||||
'pass_config': {
|
||||
'enable_sequence_parallelism': True,
|
||||
'enable_fusion': enable_fusion,
|
||||
'enable_noop': True,
|
||||
"level": 3,
|
||||
"custom_ops": ["+rms_norm"],
|
||||
"compile_sizes": [4, 8],
|
||||
"pass_config": {
|
||||
"enable_sequence_parallelism": True,
|
||||
"enable_fusion": enable_fusion,
|
||||
"enable_noop": True,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -270,12 +293,9 @@ def _compare_sp(
|
||||
]
|
||||
|
||||
try:
|
||||
compare_two_settings(model_id,
|
||||
tp_sp_args,
|
||||
tp_args,
|
||||
tp_sp_env,
|
||||
tp_env,
|
||||
method=method)
|
||||
compare_two_settings(
|
||||
model_id, tp_sp_args, tp_args, tp_sp_env, tp_env, method=method
|
||||
)
|
||||
except Exception:
|
||||
testing_ray_compiled_graph = tp_sp_env is not None
|
||||
if testing_ray_compiled_graph and vllm_major_version == "0":
|
||||
@@ -301,10 +321,17 @@ SP_TEST_MODELS = [
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
|
||||
"runner", "test_options"),
|
||||
(
|
||||
"model_id",
|
||||
"parallel_setup",
|
||||
"distributed_backend",
|
||||
"vllm_major_version",
|
||||
"runner",
|
||||
"test_options",
|
||||
),
|
||||
[
|
||||
params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
|
||||
params
|
||||
for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
|
||||
for params in settings.iter_params(model_id)
|
||||
if model_id in SP_TEST_MODELS
|
||||
],
|
||||
@@ -319,12 +346,14 @@ def test_tp_sp_generation(
|
||||
test_options: SPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_sp(model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
vllm_major_version,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
is_multimodal=False)
|
||||
_compare_sp(
|
||||
model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
vllm_major_version,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
is_multimodal=False,
|
||||
)
|
||||
|
||||
@@ -26,13 +26,13 @@ def distributed_run(fn, world_size):
|
||||
processes = []
|
||||
for i in range(number_of_processes):
|
||||
env = {}
|
||||
env['RANK'] = str(i)
|
||||
env['LOCAL_RANK'] = str(i)
|
||||
env['WORLD_SIZE'] = str(number_of_processes)
|
||||
env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
|
||||
env['MASTER_ADDR'] = 'localhost'
|
||||
env['MASTER_PORT'] = '12345'
|
||||
p = multiprocessing.Process(target=fn, args=(env, ))
|
||||
env["RANK"] = str(i)
|
||||
env["LOCAL_RANK"] = str(i)
|
||||
env["WORLD_SIZE"] = str(number_of_processes)
|
||||
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
|
||||
env["MASTER_ADDR"] = "localhost"
|
||||
env["MASTER_PORT"] = "12345"
|
||||
p = multiprocessing.Process(target=fn, args=(env,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
@@ -57,25 +57,23 @@ def worker_fn_wrapper(fn):
|
||||
|
||||
@worker_fn_wrapper
|
||||
def worker_fn():
|
||||
|
||||
rank = dist.get_rank()
|
||||
if rank == 0:
|
||||
port = get_open_port()
|
||||
ip = '127.0.0.1'
|
||||
ip = "127.0.0.1"
|
||||
dist.broadcast_object_list([ip, port], src=0)
|
||||
else:
|
||||
recv = [None, None]
|
||||
dist.broadcast_object_list(recv, src=0)
|
||||
ip, port = recv # type: ignore
|
||||
|
||||
stateless_pg = StatelessProcessGroup.create(ip, port, rank,
|
||||
dist.get_world_size())
|
||||
stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())
|
||||
|
||||
for pg in [dist.group.WORLD, stateless_pg]:
|
||||
|
||||
writer_rank = 2
|
||||
broadcaster = MessageQueue.create_from_process_group(
|
||||
pg, 40 * 1024, 2, writer_rank)
|
||||
pg, 40 * 1024, 2, writer_rank
|
||||
)
|
||||
if rank == writer_rank:
|
||||
seed = random.randint(0, 1000)
|
||||
dist.broadcast_object_list([seed], writer_rank)
|
||||
|
||||
@@ -5,7 +5,8 @@ import traceback
|
||||
import unittest
|
||||
|
||||
from vllm.distributed.device_communicators.shm_object_storage import (
|
||||
SingleWriterShmRingBuffer)
|
||||
SingleWriterShmRingBuffer,
|
||||
)
|
||||
|
||||
|
||||
class TestSingleWriterShmRingBuffer(unittest.TestCase):
|
||||
@@ -25,18 +26,21 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
|
||||
"""Test opening an existing buffer"""
|
||||
# First create a buffer
|
||||
self.ring_buffer = SingleWriterShmRingBuffer(
|
||||
data_buffer_size=self.buffer_size, create=True)
|
||||
data_buffer_size=self.buffer_size, create=True
|
||||
)
|
||||
|
||||
# Then open it with another instance
|
||||
reader_buffer = SingleWriterShmRingBuffer(*self.ring_buffer.handle())
|
||||
self.assertFalse(reader_buffer.is_writer)
|
||||
self.assertEqual(reader_buffer.shared_memory.name,
|
||||
self.ring_buffer.shared_memory.name)
|
||||
self.assertEqual(
|
||||
reader_buffer.shared_memory.name, self.ring_buffer.shared_memory.name
|
||||
)
|
||||
|
||||
def test_buffer_access(self):
|
||||
"""Test accessing allocated buffers"""
|
||||
self.ring_buffer = SingleWriterShmRingBuffer(
|
||||
data_buffer_size=self.buffer_size, create=True)
|
||||
data_buffer_size=self.buffer_size, create=True
|
||||
)
|
||||
|
||||
size = 100
|
||||
address, monotonic_id = self.ring_buffer.allocate_buf(size)
|
||||
@@ -44,11 +48,11 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
|
||||
# Write some test data
|
||||
test_data = b"Hello, World!" * 7 # 91 bytes
|
||||
with self.ring_buffer.access_buf(address) as (data_buf, metadata):
|
||||
data_buf[0:len(test_data)] = test_data
|
||||
data_buf[0 : len(test_data)] = test_data
|
||||
|
||||
# Read it back
|
||||
with self.ring_buffer.access_buf(address) as (data_buf2, metadata2):
|
||||
read_data = bytes(data_buf2[0:len(test_data)])
|
||||
read_data = bytes(data_buf2[0 : len(test_data)])
|
||||
read_id = metadata2[0]
|
||||
|
||||
self.assertEqual(read_data, test_data)
|
||||
@@ -58,7 +62,8 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
|
||||
"""Test that MemoryError is raised when buffer is full"""
|
||||
small_buffer_size = 200
|
||||
self.ring_buffer = SingleWriterShmRingBuffer(
|
||||
data_buffer_size=small_buffer_size, create=True)
|
||||
data_buffer_size=small_buffer_size, create=True
|
||||
)
|
||||
|
||||
# Fill up the buffer
|
||||
self.ring_buffer.allocate_buf(100)
|
||||
@@ -72,7 +77,8 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
|
||||
"""Test allocation and freeing of buffers"""
|
||||
small_buffer_size = 200
|
||||
self.ring_buffer = SingleWriterShmRingBuffer(
|
||||
data_buffer_size=small_buffer_size, create=True)
|
||||
data_buffer_size=small_buffer_size, create=True
|
||||
)
|
||||
|
||||
size = 80
|
||||
# Write some data
|
||||
@@ -81,7 +87,7 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
|
||||
address, monotonic_id = self.ring_buffer.allocate_buf(size)
|
||||
with self.ring_buffer.access_buf(address) as (data_buf, metadata):
|
||||
data_buf[0:4] = (0).to_bytes(4, "little") # 0 for not in-use
|
||||
data_buf[4:len(test_data) + 4] = test_data
|
||||
data_buf[4 : len(test_data) + 4] = test_data
|
||||
print(self.ring_buffer.metadata)
|
||||
freed_ids = self.ring_buffer.free_buf(lambda *args: True)
|
||||
print(f" Freed IDs: {freed_ids}")
|
||||
@@ -90,7 +96,8 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
|
||||
def test_clear_buffer(self):
|
||||
"""Test clearing the buffer"""
|
||||
self.ring_buffer = SingleWriterShmRingBuffer(
|
||||
data_buffer_size=self.buffer_size, create=True)
|
||||
data_buffer_size=self.buffer_size, create=True
|
||||
)
|
||||
|
||||
# Allocate some buffers
|
||||
for _ in range(3):
|
||||
@@ -121,8 +128,7 @@ def main():
|
||||
# Manual demonstration
|
||||
try:
|
||||
print("Creating ring buffer...")
|
||||
writer_buffer = SingleWriterShmRingBuffer(data_buffer_size=2048,
|
||||
create=True)
|
||||
writer_buffer = SingleWriterShmRingBuffer(data_buffer_size=2048, create=True)
|
||||
reader_buffer = SingleWriterShmRingBuffer(*writer_buffer.handle())
|
||||
|
||||
print(f"Buffer created with name: {writer_buffer.shared_memory.name}")
|
||||
@@ -140,7 +146,7 @@ def main():
|
||||
# Write some test data
|
||||
with writer_buffer.access_buf(address) as (data_buf, metadata):
|
||||
test_message = f"Test message {i}".encode()
|
||||
data_buf[0:len(test_message)] = test_message
|
||||
data_buf[0 : len(test_message)] = test_message
|
||||
|
||||
except MemoryError as e:
|
||||
print(f" Failed to allocate {size} bytes: {e}")
|
||||
|
||||
@@ -12,28 +12,33 @@ import torch
|
||||
|
||||
# Assuming these are imported from your module
|
||||
from vllm.distributed.device_communicators.shm_object_storage import (
|
||||
MsgpackSerde, SingleWriterShmObjectStorage, SingleWriterShmRingBuffer)
|
||||
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
|
||||
MultiModalSharedField)
|
||||
MsgpackSerde,
|
||||
SingleWriterShmObjectStorage,
|
||||
SingleWriterShmRingBuffer,
|
||||
)
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalFieldElem,
|
||||
MultiModalKwargsItem,
|
||||
MultiModalSharedField,
|
||||
)
|
||||
|
||||
|
||||
def _dummy_elem(modality: str, key: str, size: int):
|
||||
return MultiModalFieldElem(
|
||||
modality=modality,
|
||||
key=key,
|
||||
data=torch.empty((size, ), dtype=torch.int8),
|
||||
data=torch.empty((size,), dtype=torch.int8),
|
||||
field=MultiModalSharedField(1),
|
||||
)
|
||||
|
||||
|
||||
def _dummy_item(modality: str, size_by_key: dict[str, int]):
|
||||
return MultiModalKwargsItem.from_elems([
|
||||
_dummy_elem(modality, key, size) for key, size in size_by_key.items()
|
||||
])
|
||||
return MultiModalKwargsItem.from_elems(
|
||||
[_dummy_elem(modality, key, size) for key, size in size_by_key.items()]
|
||||
)
|
||||
|
||||
|
||||
class TestSingleWriterShmObjectStorage(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures before each test method."""
|
||||
ring_buffer = SingleWriterShmRingBuffer(
|
||||
@@ -208,8 +213,7 @@ class TestSingleWriterShmObjectStorage(unittest.TestCase):
|
||||
with self.assertRaises(ValueError) as context:
|
||||
self.storage.get(address, monotonic_id + 100)
|
||||
|
||||
self.assertIn("has been modified or is invalid", \
|
||||
str(context.exception))
|
||||
self.assertIn("has been modified or is invalid", str(context.exception))
|
||||
|
||||
def test_clear_storage(self):
|
||||
"""Test clearing the storage."""
|
||||
@@ -234,8 +238,7 @@ class TestSingleWriterShmObjectStorage(unittest.TestCase):
|
||||
# Reader process function
|
||||
def reader_process(process_id, storage_handle, items_to_read):
|
||||
"""Reader process that connects to existing shared memory and reads data."""
|
||||
reader_storage = SingleWriterShmObjectStorage.create_from_handle(
|
||||
storage_handle)
|
||||
reader_storage = SingleWriterShmObjectStorage.create_from_handle(storage_handle)
|
||||
|
||||
print(f"Reader {process_id} started")
|
||||
|
||||
@@ -276,11 +279,7 @@ def run_multiprocess_example():
|
||||
|
||||
# Test basic data types
|
||||
test_data = [
|
||||
("user_data", {
|
||||
"name": "Alice",
|
||||
"age": 30,
|
||||
"scores": [95, 87, 92]
|
||||
}),
|
||||
("user_data", {"name": "Alice", "age": 30, "scores": [95, 87, 92]}),
|
||||
("simple_string", "Hello, World!"),
|
||||
("number", 42),
|
||||
("list_data", [1, 2, 3, "four", 5.0]),
|
||||
@@ -301,8 +300,9 @@ def run_multiprocess_example():
|
||||
# initialize lock for reader processes
|
||||
handle.reader_lock = Lock()
|
||||
for i in range(storage.n_readers):
|
||||
p = multiprocessing.Process(target=reader_process,
|
||||
args=(i, handle, stored_items))
|
||||
p = multiprocessing.Process(
|
||||
target=reader_process, args=(i, handle, stored_items)
|
||||
)
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
|
||||
@@ -14,11 +14,12 @@ import vllm.envs as envs
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
|
||||
from vllm.distributed.device_communicators.cuda_communicator import (
|
||||
CudaCommunicator)
|
||||
from vllm.distributed.parallel_state import (get_tp_group,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel)
|
||||
from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
|
||||
from vllm.distributed.parallel_state import (
|
||||
get_tp_group,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.platforms import current_platform
|
||||
@@ -32,8 +33,7 @@ test_size_elements = 1024 * 1024
|
||||
|
||||
def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
|
||||
monkeypatch = pytest.MonkeyPatch()
|
||||
config = VllmConfig(parallel_config=ParallelConfig(
|
||||
tensor_parallel_size=world_size))
|
||||
config = VllmConfig(parallel_config=ParallelConfig(tensor_parallel_size=world_size))
|
||||
|
||||
with monkeypatch.context() as m, set_current_vllm_config(config):
|
||||
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
|
||||
@@ -42,34 +42,34 @@ def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
|
||||
torch.cuda.set_device(device)
|
||||
torch.set_default_device(device)
|
||||
torch.set_default_dtype(dtype)
|
||||
update_environment_variables({
|
||||
'RANK': str(local_rank),
|
||||
'LOCAL_RANK': str(local_rank),
|
||||
'WORLD_SIZE': str(world_size),
|
||||
'MASTER_ADDR': 'localhost',
|
||||
'MASTER_PORT': '12345',
|
||||
})
|
||||
update_environment_variables(
|
||||
{
|
||||
"RANK": str(local_rank),
|
||||
"LOCAL_RANK": str(local_rank),
|
||||
"WORLD_SIZE": str(world_size),
|
||||
"MASTER_ADDR": "localhost",
|
||||
"MASTER_PORT": "12345",
|
||||
}
|
||||
)
|
||||
|
||||
init_distributed_environment()
|
||||
initialize_model_parallel(tensor_model_parallel_size=world_size)
|
||||
|
||||
cuda_communicator = typing.cast(CudaCommunicator,
|
||||
get_tp_group().device_communicator)
|
||||
cuda_communicator = typing.cast(
|
||||
CudaCommunicator, get_tp_group().device_communicator
|
||||
)
|
||||
symm_mem_comm = cuda_communicator.symm_mem_comm
|
||||
if symm_mem_comm is None or symm_mem_comm.disabled:
|
||||
# can't use skip under multiprocessing
|
||||
q.put("SymmMemCommunicator is not available or disabled.")
|
||||
return
|
||||
|
||||
inp_direct_symm_mem = torch.randint(1,
|
||||
23, (test_size_elements, ),
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
inp_direct_symm_mem = torch.randint(
|
||||
1, 23, (test_size_elements,), dtype=dtype, device=device
|
||||
)
|
||||
if not symm_mem_comm.should_use_symm_mem(inp_direct_symm_mem):
|
||||
# can't use skip under multiprocessing
|
||||
q.put(
|
||||
"SymmMemCommunicator isn't used for this world and input size."
|
||||
)
|
||||
q.put("SymmMemCommunicator isn't used for this world and input size.")
|
||||
return
|
||||
|
||||
original_inp_direct_symm_mem = inp_direct_symm_mem.clone()
|
||||
@@ -78,42 +78,37 @@ def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
|
||||
|
||||
group = get_tp_group().device_group
|
||||
dist.all_reduce(original_inp_direct_symm_mem, group=group)
|
||||
torch.testing.assert_close(out_direct_symm_mem,
|
||||
original_inp_direct_symm_mem,
|
||||
atol=2.5,
|
||||
rtol=0.1)
|
||||
torch.testing.assert_close(
|
||||
out_direct_symm_mem, original_inp_direct_symm_mem, atol=2.5, rtol=0.1
|
||||
)
|
||||
|
||||
# Test tensor_model_parallel_all_reduce which should use symm_mem
|
||||
inp_tensor_parallel = torch.randint(-23,
|
||||
1, (test_size_elements, ),
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
inp_tensor_parallel = torch.randint(
|
||||
-23, 1, (test_size_elements,), dtype=dtype, device=device
|
||||
)
|
||||
original_inp_tensor_parallel = inp_tensor_parallel.clone()
|
||||
out_tensor_parallel = tensor_model_parallel_all_reduce(
|
||||
inp_tensor_parallel)
|
||||
out_tensor_parallel = tensor_model_parallel_all_reduce(inp_tensor_parallel)
|
||||
dist.all_reduce(original_inp_tensor_parallel, group=group)
|
||||
torch.testing.assert_close(out_tensor_parallel,
|
||||
original_inp_tensor_parallel,
|
||||
atol=2.5,
|
||||
rtol=0.1)
|
||||
torch.testing.assert_close(
|
||||
out_tensor_parallel, original_inp_tensor_parallel, atol=2.5, rtol=0.1
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="SymmMemAllreduce is only available for CUDA platforms.")
|
||||
reason="SymmMemAllreduce is only available for CUDA platforms.",
|
||||
)
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("pipeline_parallel_size", [1])
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
|
||||
reason="Only test on CUDA")
|
||||
def test_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
|
||||
pipeline_parallel_size):
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
|
||||
def test_symm_mem_allreduce(
|
||||
monkeypatch: pytest.MonkeyPatch, tp_size, pipeline_parallel_size
|
||||
):
|
||||
world_size = tp_size * pipeline_parallel_size
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
q = mp.get_context('spawn').Queue()
|
||||
mp.spawn(symm_mem_allreduce_worker,
|
||||
args=(world_size, q),
|
||||
nprocs=world_size)
|
||||
q = mp.get_context("spawn").Queue()
|
||||
mp.spawn(symm_mem_allreduce_worker, args=(world_size, q), nprocs=world_size)
|
||||
try:
|
||||
val = q.get(timeout=1)
|
||||
except queue.Empty:
|
||||
@@ -126,18 +121,20 @@ def test_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="SymmMemAllreduce is only available for CUDA platforms.")
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
|
||||
reason="Only test on CUDA")
|
||||
reason="SymmMemAllreduce is only available for CUDA platforms.",
|
||||
)
|
||||
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
|
||||
def test_dp_with_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch):
|
||||
world_size = 4
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip("Not enough GPUs to run the test.")
|
||||
# Verify that the DataParallel runs without error
|
||||
engine_args = EngineArgs(model="distilbert/distilgpt2",
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
data_parallel_size=2,
|
||||
tensor_parallel_size=2,
|
||||
data_parallel_backend="mp")
|
||||
engine_args = EngineArgs(
|
||||
model="distilbert/distilgpt2",
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
data_parallel_size=2,
|
||||
tensor_parallel_size=2,
|
||||
data_parallel_backend="mp",
|
||||
)
|
||||
LLMEngine.from_engine_args(engine_args)
|
||||
|
||||
@@ -24,13 +24,15 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# set different `gpu_memory_utilization` and `swap_space` for different ranks,
|
||||
# to test if all ranks agree on the same kv cache configuration.
|
||||
llm = LLM(model="facebook/opt-125m",
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
|
||||
distributed_executor_backend="external_launcher",
|
||||
gpu_memory_utilization=random.uniform(0.7, 0.9),
|
||||
swap_space=random.randint(1, 4),
|
||||
seed=0)
|
||||
llm = LLM(
|
||||
model="facebook/opt-125m",
|
||||
tensor_parallel_size=2,
|
||||
pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
|
||||
distributed_executor_backend="external_launcher",
|
||||
gpu_memory_utilization=random.uniform(0.7, 0.9),
|
||||
swap_space=random.randint(1, 4),
|
||||
seed=0,
|
||||
)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
@@ -48,15 +50,14 @@ def test_consistent_across_ranks(obj):
|
||||
assert container[0] == obj
|
||||
|
||||
|
||||
test_consistent_across_ranks(
|
||||
llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
|
||||
test_consistent_across_ranks(
|
||||
llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
|
||||
test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
|
||||
test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
|
||||
|
||||
# make sure we can access the model parameters from the calling process
|
||||
# of the `LLM` instance.
|
||||
params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
|
||||
model.parameters())
|
||||
params = list(
|
||||
llm.llm_engine.model_executor.driver_worker.worker.model_runner.model.parameters()
|
||||
)
|
||||
test_consistent_across_ranks(len(params))
|
||||
|
||||
# all ranks should have the same outputs
|
||||
@@ -65,5 +66,4 @@ for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
test_consistent_across_ranks(prompt)
|
||||
test_consistent_across_ranks(generated_text)
|
||||
print(f"Rank {torch_rank}, Prompt: {prompt!r}, "
|
||||
f"Generated text: {generated_text!r}")
|
||||
print(f"Rank {torch_rank}, Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
@@ -24,23 +24,22 @@ dp_rank = int(os.getenv("DP_RANK", "0"))
|
||||
|
||||
if dp_size > 1:
|
||||
# distribute the prompts across the data parallel ranks
|
||||
prompts = [
|
||||
prompt for idx, prompt in enumerate(prompts)
|
||||
if idx % dp_size == dp_rank
|
||||
]
|
||||
prompts = [prompt for idx, prompt in enumerate(prompts) if idx % dp_size == dp_rank]
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# set different `gpu_memory_utilization` and `swap_space` for different ranks,
|
||||
# to test if all ranks agree on the same kv cache configuration.
|
||||
llm = LLM(model="microsoft/Phi-mini-MoE-instruct",
|
||||
tensor_parallel_size=int(os.getenv("TP_SIZE", "1")),
|
||||
pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")),
|
||||
enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
|
||||
distributed_executor_backend="external_launcher",
|
||||
gpu_memory_utilization=random.uniform(0.7, 0.9),
|
||||
swap_space=random.randint(1, 4),
|
||||
seed=0)
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-mini-MoE-instruct",
|
||||
tensor_parallel_size=int(os.getenv("TP_SIZE", "1")),
|
||||
pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")),
|
||||
enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
|
||||
distributed_executor_backend="external_launcher",
|
||||
gpu_memory_utilization=random.uniform(0.7, 0.9),
|
||||
swap_space=random.randint(1, 4),
|
||||
seed=0,
|
||||
)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
@@ -54,21 +53,18 @@ def test_consistent_across_ranks(obj):
|
||||
dist.broadcast_object_list([obj], src=group.ranks[0], group=cpu_group)
|
||||
else:
|
||||
container = [None]
|
||||
dist.broadcast_object_list(container,
|
||||
src=group.ranks[0],
|
||||
group=cpu_group)
|
||||
dist.broadcast_object_list(container, src=group.ranks[0], group=cpu_group)
|
||||
assert container[0] == obj
|
||||
|
||||
|
||||
test_consistent_across_ranks(
|
||||
llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
|
||||
test_consistent_across_ranks(
|
||||
llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
|
||||
test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
|
||||
test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
|
||||
|
||||
# make sure we can access the model parameters from the calling process
|
||||
# of the `LLM` instance.
|
||||
params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
|
||||
model.parameters())
|
||||
params = list(
|
||||
llm.llm_engine.model_executor.driver_worker.worker.model_runner.model.parameters()
|
||||
)
|
||||
test_consistent_across_ranks(len(params))
|
||||
|
||||
# all ranks should have the same outputs
|
||||
@@ -77,5 +73,4 @@ for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
test_consistent_across_ranks(prompt)
|
||||
test_consistent_across_ranks(generated_text)
|
||||
print(f"Rank {group_rank}, Prompt: {prompt!r}, "
|
||||
f"Generated text: {generated_text!r}")
|
||||
print(f"Rank {group_rank}, Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
@@ -10,21 +10,22 @@ import torch
|
||||
import vllm.envs as envs
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.utils import StatelessProcessGroup
|
||||
from vllm.utils import (cuda_device_count_stateless, get_open_port,
|
||||
update_environment_variables)
|
||||
from vllm.utils import (
|
||||
cuda_device_count_stateless,
|
||||
get_open_port,
|
||||
update_environment_variables,
|
||||
)
|
||||
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
|
||||
@ray.remote
|
||||
class _CUDADeviceCountStatelessTestActor:
|
||||
|
||||
def get_count(self):
|
||||
return cuda_device_count_stateless()
|
||||
|
||||
def set_cuda_visible_devices(self, cuda_visible_devices: str):
|
||||
update_environment_variables(
|
||||
{"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
|
||||
update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
|
||||
|
||||
def get_cuda_visible_devices(self):
|
||||
return envs.CUDA_VISIBLE_DEVICES
|
||||
@@ -34,10 +35,9 @@ def test_cuda_device_count_stateless():
|
||||
"""Test that cuda_device_count_stateless changes return value if
|
||||
CUDA_VISIBLE_DEVICES is changed."""
|
||||
actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore
|
||||
num_gpus=2).remote()
|
||||
assert len(
|
||||
sorted(ray.get(
|
||||
actor.get_cuda_visible_devices.remote()).split(","))) == 2
|
||||
num_gpus=2
|
||||
).remote()
|
||||
assert len(sorted(ray.get(actor.get_cuda_visible_devices.remote()).split(","))) == 2
|
||||
assert ray.get(actor.get_count.remote()) == 2
|
||||
ray.get(actor.set_cuda_visible_devices.remote("0"))
|
||||
assert ray.get(actor.get_count.remote()) == 1
|
||||
@@ -46,15 +46,13 @@ def test_cuda_device_count_stateless():
|
||||
|
||||
|
||||
def cpu_worker(rank, WORLD_SIZE, port1, port2):
|
||||
pg1 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port1,
|
||||
rank=rank,
|
||||
world_size=WORLD_SIZE)
|
||||
pg1 = StatelessProcessGroup.create(
|
||||
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
|
||||
)
|
||||
if rank <= 2:
|
||||
pg2 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port2,
|
||||
rank=rank,
|
||||
world_size=3)
|
||||
pg2 = StatelessProcessGroup.create(
|
||||
host="127.0.0.1", port=port2, rank=rank, world_size=3
|
||||
)
|
||||
data = torch.tensor([rank])
|
||||
data = pg1.broadcast_obj(data, src=2)
|
||||
assert data.item() == 2
|
||||
@@ -68,16 +66,14 @@ def cpu_worker(rank, WORLD_SIZE, port1, port2):
|
||||
|
||||
def gpu_worker(rank, WORLD_SIZE, port1, port2):
|
||||
torch.cuda.set_device(rank)
|
||||
pg1 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port1,
|
||||
rank=rank,
|
||||
world_size=WORLD_SIZE)
|
||||
pg1 = StatelessProcessGroup.create(
|
||||
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
|
||||
)
|
||||
pynccl1 = PyNcclCommunicator(pg1, device=rank)
|
||||
if rank <= 2:
|
||||
pg2 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port2,
|
||||
rank=rank,
|
||||
world_size=3)
|
||||
pg2 = StatelessProcessGroup.create(
|
||||
host="127.0.0.1", port=port2, rank=rank, world_size=3
|
||||
)
|
||||
pynccl2 = PyNcclCommunicator(pg2, device=rank)
|
||||
data = torch.tensor([rank]).cuda()
|
||||
pynccl1.all_reduce(data)
|
||||
@@ -96,10 +92,9 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2):
|
||||
|
||||
|
||||
def broadcast_worker(rank, WORLD_SIZE, port1, port2):
|
||||
pg1 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port1,
|
||||
rank=rank,
|
||||
world_size=WORLD_SIZE)
|
||||
pg1 = StatelessProcessGroup.create(
|
||||
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
|
||||
)
|
||||
if rank == 2:
|
||||
pg1.broadcast_obj("secret", src=2)
|
||||
else:
|
||||
@@ -109,10 +104,9 @@ def broadcast_worker(rank, WORLD_SIZE, port1, port2):
|
||||
|
||||
|
||||
def allgather_worker(rank, WORLD_SIZE, port1, port2):
|
||||
pg1 = StatelessProcessGroup.create(host="127.0.0.1",
|
||||
port=port1,
|
||||
rank=rank,
|
||||
world_size=WORLD_SIZE)
|
||||
pg1 = StatelessProcessGroup.create(
|
||||
host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
|
||||
)
|
||||
data = pg1.all_gather_obj(rank)
|
||||
assert data == list(range(WORLD_SIZE))
|
||||
pg1.barrier()
|
||||
@@ -121,7 +115,8 @@ def allgather_worker(rank, WORLD_SIZE, port1, port2):
|
||||
@pytest.mark.skip(reason="This test is flaky and prone to hang.")
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@pytest.mark.parametrize(
|
||||
"worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
|
||||
"worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker]
|
||||
)
|
||||
def test_stateless_process_group(worker):
|
||||
port1 = get_open_port()
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
@@ -129,12 +124,14 @@ def test_stateless_process_group(worker):
|
||||
port2 = get_open_port()
|
||||
WORLD_SIZE = 4
|
||||
from multiprocessing import get_context
|
||||
|
||||
ctx = get_context("fork")
|
||||
processes = []
|
||||
for i in range(WORLD_SIZE):
|
||||
rank = i
|
||||
processes.append(
|
||||
ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2)))
|
||||
ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2))
|
||||
)
|
||||
for p in processes:
|
||||
p.start()
|
||||
for p in processes:
|
||||
|
||||
Reference in New Issue
Block a user