[1/N] Elastic EP Milestone 2 (#34861)
Signed-off-by: Yongji Wu <wuyongji317@gmail.com> Signed-off-by: Itay Alroy <ialroy@nvidia.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Signed-off-by: Ron Tourgeman <rtourgeman@nvidia.com> Co-authored-by: Yongji Wu <wuyongji317@gmail.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Ron Tourgeman <rtourgeman@nvidia.com>
This commit is contained in:
@@ -7,6 +7,7 @@ import random
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed.parallel_state import (
|
||||
init_distributed_environment,
|
||||
)
|
||||
@@ -42,7 +43,11 @@ def set_env_vars_and_device(env: dict[str, str]) -> None:
|
||||
local_rank = os.environ["LOCAL_RANK"]
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
torch.cuda.set_device(device)
|
||||
init_distributed_environment()
|
||||
|
||||
# Create a minimal vllm config for init_distributed_environment
|
||||
vllm_config = VllmConfig()
|
||||
with set_current_vllm_config(vllm_config):
|
||||
init_distributed_environment()
|
||||
|
||||
# Ensure each worker process has the same random seed
|
||||
random.seed(42)
|
||||
|
||||
202
tests/distributed/test_elastic_ep.py
Normal file
202
tests/distributed/test_elastic_ep.py
Normal file
@@ -0,0 +1,202 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from ..evals.gsm8k.gsm8k_eval import evaluate_gsm8k
|
||||
from ..utils import RemoteOpenAIServer, multi_gpu_test
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def cleanup_ray_between_tests():
|
||||
"""Force-stop any lingering Ray processes between tests."""
|
||||
subprocess.run(["ray", "stop", "--force"], timeout=30, capture_output=True)
|
||||
time.sleep(5)
|
||||
yield
|
||||
|
||||
|
||||
MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||
|
||||
NUM_GSM8K_QUESTIONS = 256
|
||||
EXPECTED_ACCURACY = 0.58
|
||||
ACCURACY_TOL = 0.08
|
||||
MAX_NUM_SEQS = 32
|
||||
|
||||
|
||||
def _send_scale_command(server: RemoteOpenAIServer, new_dp_size: int) -> bool:
|
||||
url = server.url_for("scale_elastic_ep")
|
||||
payload = {"new_data_parallel_size": new_dp_size}
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
try:
|
||||
response = requests.post(url, json=payload, headers=headers, timeout=300)
|
||||
return response.status_code == 200
|
||||
except requests.exceptions.RequestException:
|
||||
return False
|
||||
|
||||
|
||||
def _run_gsm8k_eval(server: RemoteOpenAIServer, stage: str) -> float:
|
||||
assert server.port is not None
|
||||
result = evaluate_gsm8k(
|
||||
num_questions=NUM_GSM8K_QUESTIONS,
|
||||
host=f"http://{server.host}",
|
||||
port=server.port,
|
||||
)
|
||||
accuracy = result["accuracy"]
|
||||
print(
|
||||
f"[{stage}] GSM8K accuracy: {accuracy:.3f} "
|
||||
f"({result['num_questions']} questions)"
|
||||
)
|
||||
assert accuracy >= EXPECTED_ACCURACY, (
|
||||
f"[{stage}] GSM8K accuracy {accuracy:.3f} is below "
|
||||
f"expected threshold {EXPECTED_ACCURACY}"
|
||||
)
|
||||
return accuracy
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_elastic_ep_scaling():
|
||||
vllm_serve_args = [
|
||||
"--trust-remote-code",
|
||||
"--tensor-parallel-size",
|
||||
"1",
|
||||
"--gpu-memory-utilization",
|
||||
"0.8",
|
||||
"--max-model-len",
|
||||
"4096",
|
||||
"--max-num-seqs",
|
||||
str(MAX_NUM_SEQS),
|
||||
"--enable-expert-parallel",
|
||||
"--all2all-backend",
|
||||
"allgather_reducescatter",
|
||||
"--enable-elastic-ep",
|
||||
"--enable-eplb",
|
||||
"--eplb-config.num_redundant_experts",
|
||||
"0",
|
||||
"--data-parallel-backend",
|
||||
"ray",
|
||||
"--data-parallel-size",
|
||||
"2",
|
||||
"--api-server-count",
|
||||
"1",
|
||||
]
|
||||
|
||||
leader_address = os.environ.get("LEADER_ADDRESS")
|
||||
if leader_address:
|
||||
vllm_serve_args.extend(["--data-parallel-address", leader_address])
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, vllm_serve_args, env_dict={}, max_wait_seconds=1200
|
||||
) as server:
|
||||
initial_accuracy = _run_gsm8k_eval(server, "Initial (2 GPUs)")
|
||||
|
||||
assert _send_scale_command(server, 4)
|
||||
time.sleep(10)
|
||||
scale_up_accuracy = _run_gsm8k_eval(server, "After scale up (4 GPUs)")
|
||||
|
||||
assert scale_up_accuracy >= initial_accuracy - ACCURACY_TOL, (
|
||||
f"Scale up accuracy {scale_up_accuracy:.3f} dropped more than "
|
||||
f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
|
||||
)
|
||||
|
||||
assert _send_scale_command(server, 2)
|
||||
time.sleep(5)
|
||||
scale_down_accuracy = _run_gsm8k_eval(server, "After scale down (2 GPUs)")
|
||||
|
||||
assert scale_down_accuracy >= initial_accuracy - ACCURACY_TOL, (
|
||||
f"Scale down accuracy {scale_down_accuracy:.3f} dropped more than "
|
||||
f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
|
||||
)
|
||||
|
||||
print("\nAccuracy Summary:")
|
||||
print(f" Initial: {initial_accuracy:.3f}")
|
||||
print(
|
||||
f" Scale up: {scale_up_accuracy:.3f} "
|
||||
f"(diff: {scale_up_accuracy - initial_accuracy:+.3f})"
|
||||
)
|
||||
print(
|
||||
f" Scale down: {scale_down_accuracy:.3f} "
|
||||
f"(diff: {scale_down_accuracy - initial_accuracy:+.3f})"
|
||||
)
|
||||
print(f" Tolerance: {ACCURACY_TOL:.3f}")
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
def test_elastic_ep_scaling_uneven():
|
||||
"""Test scale up with uneven worker distribution.
|
||||
|
||||
This tests the case where num_new_workers % old_dp_size != 0,
|
||||
specifically 2 -> 3 where remainder = 1 % 2 = 1.
|
||||
This exercises the remainder handling in sender-receiver pairing.
|
||||
"""
|
||||
vllm_serve_args = [
|
||||
"--trust-remote-code",
|
||||
"--tensor-parallel-size",
|
||||
"1",
|
||||
"--gpu-memory-utilization",
|
||||
"0.8",
|
||||
"--max-model-len",
|
||||
"4096",
|
||||
"--max-num-seqs",
|
||||
str(MAX_NUM_SEQS),
|
||||
"--enable-expert-parallel",
|
||||
"--all2all-backend",
|
||||
"allgather_reducescatter",
|
||||
"--enable-elastic-ep",
|
||||
"--enable-eplb",
|
||||
"--eplb-config.num_redundant_experts",
|
||||
"0",
|
||||
"--data-parallel-backend",
|
||||
"ray",
|
||||
"--data-parallel-size",
|
||||
"2",
|
||||
"--api-server-count",
|
||||
"1",
|
||||
]
|
||||
|
||||
leader_address = os.environ.get("LEADER_ADDRESS")
|
||||
if leader_address:
|
||||
vllm_serve_args.extend(["--data-parallel-address", leader_address])
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, vllm_serve_args, env_dict={}, max_wait_seconds=1200
|
||||
) as server:
|
||||
initial_accuracy = _run_gsm8k_eval(server, "Initial (2 GPUs)")
|
||||
|
||||
# Scale 2 -> 3: This has remainder = 1 % 2 = 1
|
||||
# Tests uneven sender-receiver pairing
|
||||
assert _send_scale_command(server, 3)
|
||||
time.sleep(10)
|
||||
scale_up_accuracy = _run_gsm8k_eval(server, "After scale up (3 GPUs)")
|
||||
|
||||
assert scale_up_accuracy >= initial_accuracy - ACCURACY_TOL, (
|
||||
f"Scale up accuracy {scale_up_accuracy:.3f} dropped more than "
|
||||
f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
|
||||
)
|
||||
|
||||
# Scale back down to 2
|
||||
assert _send_scale_command(server, 2)
|
||||
time.sleep(5)
|
||||
scale_down_accuracy = _run_gsm8k_eval(server, "After scale down (2 GPUs)")
|
||||
|
||||
assert scale_down_accuracy >= initial_accuracy - ACCURACY_TOL, (
|
||||
f"Scale down accuracy {scale_down_accuracy:.3f} dropped more than "
|
||||
f"{ACCURACY_TOL} below initial accuracy {initial_accuracy:.3f}"
|
||||
)
|
||||
|
||||
print("\nAccuracy Summary (Uneven Scaling):")
|
||||
print(f" Initial: {initial_accuracy:.3f}")
|
||||
print(
|
||||
f" Scale up: {scale_up_accuracy:.3f} "
|
||||
f"(diff: {scale_up_accuracy - initial_accuracy:+.3f})"
|
||||
)
|
||||
print(
|
||||
f" Scale down: {scale_down_accuracy:.3f} "
|
||||
f"(diff: {scale_down_accuracy - initial_accuracy:+.3f})"
|
||||
)
|
||||
print(f" Tolerance: {ACCURACY_TOL:.3f}")
|
||||
@@ -8,6 +8,7 @@ import pytest
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed.eplb.rebalance_execute import (
|
||||
move_from_buffer,
|
||||
rearrange_expert_weights_inplace,
|
||||
@@ -244,90 +245,95 @@ def _test_async_transfer_layer_without_mtp_worker(
|
||||
num_logical_experts: int,
|
||||
) -> None:
|
||||
set_env_vars_and_device(env)
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
tp_group = get_tp_group()
|
||||
ep_group = tp_group.device_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.parallel_config.tensor_parallel_size = world_size
|
||||
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
hidden_sizes = [16, 32]
|
||||
with set_current_vllm_config(vllm_config):
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
redundancy_config = create_redundancy_config(
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
)
|
||||
old_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
redundancy_config,
|
||||
)
|
||||
tp_group = get_tp_group()
|
||||
ep_group = tp_group.device_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
|
||||
new_redundancy_config = create_redundancy_config(
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
)
|
||||
new_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
new_redundancy_config,
|
||||
)
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
hidden_sizes = [16, 32]
|
||||
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers,
|
||||
num_local_experts,
|
||||
hidden_sizes,
|
||||
ep_rank,
|
||||
device,
|
||||
old_indices,
|
||||
)
|
||||
old_indices_cpu = old_indices.cpu()
|
||||
new_indices_cpu = new_indices.cpu()
|
||||
redundancy_config = create_redundancy_config(
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
)
|
||||
old_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
redundancy_config,
|
||||
)
|
||||
|
||||
expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
|
||||
cuda_stream = torch.cuda.Stream(device=device)
|
||||
new_redundancy_config = create_redundancy_config(
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
)
|
||||
new_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
new_redundancy_config,
|
||||
)
|
||||
|
||||
for layer_idx in range(num_layers):
|
||||
is_unchanged, is_received_locally, recv_metadata = asyncio.run(
|
||||
transfer_layer(
|
||||
old_layer_indices=old_indices_cpu[layer_idx],
|
||||
new_layer_indices=new_indices_cpu[layer_idx],
|
||||
expert_weights=expert_weights[layer_idx],
|
||||
expert_weights_buffer=expert_buffer,
|
||||
ep_group=ep_group,
|
||||
cuda_stream=cuda_stream,
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers,
|
||||
num_local_experts,
|
||||
hidden_sizes,
|
||||
ep_rank,
|
||||
device,
|
||||
old_indices,
|
||||
)
|
||||
old_indices_cpu = old_indices.cpu()
|
||||
new_indices_cpu = new_indices.cpu()
|
||||
|
||||
expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
|
||||
cuda_stream = torch.cuda.Stream(device=device)
|
||||
|
||||
for layer_idx in range(num_layers):
|
||||
is_unchanged, is_received_locally, recv_metadata = asyncio.run(
|
||||
transfer_layer(
|
||||
old_layer_indices=old_indices_cpu[layer_idx],
|
||||
new_layer_indices=new_indices_cpu[layer_idx],
|
||||
expert_weights=expert_weights[layer_idx],
|
||||
expert_weights_buffer=expert_buffer,
|
||||
ep_group=ep_group,
|
||||
cuda_stream=cuda_stream,
|
||||
)
|
||||
)
|
||||
cuda_stream.synchronize()
|
||||
move_from_buffer(
|
||||
expert_weights=expert_weights[layer_idx],
|
||||
expert_weights_buffers=expert_buffer,
|
||||
is_unchanged=is_unchanged,
|
||||
is_received_locally=is_received_locally,
|
||||
recv_metadata=recv_metadata,
|
||||
new_indices=new_indices_cpu[layer_idx].numpy(),
|
||||
ep_rank=ep_rank,
|
||||
)
|
||||
)
|
||||
cuda_stream.synchronize()
|
||||
move_from_buffer(
|
||||
expert_weights=expert_weights[layer_idx],
|
||||
expert_weights_buffers=expert_buffer,
|
||||
is_unchanged=is_unchanged,
|
||||
is_received_locally=is_received_locally,
|
||||
recv_metadata=recv_metadata,
|
||||
new_indices=new_indices_cpu[layer_idx].numpy(),
|
||||
ep_rank=ep_rank,
|
||||
)
|
||||
|
||||
verify_expert_weights_after_shuffle(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
ep_rank,
|
||||
num_local_experts,
|
||||
)
|
||||
verify_redundant_experts_have_same_weights(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
world_size,
|
||||
num_local_experts,
|
||||
)
|
||||
verify_expert_weights_after_shuffle(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
ep_rank,
|
||||
num_local_experts,
|
||||
)
|
||||
verify_redundant_experts_have_same_weights(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
world_size,
|
||||
num_local_experts,
|
||||
)
|
||||
|
||||
|
||||
def _test_rearrange_expert_weights_with_redundancy(
|
||||
@@ -336,71 +342,76 @@ def _test_rearrange_expert_weights_with_redundancy(
|
||||
# Initialize model parallel (using tensor parallel as an entrypoint
|
||||
# to expert parallel)
|
||||
set_env_vars_and_device(env)
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.parallel_config.tensor_parallel_size = world_size
|
||||
|
||||
# Test parameters
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
hidden_sizes = [32, 64] # Two different weight matrices
|
||||
with set_current_vllm_config(vllm_config):
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
# Create old expert indices (with redundancy)
|
||||
redundancy_config = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
|
||||
old_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
redundancy_config,
|
||||
)
|
||||
# Test parameters
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
hidden_sizes = [32, 64] # Two different weight matrices
|
||||
|
||||
# Create new expert indices (with redundancy)
|
||||
new_redundancy_config = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
new_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
new_redundancy_config,
|
||||
)
|
||||
# Create old expert indices (with redundancy)
|
||||
redundancy_config = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
|
||||
# Create expert weights
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
|
||||
)
|
||||
old_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
redundancy_config,
|
||||
)
|
||||
|
||||
# Execute weight rearrangement
|
||||
rearrange_expert_weights_inplace(
|
||||
old_indices,
|
||||
new_indices,
|
||||
expert_weights,
|
||||
ep_group,
|
||||
is_profile=False,
|
||||
)
|
||||
# Create new expert indices (with redundancy)
|
||||
new_redundancy_config = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
new_indices = create_expert_indices_with_redundancy(
|
||||
num_layers,
|
||||
num_logical_experts,
|
||||
total_physical_experts,
|
||||
new_redundancy_config,
|
||||
)
|
||||
|
||||
# Verify the rearrangement result
|
||||
verify_expert_weights_after_shuffle(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
ep_rank,
|
||||
num_local_experts,
|
||||
)
|
||||
# Create expert weights
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
|
||||
)
|
||||
|
||||
verify_redundant_experts_have_same_weights(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
world_size,
|
||||
num_local_experts,
|
||||
)
|
||||
# Execute weight rearrangement
|
||||
rearrange_expert_weights_inplace(
|
||||
old_indices,
|
||||
new_indices,
|
||||
expert_weights,
|
||||
ep_group,
|
||||
is_profile=False,
|
||||
)
|
||||
|
||||
# Verify the rearrangement result
|
||||
verify_expert_weights_after_shuffle(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
ep_rank,
|
||||
num_local_experts,
|
||||
)
|
||||
|
||||
verify_redundant_experts_have_same_weights(
|
||||
expert_weights,
|
||||
new_indices,
|
||||
hidden_sizes,
|
||||
world_size,
|
||||
num_local_experts,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -444,58 +455,63 @@ def test_rearrange_expert_weights_with_redundancy(
|
||||
|
||||
def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
|
||||
set_env_vars_and_device(env)
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.parallel_config.tensor_parallel_size = world_size
|
||||
|
||||
num_layers = 2
|
||||
num_local_experts = 2
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
num_logical_experts = total_physical_experts // 2 # Some redundancy
|
||||
hidden_sizes = [32, 64]
|
||||
with set_current_vllm_config(vllm_config):
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
# Create redundancy configuration
|
||||
redundancy_config = [2] * num_logical_experts
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
|
||||
# Same indices - no change
|
||||
indices = create_expert_indices_with_redundancy(
|
||||
num_layers, num_logical_experts, total_physical_experts, redundancy_config
|
||||
)
|
||||
num_layers = 2
|
||||
num_local_experts = 2
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
num_logical_experts = total_physical_experts // 2 # Some redundancy
|
||||
hidden_sizes = [32, 64]
|
||||
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
|
||||
)
|
||||
# Create redundancy configuration
|
||||
redundancy_config = [2] * num_logical_experts
|
||||
|
||||
# Save original weights
|
||||
original_weights = []
|
||||
for layer_weights in expert_weights:
|
||||
layer_copy = []
|
||||
for weight in layer_weights:
|
||||
layer_copy.append(weight.clone())
|
||||
original_weights.append(layer_copy)
|
||||
# Same indices - no change
|
||||
indices = create_expert_indices_with_redundancy(
|
||||
num_layers, num_logical_experts, total_physical_experts, redundancy_config
|
||||
)
|
||||
|
||||
# Execute rearrangement (should be no change)
|
||||
rearrange_expert_weights_inplace(
|
||||
indices,
|
||||
indices, # Same indices
|
||||
expert_weights,
|
||||
ep_group,
|
||||
is_profile=False,
|
||||
)
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
|
||||
)
|
||||
|
||||
# Verify that the weights have not changed
|
||||
for layer in range(num_layers):
|
||||
for weight_idx in range(len(hidden_sizes)):
|
||||
torch.testing.assert_close(
|
||||
expert_weights[layer][weight_idx],
|
||||
original_weights[layer][weight_idx],
|
||||
msg=f"""Layer {layer}, weight {weight_idx}
|
||||
# Save original weights
|
||||
original_weights = []
|
||||
for layer_weights in expert_weights:
|
||||
layer_copy = []
|
||||
for weight in layer_weights:
|
||||
layer_copy.append(weight.clone())
|
||||
original_weights.append(layer_copy)
|
||||
|
||||
# Execute rearrangement (should be no change)
|
||||
rearrange_expert_weights_inplace(
|
||||
indices,
|
||||
indices, # Same indices
|
||||
expert_weights,
|
||||
ep_group,
|
||||
is_profile=False,
|
||||
)
|
||||
|
||||
# Verify that the weights have not changed
|
||||
for layer in range(num_layers):
|
||||
for weight_idx in range(len(hidden_sizes)):
|
||||
torch.testing.assert_close(
|
||||
expert_weights[layer][weight_idx],
|
||||
original_weights[layer][weight_idx],
|
||||
msg=f"""Layer {layer}, weight {weight_idx}
|
||||
should remain unchanged""",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@@ -538,64 +554,69 @@ def test_rearrange_expert_weights_no_change(world_size):
|
||||
|
||||
def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
|
||||
set_env_vars_and_device(env)
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
vllm_config = VllmConfig()
|
||||
vllm_config.parallel_config.tensor_parallel_size = world_size
|
||||
|
||||
num_layers = 1
|
||||
num_local_experts = 2
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
num_logical_experts = total_physical_experts // 2
|
||||
hidden_sizes = [32]
|
||||
with set_current_vllm_config(vllm_config):
|
||||
ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
|
||||
)
|
||||
|
||||
# Create different index distributions
|
||||
old_redundancy = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
new_redundancy = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
ep_group = get_tp_group().cpu_group
|
||||
ep_rank = torch.distributed.get_rank()
|
||||
device = torch.device(f"cuda:{ep_rank}")
|
||||
|
||||
old_indices = create_expert_indices_with_redundancy(
|
||||
num_layers, num_logical_experts, total_physical_experts, old_redundancy
|
||||
)
|
||||
new_indices = create_expert_indices_with_redundancy(
|
||||
num_layers, num_logical_experts, total_physical_experts, new_redundancy
|
||||
)
|
||||
num_layers = 1
|
||||
num_local_experts = 2
|
||||
total_physical_experts = world_size * num_local_experts
|
||||
num_logical_experts = total_physical_experts // 2
|
||||
hidden_sizes = [32]
|
||||
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
|
||||
)
|
||||
# Create different index distributions
|
||||
old_redundancy = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
new_redundancy = create_redundancy_config(
|
||||
num_logical_experts, total_physical_experts
|
||||
)
|
||||
|
||||
# Save original weights
|
||||
original_weights = []
|
||||
for layer_weights in expert_weights:
|
||||
layer_copy = []
|
||||
for weight in layer_weights:
|
||||
layer_copy.append(weight.clone())
|
||||
original_weights.append(layer_copy)
|
||||
old_indices = create_expert_indices_with_redundancy(
|
||||
num_layers, num_logical_experts, total_physical_experts, old_redundancy
|
||||
)
|
||||
new_indices = create_expert_indices_with_redundancy(
|
||||
num_layers, num_logical_experts, total_physical_experts, new_redundancy
|
||||
)
|
||||
|
||||
# Execute profile mode rearrangement
|
||||
rearrange_expert_weights_inplace(
|
||||
old_indices,
|
||||
new_indices,
|
||||
expert_weights,
|
||||
ep_group,
|
||||
is_profile=True, # Profile mode
|
||||
)
|
||||
expert_weights = create_expert_weights(
|
||||
num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
|
||||
)
|
||||
|
||||
# In profile mode, the weights should remain unchanged
|
||||
for layer in range(num_layers):
|
||||
for weight_idx in range(len(hidden_sizes)):
|
||||
torch.testing.assert_close(
|
||||
expert_weights[layer][weight_idx],
|
||||
original_weights[layer][weight_idx],
|
||||
msg="In profile mode, the weights should remain unchanged",
|
||||
)
|
||||
# Save original weights
|
||||
original_weights = []
|
||||
for layer_weights in expert_weights:
|
||||
layer_copy = []
|
||||
for weight in layer_weights:
|
||||
layer_copy.append(weight.clone())
|
||||
original_weights.append(layer_copy)
|
||||
|
||||
# Execute profile mode rearrangement
|
||||
rearrange_expert_weights_inplace(
|
||||
old_indices,
|
||||
new_indices,
|
||||
expert_weights,
|
||||
ep_group,
|
||||
is_profile=True, # Profile mode
|
||||
)
|
||||
|
||||
# In profile mode, the weights should remain unchanged
|
||||
for layer in range(num_layers):
|
||||
for weight_idx in range(len(hidden_sizes)):
|
||||
torch.testing.assert_close(
|
||||
expert_weights[layer][weight_idx],
|
||||
original_weights[layer][weight_idx],
|
||||
msg="In profile mode, the weights should remain unchanged",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("world_size", [2, 4])
|
||||
|
||||
@@ -10,6 +10,7 @@ import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
import vllm.envs as envs
|
||||
from tests.utils import ensure_current_vllm_config
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
|
||||
from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric_ops
|
||||
@@ -51,7 +52,8 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
|
||||
)
|
||||
|
||||
init_distributed_environment()
|
||||
initialize_model_parallel(tensor_model_parallel_size=world_size)
|
||||
with ensure_current_vllm_config():
|
||||
initialize_model_parallel(tensor_model_parallel_size=world_size)
|
||||
|
||||
cuda_communicator = typing.cast(
|
||||
CudaCommunicator, get_tp_group().device_communicator
|
||||
|
||||
@@ -9,6 +9,7 @@ import pytest
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
from tests.utils import ensure_current_vllm_config
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce # noqa
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
|
||||
@@ -112,7 +113,8 @@ def test_pynccl_multiple_allreduce():
|
||||
@worker_fn_wrapper
|
||||
def multiple_allreduce_with_vllm_worker_fn():
|
||||
device = torch.device(f"cuda:{torch.distributed.get_rank()}")
|
||||
ensure_model_parallel_initialized(2, 2)
|
||||
with ensure_current_vllm_config():
|
||||
ensure_model_parallel_initialized(2, 2)
|
||||
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
|
||||
with graph_capture(device=device):
|
||||
# two tp groups can communicate independently
|
||||
|
||||
Reference in New Issue
Block a user