[Feature] Expert Parallelism Load Balancer (EPLB) (#18343)

Signed-off-by: Bowen Wang <abmfy@icloud.com>
2025-06-26 15:30:21 -07:00
parent 07b8fae219
commit e9fd658a73
24 changed files with 2446 additions and 54 deletions
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -0,0 +1,504 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import multiprocessing
+import os
+import random
+
+import pytest
+import torch
+import torch.distributed
+
+from vllm.distributed.eplb.rebalance_execute import (
+    rearrange_expert_weights_inplace)
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             get_tp_group,
+                                             init_distributed_environment)
+from vllm.utils import update_environment_variables
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes: list[multiprocessing.Process] = []
+    for i in range(number_of_processes):
+        env: dict[str, str] = {}
+        env['RANK'] = str(i)
+        env['LOCAL_RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def worker_fn_wrapper(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapped_fn(env):
+        update_environment_variables(env)
+        local_rank = os.environ['LOCAL_RANK']
+        device = torch.device(f"cuda:{local_rank}")
+        torch.cuda.set_device(device)
+        init_distributed_environment()
+
+        # Ensure each worker process has the same random seed
+        random.seed(42)
+        torch.manual_seed(42)
+
+        fn()
+
+    return wrapped_fn
+
+
+def create_expert_indices_with_redundancy(
+        num_layers: int,
+        num_logical_experts: int,
+        total_physical_experts: int,
+        redundancy_config: list[int],  # redundancy for each logical expert
+) -> torch.Tensor:
+    """
+    Create expert indices with redundancy.
+    
+    Args:
+        num_layers: number of layers
+        num_logical_experts: number of logical experts
+        total_physical_experts: total number of physical experts
+        redundancy_config: redundancy for each logical expert
+    
+    Returns:
+        indices: Shape (num_layers, total_physical_experts)
+    """
+    assert sum(redundancy_config) == total_physical_experts
+    assert len(redundancy_config) == num_logical_experts
+
+    indices = torch.zeros(num_layers, total_physical_experts, dtype=torch.long)
+
+    for layer in range(num_layers):
+        physical_pos = 0
+        for logical_expert_id, redundancy in enumerate(redundancy_config):
+            for _ in range(redundancy):
+                indices[layer, physical_pos] = logical_expert_id
+                physical_pos += 1
+
+    # Shuffle the indices at dim 1
+    for layer in range(num_layers):
+        indices[layer] = indices[layer][torch.randperm(indices.shape[1])]
+
+    return indices
+
+
+def create_expert_weights(
+    num_layers: int,
+    num_local_experts: int,
+    hidden_sizes: list[int],
+    rank: int,
+    device: torch.device,
+    physical_to_logical_mapping: torch.Tensor,
+) -> list[list[torch.Tensor]]:
+    """
+    Create fake expert weights tensor for testing.
+    
+    Use `arange` to generate predictable weights values, based on logical
+    expert ID.
+    All replicas of the same logical expert should have the same weights.
+    
+    Args:
+        physical_to_logical_mapping: Shape (num_layers, num_local_experts)
+            mapping[layer, physical_pos] = logical_expert_id
+    """
+    expert_weights = []
+
+    for layer in range(num_layers):
+        layer_weights = []
+        for weight_idx, hidden_size in enumerate(hidden_sizes):
+            weight_tensor = torch.zeros(num_local_experts,
+                                        hidden_size,
+                                        device=device,
+                                        dtype=torch.float32)
+
+            for local_expert in range(num_local_experts):
+                # Get the logical expert ID for this physical expert
+                global_pos = rank * num_local_experts + local_expert
+                logical_expert_id = physical_to_logical_mapping[
+                    layer, global_pos].item()
+
+                # Generate weights based on logical expert ID
+                # (so that all replicas of the same logical expert have the
+                # same weights)
+                base_value = (logical_expert_id * 1000 + layer * 100 +
+                              weight_idx * 10)
+                weight_tensor[local_expert] = torch.arange(base_value,
+                                                           base_value +
+                                                           hidden_size,
+                                                           device=device,
+                                                           dtype=torch.float32)
+
+            layer_weights.append(weight_tensor)
+        expert_weights.append(layer_weights)
+
+    return expert_weights
+
+
+def create_redundancy_config(
+    num_logical_experts: int,
+    num_physical_experts: int,
+) -> list[int]:
+    """Create a redundancy configuration."""
+    redundancy_config = [1] * num_logical_experts
+    remaining = num_physical_experts - num_logical_experts
+    # Randomly assign the remaining physical experts to the logical experts
+    for _ in range(remaining):
+        redundancy_config[random.choice(range(num_logical_experts))] += 1
+    return redundancy_config
+
+
+def verify_expert_weights_after_shuffle(
+    expert_weights: list[list[torch.Tensor]],
+    new_indices: torch.Tensor,
+    hidden_sizes: list[int],
+    ep_rank: int,
+    num_local_experts: int,
+):
+    """Verify the weights after shuffling are correct."""
+    num_layers = len(expert_weights)
+
+    for layer in range(num_layers):
+        for weight_idx, hidden_size in enumerate(hidden_sizes):
+            weight_tensor = expert_weights[layer][weight_idx]
+
+            for local_expert in range(num_local_experts):
+                # Calculate the global expert ID for this local expert
+                global_pos = ep_rank * num_local_experts + local_expert
+                expected_logical_expert = new_indices[layer, global_pos].item()
+
+                # Check if the weights are correct
+                actual_weights = weight_tensor[local_expert]
+                expected_base = (expected_logical_expert * 1000 + layer * 100 +
+                                 weight_idx * 10)
+                expected_weights = torch.arange(expected_base,
+                                                expected_base + hidden_size,
+                                                device=actual_weights.device,
+                                                dtype=actual_weights.dtype)
+
+                torch.testing.assert_close(
+                    actual_weights,
+                    expected_weights,
+                    msg=f"Layer {layer}, weight {weight_idx},"
+                    f"local expert {local_expert}: "
+                    f"weights do not match. "
+                    f"Expected logical expert {expected_logical_expert}")
+
+
+def verify_redundant_experts_have_same_weights(
+    expert_weights: list[list[torch.Tensor]],
+    indices: torch.Tensor,
+    hidden_sizes: list[int],
+    world_size: int,
+    num_local_experts: int,
+):
+    """
+    Verify that all replicas of the same logical expert have the same weights.
+    """
+    num_layers = len(expert_weights)
+    total_physical_experts = world_size * num_local_experts
+
+    for layer in range(num_layers):
+        # Collect weights for all physical experts for each weight matrix
+        all_weights: list[torch.Tensor] = []
+
+        for weight_idx, hidden_size in enumerate(hidden_sizes):
+            # Create tensor to store all expert weights
+            # Shape: [total_physical_experts, hidden_size]
+            gathered_weights = torch.zeros(
+                total_physical_experts,
+                hidden_size,
+                device=expert_weights[layer][weight_idx].device,
+                dtype=expert_weights[layer][weight_idx].dtype)
+
+            # Use all_gather to collect expert weights from current node
+            # expert_weights[layer][weight_idx] shape:
+            # [num_local_experts, hidden_size]
+            local_weights = expert_weights[layer][
+                weight_idx]  # [num_local_experts, hidden_size]
+
+            # Split tensor along dim 0 into a list for all_gather
+            gathered_weights_list = torch.chunk(gathered_weights,
+                                                world_size,
+                                                dim=0)
+
+            torch.distributed.all_gather(
+                # Output list: each element corresponds to one rank's weights
+                list(gathered_weights_list),
+                local_weights  # Input: current rank's local weights
+            )
+
+            all_weights.append(gathered_weights)
+
+        # Verify that all replicas of the same logical expert have the same
+        # weights
+        logical_expert_weights: dict[int, dict[int, torch.Tensor]] = {}
+
+        for physical_pos in range(total_physical_experts):
+            logical_expert_id = int(indices[layer, physical_pos].item())
+
+            if logical_expert_id not in logical_expert_weights:
+                # First time encountering this logical expert, save its weights
+                logical_expert_weights[logical_expert_id] = {
+                    weight_idx: all_weights[weight_idx][physical_pos]
+                    for weight_idx in range(len(hidden_sizes))
+                }
+            else:
+                # Verify that current physical expert's weights match the
+                # previously saved logical expert weights
+                for weight_idx in range(len(hidden_sizes)):
+                    torch.testing.assert_close(
+                        all_weights[weight_idx][physical_pos],
+                        logical_expert_weights[logical_expert_id][weight_idx],
+                        msg=f"Layer {layer}, weight {weight_idx},"
+                        f"logical expert {logical_expert_id}: "
+                        f"Physical expert {physical_pos} has different weights"
+                        f"than expected")
+
+
+@pytest.mark.parametrize(
+    "world_size,num_layers,num_local_experts,num_logical_experts",
+    [
+        # 2 GPU, 2 experts per GPU
+        # 3 logical experts, 4 physical experts, 1 redundant experts
+        (2, 1, 2, 3),
+        # 2 GPU, 3 experts per GPU
+        # 4 logical experts, 6 physical experts, 2 redundant experts
+        (2, 2, 3, 4),
+        # 2 GPU, 8 experts per GPU
+        # 16 logical experts, 16 physical experts, 0 redundant experts
+        (2, 4, 8, 16),
+        # 4 GPU, 2 experts per GPU
+        # 6 logical experts, 8 physical experts, 2 redundant experts
+        (4, 1, 2, 6),
+        # 4 GPU, 2 experts per GPU
+        # 5 logical experts, 8 physical experts, 3 redundant experts
+        (4, 2, 2, 5),
+        # 4 GPU, 8 experts per GPU
+        # 16 logical experts, 32 physical experts, 16 redundant experts
+        (4, 8, 8, 16),
+    ])
+def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
+                                                  num_local_experts,
+                                                  num_logical_experts):
+    """Test the functionality of rearranging expert weights with redundancy."""
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    @worker_fn_wrapper
+    def worker_fn():
+        # Initialize model parallel (using tensor parallel as an entrypoint
+        # to expert parallel)
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size,
+            pipeline_model_parallel_size=1)
+
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
+
+        # Test parameters
+        total_physical_experts = world_size * num_local_experts
+        hidden_sizes = [32, 64]  # Two different weight matrices
+
+        # Create old expert indices (with redundancy)
+        redundancy_config = create_redundancy_config(num_logical_experts,
+                                                     total_physical_experts)
+
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            redundancy_config,
+        )
+
+        # Create new expert indices (with redundancy)
+        new_redundancy_config = create_redundancy_config(
+            num_logical_experts, total_physical_experts)
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            new_redundancy_config,
+        )
+
+        # Create expert weights
+        expert_weights = create_expert_weights(num_layers, num_local_experts,
+                                               hidden_sizes, ep_rank, device,
+                                               old_indices)
+
+        # Execute weight rearrangement
+        rearrange_expert_weights_inplace(
+            old_indices,
+            new_indices,
+            expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        # Verify the rearrangement result
+        verify_expert_weights_after_shuffle(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            ep_rank,
+            num_local_experts,
+        )
+
+        verify_redundant_experts_have_same_weights(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            world_size,
+            num_local_experts,
+        )
+
+    distributed_run(worker_fn, world_size)
+
+
+@pytest.mark.parametrize("world_size", [2, 4])
+def test_rearrange_expert_weights_no_change(world_size):
+    """
+    Test that when the indices do not change, the weights should remain
+    unchanged.
+    """
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    @worker_fn_wrapper
+    def worker_fn():
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size,
+            pipeline_model_parallel_size=1)
+
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
+
+        num_layers = 2
+        num_local_experts = 2
+        total_physical_experts = world_size * num_local_experts
+        num_logical_experts = total_physical_experts // 2  # Some redundancy
+        hidden_sizes = [32, 64]
+
+        # Create redundancy configuration
+        redundancy_config = [2] * num_logical_experts
+
+        # Same indices - no change
+        indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts,
+            redundancy_config)
+
+        expert_weights = create_expert_weights(num_layers, num_local_experts,
+                                               hidden_sizes, ep_rank, device,
+                                               indices)
+
+        # Save original weights
+        original_weights = []
+        for layer_weights in expert_weights:
+            layer_copy = []
+            for weight in layer_weights:
+                layer_copy.append(weight.clone())
+            original_weights.append(layer_copy)
+
+        # Execute rearrangement (should be no change)
+        rearrange_expert_weights_inplace(
+            indices,
+            indices,  # Same indices
+            expert_weights,
+            ep_group,
+            is_profile=False)
+
+        # Verify that the weights have not changed
+        for layer in range(num_layers):
+            for weight_idx in range(len(hidden_sizes)):
+                torch.testing.assert_close(
+                    expert_weights[layer][weight_idx],
+                    original_weights[layer][weight_idx],
+                    msg=f"Layer {layer}, weight {weight_idx} should remain "
+                    f"unchanged")
+
+    distributed_run(worker_fn, world_size)
+
+
+@pytest.mark.parametrize("world_size", [2, 4])
+def test_rearrange_expert_weights_profile_mode(world_size):
+    """Test profile mode (should not copy actual weights)"""
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    @worker_fn_wrapper
+    def worker_fn():
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size,
+            pipeline_model_parallel_size=1)
+
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
+
+        num_layers = 1
+        num_local_experts = 2
+        total_physical_experts = world_size * num_local_experts
+        num_logical_experts = total_physical_experts // 2
+        hidden_sizes = [32]
+
+        # Create different index distributions
+        old_redundancy = create_redundancy_config(num_logical_experts,
+                                                  total_physical_experts)
+        new_redundancy = create_redundancy_config(num_logical_experts,
+                                                  total_physical_experts)
+
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts,
+            old_redundancy)
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts,
+            new_redundancy)
+
+        expert_weights = create_expert_weights(num_layers, num_local_experts,
+                                               hidden_sizes, ep_rank, device,
+                                               old_indices)
+
+        # Save original weights
+        original_weights = []
+        for layer_weights in expert_weights:
+            layer_copy = []
+            for weight in layer_weights:
+                layer_copy.append(weight.clone())
+            original_weights.append(layer_copy)
+
+        # Execute profile mode rearrangement
+        rearrange_expert_weights_inplace(
+            old_indices,
+            new_indices,
+            expert_weights,
+            ep_group,
+            is_profile=True  # Profile mode
+        )
+
+        # In profile mode, the weights should remain unchanged
+        for layer in range(num_layers):
+            for weight_idx in range(len(hidden_sizes)):
+                torch.testing.assert_close(
+                    expert_weights[layer][weight_idx],
+                    original_weights[layer][weight_idx],
+                    msg="In profile mode, the weights should remain unchanged")
+
+    distributed_run(worker_fn, world_size)