[Distributed] Add enable_expert_parallel arg (#14305)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-06 13:54:45 -05:00
parent cd579352bf
commit cc2f9b32c8
5 changed files with 27 additions and 21 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -754,7 +754,7 @@ class ModelConfig:
                " must be divisible by tensor parallel size "
                f"({tensor_parallel_size}).")

-        if envs.VLLM_TEST_ENABLE_EP:
+        if parallel_config.enable_expert_parallel:
            self._verify_with_expert_parallelism()

        pipeline_parallel_size = parallel_config.pipeline_parallel_size
@@ -1334,6 +1334,7 @@ class ParallelConfig:
    # IP of the data parallel master.
    data_parallel_master_ip: str = "127.0.0.1"
    data_parallel_master_port: int = 29500  # Port of the data parallel master.
+    enable_expert_parallel: bool = False  # Use EP instead of TP for MoE layers.

    # Maximum number of multiple batches
    # when load model sequentially. To avoid RAM OOM when using tensor