[Feature] Expert Parallelism Load Balancer (EPLB) (#18343)

Signed-off-by: Bowen Wang <abmfy@icloud.com>
2025-06-26 15:30:21 -07:00
parent 07b8fae219
commit e9fd658a73
24 changed files with 2446 additions and 54 deletions
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -482,7 +482,15 @@ class AWQMoEMethod(FusedMoEMethodBase):
        e_score_correction_bias: Optional[torch.Tensor] = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
+        if enable_eplb:
+            raise NotImplementedError(
+                "EPLB not supported for `AWQMoEMethod` yet.")
+
        assert activation == "silu", "Only SiLU activation is supported."

        if apply_router_weight_on_input: