[Feature] Expert Parallelism Load Balancer (EPLB) (#18343)

Signed-off-by: Bowen Wang <abmfy@icloud.com>
2025-06-26 15:30:21 -07:00
parent 07b8fae219
commit e9fd658a73
24 changed files with 2446 additions and 54 deletions
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -31,12 +31,20 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):

        text_config = hf_config.get_text_config()

+        # Ensure at least 2 expert per group
+        # Since `grouped_topk` assums top-2
+        num_experts = getattr(text_config, 'n_group', 1) * 2
+
        text_config.update({
            "num_layers": 1,
            "num_hidden_layers": 1,
-            "num_experts": 2,
+            "num_experts": num_experts,
            "num_experts_per_tok": 2,
-            "num_local_experts": 2,
+            "num_local_experts": num_experts,
+            # Otherwise there will not be any expert layers
+            "first_k_dense_replace": 0,
+            # To avoid OOM on DeepSeek-V3
+            "n_routed_experts": num_experts,
        })

        if hasattr(hf_config, "vision_config"):