[BugFix][Router Replay] Capture Logical Experts with EPLB (#33013)

Signed-off-by: Hollow Man <hollowman@opensuse.org>
2026-01-31 17:12:17 +02:00
parent 15f40b20aa
commit 13b842f271
4 changed files with 185 additions and 21 deletions
--- a/tests/model_executor/test_routed_experts_capture.py
+++ b/tests/model_executor/test_routed_experts_capture.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import types
+
+import pytest
+import torch
+
+from vllm.distributed.eplb.eplb_state import EplbLayerState
+from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
+from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter
+
+pytestmark = pytest.mark.cpu_test
+
+
+class DummyRouter(BaseRouter):
+    @property
+    def routing_method_type(self) -> RoutingMethodType:
+        return RoutingMethodType.FUSED_TOPK
+
+    def _compute_routing(self, hidden_states, router_logits, indices_type):
+        topk_ids = torch.tensor([[1, 2], [3, 4]], dtype=torch.int64)
+        topk_weights = torch.ones_like(topk_ids, dtype=torch.float32)
+        return topk_weights, topk_ids
+
+    def _apply_eplb_mapping(self, topk_ids: torch.Tensor) -> torch.Tensor:
+        # Make mapping observable without requiring CUDA EPLB path.
+        return topk_ids + 10
+
+
+def _make_router() -> DummyRouter:
+    return DummyRouter(
+        top_k=2,
+        global_num_experts=16,
+        eplb_state=EplbLayerState(),
+        enable_eplb=False,
+        indices_type_getter=None,
+    )
+
+
+def test_base_router_capture_pre_eplb_mapping():
+    router = _make_router()
+    captured = []
+
+    def capture_fn(ids):
+        captured.append(ids.clone())
+
+    router.set_capture_fn(capture_fn)
+    topk_weights, topk_ids = router.select_experts(
+        hidden_states=torch.empty(1),
+        router_logits=torch.empty(1),
+    )
+
+    assert topk_weights.shape == topk_ids.shape
+    assert len(captured) == 1
+    assert torch.equal(captured[0], torch.tensor([[1, 2], [3, 4]]))
+    assert torch.equal(topk_ids, torch.tensor([[11, 12], [13, 14]]))
+
+
+def test_base_router_capture_with_eplb_enabled():
+    router = _make_router()
+    router.enable_eplb = True
+    router.eplb_state.expert_load_view = torch.zeros(32, dtype=torch.int64)
+    router.eplb_state.logical_to_physical_map = torch.arange(32).view(32, 1)
+    router.eplb_state.logical_replica_count = torch.ones(32, dtype=torch.int64)
+
+    captured = []
+
+    def capture_fn(ids):
+        captured.append(ids.clone())
+
+    router.set_capture_fn(capture_fn)
+    _, topk_ids = router.select_experts(
+        hidden_states=torch.empty(1),
+        router_logits=torch.empty(1),
+    )
+
+    assert len(captured) == 1
+    # Capture should see logical ids pre-EPLB mapping.
+    assert torch.equal(captured[0], torch.tensor([[1, 2], [3, 4]]))
+    # Our DummyRouter mapping adds +10.
+    assert torch.equal(topk_ids, torch.tensor([[11, 12], [13, 14]]))
+
+
+def test_gpu_model_runner_binds_router_capture(monkeypatch):
+    from vllm.v1.worker import gpu_model_runner as gmr
+
+    class DummyFusedMoE:
+        def __init__(self):
+            self.layer_id = 7
+            self.router = _make_router()
+
+    class DummyCapturer:
+        def __init__(self):
+            self.calls = []
+
+        def capture(self, layer_id, topk_ids):
+            self.calls.append((layer_id, topk_ids))
+
+    dummy_module = DummyFusedMoE()
+
+    # Patch the runtime import inside _bind_routed_experts_capturer.
+    import vllm.model_executor.layers.fused_moe.layer as fused_moe_layer
+
+    monkeypatch.setattr(fused_moe_layer, "FusedMoE", DummyFusedMoE)
+
+    dummy_self = types.SimpleNamespace(
+        compilation_config=types.SimpleNamespace(
+            static_forward_context={"dummy": dummy_module}
+        )
+    )
+
+    capturer = DummyCapturer()
+    gmr.GPUModelRunner._bind_routed_experts_capturer(dummy_self, capturer)
+
+    assert dummy_module.router.capture_fn is not None
+    dummy_module.router.capture_fn(torch.tensor([[5, 6]]))
+
+    assert len(capturer.calls) == 1
+    layer_id, topk_ids = capturer.calls[0]
+    assert layer_id == 7
+    assert torch.equal(topk_ids, torch.tensor([[5, 6]]))
+
+
+def test_gpu_model_runner_binding_stage(monkeypatch):
+    from vllm.v1.worker import gpu_model_runner as gmr
+
+    class DummyFusedMoE:
+        def __init__(self):
+            self.layer_id = 11
+            self.router = _make_router()
+
+    class DummyCapturer:
+        def __init__(self):
+            self.calls = []
+
+        def capture(self, layer_id, topk_ids):
+            self.calls.append((layer_id, topk_ids))
+
+    dummy_module = DummyFusedMoE()
+
+    import vllm.model_executor.layers.fused_moe.layer as fused_moe_layer
+
+    monkeypatch.setattr(fused_moe_layer, "FusedMoE", DummyFusedMoE)
+
+    dummy_self = types.SimpleNamespace(
+        compilation_config=types.SimpleNamespace(
+            static_forward_context={"dummy": dummy_module}
+        )
+    )
+
+    # Before binding, no capture hook.
+    assert dummy_module.router.capture_fn is None
+
+    capturer = DummyCapturer()
+    gmr.GPUModelRunner._bind_routed_experts_capturer(dummy_self, capturer)
+
+    # After binding, hook should exist and be callable.
+    assert callable(dummy_module.router.capture_fn)
+    dummy_module.router.capture_fn(torch.tensor([[9, 10]]))
+    assert len(capturer.calls) == 1