[MoE Refactor][9/N] Use modular kernel for unquantized Triton MoE (#31052)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
2025-12-22 09:34:19 -08:00
parent ab3a85fd68
commit 7b926e8901
2 changed files with 22 additions and 7 deletions
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
+from vllm.v1.worker.workspace import init_workspace_manager

 NUM_EXPERTS = [8, 64, 192]
 EP_SIZE = [1, 4]
@@ -487,6 +488,7 @@ def test_mixtral_moe(
    monkeypatch.setenv("MASTER_ADDR", "localhost")
    monkeypatch.setenv("MASTER_PORT", "12345")
    init_distributed_environment()
+    init_workspace_manager(torch.cuda.current_device())

    # Instantiate our and huggingface's MoE blocks
    vllm_config.compilation_config.static_forward_context = dict()
@@ -533,6 +535,11 @@ def test_mixtral_moe(
            torch.cuda.synchronize()
            torch.cuda.empty_cache()

+        # FIXME (zyongye) fix this after we move self.kernel
+        # assignment in FusedMoE.__init__
+
+        vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts)
+
        # Run forward passes for both MoE blocks
        hf_states, _ = hf_moe.forward(hf_inputs)
        vllm_states = vllm_moe.forward(vllm_inputs)