[MoE Refactor] Migrate Unquantized to Full Oracle Flow (#36286)

Signed-off-by: Yifan Zong <yzong@redhat.com> Signed-off-by: Robert Shaw <robshaw@redhat.com> Signed-off-by: yzong-rh <yzong@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-03-31 15:43:33 -04:00
parent 598190aac3
commit d9b90a07ac
11 changed files with 618 additions and 514 deletions
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -1664,7 +1664,7 @@ def test_unquantized_bf16_flashinfer_trtllm_backend(
        intermediate_size_per_partition=n,
        num_local_experts=e,
        num_logical_experts=e,
-        activation="silu",
+        activation=MoEActivation.SILU,
        device="cuda",
        moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
        in_dtype=dtype,
@@ -1695,13 +1695,25 @@ def test_unquantized_bf16_flashinfer_trtllm_backend(
        layer.topk_group = 1
        layer.intermediate_size_per_partition = n
        layer.ep_rank = 0
-        layer.activation = "silu"
+        layer.activation = MoEActivation.SILU
        layer.e_score_correction_bias = None
        layer.routing_method_type = RoutingMethodType.Renormalize
+        layer.expert_map = None
+        layer.apply_router_weight_on_input = False
+        layer.routed_scaling_factor = None
+        layer.shared_experts = None
+        layer._maybe_init_expert_routing_tables = lambda: None

        quant_method.process_weights_after_loading(layer)

-        trtllm_output = quant_method.forward_monolithic_cuda(
+        assert quant_method.moe_kernel is not None, (
+            "moe_kernel should be set after process_weights_after_loading"
+        )
+        assert quant_method.supports_internal_mk, (
+            "supports_internal_mk should be True after setup"
+        )
+
+        trtllm_output = quant_method.apply_monolithic(
            layer=layer,
            x=a,
            router_logits=router_logits,
--- a/tests/kernels/moe/test_unquantized_backend_selection.py
+++ b/tests/kernels/moe/test_unquantized_backend_selection.py
@@ -24,7 +24,7 @@ from vllm.platforms import current_platform
    ],
 )
@patch(
-    "vllm.model_executor.layers.fused_moe.oracle.unquantized.has_flashinfer",
+    "vllm.utils.flashinfer.has_flashinfer",
    return_value=False,
 )
@patch(
@@ -54,13 +54,29 @@ def test_select_default_backend_by_platform(
        # Set only the specified platform to True
        getattr(mock_platform, platform_method).return_value = True

+    with (
+        patch.object(current_platform, "is_cuda", return_value=False),
+        patch.object(current_platform, "is_rocm", return_value=False),
+        patch.object(current_platform, "is_cpu", return_value=False),
+        patch.object(current_platform, "is_xpu", return_value=False),
+        patch.object(current_platform, "is_tpu", return_value=False),
+        patch.object(current_platform, "is_out_of_tree", return_value=False),
+        patch.object(current_platform, platform_method, return_value=True),
+    ):
        moe_config = make_dummy_moe_config()
-        selected_backend = select_unquantized_moe_backend(
-            moe_config=moe_config,
-            use_dp=False,
+        selected_backend, expert_cls = select_unquantized_moe_backend(
+            moe_config=moe_config
        )

        assert selected_backend == expected_backend
+        if expected_backend in [
+            UnquantizedMoeBackend.CPU,
+            UnquantizedMoeBackend.OOT,
+            UnquantizedMoeBackend.TPU,
+        ]:
+            assert expert_cls is None
+        else:
+            assert expert_cls is not None


@patch(
@@ -87,88 +103,90 @@ def test_select_rocm_aiter_backend(mock_aiter_enabled, mock_has_flashinfer):
        mock_platform.is_out_of_tree.return_value = False

        moe_config = make_dummy_moe_config()
-        selected_backend = select_unquantized_moe_backend(
+        selected_backend, expert_cls = select_unquantized_moe_backend(
            moe_config=moe_config,
-            use_dp=False,
        )

        assert selected_backend == UnquantizedMoeBackend.AITER
+        assert expert_cls is not None


@patch(
-    "vllm.model_executor.layers.fused_moe.oracle.unquantized.has_flashinfer",
-    return_value=True,
-)
-@patch(
-    "vllm.model_executor.layers.fused_moe.oracle.unquantized.is_supported_config_trtllm_bf16",
+    "vllm.model_executor.layers.fused_moe.experts.trtllm_bf16_moe.TrtLlmBf16Experts.is_supported_config",
    return_value=(True, None),
 )
@pytest.mark.skipif(
    not current_platform.is_cuda(), reason="Only supported on NVIDIA platforms."
 )
-def test_select_cuda_flashinfer_trtllm_backend(
-    mock_has_flashinfer, mock_is_supported_trtllm, monkeypatch
-):
+def test_select_cuda_flashinfer_trtllm_backend(mock_is_supported_trtllm, monkeypatch):
    """Test CUDA backend selection when FlashInfer TRTLLM is available and enabled."""
-    with patch(
-        "vllm.model_executor.layers.fused_moe.oracle.unquantized.current_platform"
-    ) as mock_platform:
-        # Set as CUDA platform
-        mock_platform.is_cuda.return_value = True
-        mock_platform.is_rocm.return_value = False
-        mock_platform.is_cpu.return_value = False
-        mock_platform.is_xpu.return_value = False
-        mock_platform.is_tpu.return_value = False
-        mock_platform.is_out_of_tree.return_value = False
-
+    with (
+        patch.object(current_platform, "is_cuda", return_value=True),
+        patch.object(current_platform, "is_rocm", return_value=False),
+        patch.object(current_platform, "is_cpu", return_value=False),
+        patch.object(current_platform, "is_xpu", return_value=False),
+        patch.object(current_platform, "is_tpu", return_value=False),
+        patch.object(current_platform, "is_out_of_tree", return_value=False),
+        patch.object(current_platform, "has_device_capability", return_value=True),
+    ):
        monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1")

        moe_config = make_dummy_moe_config()
+        # TRTLLM requires EP and does not support DP
+        moe_config.moe_parallel_config.use_ep = True
+        moe_config.moe_parallel_config.use_dp = False

-        selected_backend = select_unquantized_moe_backend(
-            moe_config=moe_config,
-            use_dp=False,
+        selected_backend, experts_cls = select_unquantized_moe_backend(
+            moe_config=moe_config
        )

        assert selected_backend == UnquantizedMoeBackend.FLASHINFER_TRTLLM
+        assert experts_cls is not None


@patch(
-    "vllm.model_executor.layers.fused_moe.oracle.unquantized.has_flashinfer",
+    "vllm.utils.flashinfer.has_flashinfer",
    return_value=True,
 )
@patch(
-    "vllm.model_executor.layers.fused_moe.oracle.unquantized.is_supported_config_trtllm_bf16",
+    "vllm.model_executor.layers.fused_moe.experts.trtllm_bf16_moe.TrtLlmBf16Experts.is_supported_config",
    return_value=(False, None),
 )
+@patch(
+    "vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts.is_supported_config",
+    return_value=(True, None),
+)
@pytest.mark.skipif(
    not current_platform.is_cuda(), reason="Only supported on NVIDIA platforms."
 )
 def test_select_cuda_flashinfer_cutlass_backend(
-    mock_has_flashinfer, mock_is_supported_trtllm, monkeypatch
+    mock_has_flashinfer,
+    mock_is_supported_trtllm,
+    mock_is_supported_cutlass,
+    monkeypatch,
 ):
    """Test CUDA backend selection when FlashInfer TRTLLM is not available
    and FlashInfer CUTLASS is available."""
-    with patch(
-        "vllm.model_executor.layers.fused_moe.oracle.unquantized.current_platform"
-    ) as mock_platform:
-        # Set as CUDA platform with Hopper capability
-        mock_platform.is_cuda.return_value = True
-        mock_platform.is_rocm.return_value = False
-        mock_platform.is_cpu.return_value = False
-        mock_platform.is_xpu.return_value = False
-        mock_platform.is_tpu.return_value = False
-        mock_platform.is_out_of_tree.return_value = False
-        mock_platform.has_device_capability.return_value = True  # SM90+
-
+    with (
+        patch.object(current_platform, "is_cuda", return_value=True),
+        patch.object(current_platform, "is_rocm", return_value=False),
+        patch.object(current_platform, "is_cpu", return_value=False),
+        patch.object(current_platform, "is_xpu", return_value=False),
+        patch.object(current_platform, "is_tpu", return_value=False),
+        patch.object(current_platform, "is_out_of_tree", return_value=False),
+        patch.object(current_platform, "has_device_capability", return_value=True),
+    ):
        # Enable FlashInfer via env var
        monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1")

        moe_config = make_dummy_moe_config()
+        # CUTLASS requires EP and does not support DP
+        moe_config.moe_parallel_config.use_ep = True
+        moe_config.moe_parallel_config.use_dp = False

-        selected_backend = select_unquantized_moe_backend(
-            moe_config=moe_config,
-            use_dp=False,  # CUTLASS doesn't support DP
+        selected_backend, experts_cls = select_unquantized_moe_backend(
+            moe_config=moe_config
        )

        assert selected_backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS
+        assert experts_cls is not None