[MoE Refactor] Oracle Select FP8+NVFP4 Kernels In Priority (#32414)

2026-01-21 08:22:33 -05:00
parent e14467be43
commit 42135d6898
82 changed files with 2710 additions and 1563 deletions
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -8,6 +8,7 @@ import pytest
 import torch

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk
@@ -193,16 +194,18 @@ def run_with_expert_maps(

    out_tensor = torch.zeros_like(cutlass_moe_kwargs["hidden_states"])
    for kwargs, new_quant_config in slice_experts():
+        w2 = kwargs["w2"]
+        a = kwargs["hidden_states"]
        kernel = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp8(
-                out_dtype=kwargs["hidden_states"].dtype,
-                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
-                e=kwargs["w2"].shape[0],  # type: ignore[union-attr]
-                n=kwargs["w2"].shape[2],  # type: ignore[union-attr]
-                k=kwargs["w2"].shape[1],  # type: ignore[union-attr]
+                moe_config=make_dummy_moe_config(
+                    num_experts=w2.shape[0],
+                    hidden_dim=w2.shape[1],
+                    intermediate_size_per_partition=w2.shape[2],
+                    in_dtype=a.dtype,
+                ),
                quant_config=new_quant_config,
-                device="cuda",
            ),
        )
        out_tensor = out_tensor + kernel(**kwargs)
@@ -249,19 +252,19 @@ def run_8_bit(
        "topk_ids": topk_ids,
    }

-    num_experts = moe_tensors.w1.size(0)
+    num_experts = moe_tensors.w1.size(0)  # type: ignore[attr-defined]
    with_ep = num_local_experts is not None or num_local_experts == num_experts
    if not with_ep:
        kernel = mk.FusedMoEModularKernel(
            MoEPrepareAndFinalizeNoEP(),
            CutlassExpertsFp8(
-                out_dtype=moe_tensors.a.dtype,
-                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
-                e=moe_tensors.w2_q.shape[0],  # type: ignore[union-attr]
-                n=moe_tensors.w2_q.shape[2],  # type: ignore[union-attr]
-                k=moe_tensors.w2_q.shape[1],  # type: ignore[union-attr]
+                moe_config=make_dummy_moe_config(
+                    num_experts=moe_tensors.w2_q.shape[0],  # type: ignore[union-attr]
+                    hidden_dim=moe_tensors.w2_q.shape[1],  # type: ignore[union-attr]
+                    intermediate_size_per_partition=moe_tensors.w2_q.shape[2],  # type: ignore[union-attr]
+                    in_dtype=moe_tensors.a.dtype,
+                ),
                quant_config=quant_config,
-                device="cuda",
            ),
        )
        return kernel(**kwargs)