[MoE Refactor][15/N] Apply Refactor to Fp8 (#31415)

2026-01-07 19:42:33 -05:00
parent ffc0a2798b
commit 5dcd7ef1f2
38 changed files with 1439 additions and 1528 deletions
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -11,12 +11,17 @@ from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEQuantConfig,
    fp8_w8a8_moe_quant_config,
 )
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    FlashInferExperts,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    apply_flashinfer_per_tensor_scale_fp8,
-    flashinfer_cutlass_moe_fp8,
+    apply_fi_trtllm_fp8_per_tensor_moe,
    register_scales_for_trtllm_fp8_per_tensor_moe,
-    rotate_flashinfer_fp8_moe_weights,
+    rotate_weights_for_fi_trtllm_fp8_per_tensor_moe,
    swap_w13_to_w31,
 )
 from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8
@@ -103,6 +108,7 @@ class TestData:
        w2_quantized, w2_weight_scale = quant_fp8_per_tensor_batches(w2)

        layer = torch.nn.Module()
+        layer.orig_dtype = torch.bfloat16
        layer.w13_weight = w13_quantized.clone()
        layer.w2_weight = w2_quantized.clone()
        layer.w13_input_scale = a1_scale
@@ -115,10 +121,10 @@ class TestData:
            pcp_size=1,
            dp_size=1,
            ep_size=1,
-            tp_rank=1,
-            pcp_rank=1,
-            dp_rank=1,
-            ep_rank=1,
+            tp_rank=0,
+            pcp_rank=0,
+            dp_rank=0,
+            ep_rank=0,
            use_ep=False,
            all2all_backend="naive",
        )
@@ -126,7 +132,9 @@ class TestData:
        # flashinfer expects swapped rows for w13
        layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
        if is_trtllm:
-            rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
+            rotate_weights_for_fi_trtllm_fp8_per_tensor_moe(
+                layer.w13_weight, layer.w2_weight
+            )
            register_scales_for_trtllm_fp8_per_tensor_moe(
                layer,
                layer.w13_weight_scale,
@@ -199,7 +207,7 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
            quant_config=quant_config,
        )

-        flashinfer_output = apply_flashinfer_per_tensor_scale_fp8(
+        flashinfer_output = apply_fi_trtllm_fp8_per_tensor_moe(
            layer=td.layer,
            hidden_states=td.hidden_states,
            router_logits=score,
@@ -277,17 +285,34 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
        td.layer.get_fused_moe_quant_config = get_fused_moe_quant_config
        td.layer.quant_method = td.layer

-        flashinfer_cutlass_output = flashinfer_cutlass_moe_fp8(
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(
+                defer_input_quant=quant_config.is_block_quantized
+            ),
+            FlashInferExperts(
+                out_dtype=td.layer.orig_dtype,
+                quant_config=quant_config,
+                ep_rank=td.layer.moe_parallel_config.ep_rank,
+                ep_size=td.layer.moe_parallel_config.ep_size,
+                tp_rank=td.layer.moe_parallel_config.tp_rank,
+                tp_size=td.layer.moe_parallel_config.tp_size,
+                use_dp=False,
+                use_deepseek_fp8_block_scale=False,
+            ),
+        )
+
+        flashinfer_cutlass_output = kernel(
            td.hidden_states,
-            td.layer,
+            td.layer.w13_weight,
+            td.layer.w2_weight,
            topk_weights,
            topk_ids,
+            inplace=False,
            activation=activation,
            global_num_experts=e,
            expert_map=None,
            apply_router_weight_on_input=True,
        )
-
        torch.testing.assert_close(
            output, flashinfer_cutlass_output, atol=5.5e-2, rtol=1e-2
        )