[Perf][Kernel] Fused SiLU+Mul+Quant kernel for NVFP4 cutlass_moe (#31832)

Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com>
2026-01-09 09:40:33 -05:00
parent 8e27663b6a
commit 34cd32fe30
8 changed files with 278 additions and 95 deletions
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -549,7 +549,8 @@ def run_cutlass_moe_fp4(
        num_topk,
    )
    c1 = _resize_cache(workspace13, (m * topk, n * 2))
-    c2 = _resize_cache(workspace2, (m * topk, n))
+    # Note: c2 workspace is no longer needed since SiLU is fused with quantization.
+    # c3 reuses workspace13 after c1 is consumed.
    c3 = _resize_cache(workspace13, (m * topk, k))
    ops.cutlass_fp4_moe_mm(
        c1,
@@ -563,9 +564,9 @@ def run_cutlass_moe_fp4(
        blockscale_offsets[:-1],
    )
    del rep_a_fp4, rep_a_blockscale
-    torch.ops._C.silu_and_mul(c2, c1)
-    int_fp4, int_blockscale = ops.scaled_fp4_experts_quant(
-        c2, a2_gscale, expert_offsets, blockscale_offsets, num_topk
+    # Fused SiLU+Mul+NVFP4 quantization
+    int_fp4, int_blockscale = ops.silu_and_mul_scaled_fp4_experts_quant(
+        c1, a2_gscale, expert_offsets, blockscale_offsets, num_topk
    )

    ops.cutlass_fp4_moe_mm(