[Kernel] CUTLASS MoE FP8: Integrate cuda moe permute/unpermute (#23045)

Signed-off-by: Shixian Cui <shixian@amazon.com>
2025-08-20 07:35:26 -07:00
parent 4449235843
commit b17109beea
15 changed files with 369 additions and 121 deletions
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -535,7 +535,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,

    expert_offsets = torch.zeros((num_experts + 1),
                                 device=device,
-                                 dtype=torch.int32)
+                                 dtype=torch.int64)

    problem_sizes = torch.zeros((num_experts, 3),
                                device=device,