[Kernels][Bugfix] Use torch op for all kernels in FusedMoE forward. Add additional testing for cudagraphs. (#19717)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-06-25 02:22:58 -04:00
parent f59fc60fb3
commit 015fab8c2f
14 changed files with 379 additions and 238 deletions
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1054,12 +1054,21 @@ def compute_max_diff(output, output_ref):
        torch.abs(output_ref))


-def torch_moe(a, w1, w2, score, topk, expert_map):
+def torch_experts(a: torch.Tensor,
+                  w1: torch.Tensor,
+                  w2: torch.Tensor,
+                  topk_weight: torch.Tensor,
+                  topk_ids: torch.Tensor,
+                  global_num_experts: int = -1,
+                  expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
+    assert (global_num_experts == -1
+            or (global_num_experts == w1.shape[0] and expert_map is None)
+            or (expert_map is not None
+                and global_num_experts == expert_map.shape[0]))
+    topk = topk_ids.shape[1]
    B, D = a.shape
    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
-    score = torch.softmax(score, dim=-1, dtype=torch.float32)
-    topk_weight, topk_ids = torch.topk(score, topk)
    topk_weight = topk_weight.view(-1)
    topk_ids = topk_ids.view(-1)
    if expert_map is not None:
@@ -1073,6 +1082,19 @@ def torch_moe(a, w1, w2, score, topk, expert_map):
            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)


+def torch_moe(a: torch.Tensor,
+              w1: torch.Tensor,
+              w2: torch.Tensor,
+              score: torch.Tensor,
+              topk: int,
+              global_num_experts: int = -1,
+              expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    return torch_experts(a, w1, w2, topk_weight, topk_ids, global_num_experts,
+                         expert_map)
+
+
 def torch_moe_single(a, w, score, topk):
    B, D = a.shape
    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)