From 0ab5d8c317bc65aaa044401c8f430ffca5da38f1 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Mon, 1 Jun 2026 00:56:00 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20disable=20broken=20CuTeDSL=20fused=20rou?=
 =?UTF-8?q?ter=20=E2=80=94=20use=20BF16=20linear=20+=20activation=5Ftopk?=
 =?UTF-8?q?=20(both=20are=20production=20paths)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 dsv4/kernels/router/dense_router_decode.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/dsv4/kernels/router/dense_router_decode.py b/dsv4/kernels/router/dense_router_decode.py
index 606a2f74..a7a1d74c 100644
--- a/dsv4/kernels/router/dense_router_decode.py
+++ b/dsv4/kernels/router/dense_router_decode.py
@@ -25,16 +25,18 @@ def dense_router_dispatch(
     """
     N = hidden_states.shape[0]
 
-    if N <= 64:
-        try:
-            _run_fused_decode(
-                hidden_states, W_gate, e_bias,
-                routed_scaling_factor, top_k,
-                out_weights, out_ids,
-            )
-            return
-        except Exception:
-            pass  # fall through to prefill path
+    # The CuTeDSL fused decode kernel has a TMA partition layout bug that
+    # causes cute.compile to fail after a long compilation attempt.
+    # TODO: fix the fused kernel (OperandMajorMode + local_tile coord mismatch)
+    # For now, the BF16 linear + activation_topk path is the production path.
+    # BF16 GEMM on Blackwell uses tensor cores via cuBLAS; the activation_topk
+    # kernel is a real CUDA kernel (not PyTorch reference).
+    # if N <= 64:
+    #     try:
+    #         _run_fused_decode(...)
+    #         return
+    #     except Exception:
+    #         pass
 
     _run_prefill_path(
         hidden_states, W_gate, e_bias,