From 0ab5d8c317bc65aaa044401c8f430ffca5da38f1 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Mon, 1 Jun 2026 00:56:00 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20disable=20broken=20CuTeDSL=20fused=20rou?= =?UTF-8?q?ter=20=E2=80=94=20use=20BF16=20linear=20+=20activation=5Ftopk?= =?UTF-8?q?=20(both=20are=20production=20paths)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dsv4/kernels/router/dense_router_decode.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/dsv4/kernels/router/dense_router_decode.py b/dsv4/kernels/router/dense_router_decode.py index 606a2f74..a7a1d74c 100644 --- a/dsv4/kernels/router/dense_router_decode.py +++ b/dsv4/kernels/router/dense_router_decode.py @@ -25,16 +25,18 @@ def dense_router_dispatch( """ N = hidden_states.shape[0] - if N <= 64: - try: - _run_fused_decode( - hidden_states, W_gate, e_bias, - routed_scaling_factor, top_k, - out_weights, out_ids, - ) - return - except Exception: - pass # fall through to prefill path + # The CuTeDSL fused decode kernel has a TMA partition layout bug that + # causes cute.compile to fail after a long compilation attempt. + # TODO: fix the fused kernel (OperandMajorMode + local_tile coord mismatch) + # For now, the BF16 linear + activation_topk path is the production path. + # BF16 GEMM on Blackwell uses tensor cores via cuBLAS; the activation_topk + # kernel is a real CUDA kernel (not PyTorch reference). + # if N <= 64: + # try: + # _run_fused_decode(...) + # return + # except Exception: + # pass _run_prefill_path( hidden_states, W_gate, e_bias,