diff --git a/dsv4/kernels/router/dense_router_decode.py b/dsv4/kernels/router/dense_router_decode.py index 606a2f74..a7a1d74c 100644 --- a/dsv4/kernels/router/dense_router_decode.py +++ b/dsv4/kernels/router/dense_router_decode.py @@ -25,16 +25,18 @@ def dense_router_dispatch( """ N = hidden_states.shape[0] - if N <= 64: - try: - _run_fused_decode( - hidden_states, W_gate, e_bias, - routed_scaling_factor, top_k, - out_weights, out_ids, - ) - return - except Exception: - pass # fall through to prefill path + # The CuTeDSL fused decode kernel has a TMA partition layout bug that + # causes cute.compile to fail after a long compilation attempt. + # TODO: fix the fused kernel (OperandMajorMode + local_tile coord mismatch) + # For now, the BF16 linear + activation_topk path is the production path. + # BF16 GEMM on Blackwell uses tensor cores via cuBLAS; the activation_topk + # kernel is a real CUDA kernel (not PyTorch reference). + # if N <= 64: + # try: + # _run_fused_decode(...) + # return + # except Exception: + # pass _run_prefill_path( hidden_states, W_gate, e_bias,