From 79281b6fdadb8ca3aedec8d57508035c9669d849 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 16 May 2026 20:00:35 +0000 Subject: [PATCH] fix: compute K_packed/N_packed before passing to _get_compiled_kernel --- cutedsl/bridge.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cutedsl/bridge.py b/cutedsl/bridge.py index 08792612..7f54d407 100644 --- a/cutedsl/bridge.py +++ b/cutedsl/bridge.py @@ -387,7 +387,9 @@ def run_nvfp4_grouped_gemm( no cute.compile() in the forward path. """ num_experts = mat_b.shape[0] - n_dim = mat_b.shape[2] # N dimension (logical, not packed — float4_e2m1fn_x2 packs along K, not N) + K_packed = mat_a.shape[1] + N_packed = mat_b.shape[2] # N dimension (logical, not packed — float4_e2m1fn_x2 packs along K, not N) + n_dim = N_packed tokens_sum = mat_a.shape[0] device = mat_a.device