From 647c03b2ee143e8880c81fa0b92af52d37609d02 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 16 May 2026 20:19:21 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20make=5Fb=5Fk=5Fmajor=20must=20preserve?= =?UTF-8?q?=20shape=20=E2=80=94=20use=20double-permute=20trick?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit permute(K,N).contiguous().permute(K,N) gives same (E,K,N) shape but with K-contiguous memory. Single permute changes the shape. --- cutedsl/bridge.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cutedsl/bridge.py b/cutedsl/bridge.py index e79cc5b7..57d2a52e 100644 --- a/cutedsl/bridge.py +++ b/cutedsl/bridge.py @@ -228,16 +228,15 @@ def quantize_weight_to_nvfp4(w_bf16, block_size=SF_VEC_SIZE): # ── Tensor Layout Conversion ─────────────────────────────────────────── def make_b_k_major(b_tensor): - """Convert B tensor from N-major to K-major (required by kernel). + """Convert B tensor from N-major to K-major layout. - Input: (E, K_packed, N) — may be N-major or K-major - Output: (E, K_packed, N) contiguous in K-major order (stride of K_packed dim == 1) + The kernel expects B with stride (E*K*N, 1, K) — K is contiguous. + torch.stack produces stride (E*K*N, N, 1) — N is contiguous. - For shape (E, K, N): K-major means stride[1]==1, N-major means stride[2]==1. + double-permute trick: transpose, make contiguous, transpose back. + Same shape, but K-contiguous memory layout. """ - if b_tensor.dim() == 3 and b_tensor.stride(1) == 1: - return b_tensor.contiguous() - return b_tensor.permute(0, 2, 1).contiguous() + return b_tensor.permute(0, 2, 1).contiguous().permute(0, 2, 1) def compute_expert_offsets(tokens_per_expert, num_experts, device="cuda"):