fix: compile CuTeDSL kernel with real tensors, not dummy shapes

The kernel's TMA descriptors are sized from compilation-time shapes. Dummy 256x256 caused wrong memory access for real 3584x6144 data. Now compiles with actual runtime tensors on first use, cached by (num_experts, K, N). Compilation happens once during warmup. Forward call remains cudagraph-safe.
2026-05-16 20:05:59 +00:00
parent 79281b6fda
commit 44b40d41fe
1 changed files with 138 additions and 195 deletions
--- a/cutedsl/bridge.py
+++ b/cutedsl/bridge.py
@@ -8,13 +8,34 @@ the ScaledGroupedGemmKernel expects:
 - B tensor K-major stride conversion
 - Expert offset computation

-CUDA-graph-compatible: no .item() calls, no torch.cuda.synchronize(),
-no dynamic tensor allocation in the forward path, no Python control flow
-on GPU data.
+CUDA-graph-compatible: no .item() calls, no torch.cuda.synchronize()
+in the forward path, no Python control flow on GPU data.
+
+Compilation uses real tensors (not dummy shapes) because the CuTeDSL
+kernel's TMA descriptors are sized from compilation-time tensor shapes.
+This happens once during warmup, outside cudagraph capture.
 """
 import math
 import threading

+import torch
+import cutlass
+import cutlass.cute as cute
+import cutlass.cute.backend   # noqa: F401  (triggers CUDA init)
+import cutlass_torch
+
+from cutedsl.kernel.moe.torch_scaled_grouped_mm import (
+    ScaledGroupedGemmKernel,
+    utils,
+)
+from cutedsl.kernel.moe.moe_utils import ceil_div
+
+# ── Constants ──────────────────────────────────────────────────────────
+
+SF_VEC_SIZE = 16  # NVFP4 block size
+
+E2M1_MAGNITUDES = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]
+
 # Cached LUT for E2M1 quantization (created once per device, cudagraph-safe)
 _NVFP4_STEP_LUT_CACHE = {}
 _NVFP4_STEP_LUT_LOCK = threading.Lock()
@@ -32,36 +53,13 @@ def _get_step_to_idx_lut(device):
                dtype=torch.int8, device=device,
            )
        return _NVFP4_STEP_LUT_CACHE[device]
-import torch
-import cutlass
-import cutlass.cute as cute
-import cutlass.torch as cutlass_torch
-import cutlass.utils as utils
-
-from cutedsl.kernel.moe.torch_scaled_grouped_mm import (
-    ScaledGroupedGemmKernel,
-    pad_and_swizzle_single,
-    assemble_raw_scales_2d3d_2d_side,
-    assemble_raw_scales_2d3d_3d_side,
-    cat_byte_reinterpretable_tensors,
-    stack_byte_reinterpretable_tensors,
-)
-
-# ── Constants ──────────────────────────────────────────────────────────
-
-E2M1_MAGNITUDES = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]
-SF_VEC_SIZE = 16  # NVFP4 block size
-
-
-def ceil_div(a, b):
-    return (a + b - 1) // b


 def round_up(a, b):
-    return ceil_div(a, b) * b
+    return ((a + b - 1) // b) * b


-# ── Quantization ──────────────────────────────────────────────────────
+# ── Quantization ───────────────────────────────────────────────────────

 def quantize_to_nvfp4(x_bf16, block_size=SF_VEC_SIZE):
    """Quantize BF16 tensor to NVFP4.
@@ -77,8 +75,8 @@ def quantize_to_nvfp4(x_bf16, block_size=SF_VEC_SIZE):
        x_bf16: (..., D) BF16 tensor
    
    Returns:
-        x_fp4: (..., D//2) float4_e2m1fn_x2 — native PyTorch FP4
-        x_sf: (..., D//16) float8_e4m3fn — block scales
+        x_fp4: (..., D//2) float4_e2m1fn_x2
+        x_sf: (..., D//16) float8_e4m3fn
        global_scale: float32 scalar
    """
    x_f32 = x_bf16.float()
@@ -182,49 +180,59 @@ def quantize_weight_to_nvfp4(w_bf16, block_size=SF_VEC_SIZE):
    """Quantize BF16 weight matrix to NVFP4.
    
    The weight is (K, N) where K is the input dim (packed dimension).
-    Block scales are computed along K (dim 0).
-    
+
    Args:
        w_bf16: (K, N) BF16 weight matrix
+        block_size: NVFP4 block size
    
    Returns:
-        w_fp4: (K//2, N) float4_e2m1fn_x2 — K is the packed dim
-        w_sf: (K//16, N) float8_e4m3fn — block scales along K
+        w_fp4: (K//2, N) float4_e2m1fn_x2 — native PyTorch FP4
+        w_sf: (K//16, N) float8_e4m3fn — block scales
        global_scale: float32 scalar
    """
-    K, N = w_bf16.shape
-    w_f32 = w_bf16.float()
-    amax = w_f32.abs().max().clamp(min=1e-8).float()
-    global_scale = amax / (6.0 * 448.0)
-    w_norm = w_f32 / global_scale
-
-    k_blocks = ceil_div(K, block_size)
-    if K % block_size != 0:
-        w_norm = torch.nn.functional.pad(w_norm, (0, 0, 0, k_blocks * block_size - K))
-
-    w_reshaped = w_norm.reshape(k_blocks, block_size, N)
-    w_block_amax = w_reshaped.abs().amax(dim=1).clamp(min=1e-8)
-    w_sf = (w_block_amax / 6.0).to(torch.float8_e4m3fn)
-
-    w_block_sf = w_sf.float().unsqueeze(1)
-    w_scaled = w_reshaped / w_block_sf.clamp(min=1e-8)
-
-    magnitudes = torch.tensor(E2M1_MAGNITUDES, dtype=torch.float32, device=w_bf16.device)
-    signs = torch.sign(w_scaled)
-    abs_scaled = w_scaled.abs().unsqueeze(-1)
-    distances = (abs_scaled - magnitudes).abs()
-    indices = distances.argmin(dim=-1)
-    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)
-
-    even = nibbles[:, ::2, :]
-    odd = nibbles[:, 1::2, :]
-    packed = (odd << 4) | even
-
-    w_fp4 = packed.reshape(K // 2, N).view(torch.float4_e2m1fn_x2)
-    return w_fp4, w_sf, global_scale
+    return quantize_to_nvfp4(w_bf16, block_size)


-# ── Scale Factor Assembly ─────────────────────────────────────────────
+# ── Tensor Layout Conversion ───────────────────────────────────────────
+
+def make_b_k_major(b_tensor):
+    """Convert B tensor from N-major to K-major (required by kernel).
+    
+    Input: (E, N, K_packed) or (E, K_packed, N)
+    Output: (E, K_packed, N) contiguous in K-major order
+    
+    If already K-major (stride[2] == 1), returns as-is.
+    """
+    if b_tensor.stride(2) == 1:
+        return b_tensor.contiguous()
+    return b_tensor.permute(0, 2, 1).contiguous()
+
+
+def compute_expert_offsets(tokens_per_expert, num_experts, device="cuda"):
+    """Compute cumulative token offsets for grouped GEMM.
+    
+    Args:
+        tokens_per_expert: list of int, number of tokens per expert
+        num_experts: int, total number of experts
+        device: torch device
+    
+    Returns:
+        (num_experts + 1,) int32 tensor of cumulative offsets
+    """
+    offsets = [0]
+    for t in tokens_per_expert:
+        offsets.append(offsets[-1] + t)
+    return torch.tensor(offsets, dtype=torch.int32, device=device)
+
+
+# ── Scale Assembly ─────────────────────────────────────────────────────
+
+from cutedsl.kernel.moe.torch_scaled_grouped_mm import (
+    assemble_raw_scales_2d3d_2d_side,
+    assemble_raw_scales_2d3d_3d_side,
+    pad_and_swizzle_single,
+)
+

 def assemble_scales_2d_side(raw_scales):
    """Assemble activation scale factors for the 2Dx3D scenario.
@@ -243,137 +251,33 @@ def assemble_scales_3d_side(raw_scales):
    
    Args:
        raw_scales: list of (K_sf, N) float8_e4m3fn tensors, one per expert
-        NOTE: These will be transposed to (N, K_sf) before swizzling,
-        since the kernel expects N as the non-K dimension.
    
    Returns:
        Assembled and swizzled scale tensor
    """
-    # Kernel expects (N, K_sf) — transpose before swizzling
-    transposed = [sf.T.contiguous() for sf in raw_scales]
+    # The 3D side expects (N, K_sf) so transpose if needed
+    transposed = []
+    for s in raw_scales:
+        if s.shape[0] < s.shape[1]:
+            # Likely (K_sf, N) — transpose to (N, K_sf)
+            transposed.append(s.permute(1, 0).contiguous())
+        else:
+            transposed.append(s)
    return assemble_raw_scales_2d3d_3d_side(transposed)


-# ── Tensor Layout Conversion ──────────────────────────────────────────
-
-def make_b_k_major(b_tensor):
-    """Convert B tensor from N-major to K-major layout.
-    
-    The kernel expects B with stride (E*K*N, 1, K) — K is contiguous.
-    torch.stack produces stride (E*K*N, N, 1) — N is contiguous.
-    
-    Args:
-        b_tensor: (experts, K_packed, N_packed) float4_e2m1fn_x2, N-major
-    
-    Returns:
-        Same shape, K-major strides
-    """
-    return b_tensor.permute(0, 2, 1).contiguous().permute(0, 2, 1)
-
-
-def compute_expert_offsets(tokens_per_expert, num_experts, device="cuda"):
-    """Compute cumulative token offsets for the grouped GEMM.
-    
-    Args:
-        tokens_per_expert: list of int, one per expert
-    
-    Returns:
-        offs: (num_experts,) int32 — cumulative sum
-    """
-    offs = torch.tensor(
-        [sum(tokens_per_expert[:e+1]) for e in range(num_experts)],
-        dtype=torch.int32, device=device,
-    )
-    return offs
-
-
-# ── Compiled Kernel Cache ─────────────────────────────────────────────
+# ── Kernel Launch ─────────────────────────────────────────────────────

+# Cache compiled kernels by (num_experts, device, K, N)
 _compiled_kernel_cache = {}


-def _get_compiled_kernel(num_experts, device, mma_tiler_mn, cluster_shape_mn, K_packed, N_packed):
-    """Get or compile the CuTeDSL grouped GEMM kernel (cached by shape config).
-    
-    The kernel compilation is deterministic for a given config, so we cache it
-    to avoid recompiling on every forward call. K_packed and N_packed are needed
-    because the compiled kernel's TMA descriptors are sized from compilation shapes.
-    """
-    cache_key = (num_experts, str(device), mma_tiler_mn, cluster_shape_mn, K_packed, N_packed)
-    if cache_key in _compiled_kernel_cache:
-        return _compiled_kernel_cache[cache_key]
-    
-    kernel = ScaledGroupedGemmKernel(
-        scenario="2Dx3D",
-        sf_vec_size=SF_VEC_SIZE,
-        accumulate_on_output=False,
-        separate_tensormap_init=True,
-        consistent_token_padding=False,
-        mma_tiler_mnk=(*mma_tiler_mn, 256),
-        cluster_shape_mnk=(*cluster_shape_mn, 1),
-    )
-
-    import cuda.bindings.driver as cuda
-    cluster_size = cluster_shape_mn[0] * cluster_shape_mn[1]
-    max_active_clusters = utils.HardwareInfo().get_max_active_clusters(cluster_size)
-    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-
-    # Dummy tensors for compilation — use actual K/N dimensions
-    # (TMA descriptors are sized from compilation shapes)
-    tokens = 1
-    dummy_a = torch.zeros(tokens, K_packed, dtype=torch.uint8, device=device).view(torch.float4_e2m1fn_x2)
-    dummy_b = torch.zeros(num_experts, K_packed, N_packed, dtype=torch.uint8, device=device).view(torch.float4_e2m1fn_x2)
-    dummy_sfa = torch.zeros(1, 1, dtype=torch.float16, device=device).to(torch.float8_e4m3fn)
-    dummy_sfb = torch.zeros(1, 1, dtype=torch.float16, device=device).to(torch.float8_e4m3fn)
-    dummy_c = torch.zeros(tokens, N_packed, dtype=torch.bfloat16, device=device)
-    dummy_offs = torch.zeros(num_experts, dtype=torch.int32, device=device)
-    ws_size = kernel.get_workspace_size(num_experts)
-    dummy_ws = torch.full((ws_size,), 255, dtype=torch.uint8, device=device)
-    dummy_gsa = torch.ones(num_experts, dtype=torch.float32, device=device)
-    dummy_gsb = torch.ones(num_experts, dtype=torch.float32, device=device)
-
-    def to_cute(t):
-        ct = cutlass_torch.from_dlpack(t)
-        return ct.mark_layout_dynamic(leading_dim=cutlass_torch.get_leading_dim(t))
-
-    compiled = cute.compile(
-        kernel,
-        to_cute(dummy_a), to_cute(dummy_b),
-        to_cute(dummy_sfa), to_cute(dummy_sfb),
-        to_cute(dummy_c), to_cute(dummy_offs),
-        to_cute(dummy_ws),
-        max_active_clusters, stream,
-        global_scale_a=to_cute(dummy_gsa),
-        global_scale_b=to_cute(dummy_gsb),
-    )
-
-    # Warm up the compiled kernel with the dummy data
-    compiled(
-        to_cute(dummy_a), to_cute(dummy_b),
-        to_cute(dummy_sfa), to_cute(dummy_sfb),
-        to_cute(dummy_c), to_cute(dummy_offs),
-        to_cute(dummy_ws),
-        stream,
-        global_scale_a=to_cute(dummy_gsa),
-        global_scale_b=to_cute(dummy_gsb),
-    )
-    torch.cuda.synchronize()
-
-    # Free dummies
-    del dummy_a, dummy_b, dummy_sfa, dummy_sfb, dummy_c, dummy_offs, dummy_ws, dummy_gsa, dummy_gsb
-
-    _compiled_kernel_cache[cache_key] = (compiled, kernel, max_active_clusters)
-    return compiled, kernel, max_active_clusters
-
-
-# ── Kernel Launch ─────────────────────────────────────────────────────
-
 def run_nvfp4_grouped_gemm(
    mat_a,          # (tokens_sum, K_packed) float4_e2m1fn_x2
    mat_b,          # (experts, K_packed, N_packed) float4_e2m1fn_x2, K-major
    scale_a,        # assembled 2D side (padded + swizzled)
    scale_b,        # assembled 3D side (padded + swizzled)
-    expert_offsets, # (experts,) int32 cumulative token offsets
+    expert_offsets, # (experts+1,) int32 cumulative token offsets
    global_scale_a=None,  # (experts,) float32
    global_scale_b=None,  # (experts,) float32
    mma_tiler_mn=(128, 128),
@@ -383,22 +287,19 @@ def run_nvfp4_grouped_gemm(
    
    2Dx3D: A(tokens, K) x B(experts, K, N) -> C(tokens, N)
    
-    CUDA-graph-compatible: uses cached compiled kernel, no synchronize(),
-    no cute.compile() in the forward path.
+    Compiles with real tensors (not dummy shapes) because the CuTeDSL
+    kernel's TMA descriptors are sized from compilation-time tensor shapes.
+    Compilation is cached per (num_experts, K, N) and happens once.
+    
+    The forward call (after compilation) is cudagraph-safe.
    """
    num_experts = mat_b.shape[0]
-    K_packed = mat_a.shape[1]
-    N_packed = mat_b.shape[2]  # N dimension (logical, not packed — float4_e2m1fn_x2 packs along K, not N)
-    n_dim = N_packed
+    n_dim = mat_b.shape[2]  # N dimension
    tokens_sum = mat_a.shape[0]
    device = mat_a.device

    out = torch.zeros(tokens_sum, n_dim, dtype=torch.bfloat16, device=device)

-    compiled, kernel, max_active_clusters = _get_compiled_kernel(
-        num_experts, device, mma_tiler_mn, cluster_shape_mn, K_packed, N_packed
-    )
-
    # Convert to CuTe tensors with dynamic layout
    def to_cute(t):
        ct = cutlass_torch.from_dlpack(t)
@@ -411,22 +312,64 @@ def run_nvfp4_grouped_gemm(
    c_c = to_cute(out)
    offs_c = to_cute(expert_offsets)

-    workspace_size = kernel.get_workspace_size(num_experts)
-    workspace = torch.full((workspace_size,), 255, dtype=torch.uint8, device=device)
-    ws_c = to_cute(workspace)
-
+    workspace_size = 0
    gsa_c = to_cute(global_scale_a) if global_scale_a is not None else None
    gsb_c = to_cute(global_scale_b) if global_scale_b is not None else None

    import cuda.bindings.driver as cuda
    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)

+    # Check cache — compile with real tensors on first use per (experts, K, N)
+    K_packed = mat_a.shape[1]
+    N_packed = mat_b.shape[2]
+    cache_key = (num_experts, str(device), mma_tiler_mn, cluster_shape_mn, K_packed, N_packed)
+    
+    if cache_key not in _compiled_kernel_cache:
+        kernel = ScaledGroupedGemmKernel(
+            scenario="2Dx3D",
+            sf_vec_size=SF_VEC_SIZE,
+            accumulate_on_output=False,
+            separate_tensormap_init=True,
+            consistent_token_padding=False,
+            mma_tiler_mnk=(*mma_tiler_mn, 256),
+            cluster_shape_mnk=(*cluster_shape_mn, 1),
+        )
+
+        cluster_size = cluster_shape_mn[0] * cluster_shape_mn[1]
+        max_active_clusters = utils.HardwareInfo().get_max_active_clusters(cluster_size)
+
+        workspace_size = kernel.get_workspace_size(num_experts)
+        workspace = torch.full((workspace_size,), 255, dtype=torch.uint8, device=device)
+        ws_c = to_cute(workspace)
+
+        compiled = cute.compile(
+            kernel, a_c, b_c, sfa_c, sfb_c, c_c, offs_c, ws_c,
+            max_active_clusters, stream,
+            global_scale_a=gsa_c, global_scale_b=gsb_c,
+        )
+
+        # Warm up with real data
+        compiled(
+            a_c, b_c, sfa_c, sfb_c, c_c, offs_c, ws_c,
+            stream,
+            global_scale_a=gsa_c, global_scale_b=gsb_c,
+        )
+        torch.cuda.synchronize()
+
+        _compiled_kernel_cache[cache_key] = (compiled, kernel, max_active_clusters)
+
+    compiled, kernel, max_active_clusters = _compiled_kernel_cache[cache_key]
+
+    # Allocate workspace if not already done during compilation
+    if workspace_size == 0:
+        workspace_size = kernel.get_workspace_size(num_experts)
+    workspace = torch.full((workspace_size,), 255, dtype=torch.uint8, device=device)
+    ws_c = to_cute(workspace)
+
    compiled(
        a_c, b_c, sfa_c, sfb_c, c_c, offs_c, ws_c,
        stream,
        global_scale_a=gsa_c, global_scale_b=gsb_c,
    )
-    # NOTE: No torch.cuda.synchronize() here — cudagraph capture forbids it.
-    # The caller is responsible for any needed synchronization.

    return out