nvfp4-megamoe-kernel/tests/unit/test_smem_acc.py

"""
Test SMEM accumulator FMHA kernel: multi-KV-tile with in-kernel O accumulation.
No Python KV merge needed — the kernel handles acc_scale internally.
"""
import torch, math, sys
import cutlass.cute as cute
import cutlass.torch as ct
import cuda.bindings.driver as cuda
from dsv4.kernels.attention.fmha_smem_acc import FmhaKernel


def test_smem_acc(hd=64, s_k=256, use_smem_p=False, normalize=False):
    m = 128
    n_kv_tiles = s_k // 128
    torch.manual_seed(42)

    q = torch.randn(m, hd, 1, dtype=torch.bfloat16, device='cuda')
    k = torch.randn(s_k, hd, 1, dtype=torch.bfloat16, device='cuda')
    v = torch.randn(s_k, hd, dtype=torch.bfloat16, device='cuda')
    c = torch.zeros(m, hd, 1, dtype=torch.bfloat16, device='cuda')

    # FP32 reference
    qf = q[:, :, 0].float()
    kf = k[:, :, 0].float()
    scale = 1.0 / math.sqrt(hd)
    attn_max = (qf @ kf.T * scale).max(dim=-1, keepdim=True)[0]
    attn_exp = torch.exp(qf @ kf.T * scale - attn_max)
    attn_sum = attn_exp.sum(dim=-1, keepdim=True)
    ref_norm = (attn_exp / attn_sum) @ v.float()
    ref_unnorm = attn_exp @ v.float()

    lse_tensor = torch.zeros(m, 1, 1, dtype=torch.float32, device='cuda')
    row_sums_tensor = torch.zeros(m, 1, 1, dtype=torch.float32, device='cuda')

    kernel = FmhaKernel(head_dim=hd, s_k=s_k, use_smem_p=use_smem_p, normalize=normalize)
    pv_n_tile = kernel.pv_n_tile
    n_pv_tiles = kernel.n_pv_tiles

    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)

    # Compile
    v_tile = v[:, 0:pv_n_tile].contiguous()
    v_kernel = v_tile.unsqueeze(-1)
    c_tile = torch.zeros(m, pv_n_tile, 1, dtype=torch.bfloat16, device='cuda')

    mQ = ct.from_dlpack(q).mark_layout_dynamic(leading_dim=ct.get_leading_dim(q))
    mK = ct.from_dlpack(k).mark_layout_dynamic(leading_dim=ct.get_leading_dim(k))
    mV = ct.from_dlpack(v_kernel).mark_layout_dynamic(leading_dim=ct.get_leading_dim(v_kernel))
    mC = ct.from_dlpack(c_tile).mark_layout_dynamic(leading_dim=ct.get_leading_dim(c_tile))
    mLSE = ct.from_dlpack(lse_tensor).mark_layout_dynamic(leading_dim=ct.get_leading_dim(lse_tensor))
    mRS = ct.from_dlpack(row_sums_tensor).mark_layout_dynamic(leading_dim=ct.get_leading_dim(row_sums_tensor))

    # Simple GMEM tensor (non-dynamic-layout) for SMEM accumulator TMA store
    c_simple_tensor = c_tile.clone()
    mCSimple = ct.from_dlpack(c_simple_tensor)  # No mark_layout_dynamic!

    print(f'  hd={hd}, s_k={s_k} ({n_kv_tiles} KV tiles, pv_n_tile={pv_n_tile}, n_pv_tiles={n_pv_tiles}): Compiling...', flush=True)
    compiled = cute.compile(kernel, mQ, mK, mV, mC, stream, lse=mLSE, row_sums=mRS, c_simple=mCSimple)

    for nt in range(n_pv_tiles):
        v_start = nt * pv_n_tile
        v_end = v_start + pv_n_tile
        v_tile = v[:, v_start:v_end].contiguous()
        v_kernel = v_tile.unsqueeze(-1)
        c_tile = torch.zeros(m, pv_n_tile, 1, dtype=torch.bfloat16, device='cuda')
        lse_tensor.zero_()
        row_sums_tensor.zero_()

        mQ = ct.from_dlpack(q).mark_layout_dynamic(leading_dim=ct.get_leading_dim(q))
        mK = ct.from_dlpack(k).mark_layout_dynamic(leading_dim=ct.get_leading_dim(k))
        mV = ct.from_dlpack(v_kernel).mark_layout_dynamic(leading_dim=ct.get_leading_dim(v_kernel))
        mC = ct.from_dlpack(c_tile).mark_layout_dynamic(leading_dim=ct.get_leading_dim(c_tile))
        mLSE = ct.from_dlpack(lse_tensor).mark_layout_dynamic(leading_dim=ct.get_leading_dim(lse_tensor))
        mRS = ct.from_dlpack(row_sums_tensor).mark_layout_dynamic(leading_dim=ct.get_leading_dim(row_sums_tensor))

        mCSimple = ct.from_dlpack(c_tile)  # No mark_layout_dynamic!

        compiled(mQ, mK, mV, mC, stream, lse=mLSE, row_sums=mRS, c_simple=mCSimple)
        torch.cuda.synchronize()

        c[:, v_start:v_end, :] = c_tile

    out = c[:, :, 0].float()

    if normalize:
        cos = torch.nn.functional.cosine_similarity(
            out.flatten().unsqueeze(0), ref_norm.flatten().unsqueeze(0)
        ).item()
        ref = ref_norm
    else:
        cos = torch.nn.functional.cosine_similarity(
            out.flatten().unsqueeze(0), ref_unnorm.flatten().unsqueeze(0)
        ).item()
        ref = ref_unnorm

    status = "PASS" if cos >= 0.99 else "FAIL"
    print(f'  hd={hd}, s_k={s_k} ({n_kv_tiles} tiles): cos {cos:.6f}  {status}')
    return cos


def test():
    print("=== SMEM Accumulator FMHA: In-Kernel Multi-KV-Tile O Accumulation ===\n")

    # Single KV tile (s_k=128): should work like fmha.py
    print("--- Single KV tile (s_k=128) ---")
    test_smem_acc(64, 128)
    test_smem_acc(128, 128)

    # Multi KV tile: the SMEM accumulator approach should handle this correctly
    print("\n--- Multi KV tile (s_k=256+) ---")
    test_smem_acc(64, 256)
    test_smem_acc(64, 384)
    test_smem_acc(64, 512)
    test_smem_acc(128, 256)


if __name__ == '__main__':
    test()