test: standalone CUDA test for FMHA SM100 (no PyTorch needed)

This commit is contained in:
2026-05-28 05:31:03 +00:00
parent 77fa34a9a6
commit 99b35eb2de
2 changed files with 209 additions and 58 deletions

View File

@@ -1,15 +1,10 @@
"""
Test: Compile FMHA SM100 kernel with nvcc, load as shared library, test correctness.
This uses ctypes to load the compiled .so instead of torch.utils.cpp_extension
(which adds -D__CUDA_NO_BFLOAT16_CONVERSIONS__ causing ICE with __bf16).
Test: Compile and run standalone FMHA SM100 test via nvcc.
No PyTorch needed — pure CUDA runtime test.
"""
import subprocess
import sys
import os
import torch
import math
import ctypes
def get_repo_root():
d = os.path.dirname(os.path.abspath(__file__))
@@ -19,62 +14,46 @@ def get_repo_root():
return None
REPO = get_repo_root()
CUTLASS = "/root/cutlass"
CUDA = "/usr/local/cuda-13.2"
OUT = "/tmp/fmha_sm100_test"
def compile_kernel():
"""Compile the kernel as a shared library using nvcc directly."""
src = f"{REPO}/dsv4/kernels/attention/fmha_sm100_launch.cu"
out = f"{OUT}.so"
# Step 1: Compile standalone test
print("=" * 60)
print("Compiling standalone FMHA SM100 test...")
print("=" * 60)
cmd = [
f"{CUDA}/bin/nvcc",
"--std=c++20",
"-shared",
"-Xcompiler", "-fPIC",
f"-gencode=arch=compute_100a,code=sm_100a",
f"-I{REPO}",
f"-I{CUTLASS}/include",
f"-I{CUDA}/include",
"-I/root/dsv4-nvfp4-workspace/venv/lib/python3.12/site-packages/torch/include",
"-I/root/dsv4-nvfp4-workspace/venv/lib/python3.12/site-packages/torch/include/torch/csrc/api/include",
"-I/usr/include/python3.12",
"-DGOOGLE_CUDA=1",
"--expt-relaxed-constexpr",
src,
"-o", out,
"-L/root/dsv4-nvfp4-workspace/venv/lib/python3.12/site-packages/torch/lib",
"-lc10_cuda", "-ltorch_cuda", "-ltorch", "-lc10",
"-lcudart",
]
src = f"{REPO}/tests/unit/test_fmha_sm100_standalone.cu"
out = "/tmp/fmha_sm100_standalone"
print(f"Compiling: {' '.join(cmd[:5])}...")
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
print(f"❌ Compilation FAILED:\n{result.stderr[-1000:]}")
return False
print(f"✅ Compiled: {out}")
return True
cmd = [
f"{CUDA}/bin/nvcc",
"--std=c++20",
f"-gencode=arch=compute_100a,code=sm_100a",
f"-I{REPO}",
"--expt-relaxed-constexpr",
src,
"-o", out,
"-lcudart",
]
print(f"nvcc: {' '.join(cmd[:4])}...")
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
print(f"❌ Compilation FAILED:\n{result.stderr[-2000:]}")
sys.exit(1)
print(f"✅ Compiled: {out}")
def test_correctness():
"""Test FMHA output against PyTorch reference using ctypes."""
# Load the shared library
lib = ctypes.CDLL(f"{OUT}.so")
# We'd need to call the function via ctypes, but the function
# returns torch tensors which complicates ctypes.
# Instead, let's write a simple Python test script that uses
# torch.ops to call the custom op.
print("Shared library loaded. Need proper test harness for correctness.")
print("For now, verify kernel compiles and loads successfully.")
return True
# Step 2: Run the test
print("\n" + "=" * 60)
print("Running standalone FMHA SM100 test...")
print("=" * 60)
result = subprocess.run([out], capture_output=True, text=True, timeout=30)
print(result.stdout)
if result.stderr:
print(f"STDERR: {result.stderr[-500:]}")
print(f"Exit code: {result.returncode}")
if __name__ == "__main__":
print("=" * 60)
print("FMHA SM100 — nvcc direct compilation + correctness test")
print("=" * 60)
if compile_kernel():
test_correctness()
if result.returncode == 0:
print("\n✅ ALL TESTS PASSED!")
else:
print("\n❌ TEST FAILED")

View File

@@ -0,0 +1,172 @@
/**
* Standalone CUDA test for FMHA SM100 decode kernel.
* Launches the kernel directly via CUDA runtime, compares against CPU reference.
* No PyTorch or pybind11 needed — just nvcc + CUDA runtime.
*/
#include "fmha_sm100.cuh"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
using namespace dsv4::kernels::attention;
// CPU reference: simple attention
void attention_ref_cpu(
const float* q, const float* k, const float* v,
float* o, float* lse,
int B, int H, int sk, int HD, float scale
) {
for (int b = 0; b < B; b++) {
for (int h = 0; h < H; h++) {
const float* qh = q + (b * H + h) * HD;
const float* kb = k + b * sk * HD;
const float* vb = v + b * HD * sk;
float* oh = o + (b * H + h) * HD;
// S = Q @ K^T * scale
float* s = (float*)malloc(sk * sizeof(float));
float s_max = -FLT_MAX;
for (int c = 0; c < sk; c++) {
float dot = 0.0f;
for (int d = 0; d < HD; d++) dot += qh[d] * kb[c * HD + d];
s[c] = dot * scale;
s_max = fmaxf(s_max, s[c]);
}
// Softmax
float sum = 0.0f;
for (int c = 0; c < sk; c++) {
s[c] = expf(s[c] - s_max);
sum += s[c];
}
for (int c = 0; c < sk; c++) s[c] /= sum;
// O = S @ V
for (int d = 0; d < HD; d++) {
oh[d] = 0.0f;
for (int c = 0; c < sk; c++) {
oh[d] += s[c] * vb[d * sk + c];
}
}
if (lse) lse[b * H + h] = logf(sum) + s_max;
free(s);
}
}
}
// BF16 conversion helpers for CPU
uint16_t f32_to_bf16_cpu(float f) {
uint32_t u;
memcpy(&u, &f, 4);
uint16_t h = (uint16_t)(u >> 16);
return h;
}
float bf16_to_f32_cpu(uint16_t h) {
uint32_t u = ((uint32_t)h) << 16;
float f;
memcpy(&f, &u, 4);
return f;
}
int main() {
printf("=== FMHA SM100 Decode Kernel Test ===\n");
const int B = 1, H = 1, HD = 64, sk = 128;
const float scale = 1.0f / sqrtf((float)HD);
const int smem = 128 * HD * 2 * sizeof(uint16_t) + 1024; // K + V + slack
// Allocate host memory
float *hq = (float*)malloc(B * H * HD * sizeof(float));
float *hk = (float*)malloc(B * sk * HD * sizeof(float));
float *hv = (float*)malloc(B * HD * sk * sizeof(float));
float *ho_ref = (float*)malloc(B * H * HD * sizeof(float));
// Init with random data
srand(42);
for (int i = 0; i < B * H * HD; i++) hq[i] = (float)rand() / RAND_MAX - 0.5f;
for (int i = 0; i < B * sk * HD; i++) hk[i] = (float)rand() / RAND_MAX - 0.5f;
for (int i = 0; i < B * HD * sk; i++) hv[i] = (float)rand() / RAND_MAX - 0.5f;
// CPU reference
attention_ref_cpu(hq, hk, hv, ho_ref, NULL, B, H, sk, HD, scale);
// Convert to BF16
uint16_t *hqb = (uint16_t*)malloc(B * H * HD * sizeof(uint16_t));
uint16_t *hkb = (uint16_t*)malloc(B * sk * HD * sizeof(uint16_t));
uint16_t *hvb = (uint16_t*)malloc(B * HD * sk * sizeof(uint16_t));
uint16_t *hob = (uint16_t*)malloc(B * H * HD * sizeof(uint16_t));
for (int i = 0; i < B * H * HD; i++) hqb[i] = f32_to_bf16_cpu(hq[i]);
for (int i = 0; i < B * sk * HD; i++) hkb[i] = f32_to_bf16_cpu(hk[i]);
for (int i = 0; i < B * HD * sk; i++) hvb[i] = f32_to_bf16_cpu(hv[i]);
// Allocate GPU memory
uint16_t *dq, *dk, *dv, *do_;
float *d_lse;
cudaMalloc(&dq, B * H * HD * sizeof(uint16_t));
cudaMalloc(&dk, B * sk * HD * sizeof(uint16_t));
cudaMalloc(&dv, B * HD * sk * sizeof(uint16_t));
cudaMalloc(&do_, B * H * HD * sizeof(uint16_t));
cudaMalloc(&d_lse, B * H * sizeof(float));
// Copy to GPU
cudaMemcpy(dq, hqb, B * H * HD * sizeof(uint16_t), cudaMemcpyHostToDevice);
cudaMemcpy(dk, hkb, B * sk * HD * sizeof(uint16_t), cudaMemcpyHostToDevice);
cudaMemcpy(dv, hvb, B * HD * sk * sizeof(uint16_t), cudaMemcpyHostToDevice);
cudaMemset(do_, 0, B * H * HD * sizeof(uint16_t));
// Launch kernel
dim3 grid(1, H, B);
dim3 block(NTHREADS);
printf("Launching fmha_decode_ref<%d> <<<(%d,%d,%d), %d>>>...\n", HD, grid.x, grid.y, grid.z, block.x);
fmha_decode_ref<HD><<<grid, block, smem>>>(
dq, dk, dv, do_,
H * HD, sk * HD, H * HD,
sk, 0, 0, scale, NULL, d_lse
);
cudaError_t err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
printf("❌ Kernel launch failed: %s\n", cudaGetErrorString(err));
return 1;
}
printf("✅ Kernel launched successfully!\n");
// Copy result back
cudaMemcpy(hob, do_, B * H * HD * sizeof(uint16_t), cudaMemcpyDeviceToHost);
// Compare with reference
float cos_sim = 0.0f, norm_a = 0.0f, norm_b = 0.0f;
for (int i = 0; i < B * H * HD; i++) {
float gpu_val = bf16_to_f32_cpu(hob[i]);
float ref_val = ho_ref[i];
cos_sim += gpu_val * ref_val;
norm_a += gpu_val * gpu_val;
norm_b += ref_val * ref_val;
}
float denom = sqrtf(norm_a) * sqrtf(norm_b);
if (denom > 0) cos_sim /= denom;
printf("\nhd=%d, s_k=%d: cos %.6f %s\n", HD, sk, cos_sim, cos_sim > 0.999f ? "✅ PASS" : "❌ FAIL");
if (cos_sim < 0.999f) {
printf("First 8 values (GPU vs Ref):\n");
for (int i = 0; i < 8; i++) {
printf(" [%d] GPU=%f Ref=%f\n", i, bf16_to_f32_cpu(hob[i]), ho_ref[i]);
}
}
// Cleanup
cudaFree(dq); cudaFree(dk); cudaFree(dv); cudaFree(do_); cudaFree(d_lse);
free(hq); free(hk); free(hv); free(ho_ref);
free(hqb); free(hkb); free(hvb); free(hob);
return cos_sim > 0.999f ? 0 : 1;
}