diff --git a/tests/unit/test_fmha_sm100.py b/tests/unit/test_fmha_sm100.py index 90eb35c7..96a0bfe4 100644 --- a/tests/unit/test_fmha_sm100.py +++ b/tests/unit/test_fmha_sm100.py @@ -1,15 +1,10 @@ """ -Test: Compile FMHA SM100 kernel with nvcc, load as shared library, test correctness. - -This uses ctypes to load the compiled .so instead of torch.utils.cpp_extension -(which adds -D__CUDA_NO_BFLOAT16_CONVERSIONS__ causing ICE with __bf16). +Test: Compile and run standalone FMHA SM100 test via nvcc. +No PyTorch needed — pure CUDA runtime test. """ import subprocess import sys import os -import torch -import math -import ctypes def get_repo_root(): d = os.path.dirname(os.path.abspath(__file__)) @@ -19,62 +14,46 @@ def get_repo_root(): return None REPO = get_repo_root() -CUTLASS = "/root/cutlass" CUDA = "/usr/local/cuda-13.2" -OUT = "/tmp/fmha_sm100_test" -def compile_kernel(): - """Compile the kernel as a shared library using nvcc directly.""" - src = f"{REPO}/dsv4/kernels/attention/fmha_sm100_launch.cu" - out = f"{OUT}.so" +# Step 1: Compile standalone test +print("=" * 60) +print("Compiling standalone FMHA SM100 test...") +print("=" * 60) - cmd = [ - f"{CUDA}/bin/nvcc", - "--std=c++20", - "-shared", - "-Xcompiler", "-fPIC", - f"-gencode=arch=compute_100a,code=sm_100a", - f"-I{REPO}", - f"-I{CUTLASS}/include", - f"-I{CUDA}/include", - "-I/root/dsv4-nvfp4-workspace/venv/lib/python3.12/site-packages/torch/include", - "-I/root/dsv4-nvfp4-workspace/venv/lib/python3.12/site-packages/torch/include/torch/csrc/api/include", - "-I/usr/include/python3.12", - "-DGOOGLE_CUDA=1", - "--expt-relaxed-constexpr", - src, - "-o", out, - "-L/root/dsv4-nvfp4-workspace/venv/lib/python3.12/site-packages/torch/lib", - "-lc10_cuda", "-ltorch_cuda", "-ltorch", "-lc10", - "-lcudart", - ] +src = f"{REPO}/tests/unit/test_fmha_sm100_standalone.cu" +out = "/tmp/fmha_sm100_standalone" - print(f"Compiling: {' '.join(cmd[:5])}...") - result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) - if result.returncode != 0: - print(f"❌ Compilation FAILED:\n{result.stderr[-1000:]}") - return False - print(f"✅ Compiled: {out}") - return True +cmd = [ + f"{CUDA}/bin/nvcc", + "--std=c++20", + f"-gencode=arch=compute_100a,code=sm_100a", + f"-I{REPO}", + "--expt-relaxed-constexpr", + src, + "-o", out, + "-lcudart", +] +print(f"nvcc: {' '.join(cmd[:4])}...") +result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) +if result.returncode != 0: + print(f"❌ Compilation FAILED:\n{result.stderr[-2000:]}") + sys.exit(1) +print(f"✅ Compiled: {out}") -def test_correctness(): - """Test FMHA output against PyTorch reference using ctypes.""" - # Load the shared library - lib = ctypes.CDLL(f"{OUT}.so") - # We'd need to call the function via ctypes, but the function - # returns torch tensors which complicates ctypes. - # Instead, let's write a simple Python test script that uses - # torch.ops to call the custom op. - print("Shared library loaded. Need proper test harness for correctness.") - print("For now, verify kernel compiles and loads successfully.") - return True +# Step 2: Run the test +print("\n" + "=" * 60) +print("Running standalone FMHA SM100 test...") +print("=" * 60) +result = subprocess.run([out], capture_output=True, text=True, timeout=30) +print(result.stdout) +if result.stderr: + print(f"STDERR: {result.stderr[-500:]}") +print(f"Exit code: {result.returncode}") -if __name__ == "__main__": - print("=" * 60) - print("FMHA SM100 — nvcc direct compilation + correctness test") - print("=" * 60) - - if compile_kernel(): - test_correctness() +if result.returncode == 0: + print("\n✅ ALL TESTS PASSED!") +else: + print("\n❌ TEST FAILED") diff --git a/tests/unit/test_fmha_sm100_standalone.cu b/tests/unit/test_fmha_sm100_standalone.cu new file mode 100644 index 00000000..763d3c9f --- /dev/null +++ b/tests/unit/test_fmha_sm100_standalone.cu @@ -0,0 +1,172 @@ +/** + * Standalone CUDA test for FMHA SM100 decode kernel. + * Launches the kernel directly via CUDA runtime, compares against CPU reference. + * No PyTorch or pybind11 needed — just nvcc + CUDA runtime. + */ + +#include "fmha_sm100.cuh" +#include +#include +#include +#include + +using namespace dsv4::kernels::attention; + +// CPU reference: simple attention +void attention_ref_cpu( + const float* q, const float* k, const float* v, + float* o, float* lse, + int B, int H, int sk, int HD, float scale +) { + for (int b = 0; b < B; b++) { + for (int h = 0; h < H; h++) { + const float* qh = q + (b * H + h) * HD; + const float* kb = k + b * sk * HD; + const float* vb = v + b * HD * sk; + float* oh = o + (b * H + h) * HD; + + // S = Q @ K^T * scale + float* s = (float*)malloc(sk * sizeof(float)); + float s_max = -FLT_MAX; + for (int c = 0; c < sk; c++) { + float dot = 0.0f; + for (int d = 0; d < HD; d++) dot += qh[d] * kb[c * HD + d]; + s[c] = dot * scale; + s_max = fmaxf(s_max, s[c]); + } + + // Softmax + float sum = 0.0f; + for (int c = 0; c < sk; c++) { + s[c] = expf(s[c] - s_max); + sum += s[c]; + } + for (int c = 0; c < sk; c++) s[c] /= sum; + + // O = S @ V + for (int d = 0; d < HD; d++) { + oh[d] = 0.0f; + for (int c = 0; c < sk; c++) { + oh[d] += s[c] * vb[d * sk + c]; + } + } + + if (lse) lse[b * H + h] = logf(sum) + s_max; + + free(s); + } + } +} + +// BF16 conversion helpers for CPU +uint16_t f32_to_bf16_cpu(float f) { + uint32_t u; + memcpy(&u, &f, 4); + uint16_t h = (uint16_t)(u >> 16); + return h; +} + +float bf16_to_f32_cpu(uint16_t h) { + uint32_t u = ((uint32_t)h) << 16; + float f; + memcpy(&f, &u, 4); + return f; +} + +int main() { + printf("=== FMHA SM100 Decode Kernel Test ===\n"); + + const int B = 1, H = 1, HD = 64, sk = 128; + const float scale = 1.0f / sqrtf((float)HD); + const int smem = 128 * HD * 2 * sizeof(uint16_t) + 1024; // K + V + slack + + // Allocate host memory + float *hq = (float*)malloc(B * H * HD * sizeof(float)); + float *hk = (float*)malloc(B * sk * HD * sizeof(float)); + float *hv = (float*)malloc(B * HD * sk * sizeof(float)); + float *ho_ref = (float*)malloc(B * H * HD * sizeof(float)); + + // Init with random data + srand(42); + for (int i = 0; i < B * H * HD; i++) hq[i] = (float)rand() / RAND_MAX - 0.5f; + for (int i = 0; i < B * sk * HD; i++) hk[i] = (float)rand() / RAND_MAX - 0.5f; + for (int i = 0; i < B * HD * sk; i++) hv[i] = (float)rand() / RAND_MAX - 0.5f; + + // CPU reference + attention_ref_cpu(hq, hk, hv, ho_ref, NULL, B, H, sk, HD, scale); + + // Convert to BF16 + uint16_t *hqb = (uint16_t*)malloc(B * H * HD * sizeof(uint16_t)); + uint16_t *hkb = (uint16_t*)malloc(B * sk * HD * sizeof(uint16_t)); + uint16_t *hvb = (uint16_t*)malloc(B * HD * sk * sizeof(uint16_t)); + uint16_t *hob = (uint16_t*)malloc(B * H * HD * sizeof(uint16_t)); + + for (int i = 0; i < B * H * HD; i++) hqb[i] = f32_to_bf16_cpu(hq[i]); + for (int i = 0; i < B * sk * HD; i++) hkb[i] = f32_to_bf16_cpu(hk[i]); + for (int i = 0; i < B * HD * sk; i++) hvb[i] = f32_to_bf16_cpu(hv[i]); + + // Allocate GPU memory + uint16_t *dq, *dk, *dv, *do_; + float *d_lse; + cudaMalloc(&dq, B * H * HD * sizeof(uint16_t)); + cudaMalloc(&dk, B * sk * HD * sizeof(uint16_t)); + cudaMalloc(&dv, B * HD * sk * sizeof(uint16_t)); + cudaMalloc(&do_, B * H * HD * sizeof(uint16_t)); + cudaMalloc(&d_lse, B * H * sizeof(float)); + + // Copy to GPU + cudaMemcpy(dq, hqb, B * H * HD * sizeof(uint16_t), cudaMemcpyHostToDevice); + cudaMemcpy(dk, hkb, B * sk * HD * sizeof(uint16_t), cudaMemcpyHostToDevice); + cudaMemcpy(dv, hvb, B * HD * sk * sizeof(uint16_t), cudaMemcpyHostToDevice); + cudaMemset(do_, 0, B * H * HD * sizeof(uint16_t)); + + // Launch kernel + dim3 grid(1, H, B); + dim3 block(NTHREADS); + + printf("Launching fmha_decode_ref<%d> <<<(%d,%d,%d), %d>>>...\n", HD, grid.x, grid.y, grid.z, block.x); + + fmha_decode_ref<<>>( + dq, dk, dv, do_, + H * HD, sk * HD, H * HD, + sk, 0, 0, scale, NULL, d_lse + ); + + cudaError_t err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + printf("❌ Kernel launch failed: %s\n", cudaGetErrorString(err)); + return 1; + } + printf("✅ Kernel launched successfully!\n"); + + // Copy result back + cudaMemcpy(hob, do_, B * H * HD * sizeof(uint16_t), cudaMemcpyDeviceToHost); + + // Compare with reference + float cos_sim = 0.0f, norm_a = 0.0f, norm_b = 0.0f; + for (int i = 0; i < B * H * HD; i++) { + float gpu_val = bf16_to_f32_cpu(hob[i]); + float ref_val = ho_ref[i]; + cos_sim += gpu_val * ref_val; + norm_a += gpu_val * gpu_val; + norm_b += ref_val * ref_val; + } + float denom = sqrtf(norm_a) * sqrtf(norm_b); + if (denom > 0) cos_sim /= denom; + + printf("\nhd=%d, s_k=%d: cos %.6f %s\n", HD, sk, cos_sim, cos_sim > 0.999f ? "✅ PASS" : "❌ FAIL"); + + if (cos_sim < 0.999f) { + printf("First 8 values (GPU vs Ref):\n"); + for (int i = 0; i < 8; i++) { + printf(" [%d] GPU=%f Ref=%f\n", i, bf16_to_f32_cpu(hob[i]), ho_ref[i]); + } + } + + // Cleanup + cudaFree(dq); cudaFree(dk); cudaFree(dv); cudaFree(do_); cudaFree(d_lse); + free(hq); free(hk); free(hv); free(ho_ref); + free(hqb); free(hkb); free(hvb); free(hob); + + return cos_sim > 0.999f ? 0 : 1; +}