test: standalone CUDA test for FMHA SM100 (no PyTorch needed)
This commit is contained in:
@@ -1,15 +1,10 @@
|
||||
"""
|
||||
Test: Compile FMHA SM100 kernel with nvcc, load as shared library, test correctness.
|
||||
|
||||
This uses ctypes to load the compiled .so instead of torch.utils.cpp_extension
|
||||
(which adds -D__CUDA_NO_BFLOAT16_CONVERSIONS__ causing ICE with __bf16).
|
||||
Test: Compile and run standalone FMHA SM100 test via nvcc.
|
||||
No PyTorch needed — pure CUDA runtime test.
|
||||
"""
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
import torch
|
||||
import math
|
||||
import ctypes
|
||||
|
||||
def get_repo_root():
|
||||
d = os.path.dirname(os.path.abspath(__file__))
|
||||
@@ -19,62 +14,46 @@ def get_repo_root():
|
||||
return None
|
||||
|
||||
REPO = get_repo_root()
|
||||
CUTLASS = "/root/cutlass"
|
||||
CUDA = "/usr/local/cuda-13.2"
|
||||
OUT = "/tmp/fmha_sm100_test"
|
||||
|
||||
def compile_kernel():
|
||||
"""Compile the kernel as a shared library using nvcc directly."""
|
||||
src = f"{REPO}/dsv4/kernels/attention/fmha_sm100_launch.cu"
|
||||
out = f"{OUT}.so"
|
||||
# Step 1: Compile standalone test
|
||||
print("=" * 60)
|
||||
print("Compiling standalone FMHA SM100 test...")
|
||||
print("=" * 60)
|
||||
|
||||
cmd = [
|
||||
f"{CUDA}/bin/nvcc",
|
||||
"--std=c++20",
|
||||
"-shared",
|
||||
"-Xcompiler", "-fPIC",
|
||||
f"-gencode=arch=compute_100a,code=sm_100a",
|
||||
f"-I{REPO}",
|
||||
f"-I{CUTLASS}/include",
|
||||
f"-I{CUDA}/include",
|
||||
"-I/root/dsv4-nvfp4-workspace/venv/lib/python3.12/site-packages/torch/include",
|
||||
"-I/root/dsv4-nvfp4-workspace/venv/lib/python3.12/site-packages/torch/include/torch/csrc/api/include",
|
||||
"-I/usr/include/python3.12",
|
||||
"-DGOOGLE_CUDA=1",
|
||||
"--expt-relaxed-constexpr",
|
||||
src,
|
||||
"-o", out,
|
||||
"-L/root/dsv4-nvfp4-workspace/venv/lib/python3.12/site-packages/torch/lib",
|
||||
"-lc10_cuda", "-ltorch_cuda", "-ltorch", "-lc10",
|
||||
"-lcudart",
|
||||
]
|
||||
src = f"{REPO}/tests/unit/test_fmha_sm100_standalone.cu"
|
||||
out = "/tmp/fmha_sm100_standalone"
|
||||
|
||||
print(f"Compiling: {' '.join(cmd[:5])}...")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
if result.returncode != 0:
|
||||
print(f"❌ Compilation FAILED:\n{result.stderr[-1000:]}")
|
||||
return False
|
||||
print(f"✅ Compiled: {out}")
|
||||
return True
|
||||
cmd = [
|
||||
f"{CUDA}/bin/nvcc",
|
||||
"--std=c++20",
|
||||
f"-gencode=arch=compute_100a,code=sm_100a",
|
||||
f"-I{REPO}",
|
||||
"--expt-relaxed-constexpr",
|
||||
src,
|
||||
"-o", out,
|
||||
"-lcudart",
|
||||
]
|
||||
|
||||
print(f"nvcc: {' '.join(cmd[:4])}...")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
if result.returncode != 0:
|
||||
print(f"❌ Compilation FAILED:\n{result.stderr[-2000:]}")
|
||||
sys.exit(1)
|
||||
print(f"✅ Compiled: {out}")
|
||||
|
||||
def test_correctness():
|
||||
"""Test FMHA output against PyTorch reference using ctypes."""
|
||||
# Load the shared library
|
||||
lib = ctypes.CDLL(f"{OUT}.so")
|
||||
# We'd need to call the function via ctypes, but the function
|
||||
# returns torch tensors which complicates ctypes.
|
||||
# Instead, let's write a simple Python test script that uses
|
||||
# torch.ops to call the custom op.
|
||||
print("Shared library loaded. Need proper test harness for correctness.")
|
||||
print("For now, verify kernel compiles and loads successfully.")
|
||||
return True
|
||||
# Step 2: Run the test
|
||||
print("\n" + "=" * 60)
|
||||
print("Running standalone FMHA SM100 test...")
|
||||
print("=" * 60)
|
||||
|
||||
result = subprocess.run([out], capture_output=True, text=True, timeout=30)
|
||||
print(result.stdout)
|
||||
if result.stderr:
|
||||
print(f"STDERR: {result.stderr[-500:]}")
|
||||
print(f"Exit code: {result.returncode}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=" * 60)
|
||||
print("FMHA SM100 — nvcc direct compilation + correctness test")
|
||||
print("=" * 60)
|
||||
|
||||
if compile_kernel():
|
||||
test_correctness()
|
||||
if result.returncode == 0:
|
||||
print("\n✅ ALL TESTS PASSED!")
|
||||
else:
|
||||
print("\n❌ TEST FAILED")
|
||||
|
||||
172
tests/unit/test_fmha_sm100_standalone.cu
Normal file
172
tests/unit/test_fmha_sm100_standalone.cu
Normal file
@@ -0,0 +1,172 @@
|
||||
/**
|
||||
* Standalone CUDA test for FMHA SM100 decode kernel.
|
||||
* Launches the kernel directly via CUDA runtime, compares against CPU reference.
|
||||
* No PyTorch or pybind11 needed — just nvcc + CUDA runtime.
|
||||
*/
|
||||
|
||||
#include "fmha_sm100.cuh"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
using namespace dsv4::kernels::attention;
|
||||
|
||||
// CPU reference: simple attention
|
||||
void attention_ref_cpu(
|
||||
const float* q, const float* k, const float* v,
|
||||
float* o, float* lse,
|
||||
int B, int H, int sk, int HD, float scale
|
||||
) {
|
||||
for (int b = 0; b < B; b++) {
|
||||
for (int h = 0; h < H; h++) {
|
||||
const float* qh = q + (b * H + h) * HD;
|
||||
const float* kb = k + b * sk * HD;
|
||||
const float* vb = v + b * HD * sk;
|
||||
float* oh = o + (b * H + h) * HD;
|
||||
|
||||
// S = Q @ K^T * scale
|
||||
float* s = (float*)malloc(sk * sizeof(float));
|
||||
float s_max = -FLT_MAX;
|
||||
for (int c = 0; c < sk; c++) {
|
||||
float dot = 0.0f;
|
||||
for (int d = 0; d < HD; d++) dot += qh[d] * kb[c * HD + d];
|
||||
s[c] = dot * scale;
|
||||
s_max = fmaxf(s_max, s[c]);
|
||||
}
|
||||
|
||||
// Softmax
|
||||
float sum = 0.0f;
|
||||
for (int c = 0; c < sk; c++) {
|
||||
s[c] = expf(s[c] - s_max);
|
||||
sum += s[c];
|
||||
}
|
||||
for (int c = 0; c < sk; c++) s[c] /= sum;
|
||||
|
||||
// O = S @ V
|
||||
for (int d = 0; d < HD; d++) {
|
||||
oh[d] = 0.0f;
|
||||
for (int c = 0; c < sk; c++) {
|
||||
oh[d] += s[c] * vb[d * sk + c];
|
||||
}
|
||||
}
|
||||
|
||||
if (lse) lse[b * H + h] = logf(sum) + s_max;
|
||||
|
||||
free(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// BF16 conversion helpers for CPU
|
||||
uint16_t f32_to_bf16_cpu(float f) {
|
||||
uint32_t u;
|
||||
memcpy(&u, &f, 4);
|
||||
uint16_t h = (uint16_t)(u >> 16);
|
||||
return h;
|
||||
}
|
||||
|
||||
float bf16_to_f32_cpu(uint16_t h) {
|
||||
uint32_t u = ((uint32_t)h) << 16;
|
||||
float f;
|
||||
memcpy(&f, &u, 4);
|
||||
return f;
|
||||
}
|
||||
|
||||
int main() {
|
||||
printf("=== FMHA SM100 Decode Kernel Test ===\n");
|
||||
|
||||
const int B = 1, H = 1, HD = 64, sk = 128;
|
||||
const float scale = 1.0f / sqrtf((float)HD);
|
||||
const int smem = 128 * HD * 2 * sizeof(uint16_t) + 1024; // K + V + slack
|
||||
|
||||
// Allocate host memory
|
||||
float *hq = (float*)malloc(B * H * HD * sizeof(float));
|
||||
float *hk = (float*)malloc(B * sk * HD * sizeof(float));
|
||||
float *hv = (float*)malloc(B * HD * sk * sizeof(float));
|
||||
float *ho_ref = (float*)malloc(B * H * HD * sizeof(float));
|
||||
|
||||
// Init with random data
|
||||
srand(42);
|
||||
for (int i = 0; i < B * H * HD; i++) hq[i] = (float)rand() / RAND_MAX - 0.5f;
|
||||
for (int i = 0; i < B * sk * HD; i++) hk[i] = (float)rand() / RAND_MAX - 0.5f;
|
||||
for (int i = 0; i < B * HD * sk; i++) hv[i] = (float)rand() / RAND_MAX - 0.5f;
|
||||
|
||||
// CPU reference
|
||||
attention_ref_cpu(hq, hk, hv, ho_ref, NULL, B, H, sk, HD, scale);
|
||||
|
||||
// Convert to BF16
|
||||
uint16_t *hqb = (uint16_t*)malloc(B * H * HD * sizeof(uint16_t));
|
||||
uint16_t *hkb = (uint16_t*)malloc(B * sk * HD * sizeof(uint16_t));
|
||||
uint16_t *hvb = (uint16_t*)malloc(B * HD * sk * sizeof(uint16_t));
|
||||
uint16_t *hob = (uint16_t*)malloc(B * H * HD * sizeof(uint16_t));
|
||||
|
||||
for (int i = 0; i < B * H * HD; i++) hqb[i] = f32_to_bf16_cpu(hq[i]);
|
||||
for (int i = 0; i < B * sk * HD; i++) hkb[i] = f32_to_bf16_cpu(hk[i]);
|
||||
for (int i = 0; i < B * HD * sk; i++) hvb[i] = f32_to_bf16_cpu(hv[i]);
|
||||
|
||||
// Allocate GPU memory
|
||||
uint16_t *dq, *dk, *dv, *do_;
|
||||
float *d_lse;
|
||||
cudaMalloc(&dq, B * H * HD * sizeof(uint16_t));
|
||||
cudaMalloc(&dk, B * sk * HD * sizeof(uint16_t));
|
||||
cudaMalloc(&dv, B * HD * sk * sizeof(uint16_t));
|
||||
cudaMalloc(&do_, B * H * HD * sizeof(uint16_t));
|
||||
cudaMalloc(&d_lse, B * H * sizeof(float));
|
||||
|
||||
// Copy to GPU
|
||||
cudaMemcpy(dq, hqb, B * H * HD * sizeof(uint16_t), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dk, hkb, B * sk * HD * sizeof(uint16_t), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dv, hvb, B * HD * sk * sizeof(uint16_t), cudaMemcpyHostToDevice);
|
||||
cudaMemset(do_, 0, B * H * HD * sizeof(uint16_t));
|
||||
|
||||
// Launch kernel
|
||||
dim3 grid(1, H, B);
|
||||
dim3 block(NTHREADS);
|
||||
|
||||
printf("Launching fmha_decode_ref<%d> <<<(%d,%d,%d), %d>>>...\n", HD, grid.x, grid.y, grid.z, block.x);
|
||||
|
||||
fmha_decode_ref<HD><<<grid, block, smem>>>(
|
||||
dq, dk, dv, do_,
|
||||
H * HD, sk * HD, H * HD,
|
||||
sk, 0, 0, scale, NULL, d_lse
|
||||
);
|
||||
|
||||
cudaError_t err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
printf("❌ Kernel launch failed: %s\n", cudaGetErrorString(err));
|
||||
return 1;
|
||||
}
|
||||
printf("✅ Kernel launched successfully!\n");
|
||||
|
||||
// Copy result back
|
||||
cudaMemcpy(hob, do_, B * H * HD * sizeof(uint16_t), cudaMemcpyDeviceToHost);
|
||||
|
||||
// Compare with reference
|
||||
float cos_sim = 0.0f, norm_a = 0.0f, norm_b = 0.0f;
|
||||
for (int i = 0; i < B * H * HD; i++) {
|
||||
float gpu_val = bf16_to_f32_cpu(hob[i]);
|
||||
float ref_val = ho_ref[i];
|
||||
cos_sim += gpu_val * ref_val;
|
||||
norm_a += gpu_val * gpu_val;
|
||||
norm_b += ref_val * ref_val;
|
||||
}
|
||||
float denom = sqrtf(norm_a) * sqrtf(norm_b);
|
||||
if (denom > 0) cos_sim /= denom;
|
||||
|
||||
printf("\nhd=%d, s_k=%d: cos %.6f %s\n", HD, sk, cos_sim, cos_sim > 0.999f ? "✅ PASS" : "❌ FAIL");
|
||||
|
||||
if (cos_sim < 0.999f) {
|
||||
printf("First 8 values (GPU vs Ref):\n");
|
||||
for (int i = 0; i < 8; i++) {
|
||||
printf(" [%d] GPU=%f Ref=%f\n", i, bf16_to_f32_cpu(hob[i]), ho_ref[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
cudaFree(dq); cudaFree(dk); cudaFree(dv); cudaFree(do_); cudaFree(d_lse);
|
||||
free(hq); free(hk); free(hv); free(ho_ref);
|
||||
free(hqb); free(hkb); free(hvb); free(hob);
|
||||
|
||||
return cos_sim > 0.999f ? 0 : 1;
|
||||
}
|
||||
Reference in New Issue
Block a user