P6: Fix constexpr and bf16 conversion in CUDA test

This commit is contained in:
2026-05-30 17:00:05 +00:00
parent 2833eb56e7
commit 1a87e054db

View File

@@ -34,7 +34,7 @@ int main() {
constexpr int N = 128;
constexpr int n_h = 4;
constexpr int batch = 1;
constexpr float scale = 1.0f / sqrtf((float)HD);
const float scale = 1.0f / sqrtf((float)HD);
// Allocate tensors
bf16_t *d_q, *d_k, *d_v;
@@ -151,10 +151,11 @@ int main() {
float* f_direct = new float[n_h * HD];
float* f_tma = new float[n_h * HD];
for (int i = 0; i < n_h * HD; i++) {
f_direct[i] = bf16_to_f32(h_o_direct[i]);
f_tma[i] = bf16_to_f32(h_o_tma[i]);
}
auto b2f = [](bf16_t h) -> float { float f; asm("cvt.f32.bf16 %0, %1;" : "=f"(f) : "h"(h)); return f; };
for (int i = 0; i < n_h * HD; i++) {
f_direct[i] = b2f(h_o_direct[i]);
f_tma[i] = b2f(h_o_tma[i]);
}
float cos = cosine_sim(f_direct, f_tma, n_h * HD);
printf("P6 TMA epilogue test (hd=%d, n_h=%d): cos=%.8f\n", HD, n_h, cos);