fix: include cuda_bf16.h unconditionally, add --expt-relaxed-constexpr

This commit is contained in:
2026-05-28 05:13:01 +00:00
parent c1266b5275
commit 6bd3356582
2 changed files with 2 additions and 9 deletions

View File

@@ -30,16 +30,8 @@
#pragma once
#include <cuda_runtime.h>
#include <cstdint>
// NOTE: cuda_bf16.h has a C++17 compatibility bug on CUDA 13.2.
// We define a minimal BF16 type using the __bf16 built-in (CUDA 13+).
// __bf16 is a built-in storage type; we wrap it for type safety.
#if defined(__CUDA_ARCH__)
// Device code: __nv_bfloat16 IS available via the built-in
#undef __BF16_COMPAT
#include <cuda_bf16.h>
#endif
#include <cstdint>
// CUTLASS C++ includes (CUDA device code only)
#if defined(__CUDA_ARCH__)

View File

@@ -42,6 +42,7 @@ nvcc_cmd = [
"--x", "cu",
"-o", "/tmp/fmha_sm100_test.o",
"--ptxas-options=-v",
"--expt-relaxed-constexpr",
]
print(f"nvcc command: {' '.join(nvcc_cmd)}")