fix: include cuda_bf16.h unconditionally, add --expt-relaxed-constexpr
This commit is contained in:
@@ -30,16 +30,8 @@
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstdint>
|
||||
|
||||
// NOTE: cuda_bf16.h has a C++17 compatibility bug on CUDA 13.2.
|
||||
// We define a minimal BF16 type using the __bf16 built-in (CUDA 13+).
|
||||
// __bf16 is a built-in storage type; we wrap it for type safety.
|
||||
#if defined(__CUDA_ARCH__)
|
||||
// Device code: __nv_bfloat16 IS available via the built-in
|
||||
#undef __BF16_COMPAT
|
||||
#include <cuda_bf16.h>
|
||||
#endif
|
||||
#include <cstdint>
|
||||
|
||||
// CUTLASS C++ includes (CUDA device code only)
|
||||
#if defined(__CUDA_ARCH__)
|
||||
|
||||
@@ -42,6 +42,7 @@ nvcc_cmd = [
|
||||
"--x", "cu",
|
||||
"-o", "/tmp/fmha_sm100_test.o",
|
||||
"--ptxas-options=-v",
|
||||
"--expt-relaxed-constexpr",
|
||||
]
|
||||
|
||||
print(f"nvcc command: {' '.join(nvcc_cmd)}")
|
||||
|
||||
Reference in New Issue
Block a user