[Bugfix] Gate 256-bit instructions to CUDA 12.9+ (#34791)

Signed-off-by: Huy Do <huydhn@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
Huy Do
2026-02-21 04:48:14 -08:00
committed by GitHub
parent f74f1572ca
commit 272b535ab3

View File

@@ -14,7 +14,8 @@ struct alignas(32) u32x8_t {
}; };
__device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) { __device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
defined(CUDA_VERSION) && CUDA_VERSION >= 12090
asm volatile("ld.global.nc.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];\n" asm volatile("ld.global.nc.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];\n"
: "=r"(val.u0), "=r"(val.u1), "=r"(val.u2), "=r"(val.u3), : "=r"(val.u0), "=r"(val.u1), "=r"(val.u2), "=r"(val.u3),
"=r"(val.u4), "=r"(val.u5), "=r"(val.u6), "=r"(val.u7) "=r"(val.u4), "=r"(val.u5), "=r"(val.u6), "=r"(val.u7)
@@ -35,7 +36,8 @@ __device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) {
} }
__device__ __forceinline__ void st256(u32x8_t& val, u32x8_t* ptr) { __device__ __forceinline__ void st256(u32x8_t& val, u32x8_t* ptr) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
defined(CUDA_VERSION) && CUDA_VERSION >= 12090
asm volatile("st.global.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};\n" asm volatile("st.global.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};\n"
: :
: "l"(ptr), "r"(val.u0), "r"(val.u1), "r"(val.u2), "r"(val.u3), : "l"(ptr), "r"(val.u0), "r"(val.u1), "r"(val.u2), "r"(val.u3),