[Bugfix] Gate 256-bit instructions to CUDA 12.9+ (#34791)
Signed-off-by: Huy Do <huydhn@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -14,7 +14,8 @@ struct alignas(32) u32x8_t {
|
|||||||
};
|
};
|
||||||
|
|
||||||
__device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) {
|
__device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
|
||||||
|
defined(CUDA_VERSION) && CUDA_VERSION >= 12090
|
||||||
asm volatile("ld.global.nc.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];\n"
|
asm volatile("ld.global.nc.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];\n"
|
||||||
: "=r"(val.u0), "=r"(val.u1), "=r"(val.u2), "=r"(val.u3),
|
: "=r"(val.u0), "=r"(val.u1), "=r"(val.u2), "=r"(val.u3),
|
||||||
"=r"(val.u4), "=r"(val.u5), "=r"(val.u6), "=r"(val.u7)
|
"=r"(val.u4), "=r"(val.u5), "=r"(val.u6), "=r"(val.u7)
|
||||||
@@ -35,7 +36,8 @@ __device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void st256(u32x8_t& val, u32x8_t* ptr) {
|
__device__ __forceinline__ void st256(u32x8_t& val, u32x8_t* ptr) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 && \
|
||||||
|
defined(CUDA_VERSION) && CUDA_VERSION >= 12090
|
||||||
asm volatile("st.global.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};\n"
|
asm volatile("st.global.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};\n"
|
||||||
:
|
:
|
||||||
: "l"(ptr), "r"(val.u0), "r"(val.u1), "r"(val.u2), "r"(val.u3),
|
: "l"(ptr), "r"(val.u0), "r"(val.u1), "r"(val.u2), "r"(val.u3),
|
||||||
|
|||||||
Reference in New Issue
Block a user