102 lines
3.4 KiB
Plaintext
102 lines
3.4 KiB
Plaintext
/**
|
|
* TMEM 2-store test with fence between stores.
|
|
* Uses wid==0 guard (64 threads, 2 warps) like the working minimal test.
|
|
*/
|
|
|
|
#include <cuda_runtime.h>
|
|
#include <cstdio>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
|
|
constexpr int WARP = 32;
|
|
|
|
__device__ void tmem_alloc(uint32_t sp, int n) {
|
|
asm volatile("tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%0], %1;" :: "r"(sp), "r"(n));
|
|
}
|
|
__device__ void tmem_dealloc(uint32_t tp, int n) {
|
|
asm volatile("tcgen05.dealloc.cta_group::1.sync.aligned.b32 %0, %1;" :: "r"(tp), "r"(n));
|
|
}
|
|
__device__ void tmem_store(uint32_t c, uint32_t r0, uint32_t r1, uint32_t r2, uint32_t r3) {
|
|
asm volatile("tcgen05.st.sync.aligned.16x256b.x1.b32 [%0], {%1, %2, %3, %4};" :: "r"(c), "r"(r0), "r"(r1), "r"(r2), "r"(r3));
|
|
}
|
|
__device__ void tmem_load(uint32_t c, uint32_t &r0, uint32_t &r1, uint32_t &r2, uint32_t &r3) {
|
|
asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];" : "=r"(r0), "=r"(r1), "=r"(r2), "=r"(r3) : "r"(c));
|
|
}
|
|
|
|
__global__ void test_tmem_2store(float* out) {
|
|
extern __shared__ char sbuf[];
|
|
uint32_t* sBase = (uint32_t*)sbuf;
|
|
int tid = threadIdx.x;
|
|
int lane = tid % WARP;
|
|
int wid = tid / WARP;
|
|
|
|
// Alloc 32 TMEM columns
|
|
if (wid == 0) {
|
|
tmem_alloc(__cvta_generic_to_shared(sBase), 32);
|
|
}
|
|
__syncthreads();
|
|
uint32_t tb = *sBase;
|
|
|
|
// Store to column 0
|
|
if (wid == 0) {
|
|
float v0 = (float)(lane * 4 + 0);
|
|
float v1 = (float)(lane * 4 + 1);
|
|
float v2 = (float)(lane * 4 + 2);
|
|
float v3 = (float)(lane * 4 + 3);
|
|
uint32_t u0, u1, u2, u3;
|
|
memcpy(&u0, &v0, 4); memcpy(&u1, &v1, 4);
|
|
memcpy(&u2, &v2, 4); memcpy(&u3, &v3, 4);
|
|
tmem_store(tb + 0, u0, u1, u2, u3);
|
|
}
|
|
// Fence OUTSIDE the guard
|
|
asm volatile("tcgen05.wait::st.sync.aligned;" ::: "memory");
|
|
__syncthreads();
|
|
|
|
// Store to column 1
|
|
if (wid == 0) {
|
|
float v0 = (float)(lane * 4 + 100);
|
|
float v1 = (float)(lane * 4 + 101);
|
|
float v2 = (float)(lane * 4 + 102);
|
|
float v3 = (float)(lane * 4 + 103);
|
|
uint32_t u0, u1, u2, u3;
|
|
memcpy(&u0, &v0, 4); memcpy(&u1, &v1, 4);
|
|
memcpy(&u2, &v2, 4); memcpy(&u3, &v3, 4);
|
|
tmem_store(tb + 1, u0, u1, u2, u3);
|
|
}
|
|
asm volatile("tcgen05.wait::st.sync.aligned;" ::: "memory");
|
|
__syncthreads();
|
|
|
|
// Read back
|
|
if (wid == 0) {
|
|
uint32_t r0, r1, r2, r3;
|
|
tmem_load(tb + 0, r0, r1, r2, r3);
|
|
float f0; memcpy(&f0, &r0, 4);
|
|
if (lane == 0) out[0] = f0;
|
|
|
|
tmem_load(tb + 1, r0, r1, r2, r3);
|
|
float f1; memcpy(&f1, &r0, 4);
|
|
if (lane == 0) out[1] = f1;
|
|
}
|
|
__syncthreads();
|
|
|
|
if (wid == 0) tmem_dealloc(tb, 32);
|
|
}
|
|
|
|
int main() {
|
|
printf("=== TMEM 2-Store Test ===\n");
|
|
float* h_out = (float*)calloc(2, sizeof(float));
|
|
float* d_out; cudaMalloc(&d_out, 2 * sizeof(float));
|
|
cudaMemset(d_out, 0, 2 * sizeof(float));
|
|
|
|
test_tmem_2store<<<1, 64, 1024>>>(d_out);
|
|
cudaError_t err = cudaDeviceSynchronize();
|
|
if (err != cudaSuccess) { printf("CUDA ERROR: %s\n", cudaGetErrorString(err)); return 1; }
|
|
|
|
cudaMemcpy(h_out, d_out, 2 * sizeof(float), cudaMemcpyDeviceToHost);
|
|
printf("col 0: %.1f (expected 0.0)\n", h_out[0]);
|
|
printf("col 1: %.1f (expected 100.0)\n", h_out[1]);
|
|
printf("Test %s\n", (fabsf(h_out[0]) < 0.1f && fabsf(h_out[1] - 100.0f) < 0.1f) ? "PASSED" : "FAILED");
|
|
cudaFree(d_out); free(h_out);
|
|
return 0;
|
|
}
|