/** * Test: Softmax → PV (1 K-tile) via TS MMA. * * Write S (random data) to TMEM, softmax reads it, writes P to TMEM, * then TS MMA does PV for 1 K-tile. * This isolates the softmax→PV path from QK. */ #include #include #include #include #include #include "dsv4/kernels/attention/fmha_common.cuh" #include "dsv4/kernels/attention/fmha_umma_desc.cuh" using namespace dsv4::kernels::attention; static bf16_t f32_to_bf16_host(float f) { uint32_t u; memcpy(&u,&f,4); return (uint16_t)(u>>16); } static float bf16_to_f32_host(bf16_t h) { uint32_t u=(uint32_t)h<<16; float f; memcpy(&f,&u,4); return f; } constexpr int SK = 128, HD = 16, BLOCK_MN = 128; __global__ void __launch_bounds__(128) test_softmax_pv(const float* __restrict__ s_input, // (SK,) — row 0 of S const bf16_t* __restrict__ v, // (HD, SK) float* __restrict__ o_mma, float* __restrict__ o_ref, float scale) { const int tid = threadIdx.x, wid = tid / 32, lane = tid % 32; extern __shared__ char sbuf[]; uint32_t* sTmemBase = (uint32_t*)sbuf; bf16_t* sV = (bf16_t*)(((uintptr_t)(sbuf + 4) + 15) & ~(uintptr_t)15); // Load V K-tile 0: (16, 16) for (int i = tid; i < 256; i += 128) sV[i] = 0; for (int d = tid; d < HD; d += 128) { for (int lr = 0; lr < 16; lr++) { int ck = d / 8, lc = d % 8; int tmn = lr / 8, llr = lr % 8; int dst_idx = ck * 2 * 64 + tmn * 64 + llr * 8 + lc; sV[dst_idx] = v[d * SK + lr]; // First 16 positions of V } } __syncthreads(); // TMEM alloc: 128 cols for S/P, 32 cols for O if (wid == 1) tmem_alloc(__cvta_generic_to_shared(sTmemBase), 256); __syncthreads(); uint32_t tb = *sTmemBase; uint32_t tb_o = tb + 128; // Write S row 0 to TMEM directly (bypass QK, use input data) // All 128 columns of S: for decode T=1, only row 0 has data if (wid == 0) { float s_vals[SK]; // Read S from GMEM (already scaled) if (lane == 0) for (int j=0;j>>(d_s, d_v, d_o_mma, d_o_ref, SCALE); cudaError_t err = cudaDeviceSynchronize(); if (err != cudaSuccess) { printf("CUDA ERROR: %s\n", cudaGetErrorString(err)); return 1; } cudaMemcpy(h_o_mma, d_o_mma, HD*sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(h_o_ref, d_o_ref, HD*sizeof(float), cudaMemcpyDeviceToHost); printf("O[0..15] MMA: "); for(int d=0;d 0.9f ? "PASSED" : "FAILED"); cudaFree(d_s); cudaFree(d_v); cudaFree(d_o_mma); cudaFree(d_o_ref); free(h_s); free(h_v); free(h_o_mma); free(h_o_ref); return cos_sim > 0.9f ? 0 : 1; }