Fix smem size for prefill debug test

This commit is contained in:
2026-06-03 03:47:01 +00:00
parent 09384a637a
commit dd1cbe1faa

View File

@@ -530,7 +530,8 @@ int main() {
// Launch debug kernel
printf("\n=== GPU Kernel Execution ===\n");
int smem_size = 256 * 1024; // generous
int smem_size = 200 * 1024; // ~149KB needed, stay under 232KB limit
cudaFuncSetAttribute(prefill_t2_debug_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
prefill_t2_debug_kernel<<<dim3(1,1,1), 192, smem_size>>>(
d_q_nope_fp8, d_q_nope_scale, d_q_rope_bf16,
d_k_nope_fp8, d_k_nope_scale, d_k_rope_bf16,