diff --git a/tests/unit/test_prefill_t2_debug.cu b/tests/unit/test_prefill_t2_debug.cu index d34acebf..e8b55eb2 100644 --- a/tests/unit/test_prefill_t2_debug.cu +++ b/tests/unit/test_prefill_t2_debug.cu @@ -530,7 +530,8 @@ int main() { // Launch debug kernel printf("\n=== GPU Kernel Execution ===\n"); - int smem_size = 256 * 1024; // generous + int smem_size = 200 * 1024; // ~149KB needed, stay under 232KB limit + cudaFuncSetAttribute(prefill_t2_debug_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); prefill_t2_debug_kernel<<>>( d_q_nope_fp8, d_q_nope_scale, d_q_rope_bf16, d_k_nope_fp8, d_k_nope_scale, d_k_rope_bf16,