Fix smem size for prefill debug test
This commit is contained in:
@@ -530,7 +530,8 @@ int main() {
|
||||
|
||||
// Launch debug kernel
|
||||
printf("\n=== GPU Kernel Execution ===\n");
|
||||
int smem_size = 256 * 1024; // generous
|
||||
int smem_size = 200 * 1024; // ~149KB needed, stay under 232KB limit
|
||||
cudaFuncSetAttribute(prefill_t2_debug_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
|
||||
prefill_t2_debug_kernel<<<dim3(1,1,1), 192, smem_size>>>(
|
||||
d_q_nope_fp8, d_q_nope_scale, d_q_rope_bf16,
|
||||
d_k_nope_fp8, d_k_nope_scale, d_k_rope_bf16,
|
||||
|
||||
Reference in New Issue
Block a user