From 2dc5b4ec195d2d116b8b16a5f2ff4e11d87f1baa Mon Sep 17 00:00:00 2001 From: biondizzle Date: Mon, 1 Jun 2026 20:42:53 +0000 Subject: [PATCH] Fix sampler kernel stack overflow: reduce MAX_K from 256 to 128 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 128 * (sizeof(float) + sizeof(int)) = 1KB — within CUDA default stack limit. 256 * 8 = 2KB would overflow. --- dsv4/kernels/cuda/sampler.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dsv4/kernels/cuda/sampler.cu b/dsv4/kernels/cuda/sampler.cu index 565a2a7c..06cfeb88 100644 --- a/dsv4/kernels/cuda/sampler.cu +++ b/dsv4/kernels/cuda/sampler.cu @@ -99,10 +99,10 @@ __global__ void fused_sampler_kernel( if (tid == 0) { // Merge: find global top-k from BDIM * LK = 8192 candidates - int eff_k = min(top_k, 256); // kernel max - if (eff_k <= 0) eff_k = 256; + int eff_k = min(top_k, 128); // kernel max (stack limit: 128 * 8 = 1KB) + if (eff_k <= 0) eff_k = 128; - float gsc[256]; int gid[256]; int gn = 0; + float gsc[128]; int gid[128]; int gn = 0; for (int t = 0; t < BDIM; t++) { for (int i = 0; i < LK; i++) { float s = s_sc[t*LK+i]; @@ -115,7 +115,7 @@ __global__ void fused_sampler_kernel( // ---------- Phase 3: softmax + top-p + sample ---------- float mx = gsc[0]; // sorted desc, first is max - float probs[256]; float total = 0.0f; + float probs[128]; float total = 0.0f; for (int i = 0; i < gn; i++) { probs[i] = expf(gsc[i] - mx); total += probs[i];