Fix sampler kernel stack overflow: reduce MAX_K from 256 to 128

128 * (sizeof(float) + sizeof(int)) = 1KB — within CUDA default stack limit.
256 * 8 = 2KB would overflow.
This commit is contained in:
2026-06-01 20:42:53 +00:00
parent 360f76b970
commit 2dc5b4ec19

View File

@@ -99,10 +99,10 @@ __global__ void fused_sampler_kernel(
if (tid == 0) {
// Merge: find global top-k from BDIM * LK = 8192 candidates
int eff_k = min(top_k, 256); // kernel max
if (eff_k <= 0) eff_k = 256;
int eff_k = min(top_k, 128); // kernel max (stack limit: 128 * 8 = 1KB)
if (eff_k <= 0) eff_k = 128;
float gsc[256]; int gid[256]; int gn = 0;
float gsc[128]; int gid[128]; int gn = 0;
for (int t = 0; t < BDIM; t++) {
for (int i = 0; i < LK; i++) {
float s = s_sc[t*LK+i];
@@ -115,7 +115,7 @@ __global__ void fused_sampler_kernel(
// ---------- Phase 3: softmax + top-p + sample ----------
float mx = gsc[0]; // sorted desc, first is max
float probs[256]; float total = 0.0f;
float probs[128]; float total = 0.0f;
for (int i = 0; i < gn; i++) {
probs[i] = expf(gsc[i] - mx);
total += probs[i];