Fix sampler kernel stack overflow: reduce MAX_K from 256 to 128
128 * (sizeof(float) + sizeof(int)) = 1KB — within CUDA default stack limit. 256 * 8 = 2KB would overflow.
This commit is contained in:
@@ -99,10 +99,10 @@ __global__ void fused_sampler_kernel(
|
||||
|
||||
if (tid == 0) {
|
||||
// Merge: find global top-k from BDIM * LK = 8192 candidates
|
||||
int eff_k = min(top_k, 256); // kernel max
|
||||
if (eff_k <= 0) eff_k = 256;
|
||||
int eff_k = min(top_k, 128); // kernel max (stack limit: 128 * 8 = 1KB)
|
||||
if (eff_k <= 0) eff_k = 128;
|
||||
|
||||
float gsc[256]; int gid[256]; int gn = 0;
|
||||
float gsc[128]; int gid[128]; int gn = 0;
|
||||
for (int t = 0; t < BDIM; t++) {
|
||||
for (int i = 0; i < LK; i++) {
|
||||
float s = s_sc[t*LK+i];
|
||||
@@ -115,7 +115,7 @@ __global__ void fused_sampler_kernel(
|
||||
|
||||
// ---------- Phase 3: softmax + top-p + sample ----------
|
||||
float mx = gsc[0]; // sorted desc, first is max
|
||||
float probs[256]; float total = 0.0f;
|
||||
float probs[128]; float total = 0.0f;
|
||||
for (int i = 0; i < gn; i++) {
|
||||
probs[i] = expf(gsc[i] - mx);
|
||||
total += probs[i];
|
||||
|
||||
Reference in New Issue
Block a user