diff --git a/dsv4/kernels/cuda/sampler.cu b/dsv4/kernels/cuda/sampler.cu index 565a2a7c..06cfeb88 100644 --- a/dsv4/kernels/cuda/sampler.cu +++ b/dsv4/kernels/cuda/sampler.cu @@ -99,10 +99,10 @@ __global__ void fused_sampler_kernel( if (tid == 0) { // Merge: find global top-k from BDIM * LK = 8192 candidates - int eff_k = min(top_k, 256); // kernel max - if (eff_k <= 0) eff_k = 256; + int eff_k = min(top_k, 128); // kernel max (stack limit: 128 * 8 = 1KB) + if (eff_k <= 0) eff_k = 128; - float gsc[256]; int gid[256]; int gn = 0; + float gsc[128]; int gid[128]; int gn = 0; for (int t = 0; t < BDIM; t++) { for (int i = 0; i < LK; i++) { float s = s_sc[t*LK+i]; @@ -115,7 +115,7 @@ __global__ void fused_sampler_kernel( // ---------- Phase 3: softmax + top-p + sample ---------- float mx = gsc[0]; // sorted desc, first is max - float probs[256]; float total = 0.0f; + float probs[128]; float total = 0.0f; for (int i = 0; i < gn; i++) { probs[i] = expf(gsc[i] - mx); total += probs[i];