CRITICAL FIX: _l1_out_buf was 2x too narrow — caused GPU memory corruption

The L1 GEMM produces gate+up combined output with 2*intermediate_size
BF16 columns, but _l1_out_buf was only allocated with intermediate_size
columns. The GEMM wrote past the buffer boundary, corrupting GPU memory
and causing cudaErrorInvalidValue on subsequent operations.

This was the root cause of ALL the cudaErrorInvalidValue errors in the
shared expert and MoE L2 paths — the corrupted memory from the L1 buffer
overflow propagated downstream.

Fix: _l1_out_buf shape (max_rows, 2*intermediate_size) instead of
(max_rows, intermediate_size). Applied to both shared_expert.py and moe.py.

Also removed all DEBUG sync/print statements from quantize.py and
shared_expert.py — the bug was not in the quantize kernels, it was
the buffer overflow.
This commit is contained in:
2026-06-04 02:06:18 +00:00
parent 0890e578f4
commit 676a0448c0
3 changed files with 5 additions and 29 deletions

View File

@@ -166,9 +166,9 @@ class Nvfp4MoE:
self._l2_gsa_buf = torch.zeros(self.num_experts, dtype=torch.float32, device=self.device)
# Pre-allocated L1 GEMM output — avoids torch.zeros() in run_fused_swiglu_grouped_gemm
# Shape: (max_tokens * top_k, intermediate_size) — max possible L1 output
# Shape: (max_tokens * top_k, 2*intermediate_size) — gate+up combined
self._l1_out_buf = torch.zeros(
self.max_num_tokens * self.top_k, self.intermediate_size,
self.max_num_tokens * self.top_k, 2 * self.intermediate_size,
dtype=torch.bfloat16, device=self.device
)

View File

@@ -184,8 +184,9 @@ class Nvfp4SharedExpert:
self._l2_gsa_buf = torch.zeros(1, dtype=torch.float32, device=self.device)
# Pre-allocated L1 output buffer for graph capture
# L1 produces gate+up combined: 2 * intermediate_size BF16 columns
self._l1_out_buf = torch.zeros(
max_rows, self.intermediate_size,
max_rows, 2 * self.intermediate_size,
dtype=torch.bfloat16, device=self.device
)
@@ -365,25 +366,8 @@ class Nvfp4SharedExpert:
from dsv4.ops.quantize import quantize_nvfp4_gpu_fused
if not intermediate.is_contiguous():
intermediate = intermediate.contiguous()
# DEBUG: isolate async CUDA error
torch.cuda.synchronize() # catch any prior async error
x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
try:
torch.cuda.synchronize() # catch error from quantize kernels
except RuntimeError as e:
print(f" SE L2: quantize_nvfp4_gpu_fused FAILED after sync: {e}", flush=True)
print(f" intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device}", flush=True)
raise
# DEBUG: check gsa values before assignment
try:
gsa_first = gsa_l2_gpu[0].item() # DEBUG: read value
print(f" SE L2 gsa[0]={gsa_first:.6f} shape={tuple(gsa_l2_gpu.shape)} dev={gsa_l2_gpu.device} buf_dev={self._l2_gsa_buf.device} buf_shape={tuple(self._l2_gsa_buf.shape)}", flush=True)
# Try copy_ instead of scalar assign
self._l2_gsa_buf.copy_(gsa_l2_gpu[:1].contiguous())
print(f" SE L2 gsa copy_ succeeded", flush=True)
except RuntimeError as e:
print(f" SE L2: gsa assignment FAILED: {e}", flush=True)
raise
self._l2_gsa_buf[0] = gsa_l2_gpu[0] # scalar GPU→GPU, no sync, graph-capturable
else:
x_fp4, x_sf = quantize_activation_nvfp4(
intermediate, self._l2_activation_global_scale
@@ -425,15 +409,11 @@ class Nvfp4SharedExpert:
"""Actual implementation — called via custom autograd to be torch.compile-safe."""
self._ensure_initialized()
# DEBUG: check input
print(f" SE input: shape={tuple(hidden_states.shape)} |max|={hidden_states.abs().max().item():.6f} nan={torch.isnan(hidden_states).any().item()}", flush=True)
if self._fused_swiglu:
# P1: Fused L1 GEMM + SwiGLU + clamp in one kernel launch
intermediate = self._run_l1_fused(hidden_states)
else:
l1_out = self._run_l1(hidden_states)
print(f" SE L1 out: shape={tuple(l1_out.shape)} |max|={l1_out.abs().max().item() if l1_out.numel() > 0 else 'EMPTY'} nan={torch.isnan(l1_out).any().item() if l1_out.numel() > 0 else 'N/A'}", flush=True)
if l1_out.shape[1] < 2 * self.intermediate_size:
print(f" WARNING: l1_out shape {l1_out.shape} < expected (N, {2*self.intermediate_size})", flush=True)
@@ -447,8 +427,6 @@ class Nvfp4SharedExpert:
gate = gate.clamp(max=self.swiglu_limit)
up = up.clamp(min=-self.swiglu_limit, max=self.swiglu_limit)
intermediate = torch.nn.functional.silu(gate) * up
# DEBUG: check intermediate before L2
print(f" SE intermediate: shape={tuple(intermediate.shape)} |max|={intermediate.abs().max().item():.6f} nan={torch.isnan(intermediate).any().item()}", flush=True)
output = self._run_l2(intermediate)
return output

View File

@@ -334,8 +334,6 @@ def quantize_nvfp4_gpu_fused(x_bf16, divisor=6.0 * 448.0):
# For M=1: gsa_gpu is (1,) contiguous — zero allocation
quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
x_fp4, x_sf = quant_mod.quantize_nvfp4_from_buffer(x_bf16, gsa_gpu)
# DEBUG: sync to catch async errors from the quantize kernels
torch.cuda.synchronize()
return x_fp4, x_sf, gsa_gpu