From 676a0448c0060320ac765e2b98d38496f7a8bbdc Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 4 Jun 2026 02:06:18 +0000 Subject: [PATCH] =?UTF-8?q?CRITICAL=20FIX:=20=5Fl1=5Fout=5Fbuf=20was=202x?= =?UTF-8?q?=20too=20narrow=20=E2=80=94=20caused=20GPU=20memory=20corruptio?= =?UTF-8?q?n?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The L1 GEMM produces gate+up combined output with 2*intermediate_size BF16 columns, but _l1_out_buf was only allocated with intermediate_size columns. The GEMM wrote past the buffer boundary, corrupting GPU memory and causing cudaErrorInvalidValue on subsequent operations. This was the root cause of ALL the cudaErrorInvalidValue errors in the shared expert and MoE L2 paths — the corrupted memory from the L1 buffer overflow propagated downstream. Fix: _l1_out_buf shape (max_rows, 2*intermediate_size) instead of (max_rows, intermediate_size). Applied to both shared_expert.py and moe.py. Also removed all DEBUG sync/print statements from quantize.py and shared_expert.py — the bug was not in the quantize kernels, it was the buffer overflow. --- dsv4/layers/moe.py | 4 ++-- dsv4/layers/shared_expert.py | 28 +++------------------------- dsv4/ops/quantize.py | 2 -- 3 files changed, 5 insertions(+), 29 deletions(-) diff --git a/dsv4/layers/moe.py b/dsv4/layers/moe.py index 0dc0e89e..8743f40c 100644 --- a/dsv4/layers/moe.py +++ b/dsv4/layers/moe.py @@ -166,9 +166,9 @@ class Nvfp4MoE: self._l2_gsa_buf = torch.zeros(self.num_experts, dtype=torch.float32, device=self.device) # Pre-allocated L1 GEMM output — avoids torch.zeros() in run_fused_swiglu_grouped_gemm - # Shape: (max_tokens * top_k, intermediate_size) — max possible L1 output + # Shape: (max_tokens * top_k, 2*intermediate_size) — gate+up combined self._l1_out_buf = torch.zeros( - self.max_num_tokens * self.top_k, self.intermediate_size, + self.max_num_tokens * self.top_k, 2 * self.intermediate_size, dtype=torch.bfloat16, device=self.device ) diff --git a/dsv4/layers/shared_expert.py b/dsv4/layers/shared_expert.py index af3951fa..7a02e58c 100644 --- a/dsv4/layers/shared_expert.py +++ b/dsv4/layers/shared_expert.py @@ -184,8 +184,9 @@ class Nvfp4SharedExpert: self._l2_gsa_buf = torch.zeros(1, dtype=torch.float32, device=self.device) # Pre-allocated L1 output buffer for graph capture + # L1 produces gate+up combined: 2 * intermediate_size BF16 columns self._l1_out_buf = torch.zeros( - max_rows, self.intermediate_size, + max_rows, 2 * self.intermediate_size, dtype=torch.bfloat16, device=self.device ) @@ -365,25 +366,8 @@ class Nvfp4SharedExpert: from dsv4.ops.quantize import quantize_nvfp4_gpu_fused if not intermediate.is_contiguous(): intermediate = intermediate.contiguous() - # DEBUG: isolate async CUDA error - torch.cuda.synchronize() # catch any prior async error x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate) - try: - torch.cuda.synchronize() # catch error from quantize kernels - except RuntimeError as e: - print(f" SE L2: quantize_nvfp4_gpu_fused FAILED after sync: {e}", flush=True) - print(f" intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device}", flush=True) - raise - # DEBUG: check gsa values before assignment - try: - gsa_first = gsa_l2_gpu[0].item() # DEBUG: read value - print(f" SE L2 gsa[0]={gsa_first:.6f} shape={tuple(gsa_l2_gpu.shape)} dev={gsa_l2_gpu.device} buf_dev={self._l2_gsa_buf.device} buf_shape={tuple(self._l2_gsa_buf.shape)}", flush=True) - # Try copy_ instead of scalar assign - self._l2_gsa_buf.copy_(gsa_l2_gpu[:1].contiguous()) - print(f" SE L2 gsa copy_ succeeded", flush=True) - except RuntimeError as e: - print(f" SE L2: gsa assignment FAILED: {e}", flush=True) - raise + self._l2_gsa_buf[0] = gsa_l2_gpu[0] # scalar GPU→GPU, no sync, graph-capturable else: x_fp4, x_sf = quantize_activation_nvfp4( intermediate, self._l2_activation_global_scale @@ -425,15 +409,11 @@ class Nvfp4SharedExpert: """Actual implementation — called via custom autograd to be torch.compile-safe.""" self._ensure_initialized() - # DEBUG: check input - print(f" SE input: shape={tuple(hidden_states.shape)} |max|={hidden_states.abs().max().item():.6f} nan={torch.isnan(hidden_states).any().item()}", flush=True) - if self._fused_swiglu: # P1: Fused L1 GEMM + SwiGLU + clamp in one kernel launch intermediate = self._run_l1_fused(hidden_states) else: l1_out = self._run_l1(hidden_states) - print(f" SE L1 out: shape={tuple(l1_out.shape)} |max|={l1_out.abs().max().item() if l1_out.numel() > 0 else 'EMPTY'} nan={torch.isnan(l1_out).any().item() if l1_out.numel() > 0 else 'N/A'}", flush=True) if l1_out.shape[1] < 2 * self.intermediate_size: print(f" WARNING: l1_out shape {l1_out.shape} < expected (N, {2*self.intermediate_size})", flush=True) @@ -447,8 +427,6 @@ class Nvfp4SharedExpert: gate = gate.clamp(max=self.swiglu_limit) up = up.clamp(min=-self.swiglu_limit, max=self.swiglu_limit) intermediate = torch.nn.functional.silu(gate) * up - # DEBUG: check intermediate before L2 - print(f" SE intermediate: shape={tuple(intermediate.shape)} |max|={intermediate.abs().max().item():.6f} nan={torch.isnan(intermediate).any().item()}", flush=True) output = self._run_l2(intermediate) return output diff --git a/dsv4/ops/quantize.py b/dsv4/ops/quantize.py index 10a51388..3a189555 100644 --- a/dsv4/ops/quantize.py +++ b/dsv4/ops/quantize.py @@ -334,8 +334,6 @@ def quantize_nvfp4_gpu_fused(x_bf16, divisor=6.0 * 448.0): # For M=1: gsa_gpu is (1,) contiguous — zero allocation quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"]) x_fp4, x_sf = quant_mod.quantize_nvfp4_from_buffer(x_bf16, gsa_gpu) - # DEBUG: sync to catch async errors from the quantize kernels - torch.cuda.synchronize() return x_fp4, x_sf, gsa_gpu