From 676a0448c0060320ac765e2b98d38496f7a8bbdc Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Thu, 4 Jun 2026 02:06:18 +0000
Subject: [PATCH] =?UTF-8?q?CRITICAL=20FIX:=20=5Fl1=5Fout=5Fbuf=20was=202x?=
 =?UTF-8?q?=20too=20narrow=20=E2=80=94=20caused=20GPU=20memory=20corruptio?=
 =?UTF-8?q?n?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The L1 GEMM produces gate+up combined output with 2*intermediate_size
BF16 columns, but _l1_out_buf was only allocated with intermediate_size
columns. The GEMM wrote past the buffer boundary, corrupting GPU memory
and causing cudaErrorInvalidValue on subsequent operations.

This was the root cause of ALL the cudaErrorInvalidValue errors in the
shared expert and MoE L2 paths — the corrupted memory from the L1 buffer
overflow propagated downstream.

Fix: _l1_out_buf shape (max_rows, 2*intermediate_size) instead of
(max_rows, intermediate_size). Applied to both shared_expert.py and moe.py.

Also removed all DEBUG sync/print statements from quantize.py and
shared_expert.py — the bug was not in the quantize kernels, it was
the buffer overflow.
---
 dsv4/layers/moe.py           |  4 ++--
 dsv4/layers/shared_expert.py | 28 +++-------------------------
 dsv4/ops/quantize.py         |  2 --
 3 files changed, 5 insertions(+), 29 deletions(-)

diff --git a/dsv4/layers/moe.py b/dsv4/layers/moe.py
index 0dc0e89e..8743f40c 100644
--- a/dsv4/layers/moe.py
+++ b/dsv4/layers/moe.py
@@ -166,9 +166,9 @@ class Nvfp4MoE:
         self._l2_gsa_buf = torch.zeros(self.num_experts, dtype=torch.float32, device=self.device)
         
         # Pre-allocated L1 GEMM output — avoids torch.zeros() in run_fused_swiglu_grouped_gemm
-        # Shape: (max_tokens * top_k, intermediate_size) — max possible L1 output
+        # Shape: (max_tokens * top_k, 2*intermediate_size) — gate+up combined
         self._l1_out_buf = torch.zeros(
-            self.max_num_tokens * self.top_k, self.intermediate_size,
+            self.max_num_tokens * self.top_k, 2 * self.intermediate_size,
             dtype=torch.bfloat16, device=self.device
         )
         
diff --git a/dsv4/layers/shared_expert.py b/dsv4/layers/shared_expert.py
index af3951fa..7a02e58c 100644
--- a/dsv4/layers/shared_expert.py
+++ b/dsv4/layers/shared_expert.py
@@ -184,8 +184,9 @@ class Nvfp4SharedExpert:
         self._l2_gsa_buf = torch.zeros(1, dtype=torch.float32, device=self.device)
         
         # Pre-allocated L1 output buffer for graph capture
+        # L1 produces gate+up combined: 2 * intermediate_size BF16 columns
         self._l1_out_buf = torch.zeros(
-            max_rows, self.intermediate_size,
+            max_rows, 2 * self.intermediate_size,
             dtype=torch.bfloat16, device=self.device
         )
 
@@ -365,25 +366,8 @@ class Nvfp4SharedExpert:
             from dsv4.ops.quantize import quantize_nvfp4_gpu_fused
             if not intermediate.is_contiguous():
                 intermediate = intermediate.contiguous()
-            # DEBUG: isolate async CUDA error
-            torch.cuda.synchronize()  # catch any prior async error
             x_fp4, x_sf, gsa_l2_gpu = quantize_nvfp4_gpu_fused(intermediate)
-            try:
-                torch.cuda.synchronize()  # catch error from quantize kernels
-            except RuntimeError as e:
-                print(f"  SE L2: quantize_nvfp4_gpu_fused FAILED after sync: {e}", flush=True)
-                print(f"  intermediate: shape={tuple(intermediate.shape)} dtype={intermediate.dtype} dev={intermediate.device}", flush=True)
-                raise
-            # DEBUG: check gsa values before assignment
-            try:
-                gsa_first = gsa_l2_gpu[0].item()  # DEBUG: read value
-                print(f"  SE L2 gsa[0]={gsa_first:.6f} shape={tuple(gsa_l2_gpu.shape)} dev={gsa_l2_gpu.device} buf_dev={self._l2_gsa_buf.device} buf_shape={tuple(self._l2_gsa_buf.shape)}", flush=True)
-                # Try copy_ instead of scalar assign
-                self._l2_gsa_buf.copy_(gsa_l2_gpu[:1].contiguous())
-                print(f"  SE L2 gsa copy_ succeeded", flush=True)
-            except RuntimeError as e:
-                print(f"  SE L2: gsa assignment FAILED: {e}", flush=True)
-                raise
+            self._l2_gsa_buf[0] = gsa_l2_gpu[0]  # scalar GPU→GPU, no sync, graph-capturable
         else:
             x_fp4, x_sf = quantize_activation_nvfp4(
                 intermediate, self._l2_activation_global_scale
@@ -425,15 +409,11 @@ class Nvfp4SharedExpert:
         """Actual implementation — called via custom autograd to be torch.compile-safe."""
         self._ensure_initialized()
 
-        # DEBUG: check input
-        print(f"  SE input: shape={tuple(hidden_states.shape)} |max|={hidden_states.abs().max().item():.6f} nan={torch.isnan(hidden_states).any().item()}", flush=True)
-
         if self._fused_swiglu:
             # P1: Fused L1 GEMM + SwiGLU + clamp in one kernel launch
             intermediate = self._run_l1_fused(hidden_states)
         else:
             l1_out = self._run_l1(hidden_states)
-            print(f"  SE L1 out: shape={tuple(l1_out.shape)} |max|={l1_out.abs().max().item() if l1_out.numel() > 0 else 'EMPTY'} nan={torch.isnan(l1_out).any().item() if l1_out.numel() > 0 else 'N/A'}", flush=True)
             if l1_out.shape[1] < 2 * self.intermediate_size:
                 print(f"  WARNING: l1_out shape {l1_out.shape} < expected (N, {2*self.intermediate_size})", flush=True)
 
@@ -447,8 +427,6 @@ class Nvfp4SharedExpert:
                 gate = gate.clamp(max=self.swiglu_limit)
                 up = up.clamp(min=-self.swiglu_limit, max=self.swiglu_limit)
             intermediate = torch.nn.functional.silu(gate) * up
-        # DEBUG: check intermediate before L2
-        print(f"  SE intermediate: shape={tuple(intermediate.shape)} |max|={intermediate.abs().max().item():.6f} nan={torch.isnan(intermediate).any().item()}", flush=True)
 
         output = self._run_l2(intermediate)
         return output
diff --git a/dsv4/ops/quantize.py b/dsv4/ops/quantize.py
index 10a51388..3a189555 100644
--- a/dsv4/ops/quantize.py
+++ b/dsv4/ops/quantize.py
@@ -334,8 +334,6 @@ def quantize_nvfp4_gpu_fused(x_bf16, divisor=6.0 * 448.0):
     # For M=1: gsa_gpu is (1,) contiguous — zero allocation
     quant_mod = get_cuda_module("fused_amax_quantize", ["fused_amax_quantize.cu"])
     x_fp4, x_sf = quant_mod.quantize_nvfp4_from_buffer(x_bf16, gsa_gpu)
-    # DEBUG: sync to catch async errors from the quantize kernels
-    torch.cuda.synchronize()
     return x_fp4, x_sf, gsa_gpu