From 61d5e7ba532bf8b0d11454eba5c24e6d8e1f7fee Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Tue, 2 Jun 2026 07:32:10 +0000
Subject: [PATCH] =?UTF-8?q?revert:=20P2=20gsa=20fill=20elimination=20?=
 =?UTF-8?q?=E2=80=94=20revert=20to=20proven=20path=20for=20e2e=20stability?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fill_() is a CPU→GPU scalar write (tiny cost). The optimization
was marginal and the output quality regression (CJK tokens) needs
investigation separately. P2 can re-land after the regression is
confirmed to be sampling-related (not gsa-related).

P0/P1 (fused SwiGLU) still disabled — kernel arg-binding bug unfixed.
---
 dsv4/layers/linear.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/dsv4/layers/linear.py b/dsv4/layers/linear.py
index 4281d8ee..fb007010 100644
--- a/dsv4/layers/linear.py
+++ b/dsv4/layers/linear.py
@@ -136,7 +136,6 @@ class Nvfp4Linear:
         with torch.no_grad():
             _, _, gs = quantize_to_nvfp4(hidden_states_sample)
             self._activation_global_scale = gs
-            self._gsa_buf_initialized = False  # P2: re-fill on next call
 
 
     def run(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -178,11 +177,7 @@ class Nvfp4Linear:
             self._gsa_buf.copy_(gsa_gpu[:1].reshape(1))  # GPU → GPU, no sync
         else:
             from dsv4.ops.quantize import quantize_nvfp4_gpu
-            # P2: _activation_global_scale is set once at warmup — no per-call fill needed.
-            # The buffer retains its value across calls (GPU tensor, persistent).
-            if not getattr(self, '_gsa_buf_initialized', False):
-                self._gsa_buf.fill_(self._activation_global_scale)
-                self._gsa_buf_initialized = True
+            self._gsa_buf.fill_(self._activation_global_scale)
             x_fp4, x_sf = quantize_nvfp4_gpu(hidden_states, self._activation_global_scale)
 
         # Scatter x_fp4 into padded buffer