From 61d5e7ba532bf8b0d11454eba5c24e6d8e1f7fee Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 2 Jun 2026 07:32:10 +0000 Subject: [PATCH] =?UTF-8?q?revert:=20P2=20gsa=20fill=20elimination=20?= =?UTF-8?q?=E2=80=94=20revert=20to=20proven=20path=20for=20e2e=20stability?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fill_() is a CPU→GPU scalar write (tiny cost). The optimization was marginal and the output quality regression (CJK tokens) needs investigation separately. P2 can re-land after the regression is confirmed to be sampling-related (not gsa-related). P0/P1 (fused SwiGLU) still disabled — kernel arg-binding bug unfixed. --- dsv4/layers/linear.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/dsv4/layers/linear.py b/dsv4/layers/linear.py index 4281d8ee..fb007010 100644 --- a/dsv4/layers/linear.py +++ b/dsv4/layers/linear.py @@ -136,7 +136,6 @@ class Nvfp4Linear: with torch.no_grad(): _, _, gs = quantize_to_nvfp4(hidden_states_sample) self._activation_global_scale = gs - self._gsa_buf_initialized = False # P2: re-fill on next call def run(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -178,11 +177,7 @@ class Nvfp4Linear: self._gsa_buf.copy_(gsa_gpu[:1].reshape(1)) # GPU → GPU, no sync else: from dsv4.ops.quantize import quantize_nvfp4_gpu - # P2: _activation_global_scale is set once at warmup — no per-call fill needed. - # The buffer retains its value across calls (GPU tensor, persistent). - if not getattr(self, '_gsa_buf_initialized', False): - self._gsa_buf.fill_(self._activation_global_scale) - self._gsa_buf_initialized = True + self._gsa_buf.fill_(self._activation_global_scale) x_fp4, x_sf = quantize_nvfp4_gpu(hidden_states, self._activation_global_scale) # Scatter x_fp4 into padded buffer