diff --git a/dsv4/layers/linear.py b/dsv4/layers/linear.py
index 4281d8ee..fb007010 100644
--- a/dsv4/layers/linear.py
+++ b/dsv4/layers/linear.py
@@ -136,7 +136,6 @@ class Nvfp4Linear:
         with torch.no_grad():
             _, _, gs = quantize_to_nvfp4(hidden_states_sample)
             self._activation_global_scale = gs
-            self._gsa_buf_initialized = False  # P2: re-fill on next call
 
 
     def run(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -178,11 +177,7 @@ class Nvfp4Linear:
             self._gsa_buf.copy_(gsa_gpu[:1].reshape(1))  # GPU → GPU, no sync
         else:
             from dsv4.ops.quantize import quantize_nvfp4_gpu
-            # P2: _activation_global_scale is set once at warmup — no per-call fill needed.
-            # The buffer retains its value across calls (GPU tensor, persistent).
-            if not getattr(self, '_gsa_buf_initialized', False):
-                self._gsa_buf.fill_(self._activation_global_scale)
-                self._gsa_buf_initialized = True
+            self._gsa_buf.fill_(self._activation_global_scale)
             x_fp4, x_sf = quantize_nvfp4_gpu(hidden_states, self._activation_global_scale)
 
         # Scatter x_fp4 into padded buffer