diff --git a/dsv4/layers/linear.py b/dsv4/layers/linear.py index 4281d8ee..fb007010 100644 --- a/dsv4/layers/linear.py +++ b/dsv4/layers/linear.py @@ -136,7 +136,6 @@ class Nvfp4Linear: with torch.no_grad(): _, _, gs = quantize_to_nvfp4(hidden_states_sample) self._activation_global_scale = gs - self._gsa_buf_initialized = False # P2: re-fill on next call def run(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -178,11 +177,7 @@ class Nvfp4Linear: self._gsa_buf.copy_(gsa_gpu[:1].reshape(1)) # GPU → GPU, no sync else: from dsv4.ops.quantize import quantize_nvfp4_gpu - # P2: _activation_global_scale is set once at warmup — no per-call fill needed. - # The buffer retains its value across calls (GPU tensor, persistent). - if not getattr(self, '_gsa_buf_initialized', False): - self._gsa_buf.fill_(self._activation_global_scale) - self._gsa_buf_initialized = True + self._gsa_buf.fill_(self._activation_global_scale) x_fp4, x_sf = quantize_nvfp4_gpu(hidden_states, self._activation_global_scale) # Scatter x_fp4 into padded buffer