revert: P2 gsa fill elimination — revert to proven path for e2e stability
The fill_() is a CPU→GPU scalar write (tiny cost). The optimization was marginal and the output quality regression (CJK tokens) needs investigation separately. P2 can re-land after the regression is confirmed to be sampling-related (not gsa-related). P0/P1 (fused SwiGLU) still disabled — kernel arg-binding bug unfixed.
This commit is contained in:
@@ -136,7 +136,6 @@ class Nvfp4Linear:
|
||||
with torch.no_grad():
|
||||
_, _, gs = quantize_to_nvfp4(hidden_states_sample)
|
||||
self._activation_global_scale = gs
|
||||
self._gsa_buf_initialized = False # P2: re-fill on next call
|
||||
|
||||
|
||||
def run(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
@@ -178,11 +177,7 @@ class Nvfp4Linear:
|
||||
self._gsa_buf.copy_(gsa_gpu[:1].reshape(1)) # GPU → GPU, no sync
|
||||
else:
|
||||
from dsv4.ops.quantize import quantize_nvfp4_gpu
|
||||
# P2: _activation_global_scale is set once at warmup — no per-call fill needed.
|
||||
# The buffer retains its value across calls (GPU tensor, persistent).
|
||||
if not getattr(self, '_gsa_buf_initialized', False):
|
||||
self._gsa_buf.fill_(self._activation_global_scale)
|
||||
self._gsa_buf_initialized = True
|
||||
self._gsa_buf.fill_(self._activation_global_scale)
|
||||
x_fp4, x_sf = quantize_nvfp4_gpu(hidden_states, self._activation_global_scale)
|
||||
|
||||
# Scatter x_fp4 into padded buffer
|
||||
|
||||
Reference in New Issue
Block a user