[Bugfix] Cuda Clean up scales Kvcache fp8/int8_per_token_head (#39224)

Signed-off-by: JartX <sagformas@epdcenter.es>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
JartX
2026-04-08 13:08:04 +02:00
committed by GitHub
parent 6155bbd1dd
commit 140cbb1186

View File

@@ -5859,6 +5859,13 @@ class GPUModelRunner(
layer.kv_cache = (
torch.tensor([]) if isinstance(kv_cache, torch.Tensor) else []
)
# Clean up quantized KV cache scale views
# (int8_per_token_head, fp8_per_token_head)
if hasattr(layer, "impl"):
if hasattr(layer.impl, "_k_scale_cache"):
layer.impl._k_scale_cache = None
if hasattr(layer.impl, "_v_scale_cache"):
layer.impl._v_scale_cache = None
gc.collect()
torch.accelerator.empty_cache()