[Bugfix] Cuda Clean up scales Kvcache fp8/int8_per_token_head (#39224)
Signed-off-by: JartX <sagformas@epdcenter.es> Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
@@ -5859,6 +5859,13 @@ class GPUModelRunner(
|
||||
layer.kv_cache = (
|
||||
torch.tensor([]) if isinstance(kv_cache, torch.Tensor) else []
|
||||
)
|
||||
# Clean up quantized KV cache scale views
|
||||
# (int8_per_token_head, fp8_per_token_head)
|
||||
if hasattr(layer, "impl"):
|
||||
if hasattr(layer.impl, "_k_scale_cache"):
|
||||
layer.impl._k_scale_cache = None
|
||||
if hasattr(layer.impl, "_v_scale_cache"):
|
||||
layer.impl._v_scale_cache = None
|
||||
|
||||
gc.collect()
|
||||
torch.accelerator.empty_cache()
|
||||
|
||||
Reference in New Issue
Block a user