[Bugfix] Cuda Clean up scales Kvcache fp8/int8_per_token_head (#39224)

Signed-off-by: JartX <sagformas@epdcenter.es> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-04-08 13:08:04 +02:00
parent 6155bbd1dd
commit 140cbb1186
1 changed files with 7 additions and 0 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -5859,6 +5859,13 @@ class GPUModelRunner(
                layer.kv_cache = (
                    torch.tensor([]) if isinstance(kv_cache, torch.Tensor) else []
                )
+            # Clean up quantized KV cache scale views
+            # (int8_per_token_head, fp8_per_token_head)
+            if hasattr(layer, "impl"):
+                if hasattr(layer.impl, "_k_scale_cache"):
+                    layer.impl._k_scale_cache = None
+                if hasattr(layer.impl, "_v_scale_cache"):
+                    layer.impl._v_scale_cache = None

        gc.collect()
        torch.accelerator.empty_cache()