diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8dfa65da1..9405b5f72 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5859,6 +5859,13 @@ class GPUModelRunner( layer.kv_cache = ( torch.tensor([]) if isinstance(kv_cache, torch.Tensor) else [] ) + # Clean up quantized KV cache scale views + # (int8_per_token_head, fp8_per_token_head) + if hasattr(layer, "impl"): + if hasattr(layer.impl, "_k_scale_cache"): + layer.impl._k_scale_cache = None + if hasattr(layer.impl, "_v_scale_cache"): + layer.impl._v_scale_cache = None gc.collect() torch.accelerator.empty_cache()