[Bugfix][sleepmode][fp8 kv cache]: Fix FP8 KV cache + sleep(level=2) gibberish output (#28783)

Signed-off-by: vensen <vensenmu@gmail.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
This commit is contained in:
Vensen
2025-11-30 14:24:25 +08:00
committed by GitHub
parent 82c795d6f2
commit 66b5840287
4 changed files with 94 additions and 2 deletions

View File

@@ -141,6 +141,16 @@ class Worker(WorkerBase):
buffer.data.copy_(self._sleep_saved_buffers[name].data)
self._sleep_saved_buffers = {}
# If the KV cache has just been woken up,
# the internal state of cache_engine must be reset,
# especially the FP8 scaling factor.
if (
(tags is None or "kv_cache" in tags)
and self.cache_config.cache_dtype.startswith("fp8")
and hasattr(self.model_runner, "init_fp8_kv_scales")
):
self.model_runner.init_fp8_kv_scales()
def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
if self.vllm_config.model_config.enable_sleep_mode:
from vllm.device_allocator.cumem import CuMemAllocator