[Core] Simplify core kv-cache blocks initialization logic (#36521)

Signed-off-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
Nick Hill
2026-03-10 13:20:02 -07:00
committed by GitHub
parent 2a68464c5b
commit 65b2f405dc
4 changed files with 28 additions and 37 deletions

View File

@@ -203,21 +203,17 @@ class Worker(WorkerBase):
self.model_runner.init_fp8_kv_scales()
def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
if self.vllm_config.model_config.enable_sleep_mode:
from vllm.device_allocator.cumem import CuMemAllocator
allocator = CuMemAllocator.get_instance()
if tag == "weights":
assert allocator.get_current_usage() == 0, (
"Sleep mode can only be used for one instance per process."
)
return allocator.use_memory_pool(tag=tag)
else:
if not self.vllm_config.model_config.enable_sleep_mode:
return nullcontext()
def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
self.cache_config.num_gpu_blocks = num_gpu_blocks
self.cache_config.num_cpu_blocks = num_cpu_blocks
from vllm.device_allocator.cumem import CuMemAllocator
allocator = CuMemAllocator.get_instance()
if tag == "weights":
assert allocator.get_current_usage() == 0, (
"Sleep mode can only be used for one instance per process."
)
return allocator.use_memory_pool(tag=tag)
@instrument(span_name="Init device")
def init_device(self):