[Core] Simplify core kv-cache blocks initialization logic (#36521)
Signed-off-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
@@ -203,21 +203,17 @@ class Worker(WorkerBase):
|
||||
self.model_runner.init_fp8_kv_scales()
|
||||
|
||||
def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
|
||||
if self.vllm_config.model_config.enable_sleep_mode:
|
||||
from vllm.device_allocator.cumem import CuMemAllocator
|
||||
|
||||
allocator = CuMemAllocator.get_instance()
|
||||
if tag == "weights":
|
||||
assert allocator.get_current_usage() == 0, (
|
||||
"Sleep mode can only be used for one instance per process."
|
||||
)
|
||||
return allocator.use_memory_pool(tag=tag)
|
||||
else:
|
||||
if not self.vllm_config.model_config.enable_sleep_mode:
|
||||
return nullcontext()
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
|
||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
from vllm.device_allocator.cumem import CuMemAllocator
|
||||
|
||||
allocator = CuMemAllocator.get_instance()
|
||||
if tag == "weights":
|
||||
assert allocator.get_current_usage() == 0, (
|
||||
"Sleep mode can only be used for one instance per process."
|
||||
)
|
||||
return allocator.use_memory_pool(tag=tag)
|
||||
|
||||
@instrument(span_name="Init device")
|
||||
def init_device(self):
|
||||
|
||||
Reference in New Issue
Block a user