[V1][core] Implement pipeline parallel on Ray (#12996)

This commit is contained in:
Rui Qiao
2025-02-13 00:02:46 -08:00
committed by GitHub
parent 0ccd8769fb
commit 9605c1256e
7 changed files with 110 additions and 45 deletions

View File

@@ -2,7 +2,7 @@
"""A GPU worker class."""
import gc
import os
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING, List, Optional
import torch
import torch.distributed
@@ -194,8 +194,9 @@ class Worker:
def get_kv_cache_spec(self) -> KVCacheSpec:
return self.model_runner.get_kv_cache_spec()
def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None:
def initialize_cache(self, kv_cache_configs: List[KVCacheConfig]) -> None:
"""Allocate GPU KV cache with the specified kv_cache_config."""
kv_cache_config = kv_cache_configs[self.rank]
if self.vllm_config.model_config.enable_sleep_mode:
allocator = CuMemAllocator.get_instance()
context = allocator.use_memory_pool(tag="kv_cache")