[Hybrid Allocator] Support Pipeline Parallel (#23974)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
Chen Zhang
2025-09-14 15:55:17 -07:00
committed by GitHub
parent 90f3f7d73e
commit 8e5cdcda4e
7 changed files with 472 additions and 235 deletions

View File

@@ -10,7 +10,7 @@ from vllm import LLM
from vllm.config import ModelImpl
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
from vllm.utils import GiB_bytes
from vllm.v1.core.kv_cache_utils import get_kv_cache_config
from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
from vllm.v1.engine.core import EngineCore as V1EngineCore
from ..utils import create_new_process_for_each_test
@@ -68,11 +68,11 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
def _initialize_kv_caches_v1(self, vllm_config):
kv_cache_specs = self.model_executor.get_kv_cache_specs()
scheduler_kv_cache_config = get_kv_cache_config(
scheduler_kv_cache_config = get_kv_cache_configs(
vllm_config,
kv_cache_specs[0],
10 * GiB_bytes,
)
kv_cache_specs,
[10 * GiB_bytes],
)[0]
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
return 1, 0, scheduler_kv_cache_config