[Hybrid Allocator] Support Pipeline Parallel (#23974)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-09-14 15:55:17 -07:00
parent 90f3f7d73e
commit 8e5cdcda4e
7 changed files with 472 additions and 235 deletions
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -15,7 +15,7 @@ from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes, update_environment_variables
 from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
-                                         get_kv_cache_config)
+                                         get_kv_cache_configs)
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                       SchedulerOutput)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -585,8 +585,8 @@ def test_init_kv_cache_without_kv_sharing():
    available_memory = 20 * GiB_bytes
    # page size for layer 0's kv_cache_spec is 32KB
    num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
-    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
-                                          available_memory)
+    kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
+                                           [available_memory])[0]
    assert kv_cache_config.num_blocks == num_expected_blocks
    assert len(kv_cache_config.kv_cache_tensors) == 2
    assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
@@ -657,8 +657,8 @@ def test_init_kv_cache_with_kv_sharing_valid():
    # with KV sharing, we can allocate (available_mem//page_size//1) blocks
    # which is twice as many as without KV sharing
    num_expected_blocks = 655360  # 20GB / 32KB
-    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
-                                          available_memory)
+    kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
+                                           [available_memory])[0]
    assert kv_cache_config.num_blocks == num_expected_blocks
    assert len(kv_cache_config.kv_cache_tensors) == 1
    # Each layer now has twice the available memory for KV cache
@@ -788,8 +788,8 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
        kv_cache_spec = runner.get_kv_cache_spec()

        available_memory = 5 * GiB_bytes
-        kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
-                                              available_memory)
+        kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
+                                               [available_memory])[0]
        runner.initialize_kv_cache(kv_cache_config)

        # random partition of blocks