[Hybrid Allocator] Support Pipeline Parallel (#23974)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
@@ -15,7 +15,7 @@ from vllm.platforms import current_platform
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import GiB_bytes, update_environment_variables
|
||||
from vllm.v1.core.kv_cache_utils import (estimate_max_model_len,
|
||||
get_kv_cache_config)
|
||||
get_kv_cache_configs)
|
||||
from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
|
||||
SchedulerOutput)
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
@@ -585,8 +585,8 @@ def test_init_kv_cache_without_kv_sharing():
|
||||
available_memory = 20 * GiB_bytes
|
||||
# page size for layer 0's kv_cache_spec is 32KB
|
||||
num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers)
|
||||
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
|
||||
[available_memory])[0]
|
||||
assert kv_cache_config.num_blocks == num_expected_blocks
|
||||
assert len(kv_cache_config.kv_cache_tensors) == 2
|
||||
assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
|
||||
@@ -657,8 +657,8 @@ def test_init_kv_cache_with_kv_sharing_valid():
|
||||
# with KV sharing, we can allocate (available_mem//page_size//1) blocks
|
||||
# which is twice as many as without KV sharing
|
||||
num_expected_blocks = 655360 # 20GB / 32KB
|
||||
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
|
||||
[available_memory])[0]
|
||||
assert kv_cache_config.num_blocks == num_expected_blocks
|
||||
assert len(kv_cache_config.kv_cache_tensors) == 1
|
||||
# Each layer now has twice the available memory for KV cache
|
||||
@@ -788,8 +788,8 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
|
||||
kv_cache_spec = runner.get_kv_cache_spec()
|
||||
|
||||
available_memory = 5 * GiB_bytes
|
||||
kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
kv_cache_config = get_kv_cache_configs(vllm_config, [kv_cache_spec],
|
||||
[available_memory])[0]
|
||||
runner.initialize_kv_cache(kv_cache_config)
|
||||
|
||||
# random partition of blocks
|
||||
|
||||
Reference in New Issue
Block a user