[v1] Support multiple KV cache groups in GPU model runner (#17945)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-05-15 09:54:54 +08:00
parent f25e0d1125
commit e60f550b38
16 changed files with 482 additions and 215 deletions
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -19,7 +19,8 @@ from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
                                         hash_request_tokens,
                                         unify_kv_cache_configs)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheGroupSpec, KVCacheTensor)
+                                        KVCacheGroupSpec, KVCacheTensor,
+                                        SlidingWindowSpec)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request

@@ -54,12 +55,14 @@ def new_kv_cache_spec(block_size=16,
                      num_kv_heads=2,
                      head_size=64,
                      dtype=torch.float32,
-                      use_mla=False):
+                      use_mla=False,
+                      sliding_window=None):
    return FullAttentionSpec(block_size=block_size,
                             num_kv_heads=num_kv_heads,
                             head_size=head_size,
                             dtype=dtype,
-                             use_mla=use_mla)
+                             use_mla=use_mla,
+                             sliding_window=sliding_window)


 def test_none_hash():
@@ -471,6 +474,68 @@ def test_unify_kv_cache_configs():
        unify_kv_cache_configs(diff_kv_cache_config)


+def test_merge_kv_cache_spec():
+    same_layer_specs = [
+        new_kv_cache_spec(num_kv_heads=32),
+        new_kv_cache_spec(num_kv_heads=32),
+    ]
+    merged_layer_spec = same_layer_specs[0].merge(same_layer_specs)
+    assert merged_layer_spec.block_size == 16
+    assert merged_layer_spec.num_kv_heads == 32
+    assert merged_layer_spec.head_size == 64
+    assert merged_layer_spec.dtype == torch.float32
+    assert merged_layer_spec.sliding_window is None
+
+    different_layer_specs = [
+        new_kv_cache_spec(num_kv_heads=32),
+        new_kv_cache_spec(num_kv_heads=16),
+    ]
+    with pytest.raises(AssertionError):
+        different_layer_specs[0].merge(different_layer_specs)
+
+    full_spec = new_kv_cache_spec(num_kv_heads=32)
+    different_type_layer_specs = [
+        full_spec,
+        SlidingWindowSpec(
+            block_size=full_spec.block_size,
+            num_kv_heads=full_spec.num_kv_heads,
+            head_size=full_spec.head_size,
+            dtype=full_spec.dtype,
+            use_mla=full_spec.use_mla,
+            sliding_window=1,
+        ),
+    ]
+    with pytest.raises(AssertionError):
+        different_type_layer_specs[0].merge(different_type_layer_specs)
+    with pytest.raises(AssertionError):
+        different_type_layer_specs[1].merge(different_type_layer_specs)
+
+    different_sliding_window_layer_specs = [
+        new_kv_cache_spec(num_kv_heads=32),
+        new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
+        new_kv_cache_spec(num_kv_heads=32, sliding_window=2),
+    ]
+    with pytest.raises(ValueError):
+        different_sliding_window_layer_specs[0].merge(
+            different_sliding_window_layer_specs)
+
+    same_sliding_window_layer_specs = [
+        new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
+        new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
+    ]
+    merged_layer_spec = same_sliding_window_layer_specs[0].merge(
+        same_sliding_window_layer_specs)
+    assert merged_layer_spec.sliding_window == 1
+
+    same_sliding_window_layer_spec_with_none = [
+        new_kv_cache_spec(num_kv_heads=32, sliding_window=1),
+        new_kv_cache_spec(num_kv_heads=32, sliding_window=None),
+    ]
+    merged_layer_spec = same_sliding_window_layer_spec_with_none[0].merge(
+        same_sliding_window_layer_spec_with_none)
+    assert merged_layer_spec.sliding_window == 1
+
+
@pytest.mark.parametrize(
    ("model_id", "max_model_len", "want_estimated_max_len"), [
        ("Qwen/Qwen1.5-7B", 16385, 16384),
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -84,7 +84,7 @@ def test_prefill(hash_algo):
    blocks = manager.allocate_slots(req0, 55,
                                    len(computed_blocks.blocks) * 16,
                                    computed_blocks)
-    assert blocks.get_block_ids() == [1, 2, 3, 4]
+    assert blocks.get_block_ids() == [[1, 2, 3, 4]]

    # Check full block metadata
    parent_block_hash = None
@@ -107,13 +107,13 @@ def test_prefill(hash_algo):
    req1 = make_request("1", common_token_ids + unique_token_ids)
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [1, 2, 3]
+    assert computed_blocks.get_block_ids() == [[1, 2, 3]]
    assert num_computed_tokens == 3 * 16
    num_new_tokens = 53 - 3 * 16
    blocks = manager.allocate_slots(req1, num_new_tokens,
                                    len(computed_blocks.blocks) * 16,
                                    computed_blocks)
-    assert blocks.get_block_ids() == [5]
+    assert blocks.get_block_ids() == [[5]]
    for block in computed_blocks.blocks:
        assert block.ref_cnt == 2

@@ -141,13 +141,13 @@ def test_prefill(hash_algo):
    req2 = make_request("2", common_token_ids + unique_token_ids)
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
    assert len(manager.req_to_block_hashes[req2.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [1, 2, 3]
+    assert computed_blocks.get_block_ids() == [[1, 2, 3]]
    assert num_computed_tokens == 3 * 16
    num_new_tokens = 53 - 3 * 16
    blocks = manager.allocate_slots(req2, num_new_tokens,
                                    len(computed_blocks.blocks) * 16,
                                    computed_blocks)
-    assert blocks.get_block_ids() == [6]
+    assert blocks.get_block_ids() == [[6]]

    # Although we only have 6 free blocks, we have 8 blocks in
    # the free block queue due to lazy removal.
@@ -171,7 +171,7 @@ def test_prefill(hash_algo):
                                    len(computed_blocks.blocks) * 16,
                                    computed_blocks)
    # This block ID order also checks the eviction order.
-    assert blocks.get_block_ids() == [7, 8, 9, 10, 4, 5, 6, 3, 2, 1]
+    assert blocks.get_block_ids() == [[7, 8, 9, 10, 4, 5, 6, 3, 2, 1]]
    assert manager.block_pool.free_block_queue.num_free_blocks == 0
    assert manager.block_pool.free_block_queue.free_list_head is None
    assert manager.block_pool.free_block_queue.free_list_tail is None
@@ -208,7 +208,7 @@ def test_prefill_plp():
    blocks = manager.allocate_slots(req0, 55,
                                    len(computed_blocks.blocks) * 16,
                                    computed_blocks)
-    assert blocks.get_block_ids() == [1, 2, 3, 4]
+    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
    req0_block_hashes = [b.block_hash for b in blocks.blocks]

    # Check full block metadata
@@ -233,13 +233,13 @@ def test_prefill_plp():
    req1 = make_request("1", common_token_ids + unique_token_ids)
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert computed_blocks.get_block_ids() == [1, 2, 3]
+    assert computed_blocks.get_block_ids() == [[1, 2, 3]]
    assert num_computed_tokens == 3 * 16
    num_new_tokens = 53 - 3 * 16
    blocks = manager.allocate_slots(req1, num_new_tokens,
                                    len(computed_blocks.blocks) * 16,
                                    computed_blocks)
-    assert blocks.get_block_ids() == [5]
+    assert blocks.get_block_ids() == [[5]]
    for block in computed_blocks.blocks:
        assert block.ref_cnt == 2

@@ -277,11 +277,11 @@ def test_prefill_plp():
    block_ids = blocks.get_block_ids()
    # Duplicate cached blocks have different ids but same hashes vs request #0
    assert [b.block_hash for b in blocks.blocks] == req0_block_hashes
-    assert block_ids != [1, 2, 3, 4]
+    assert block_ids != [[1, 2, 3, 4]]

    # Request #2 block hashes are valid since request #0 hashes are.
    # Check block reference counts.
-    for block_id in block_ids:
+    for block_id in block_ids[0]:
        assert manager.block_pool.blocks[block_id].ref_cnt == 1

    manager.free(req2)
@@ -307,7 +307,7 @@ def test_decode():
    blocks = manager.allocate_slots(req0, 55,
                                    len(computed_blocks.blocks) * 16,
                                    computed_blocks)
-    assert blocks.get_block_ids() == [1, 2, 3, 4]
+    assert blocks.get_block_ids() == [[1, 2, 3, 4]]

    # Append slots without allocating a new block.
    req0.num_computed_tokens = 55
@@ -379,12 +379,12 @@ def test_evict():
    # Touch the first 2 blocks.
    req2 = make_request("2", list(range(2 * 16 + 3)))
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert computed_blocks.get_block_ids() == [1, 2]
+    assert computed_blocks.get_block_ids() == [[1, 2]]
    assert num_computed_tokens == 2 * 16
    blocks = manager.allocate_slots(req2, 3,
                                    len(computed_blocks.blocks) * 16,
                                    computed_blocks)
-    assert blocks.get_block_ids() == [10]
+    assert blocks.get_block_ids() == [[10]]
    assert manager.block_pool.free_block_queue.num_free_blocks == 7


@@ -625,7 +625,7 @@ def test_mm_prefix_caching():
    blocks = manager.allocate_slots(req0, 59,
                                    len(computed_blocks.blocks) * 16,
                                    computed_blocks)
-    assert blocks.get_block_ids() == [1, 2, 3, 4]
+    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
    req0.num_computed_tokens = 59

    # Append slots without allocating a new block.
@@ -686,7 +686,7 @@ def test_cache_key_salting():
    blocks = manager.allocate_slots(req0, 59,
                                    len(computed_blocks.blocks) * 16,
                                    computed_blocks)
-    assert blocks.get_block_ids() == [1, 2, 3, 4]
+    assert blocks.get_block_ids() == [[1, 2, 3, 4]]
    req0.num_computed_tokens = 59

    # Append slots without allocating a new block.
@@ -797,7 +797,7 @@ def test_reset_prefix_cache():
    all_token_ids = full_block_token_ids + unique_token_ids
    req0 = make_request("0", all_token_ids)
    blocks = manager.allocate_slots(req0, 55)
-    assert blocks.get_block_ids() == [1, 2, 3, 4]
+    assert blocks.get_block_ids() == [[1, 2, 3, 4]]

    unique_token_ids = [4] * 7
    all_token_ids = full_block_token_ids + unique_token_ids
@@ -808,7 +808,7 @@ def test_reset_prefix_cache():
    blocks = manager.allocate_slots(req1, 7,
                                    len(computed_blocks.blocks) * 16,
                                    computed_blocks)
-    assert blocks.get_block_ids() == [5]
+    assert blocks.get_block_ids() == [[5]]

    # Failed to reset prefix cache because some blocks are not freed yet.
    assert not manager.reset_prefix_cache()
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -9,9 +9,11 @@ import torch

 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, KVCacheTensor)
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.worker.gpu_input_batch import (BlockTable, CachedRequestState,
-                                            InputBatch)
+from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch

 VOCAB_SIZE = 1024
 NUM_OUTPUT_TOKENS = 20
@@ -22,6 +24,27 @@ CUDA_DEVICES = [
 MAX_NUM_PROMPT_TOKENS = 64


+def get_kv_cache_config() -> KVCacheConfig:
+    return KVCacheConfig(
+        num_blocks=10,
+        tensors={
+            "layer.0": KVCacheTensor(size=1024),
+        },
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                layer_names=["layer.0"],
+                kv_cache_spec=FullAttentionSpec(
+                    block_size=1,
+                    num_kv_heads=1,
+                    head_size=16,
+                    dtype=torch.float16,
+                    use_mla=False,
+                ),
+            ),
+        ],
+    )
+
+
 def _compare_objs(obj1, obj2):
    attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
    attr_names = set([
@@ -41,6 +64,10 @@ def _compare_objs(obj1, obj2):
        elif isinstance(a, np.ndarray):
            if np.allclose(a, b):
                is_same = True
+        elif isinstance(a, MultiGroupBlockTable):
+            for a_i, b_i in zip(a.block_tables, b.block_tables):
+                _compare_objs(a_i, b_i)
+            is_same = True
        elif isinstance(a, (BlockTable, SamplingMetadata)):
            _compare_objs(a, b)
            is_same = True  # if we make it here must be same
@@ -198,7 +225,7 @@ def _construct_cached_request_state(req_id_suffix: int):
        sampling_params=_create_sampling_params(),
        mm_inputs=[],
        mm_positions=[],
-        block_ids=[],
+        block_ids=[[]],
        generator=None,
        num_computed_tokens=len(output_token_ids),
        output_token_ids=output_token_ids,
@@ -220,11 +247,11 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
    input_batch: InputBatch = InputBatch(
        max_num_reqs=batch_size,
        max_model_len=1024,
-        max_num_blocks_per_req=10,
        max_num_batched_tokens=1024,
        device=torch.device(device),
        pin_memory=is_pin_memory_available(),
        vocab_size=1024,
+        kv_cache_config=get_kv_cache_config(),
    )
    reqs: list[CachedRequestState] = []
    req_id_reqs = {}
@@ -310,20 +337,20 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
    input_batch: InputBatch = InputBatch(
        max_num_reqs=batch_size,
        max_model_len=1024,
-        max_num_blocks_per_req=10,
        max_num_batched_tokens=1024,
        device=torch.device(device),
        pin_memory=is_pin_memory_available(),
        vocab_size=1024,
+        kv_cache_config=get_kv_cache_config(),
    )
    ref_input_batch: InputBatch = InputBatch(
        max_num_reqs=batch_size,
        max_model_len=1024,
-        max_num_blocks_per_req=10,
        max_num_batched_tokens=1024,
        device=torch.device(device),
        pin_memory=is_pin_memory_available(),
        vocab_size=1024,
+        kv_cache_config=get_kv_cache_config(),
    )

    reqs: list[CachedRequestState] = []
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -1,15 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
-import weakref

 import pytest
-import torch

-from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, VllmConfig)
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                       SchedulerOutput)
-from vllm.v1.kv_cache_interface import FullAttentionSpec
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, KVCacheTensor)
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner


@@ -17,13 +18,34 @@ def initialize_kv_cache(runner: GPUModelRunner):
    """
    Only perform necessary steps in GPUModelRunner.initialize_kv_cache()
    """
-    kv_cache_spec = FullAttentionSpec(block_size=16,
-                                      num_kv_heads=1,
-                                      head_size=64,
-                                      dtype=torch.float16,
-                                      use_mla=False)
-    runner.attn_metadata_builder = runner.attn_backend.get_builder_cls()(
-        weakref.proxy(runner), kv_cache_spec, runner.input_batch.block_table)
+    kv_cache_config = KVCacheConfig(
+        num_blocks=10,
+        tensors={
+            "layer.0": KVCacheTensor(size=1024),
+        },
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                layer_names=["layer.0"],
+                kv_cache_spec=FullAttentionSpec(
+                    block_size=16,
+                    num_kv_heads=runner.model_config.get_num_kv_heads(
+                        runner.parallel_config),
+                    head_size=runner.model_config.get_head_size(),
+                    dtype=runner.kv_cache_dtype,
+                    use_mla=False,
+                ))
+        ])
+    runner.kv_cache_config = kv_cache_config
+    runner.input_batch = InputBatch(
+        max_num_reqs=runner.max_num_reqs,
+        max_model_len=runner.max_model_len,
+        max_num_batched_tokens=runner.max_num_tokens,
+        device=runner.device,
+        pin_memory=runner.pin_memory,
+        vocab_size=runner.model_config.get_vocab_size(),
+        kv_cache_config=kv_cache_config,
+    )
+    runner.initialize_attn_backend(kv_cache_config)


@pytest.fixture
@@ -48,10 +70,12 @@ def model_runner():
        swap_space=0,
        cache_dtype="auto",
    )
+    parallel_config = ParallelConfig()
    vllm_config = VllmConfig(
        model_config=model_config,
        cache_config=cache_config,
        scheduler_config=scheduler_config,
+        parallel_config=parallel_config,
    )

    device = "cuda"
@@ -73,7 +97,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
                mm_hashes=[],
                mm_positions=[],
                sampling_params=SamplingParams(),
-                block_ids=[0],
+                block_ids=[[0]],
                num_computed_tokens=0,
                lora_request=None,
            ))
@@ -111,13 +135,14 @@ def _is_sampling_metadata_changed(model_runner,

 def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
    req_index = model_runner.input_batch.req_id_to_index[req_id]
-    block_table = model_runner.input_batch.block_table
+    block_table = model_runner.input_batch.block_table[0]
    req_state = model_runner.requests[req_id]
-    if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids):
+    if block_table.num_blocks_per_row[req_index] != len(
+            req_state.block_ids[0]):
        return False
    num_blocks = block_table.num_blocks_per_row[req_index]
    return (block_table.block_table_np[req_index, :num_blocks] ==
-            req_state.block_ids).all()
+            req_state.block_ids[0]).all()


 def test_update_states_new_request(model_runner):
@@ -200,7 +225,7 @@ def test_update_states_request_resumed(model_runner):
        req_id=req_id,
        resumed_from_preemption=False,
        new_token_ids=[],
-        new_block_ids=[],
+        new_block_ids=[[]],
        num_computed_tokens=0,
    )