[v1] Redo "Support multiple KV cache groups in GPU model runner (#17945)" (#18593)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-05-24 00:39:47 +08:00
parent 9520a989df
commit 6550114c9c
15 changed files with 469 additions and 203 deletions
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -9,9 +9,11 @@ import torch

 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, KVCacheTensor)
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.worker.gpu_input_batch import (BlockTable, CachedRequestState,
-                                            InputBatch)
+from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch

 VOCAB_SIZE = 1024
 NUM_OUTPUT_TOKENS = 20
@@ -22,6 +24,27 @@ CUDA_DEVICES = [
 MAX_NUM_PROMPT_TOKENS = 64


+def get_kv_cache_config() -> KVCacheConfig:
+    return KVCacheConfig(
+        num_blocks=10,
+        tensors={
+            "layer.0": KVCacheTensor(size=1024),
+        },
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                layer_names=["layer.0"],
+                kv_cache_spec=FullAttentionSpec(
+                    block_size=1,
+                    num_kv_heads=1,
+                    head_size=16,
+                    dtype=torch.float16,
+                    use_mla=False,
+                ),
+            ),
+        ],
+    )
+
+
 def _compare_objs(obj1, obj2):
    attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
    attr_names = set([
@@ -41,6 +64,10 @@ def _compare_objs(obj1, obj2):
        elif isinstance(a, np.ndarray):
            if np.allclose(a, b):
                is_same = True
+        elif isinstance(a, MultiGroupBlockTable):
+            for a_i, b_i in zip(a.block_tables, b.block_tables):
+                _compare_objs(a_i, b_i)
+            is_same = True
        elif isinstance(a, (BlockTable, SamplingMetadata)):
            _compare_objs(a, b)
            is_same = True  # if we make it here must be same
@@ -198,7 +225,7 @@ def _construct_cached_request_state(req_id_suffix: int):
        sampling_params=_create_sampling_params(),
        mm_inputs=[],
        mm_positions=[],
-        block_ids=[],
+        block_ids=[[]],
        generator=None,
        num_computed_tokens=len(output_token_ids),
        output_token_ids=output_token_ids,
@@ -220,11 +247,11 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
    input_batch: InputBatch = InputBatch(
        max_num_reqs=batch_size,
        max_model_len=1024,
-        max_num_blocks_per_req=10,
        max_num_batched_tokens=1024,
        device=torch.device(device),
        pin_memory=is_pin_memory_available(),
        vocab_size=1024,
+        block_size=1,
    )
    reqs: list[CachedRequestState] = []
    req_id_reqs = {}
@@ -310,20 +337,20 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
    input_batch: InputBatch = InputBatch(
        max_num_reqs=batch_size,
        max_model_len=1024,
-        max_num_blocks_per_req=10,
        max_num_batched_tokens=1024,
        device=torch.device(device),
        pin_memory=is_pin_memory_available(),
        vocab_size=1024,
+        block_size=1,
    )
    ref_input_batch: InputBatch = InputBatch(
        max_num_reqs=batch_size,
        max_model_len=1024,
-        max_num_blocks_per_req=10,
        max_num_batched_tokens=1024,
        device=torch.device(device),
        pin_memory=is_pin_memory_available(),
        vocab_size=1024,
+        block_size=1,
    )

    reqs: list[CachedRequestState] = []