[Hybrid] A simpler algorithm to find kernel_block_size (#26476)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
Chen Zhang
2025-10-31 14:30:28 -07:00
committed by GitHub
parent 0e0a638c3b
commit df334868ca
3 changed files with 146 additions and 82 deletions

View File

@@ -6,6 +6,7 @@ import pytest
import torch
from vllm.attention import Attention
from vllm.attention.backends.abstract import MultipleOf
from vllm.config import (
CacheConfig,
ModelConfig,
@@ -34,6 +35,7 @@ from vllm.v1.kv_cache_interface import (
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
from vllm.v1.worker.utils import AttentionGroup
BLOCK_SIZE = 16
NUM_BLOCKS = 10
@@ -181,6 +183,57 @@ def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
).all()
def _make_mock_backend_for_kernel_block_size(
supported_sizes: list[int | MultipleOf],
):
class _MockBackend:
@staticmethod
def get_supported_kernel_block_size():
return supported_sizes
return _MockBackend()
def _make_kv_cache_spec() -> FullAttentionSpec:
return FullAttentionSpec(block_size=1, num_kv_heads=1, head_size=1, dtype="float16")
def test_select_common_block_size_prefers_manager_block_size():
backend_a = _make_mock_backend_for_kernel_block_size([MultipleOf(32)])
backend_b = _make_mock_backend_for_kernel_block_size([64, MultipleOf(16)])
attn_groups = [
AttentionGroup(backend_a, [], [], _make_kv_cache_spec(), 0),
AttentionGroup(backend_b, [], [], _make_kv_cache_spec(), 0),
]
selected_size = GPUModelRunner.select_common_block_size(128, attn_groups)
assert selected_size == 128
def test_select_common_block_size_uses_largest_shared_int():
backend_a = _make_mock_backend_for_kernel_block_size([128, 64])
backend_b = _make_mock_backend_for_kernel_block_size([64, 32])
attn_groups = [
AttentionGroup(backend_a, [], [], _make_kv_cache_spec(), 0),
AttentionGroup(backend_b, [], [], _make_kv_cache_spec(), 0),
]
selected_size = GPUModelRunner.select_common_block_size(256, attn_groups)
assert selected_size == 64
def test_select_common_block_size_no_valid_option():
backend_a = _make_mock_backend_for_kernel_block_size([64])
backend_b = _make_mock_backend_for_kernel_block_size([MultipleOf(16)])
attn_groups = [
AttentionGroup(backend_a, [], [], _make_kv_cache_spec(), 0),
AttentionGroup(backend_b, [], [], _make_kv_cache_spec(), 0),
]
with pytest.raises(ValueError):
GPUModelRunner.select_common_block_size(48, attn_groups)
def test_update_states_new_request(model_runner, dist_init):
req_id = "req_0"