Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -15,14 +15,12 @@ pytestmark = pytest.mark.cpu_test
def _make_model_runner_output(
scheduler_output: SchedulerOutput, ) -> ModelRunnerOutput:
scheduler_output: SchedulerOutput,
) -> ModelRunnerOutput:
req_ids = list(scheduler_output.num_scheduled_tokens.keys())
return ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index={
req_id: i
for i, req_id in enumerate(req_ids)
},
req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)},
sampled_token_ids=[[i] for i in range(len(req_ids))],
logprobs=None,
prompt_logprobs_dict={},
@@ -75,8 +73,7 @@ def test_abort():
if not abort_order:
return
req = requests[abort_order.pop(0)]
scheduler.finish_requests(req.request_id,
RequestStatus.FINISHED_ABORTED)
scheduler.finish_requests(req.request_id, RequestStatus.FINISHED_ABORTED)
while sched_outputs:
# Abort a scheduled request.
@@ -112,8 +109,7 @@ def test_preempt():
if not abort_order:
return
req = requests[abort_order.pop(0)]
scheduler.finish_requests(req.request_id,
RequestStatus.FINISHED_ABORTED)
scheduler.finish_requests(req.request_id, RequestStatus.FINISHED_ABORTED)
while sched_outputs:
# Abort a scheduled request.
@@ -135,15 +131,19 @@ def test_prefix_caching_for_prefill_dedup():
CHUNK_SIZE = 1000
BLOCK_SIZE = 16
num_prompt_tokens = 100
scheduler = create_scheduler(async_scheduling=True,
max_num_batched_tokens=CHUNK_SIZE,
enable_prefix_caching=True,
block_size=BLOCK_SIZE)
requests = create_requests(num_requests=5,
num_tokens=num_prompt_tokens,
max_tokens=3,
same_prompt=True,
block_size=BLOCK_SIZE)
scheduler = create_scheduler(
async_scheduling=True,
max_num_batched_tokens=CHUNK_SIZE,
enable_prefix_caching=True,
block_size=BLOCK_SIZE,
)
requests = create_requests(
num_requests=5,
num_tokens=num_prompt_tokens,
max_tokens=3,
same_prompt=True,
block_size=BLOCK_SIZE,
)
requests_copy = requests.copy()
# Two requests with the same prompt.
@@ -185,14 +185,18 @@ def test_prefix_caching_for_multi_turn():
BLOCK_SIZE = 16
num_prompt_tokens = 100
num_output_tokens = 200
scheduler = create_scheduler(async_scheduling=True,
max_num_batched_tokens=CHUNK_SIZE,
enable_prefix_caching=True,
block_size=BLOCK_SIZE)
requests = create_requests(num_requests=5,
num_tokens=num_prompt_tokens,
max_tokens=num_output_tokens,
block_size=BLOCK_SIZE)
scheduler = create_scheduler(
async_scheduling=True,
max_num_batched_tokens=CHUNK_SIZE,
enable_prefix_caching=True,
block_size=BLOCK_SIZE,
)
requests = create_requests(
num_requests=5,
num_tokens=num_prompt_tokens,
max_tokens=num_output_tokens,
block_size=BLOCK_SIZE,
)
for req in requests:
scheduler.add_request(req)
@@ -212,14 +216,16 @@ def test_prefix_caching_for_multi_turn():
# Create next-turn requests whose prompts are the full output of the
# previous turn.
next_turn_requests = create_requests(num_requests=5,
num_tokens=num_prompt_tokens +
num_output_tokens,
max_tokens=num_output_tokens,
block_size=BLOCK_SIZE)
next_turn_requests = create_requests(
num_requests=5,
num_tokens=num_prompt_tokens + num_output_tokens,
max_tokens=num_output_tokens,
block_size=BLOCK_SIZE,
)
for i, req in enumerate(next_turn_requests):
req.prompt_token_ids = (requests[i].prompt_token_ids +
list(requests[i].output_token_ids))
req.prompt_token_ids = requests[i].prompt_token_ids + list(
requests[i].output_token_ids
)
req._all_token_ids = req.prompt_token_ids.copy()
req.all_token_ids = ConstantList(req._all_token_ids)
req.block_hashes = []
@@ -233,5 +239,4 @@ def test_prefix_caching_for_multi_turn():
# Make sure the next-turn requests get prefix cache hit by the previous
# requests.
for req in next_turn_requests:
assert (req.num_cached_tokens == req.num_prompt_tokens // BLOCK_SIZE *
BLOCK_SIZE)
assert req.num_cached_tokens == req.num_prompt_tokens // BLOCK_SIZE * BLOCK_SIZE

View File

@@ -10,7 +10,6 @@ pytestmark = pytest.mark.cpu_test
# ------------------ Mock Classes ------------------ #
class MockRequest:
def __init__(self, request_id, mm_hashes, token_counts):
self.request_id = request_id
self._token_counts = token_counts
@@ -20,8 +19,7 @@ class MockRequest:
data=None,
modality="image",
identifier=mm_hash,
mm_position=PlaceholderRange(offset=0,
length=self._token_counts[i]),
mm_position=PlaceholderRange(offset=0, length=self._token_counts[i]),
)
self.mm_features.append(feature)
@@ -167,8 +165,7 @@ def test_schedule_request_multi_images_respect_space_limit():
num_tokens_to_schedule += req.get_num_encoder_tokens(0)
compute_budget -= req.get_num_encoder_tokens(0)
assert not manager.can_allocate(req, 1, compute_budget,
num_tokens_to_schedule)
assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule)
def test_schedule_request_multi_images_respect_compute_limit():
@@ -180,5 +177,4 @@ def test_schedule_request_multi_images_respect_compute_limit():
num_tokens_to_schedule += req.get_num_encoder_tokens(0)
compute_budget -= req.get_num_encoder_tokens(0)
assert not manager.can_allocate(req, 1, compute_budget,
num_tokens_to_schedule)
assert not manager.can_allocate(req, 1, compute_budget, num_tokens_to_schedule)

File diff suppressed because it is too large Load Diff

View File

@@ -26,8 +26,7 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
# However, if they have different attention backends, they will be
# placed in different attention groups for KV cache group 0
kv_cache_groups = [
KVCacheGroupSpec(["model.layers.0", "model.layers.1"],
new_kv_cache_spec()),
KVCacheGroupSpec(["model.layers.0", "model.layers.1"], new_kv_cache_spec()),
]
add_kv_sharing_layers_to_kv_cache_groups(
@@ -38,7 +37,10 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
# Check that the layers were added to the correct KV cache group
assert len(kv_cache_groups) == 1
assert kv_cache_groups[0].layer_names == [
"model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
"model.layers.0",
"model.layers.1",
"model.layers.2",
"model.layers.3",
]
@@ -53,8 +55,7 @@ def test_initialize_kv_cache_for_kv_sharing_same_attn_groups():
}
kv_cache_groups = [
KVCacheGroupSpec(["model.layers.0", "model.layers.1"],
new_kv_cache_spec()),
KVCacheGroupSpec(["model.layers.0", "model.layers.1"], new_kv_cache_spec()),
]
add_kv_sharing_layers_to_kv_cache_groups(
@@ -65,14 +66,17 @@ def test_initialize_kv_cache_for_kv_sharing_same_attn_groups():
# Check that the layers were added to the correct KV cache group
assert len(kv_cache_groups) == 1
assert kv_cache_groups[0].layer_names == [
"model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
"model.layers.0",
"model.layers.1",
"model.layers.2",
"model.layers.3",
]
def test_initialize_kv_cache_for_kv_sharing_no_attn_groups():
"""
Test KV sharing set up when no attention groups are provided.
This is the case for the TPU model runner, which doesn't have
This is the case for the TPU model runner, which doesn't have
support for attention groups yet.
"""
shared_kv_cache_layers = {
@@ -92,9 +96,5 @@ def test_initialize_kv_cache_for_kv_sharing_no_attn_groups():
# Check that the layers were added to the correct KV cache group
assert len(kv_cache_groups) == 2
assert kv_cache_groups[0].layer_names == [
"model.layers.0", "model.layers.2"
]
assert kv_cache_groups[1].layer_names == [
"model.layers.1", "model.layers.3"
]
assert kv_cache_groups[0].layer_names == ["model.layers.0", "model.layers.2"]
assert kv_cache_groups[1].layer_names == ["model.layers.1", "model.layers.3"]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -15,13 +15,15 @@ PROMPT = "Hello my name is Robert and I"
@pytest.fixture(scope="module")
def llm() -> LLM:
return LLM(MODEL,
enforce_eager=True,
enable_prefix_caching=True,
long_prefill_token_threshold=2,
max_num_batched_tokens=6,
max_num_seqs=3,
block_size=16)
return LLM(
MODEL,
enforce_eager=True,
enable_prefix_caching=True,
long_prefill_token_threshold=2,
max_num_batched_tokens=6,
max_num_seqs=3,
block_size=16,
)
def test_concurrent_partial_prefill(llm):

View File

@@ -7,27 +7,28 @@ import pytest
import torch
from vllm.v1.core.block_pool import BlockPool
from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
make_block_hash_with_group_id)
from vllm.v1.core.kv_cache_utils import (
BlockHash,
KVCacheBlock,
make_block_hash_with_group_id,
)
from vllm.v1.core.single_type_kv_cache_manager import (
ChunkedLocalAttentionManager, SlidingWindowManager)
from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
SlidingWindowSpec)
ChunkedLocalAttentionManager,
SlidingWindowManager,
)
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, SlidingWindowSpec
pytestmark = pytest.mark.cpu_test
def get_sliding_window_manager(sliding_window_spec, block_pool):
return SlidingWindowManager(sliding_window_spec,
block_pool,
kv_cache_group_id=0)
return SlidingWindowManager(sliding_window_spec, block_pool, kv_cache_group_id=0)
def get_chunked_local_attention_manager(chunked_local_attention_spec,
block_pool):
return ChunkedLocalAttentionManager(chunked_local_attention_spec,
block_pool,
kv_cache_group_id=0)
def get_chunked_local_attention_manager(chunked_local_attention_spec, block_pool):
return ChunkedLocalAttentionManager(
chunked_local_attention_spec, block_pool, kv_cache_group_id=0
)
def test_chunked_local_attention_possible_cached_prefix():
@@ -41,8 +42,9 @@ def test_chunked_local_attention_possible_cached_prefix():
)
block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
manager = get_chunked_local_attention_manager(chunked_local_attention_spec,
block_pool)
manager = get_chunked_local_attention_manager(
chunked_local_attention_spec, block_pool
)
def run_one_case(block_is_cached, tail_token, expect_length):
block_hash_list = [
@@ -52,12 +54,14 @@ def test_chunked_local_attention_possible_cached_prefix():
block_pool.cached_block_hash_to_block._cache.clear()
# Mock the block pool with the cached blocks
for i, (block_hash,
is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
for i, (block_hash, is_cached) in enumerate(
zip(block_hash_list, block_is_cached)
):
if is_cached:
block_pool.cached_block_hash_to_block.insert(
make_block_hash_with_group_id(block_hash, 0),
block_pool.blocks[i + 10])
block_pool.blocks[i + 10],
)
computed_blocks = manager.find_longest_cache_hit(
block_hashes=block_hash_list,
@@ -65,11 +69,14 @@ def test_chunked_local_attention_possible_cached_prefix():
kv_cache_group_ids=[0],
block_pool=block_pool,
kv_cache_spec=chunked_local_attention_spec,
use_eagle=False)[0]
use_eagle=False,
)[0]
assert len(computed_blocks) == expect_length
assert all(block == block_pool.null_block
for block in computed_blocks[:(expect_length - 1) // 2])
assert all(
block == block_pool.null_block
for block in computed_blocks[: (expect_length - 1) // 2]
)
run_one_case([True], 0, 1)
run_one_case([True], 1, 1)
@@ -115,12 +122,14 @@ def test_sliding_window_possible_cached_prefix():
block_pool.cached_block_hash_to_block._cache.clear()
# Mock the block pool with the cached blocks
for i, (block_hash,
is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
for i, (block_hash, is_cached) in enumerate(
zip(block_hash_list, block_is_cached)
):
if is_cached:
block_pool.cached_block_hash_to_block.insert(
make_block_hash_with_group_id(block_hash, 0),
block_pool.blocks[i + 10])
block_pool.blocks[i + 10],
)
computed_blocks = manager.find_longest_cache_hit(
block_hashes=block_hash_list,
@@ -128,16 +137,18 @@ def test_sliding_window_possible_cached_prefix():
kv_cache_group_ids=[0],
block_pool=block_pool,
kv_cache_spec=sliding_window_spec,
use_eagle=False)[0]
use_eagle=False,
)[0]
assert len(computed_blocks) == expect_length
assert all(block == block_pool.null_block
for block in computed_blocks[:expect_length - 2])
assert all(
block == block_pool.null_block
for block in computed_blocks[: expect_length - 2]
)
for i in range(2):
if i < expect_length:
block_index = expect_length - i - 1
assert computed_blocks[
block_index].block_id == block_index + 10
assert computed_blocks[block_index].block_id == block_index + 10
run_one_case([False] * 10, 0)
run_one_case([True], 1)
@@ -146,17 +157,16 @@ def test_sliding_window_possible_cached_prefix():
run_one_case([True, True, False], 2)
run_one_case([True, True, True], 3)
run_one_case([True, True, True, False], 3)
run_one_case([
True, True, False, True, False, False, True, True, False, True, True,
True
], 12)
run_one_case([
True, True, False, True, False, False, True, True, False, False, False
], 8)
run_one_case([
True, True, False, True, False, False, True, True, False, False, False,
True
], 8)
run_one_case(
[True, True, False, True, False, False, True, True, False, True, True, True], 12
)
run_one_case(
[True, True, False, True, False, False, True, True, False, False, False], 8
)
run_one_case(
[True, True, False, True, False, False, True, True, False, False, False, True],
8,
)
def test_chunked_local_attention_remove_skipped_blocks():
@@ -176,8 +186,8 @@ def test_chunked_local_attention_remove_skipped_blocks():
def id_to_block_table(ids) -> list[KVCacheBlock]:
return [
KVCacheBlock(id_)
if id_ != null_block_id else block_pool.null_block for id_ in ids
KVCacheBlock(id_) if id_ != null_block_id else block_pool.null_block
for id_ in ids
]
def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
@@ -188,7 +198,17 @@ def test_chunked_local_attention_remove_skipped_blocks():
assert block.block_id == id_
original_block_ids = [
1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
1000,
1001,
1002,
1003,
1004,
1005,
1006,
1007,
1008,
1009,
1010,
]
block_table = id_to_block_table(original_block_ids)
manager.req_to_blocks["test"] = block_table
@@ -227,8 +247,8 @@ def test_sliding_window_remove_skipped_blocks():
def id_to_block_table(ids) -> list[KVCacheBlock]:
return [
KVCacheBlock(id_)
if id_ != null_block_id else block_pool.null_block for id_ in ids
KVCacheBlock(id_) if id_ != null_block_id else block_pool.null_block
for id_ in ids
]
def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]):
@@ -239,7 +259,17 @@ def test_sliding_window_remove_skipped_blocks():
assert block.block_id == id_
original_block_ids = [
1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
1000,
1001,
1002,
1003,
1004,
1005,
1006,
1007,
1008,
1009,
1010,
]
block_table = id_to_block_table(original_block_ids)
manager.req_to_blocks["test"] = block_table
@@ -289,13 +319,16 @@ def test_get_num_blocks_to_allocate():
block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
manager = get_sliding_window_manager(sliding_window_spec, block_pool)
cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
cached_blocks_2 = [block_pool.null_block for _ in range(5)
] + [KVCacheBlock(i + 1) for i in range(5)]
cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
KVCacheBlock(i + 1) for i in range(5)
]
assert manager.get_num_blocks_to_allocate("1", 20 * block_size,
cached_blocks_1) == 20
assert manager.get_num_blocks_to_allocate("2", 20 * block_size,
cached_blocks_2) == 15
assert (
manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
)
assert (
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
)
def test_chunked_local_attention_get_num_blocks_to_allocate():
@@ -311,10 +344,13 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
manager = get_chunked_local_attention_manager(attention_spec, block_pool)
cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
cached_blocks_2 = [block_pool.null_block for _ in range(5)
] + [KVCacheBlock(i + 1) for i in range(5)]
cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
KVCacheBlock(i + 1) for i in range(5)
]
assert manager.get_num_blocks_to_allocate("1", 20 * block_size,
cached_blocks_1) == 20
assert manager.get_num_blocks_to_allocate("2", 20 * block_size,
cached_blocks_2) == 15
assert (
manager.get_num_blocks_to_allocate("1", 20 * block_size, cached_blocks_1) == 20
)
assert (
manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15
)

View File

@@ -4,18 +4,29 @@ from typing import Optional, Union
import torch
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
SchedulerConfig, SpeculativeConfig, VllmConfig)
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
MultiModalKwargsItem, PlaceholderRange)
from vllm.config import (
CacheConfig,
KVTransferConfig,
ModelConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
)
from vllm.multimodal.inputs import (
MultiModalFeatureSpec,
MultiModalKwargsItem,
PlaceholderRange,
)
from vllm.sampling_params import SamplingParams
from vllm.utils import sha256
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
init_none_hash)
from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
from vllm.v1.core.sched.async_scheduler import AsyncScheduler
from vllm.v1.core.sched.scheduler import Scheduler
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec)
from vllm.v1.kv_cache_interface import (
FullAttentionSpec,
KVCacheConfig,
KVCacheGroupSpec,
)
from vllm.v1.request import Request
from vllm.v1.structured_output import StructuredOutputManager
@@ -37,7 +48,7 @@ def create_scheduler(
skip_tokenizer_init: bool = False,
async_scheduling: bool = False,
) -> Union[Scheduler, AsyncScheduler]:
'''Create scheduler under test.
"""Create scheduler under test.
Args:
model: model under test
@@ -49,7 +60,7 @@ def create_scheduler(
Returns:
{class}`Scheduler` instance
'''
"""
if max_model_len is None:
max_model_len = max_num_batched_tokens
scheduler_config = SchedulerConfig(
@@ -69,9 +80,11 @@ def create_scheduler(
skip_tokenizer_init=skip_tokenizer_init,
)
# Cache config, optionally force APC
kwargs_cache = ({} if enable_prefix_caching is None else {
'enable_prefix_caching': enable_prefix_caching
})
kwargs_cache = (
{}
if enable_prefix_caching is None
else {"enable_prefix_caching": enable_prefix_caching}
)
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
@@ -79,16 +92,21 @@ def create_scheduler(
cache_dtype="auto",
**kwargs_cache,
)
kv_transfer_config = KVTransferConfig(
kv_connector="SharedStorageConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": "local_storage"},
) if use_kv_connector else None
kv_transfer_config = (
KVTransferConfig(
kv_connector="SharedStorageConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": "local_storage"},
)
if use_kv_connector
else None
)
speculative_config: Optional[SpeculativeConfig] = None
if num_speculative_tokens is not None:
speculative_config = SpeculativeConfig(
model="ngram", num_speculative_tokens=num_speculative_tokens)
model="ngram", num_speculative_tokens=num_speculative_tokens
)
vllm_config = VllmConfig(
scheduler_config=scheduler_config,
@@ -101,9 +119,9 @@ def create_scheduler(
num_blocks=num_blocks, # A large number of blocks to hold all requests
kv_cache_tensors=[],
kv_cache_groups=[
KVCacheGroupSpec(['layer'],
FullAttentionSpec(block_size, 1, 1, torch.float32,
False))
KVCacheGroupSpec(
["layer"], FullAttentionSpec(block_size, 1, 1, torch.float32, False)
)
],
)
cache_config.num_gpu_blocks = num_blocks
@@ -135,10 +153,12 @@ def create_requests(
_none_hash_initialized = True
block_hasher = get_request_block_hasher(block_size, sha256)
sampling_params = SamplingParams(ignore_eos=False,
max_tokens=max_tokens,
stop_token_ids=stop_token_ids,
prompt_logprobs=prompt_logprobs)
sampling_params = SamplingParams(
ignore_eos=False,
max_tokens=max_tokens,
stop_token_ids=stop_token_ids,
prompt_logprobs=prompt_logprobs,
)
requests = []
for i in range(num_requests):
mm_features = []
@@ -152,11 +172,11 @@ def create_requests(
data=MultiModalKwargsItem.dummy("dummy_m"),
mm_position=position,
identifier=identifier,
modality="image")
modality="image",
)
mm_features.append(mm_feature)
prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
num_tokens)
prompt_token_ids = [0] * num_tokens if same_prompt else [i] * num_tokens
request = Request(
request_id=f"{i}",
prompt_token_ids=prompt_token_ids,