Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -22,7 +22,7 @@ NUM_HEADS = [8]
|
||||
NUM_LAYERS = [4]
|
||||
DTYPES = [torch.bfloat16]
|
||||
SEEDS = [0]
|
||||
CUDA_DEVICES = ['cuda:0']
|
||||
CUDA_DEVICES = ["cuda:0"]
|
||||
NUM_MAPPINGS = [3]
|
||||
|
||||
|
||||
@@ -56,35 +56,35 @@ def test_transfer(
|
||||
current_platform.seed_everything(seed)
|
||||
|
||||
# create per-layer GPU KV caches
|
||||
attn_backends_list = [
|
||||
FlashAttentionBackend, FlashInferBackend, FlashAttnMLABackend
|
||||
]
|
||||
attn_backends_list = [FlashAttentionBackend, FlashInferBackend, FlashAttnMLABackend]
|
||||
|
||||
gpu_caches = {}
|
||||
attn_backends = {}
|
||||
for i in range(num_layers):
|
||||
layer_name = f'layer {i}'
|
||||
layer_name = f"layer {i}"
|
||||
|
||||
attn_backend = attn_backends_list[i % len(attn_backends_list)]
|
||||
attn_backends[layer_name] = attn_backend
|
||||
|
||||
gpu_cache_shape = attn_backend.get_kv_cache_shape(
|
||||
num_gpu_blocks, gpu_block_size, num_heads, head_size)
|
||||
gpu_caches[layer_name] = torch.rand(gpu_cache_shape,
|
||||
dtype=dtype,
|
||||
device=device)
|
||||
num_gpu_blocks, gpu_block_size, num_heads, head_size
|
||||
)
|
||||
gpu_caches[layer_name] = torch.rand(gpu_cache_shape, dtype=dtype, device=device)
|
||||
|
||||
# create handler
|
||||
cpu_block_size = gpu_blocks_per_cpu_block * gpu_block_size
|
||||
handler = CpuGpuOffloadingHandler(attn_backends=attn_backends,
|
||||
gpu_block_size=gpu_block_size,
|
||||
cpu_block_size=cpu_block_size,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
gpu_caches=gpu_caches)
|
||||
handler = CpuGpuOffloadingHandler(
|
||||
attn_backends=attn_backends,
|
||||
gpu_block_size=gpu_block_size,
|
||||
cpu_block_size=cpu_block_size,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
gpu_caches=gpu_caches,
|
||||
)
|
||||
|
||||
# select block mappings
|
||||
gpu_blocks = random.sample(range(num_gpu_blocks),
|
||||
num_mappings * gpu_blocks_per_cpu_block)
|
||||
gpu_blocks = random.sample(
|
||||
range(num_gpu_blocks), num_mappings * gpu_blocks_per_cpu_block
|
||||
)
|
||||
cpu_blocks = random.sample(range(num_cpu_blocks), num_mappings)
|
||||
|
||||
# convert cpu blocks to gpu block size
|
||||
@@ -96,9 +96,10 @@ def test_transfer(
|
||||
|
||||
# maybe skip a GPU block to test writing to the middle of a CPU block
|
||||
if gpu_to_cpu:
|
||||
gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1:]
|
||||
gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1 :]
|
||||
cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[
|
||||
gpu_blocks_per_cpu_block - 1:]
|
||||
gpu_blocks_per_cpu_block - 1 :
|
||||
]
|
||||
|
||||
# set transfer direction
|
||||
if gpu_to_cpu:
|
||||
@@ -124,8 +125,9 @@ def test_transfer(
|
||||
|
||||
# build dst -> src mapping
|
||||
dst_to_src = {}
|
||||
for src_block, dst_block in zip(src_blocks_in_gpu_block_size,
|
||||
dst_blocks_in_gpu_block_size):
|
||||
for src_block, dst_block in zip(
|
||||
src_blocks_in_gpu_block_size, dst_blocks_in_gpu_block_size
|
||||
):
|
||||
dst_to_src[dst_block] = src_block
|
||||
|
||||
# build transfer specs
|
||||
@@ -157,8 +159,11 @@ def test_transfer(
|
||||
for dst_block in range(dst_size_in_gpu_blocks):
|
||||
src_block_candidate = dst_to_src.get(dst_block)
|
||||
for src_cache, dst_cache, orig_dst_cache, kv_dim in zip(
|
||||
src_kv_caches, dst_kv_caches, orig_dst_caches,
|
||||
handler.kv_dim_before_num_blocks):
|
||||
src_kv_caches,
|
||||
dst_kv_caches,
|
||||
orig_dst_caches,
|
||||
handler.kv_dim_before_num_blocks,
|
||||
):
|
||||
if kv_dim:
|
||||
# iterate over key, value
|
||||
for i in range(2):
|
||||
@@ -166,12 +171,14 @@ def test_transfer(
|
||||
expected_value = src_cache[i][src_block_candidate]
|
||||
else:
|
||||
expected_value = orig_dst_cache[i][dst_block]
|
||||
torch.testing.assert_close(dst_cache[i][dst_block].cpu(),
|
||||
expected_value.cpu())
|
||||
torch.testing.assert_close(
|
||||
dst_cache[i][dst_block].cpu(), expected_value.cpu()
|
||||
)
|
||||
else:
|
||||
if src_block_candidate is not None:
|
||||
expected_value = src_cache[src_block_candidate]
|
||||
else:
|
||||
expected_value = orig_dst_cache[dst_block]
|
||||
torch.testing.assert_close(dst_cache[dst_block].cpu(),
|
||||
expected_value.cpu())
|
||||
torch.testing.assert_close(
|
||||
dst_cache[dst_block].cpu(), expected_value.cpu()
|
||||
)
|
||||
|
||||
@@ -7,8 +7,11 @@ from typing import Optional
|
||||
import numpy as np
|
||||
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash
|
||||
from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent,
|
||||
PrepareStoreOutput)
|
||||
from vllm.v1.kv_offload.abstract import (
|
||||
LoadStoreSpec,
|
||||
OffloadingEvent,
|
||||
PrepareStoreOutput,
|
||||
)
|
||||
from vllm.v1.kv_offload.backends.cpu import CPUBackend
|
||||
from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
|
||||
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
|
||||
@@ -26,31 +29,38 @@ def to_hashes(int_hashes: list[int]) -> list[BlockHash]:
|
||||
|
||||
|
||||
def verify_store_output(
|
||||
prepare_store_output: Optional[PrepareStoreOutput],
|
||||
expected_prepare_store_output: ExpectedPrepareStoreOutput):
|
||||
prepare_store_output: Optional[PrepareStoreOutput],
|
||||
expected_prepare_store_output: ExpectedPrepareStoreOutput,
|
||||
):
|
||||
assert prepare_store_output is not None
|
||||
assert (prepare_store_output.block_hashes_to_store == to_hashes(
|
||||
expected_prepare_store_output.block_hashes_to_store))
|
||||
assert (prepare_store_output.block_hashes_evicted == to_hashes(
|
||||
expected_prepare_store_output.block_hashes_evicted))
|
||||
assert prepare_store_output.block_hashes_to_store == to_hashes(
|
||||
expected_prepare_store_output.block_hashes_to_store
|
||||
)
|
||||
assert prepare_store_output.block_hashes_evicted == to_hashes(
|
||||
expected_prepare_store_output.block_hashes_evicted
|
||||
)
|
||||
store_spec = prepare_store_output.store_spec
|
||||
assert isinstance(store_spec, CPULoadStoreSpec)
|
||||
expected_array = np.array(expected_prepare_store_output.store_block_ids,
|
||||
dtype=np.int64)
|
||||
expected_array = np.array(
|
||||
expected_prepare_store_output.store_block_ids, dtype=np.int64
|
||||
)
|
||||
assert np.array_equal(expected_array, store_spec.block_ids)
|
||||
|
||||
|
||||
def verify_load_output(prepare_load_output: LoadStoreSpec,
|
||||
expected_prepare_load_output: list[int]):
|
||||
def verify_load_output(
|
||||
prepare_load_output: LoadStoreSpec, expected_prepare_load_output: list[int]
|
||||
):
|
||||
assert isinstance(prepare_load_output, CPULoadStoreSpec)
|
||||
expected_array = np.array(expected_prepare_load_output, dtype=np.int64)
|
||||
assert np.array_equal(expected_array, prepare_load_output.block_ids)
|
||||
|
||||
|
||||
def verify_events(events: Iterable[OffloadingEvent],
|
||||
block_size: int,
|
||||
expected_stores: tuple[set[int], ...] = (),
|
||||
expected_evictions: tuple[set[int], ...] = ()):
|
||||
def verify_events(
|
||||
events: Iterable[OffloadingEvent],
|
||||
block_size: int,
|
||||
expected_stores: tuple[set[int], ...] = (),
|
||||
expected_evictions: tuple[set[int], ...] = (),
|
||||
):
|
||||
stores: list[set[BlockHash]] = []
|
||||
evictions: list[set[BlockHash]] = []
|
||||
for event in events:
|
||||
@@ -61,8 +71,7 @@ def verify_events(events: Iterable[OffloadingEvent],
|
||||
else:
|
||||
stores.append(set(event.block_hashes))
|
||||
|
||||
def to_hash_sets(
|
||||
int_sets: tuple[set[int], ...]) -> tuple[set[BlockHash], ...]:
|
||||
def to_hash_sets(int_sets: tuple[set[int], ...]) -> tuple[set[BlockHash], ...]:
|
||||
return tuple([set(to_hashes(list(int_set))) for int_set in int_sets])
|
||||
|
||||
assert tuple(evictions) == to_hash_sets(expected_evictions)
|
||||
@@ -86,7 +95,8 @@ def test_cpu_manager():
|
||||
block_hashes_to_store=[1, 2],
|
||||
store_block_ids=[0, 1],
|
||||
block_hashes_evicted=[],
|
||||
))
|
||||
),
|
||||
)
|
||||
|
||||
# lookup [1, 2] -> not ready
|
||||
assert cpu_manager.lookup(to_hashes([1, 2])) == 0
|
||||
@@ -96,9 +106,9 @@ def test_cpu_manager():
|
||||
|
||||
# complete store [1, 2]
|
||||
cpu_manager.complete_store(to_hashes([1, 2]))
|
||||
verify_events(cpu_manager.take_events(),
|
||||
block_size=block_size,
|
||||
expected_stores=({1, 2}, ))
|
||||
verify_events(
|
||||
cpu_manager.take_events(), block_size=block_size, expected_stores=({1, 2},)
|
||||
)
|
||||
|
||||
# lookup [1, 2]
|
||||
assert cpu_manager.lookup(to_hashes([1])) == 1
|
||||
@@ -113,12 +123,13 @@ def test_cpu_manager():
|
||||
block_hashes_to_store=[3, 4, 5],
|
||||
store_block_ids=[2, 3, 0],
|
||||
block_hashes_evicted=[1],
|
||||
))
|
||||
),
|
||||
)
|
||||
|
||||
# verify eviction event
|
||||
verify_events(cpu_manager.take_events(),
|
||||
block_size=block_size,
|
||||
expected_evictions=({1}, ))
|
||||
verify_events(
|
||||
cpu_manager.take_events(), block_size=block_size, expected_evictions=({1},)
|
||||
)
|
||||
|
||||
# prepare store with no space
|
||||
assert cpu_manager.prepare_store(to_hashes([1, 6])) is None
|
||||
@@ -144,7 +155,8 @@ def test_cpu_manager():
|
||||
block_hashes_to_store=[6, 7, 8],
|
||||
store_block_ids=[3, 2, 1],
|
||||
block_hashes_evicted=[2, 3, 4],
|
||||
))
|
||||
),
|
||||
)
|
||||
|
||||
# complete store [6, 7, 8]
|
||||
cpu_manager.complete_store(to_hashes([6, 7, 8]))
|
||||
@@ -160,7 +172,8 @@ def test_cpu_manager():
|
||||
block_hashes_to_store=[9],
|
||||
store_block_ids=[1],
|
||||
block_hashes_evicted=[8],
|
||||
))
|
||||
),
|
||||
)
|
||||
|
||||
# complete store [7, 9] with failure
|
||||
cpu_manager.complete_store(to_hashes([7, 9]), success=False)
|
||||
@@ -169,7 +182,9 @@ def test_cpu_manager():
|
||||
assert cpu_manager.lookup(to_hashes([7])) == 1
|
||||
assert cpu_manager.lookup(to_hashes([9])) == 0
|
||||
|
||||
verify_events(cpu_manager.take_events(),
|
||||
block_size=block_size,
|
||||
expected_stores=({3, 4, 5}, {6, 7, 8}),
|
||||
expected_evictions=({2, 3, 4}, {8}))
|
||||
verify_events(
|
||||
cpu_manager.take_events(),
|
||||
block_size=block_size,
|
||||
expected_stores=({3, 4, 5}, {6, 7, 8}),
|
||||
expected_evictions=({2, 3, 4}, {8}),
|
||||
)
|
||||
|
||||
@@ -20,10 +20,7 @@ def test_cpu_offloading(cpu_block_size: int) -> None:
|
||||
kv_transfer_config = KVTransferConfig(
|
||||
kv_connector="OffloadingConnector",
|
||||
kv_role="kv_both",
|
||||
kv_connector_extra_config={
|
||||
"num_cpu_blocks": 100,
|
||||
"block_size": cpu_block_size
|
||||
},
|
||||
kv_connector_extra_config={"num_cpu_blocks": 100, "block_size": cpu_block_size},
|
||||
)
|
||||
|
||||
llm = LLM(
|
||||
|
||||
@@ -1,17 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from vllm.v1.kv_offload.abstract import LoadStoreSpec
|
||||
from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
|
||||
OffloadingWorker, TransferResult,
|
||||
TransferSpec)
|
||||
from vllm.v1.kv_offload.worker.worker import (
|
||||
OffloadingHandler,
|
||||
OffloadingWorker,
|
||||
TransferResult,
|
||||
TransferSpec,
|
||||
)
|
||||
|
||||
|
||||
class LoadStoreSpec1(LoadStoreSpec):
|
||||
|
||||
def __init__(self,
|
||||
submit_success: bool = True,
|
||||
async_success: bool = True,
|
||||
exception: bool = False):
|
||||
def __init__(
|
||||
self,
|
||||
submit_success: bool = True,
|
||||
async_success: bool = True,
|
||||
exception: bool = False,
|
||||
):
|
||||
self.finished = False
|
||||
self.submit_success = submit_success
|
||||
self.async_success = async_success
|
||||
@@ -26,7 +30,6 @@ class LoadStoreSpec1(LoadStoreSpec):
|
||||
|
||||
|
||||
class LoadStoreSpec2(LoadStoreSpec):
|
||||
|
||||
@staticmethod
|
||||
def medium() -> str:
|
||||
return "2"
|
||||
@@ -36,7 +39,6 @@ class LoadStoreSpec2(LoadStoreSpec):
|
||||
|
||||
|
||||
class OffloadingHandler1To2(OffloadingHandler):
|
||||
|
||||
def __init__(self):
|
||||
self.transfers: dict[int, LoadStoreSpec1] = {}
|
||||
|
||||
@@ -63,7 +65,6 @@ class OffloadingHandler1To2(OffloadingHandler):
|
||||
|
||||
|
||||
class OffloadingHandler2To1(OffloadingHandler):
|
||||
|
||||
def __init__(self):
|
||||
self.transfers: dict[int, LoadStoreSpec1] = {}
|
||||
|
||||
@@ -144,9 +145,9 @@ def test_offloading_worker():
|
||||
assert 7 in handler2to1.transfers
|
||||
|
||||
# verify result of 3rd and 4th transfers
|
||||
assert (sorted(worker.get_finished()) == [(3, False), (4, True)])
|
||||
assert sorted(worker.get_finished()) == [(3, False), (4, True)]
|
||||
|
||||
# complete 6th and 7th transfers
|
||||
src6.finished = True
|
||||
dst7.finished = True
|
||||
assert (sorted(worker.get_finished()) == [(6, True), (7, True)])
|
||||
assert sorted(worker.get_finished()) == [(6, True), (7, True)]
|
||||
|
||||
Reference in New Issue
Block a user