[Speculative decoding 4/9] Lookahead scheduling for speculative decoding (#3250)
This commit is contained in:
@@ -498,3 +498,78 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
|
||||
|
||||
# After free, expect all blocks to be freed.
|
||||
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
|
||||
|
||||
|
||||
@pytest.mark.parametrize("block_size", [1, 8])
|
||||
@pytest.mark.parametrize("sequence_len", [1, 16, 129])
|
||||
@pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
|
||||
@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
|
||||
@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
|
||||
def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
|
||||
num_new_tokens: int,
|
||||
num_lookahead_slots: int,
|
||||
allocator_type: str):
|
||||
"""Verify correct calculation of get_num_blocks_touched_by_append_slots.
|
||||
|
||||
This is done by using copy-on-write, which requires any modified block to
|
||||
be copied before write if the refcount > 1. We set the refcount>1 by forking
|
||||
a sequence, then measure the free blocks before and after an append. If the
|
||||
number of consumed blocks equals what `get_num_blocks_touched_by_append_
|
||||
slots` returns, then the calculation is correct.
|
||||
"""
|
||||
|
||||
num_gpu_blocks = 1024
|
||||
|
||||
allocator = CpuGpuBlockAllocator.create(
|
||||
allocator_type=allocator_type,
|
||||
num_gpu_blocks=num_gpu_blocks,
|
||||
num_cpu_blocks=0,
|
||||
block_size=block_size,
|
||||
)
|
||||
|
||||
token_ids = list(range(sequence_len))
|
||||
token_ids_to_append = list(range(num_new_tokens))
|
||||
|
||||
block_table = BlockTable(
|
||||
block_size=block_size,
|
||||
block_allocator=allocator,
|
||||
)
|
||||
|
||||
block_table.allocate(token_ids=token_ids, device=Device.GPU)
|
||||
|
||||
# Add lookahead before fork so both sequences have the same lookahead
|
||||
# blocks.
|
||||
block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
|
||||
|
||||
# Fork sequence so that every block has refcount > 1.
|
||||
_ = block_table.fork()
|
||||
|
||||
# Determine how many blocks should be touched.
|
||||
expected_num_touched_blocks = (
|
||||
block_table.get_num_blocks_touched_by_append_slots(
|
||||
token_ids=token_ids_to_append,
|
||||
num_lookahead_slots=num_lookahead_slots))
|
||||
|
||||
# Measure how many blocks are touched by measuring num_free_blocks before
|
||||
# and after the append.
|
||||
#
|
||||
# We expect append_token_ids to CoW all mutated blocks that have refcount>1.
|
||||
num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
|
||||
block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
|
||||
num_consumed_blocks = (num_free_blocks_before_append -
|
||||
allocator.get_num_free_blocks(Device.GPU))
|
||||
|
||||
# TODO(cade) ensure equality when num_lookahead_slots > 0.
|
||||
# The reason we have < is because lookahead blocks are not copied eagerly;
|
||||
# they are copied on first write. This will cause issues for beam search +
|
||||
# speculative decoding. This is acceptable for now as it is a large effort
|
||||
# to combine the two. To fix this, we can ensure single sequence ownership
|
||||
# of lookahead blocks by appending empty slots to each block, which will
|
||||
# trigger the CoW.
|
||||
#
|
||||
# Until then, we can accept that the consumed tokens are <= the expected
|
||||
# tokens when appending with lookahead.
|
||||
if num_lookahead_slots > 0:
|
||||
assert num_consumed_blocks <= expected_num_touched_blocks
|
||||
else:
|
||||
assert num_consumed_blocks == expected_num_touched_blocks
|
||||
|
||||
Reference in New Issue
Block a user