[Speculative decoding 2/9] Multi-step worker for draft model (#2424)
This commit is contained in:
0
tests/worker/spec_decode/__init__.py
Normal file
0
tests/worker/spec_decode/__init__.py
Normal file
261
tests/worker/spec_decode/test_multi_step_worker.py
Normal file
261
tests/worker/spec_decode/test_multi_step_worker.py
Normal file
@@ -0,0 +1,261 @@
|
||||
import torch
|
||||
import random
|
||||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from vllm.worker.spec_decode.multi_step_worker import MultiStepWorker
|
||||
from vllm.worker.worker import Worker
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
|
||||
from .utils import (create_execute_model_data, create_worker,
|
||||
create_seq_group_metadata_from_prompts, zero_kv_cache,
|
||||
patch_execute_model_with_seeds,
|
||||
assert_logprobs_dict_allclose)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('num_steps', list(range(1, 17)))
|
||||
def test_assert_enough_kv_space(num_steps: int):
|
||||
"""Test that the multi step worker checks for sufficient space in the KV
|
||||
cache. It should throw if it cannot run all the steps.
|
||||
"""
|
||||
block_size = 16
|
||||
num_gpu_blocks = 2048 // block_size
|
||||
|
||||
prompts = [
|
||||
list(range(block_size * 3)),
|
||||
list(range(block_size * 2)),
|
||||
]
|
||||
|
||||
prev_output_tokens = [
|
||||
list(range(block_size * 1)),
|
||||
list(range(block_size * 2)),
|
||||
]
|
||||
|
||||
final_seq_lens = [
|
||||
len(prompt + output) + num_steps
|
||||
for prompt, output in zip(prompts, prev_output_tokens)
|
||||
]
|
||||
|
||||
inputs = create_seq_group_metadata_from_prompts(
|
||||
prompts,
|
||||
num_gpu_blocks,
|
||||
block_size,
|
||||
final_seq_lens,
|
||||
continuations=prev_output_tokens)
|
||||
|
||||
assert_enough_kv_space = MultiStepWorker._assert_enough_kv_space # pylint: disable=protected-access
|
||||
worker = MagicMock()
|
||||
worker.model_runner.block_size = block_size
|
||||
|
||||
for seq_group_metadata in inputs:
|
||||
original_block_tables = seq_group_metadata.block_tables
|
||||
|
||||
# No exception.
|
||||
assert_enough_kv_space(worker, inputs, num_steps)
|
||||
|
||||
seq_group_metadata.block_tables = {
|
||||
seq_id: []
|
||||
for seq_id, physical_blocks in original_block_tables.items()
|
||||
}
|
||||
|
||||
# Expect exception.
|
||||
with pytest.raises(ValueError,
|
||||
match='times but found insufficient KV space for'):
|
||||
assert_enough_kv_space(worker, inputs, num_steps)
|
||||
|
||||
seq_group_metadata.block_tables = original_block_tables
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def test_same_output_for_single_step():
|
||||
"""Verify the multi step worker produces the same output as the normal
|
||||
worker for num_steps=1.
|
||||
"""
|
||||
seed = 100
|
||||
model_name = 'JackFram/llama-68m'
|
||||
|
||||
block_size = 32
|
||||
num_gpu_blocks = 2048 // block_size
|
||||
multi_step_worker = create_worker(
|
||||
MultiStepWorker,
|
||||
model_name,
|
||||
block_size,
|
||||
num_gpu_blocks,
|
||||
seed,
|
||||
)
|
||||
worker = create_worker(
|
||||
Worker,
|
||||
model_name,
|
||||
block_size,
|
||||
num_gpu_blocks,
|
||||
seed,
|
||||
)
|
||||
multi_step_worker.model_runner = worker.model_runner
|
||||
multi_step_worker.cache_engine = worker.cache_engine
|
||||
|
||||
num_steps = 1
|
||||
|
||||
prompts = [
|
||||
[1, 2, 3, 4, 5],
|
||||
[6, 7, 8, 9, 10],
|
||||
]
|
||||
|
||||
final_seq_lens = [len(prompt) + num_steps for prompt in prompts]
|
||||
|
||||
multi_step_execute_model_data = create_execute_model_data(
|
||||
seq_group_metadata_list=create_seq_group_metadata_from_prompts(
|
||||
prompts, num_gpu_blocks, block_size,
|
||||
final_seq_lens=final_seq_lens))
|
||||
|
||||
single_step_execute_model_data = create_execute_model_data(
|
||||
seq_group_metadata_list=create_seq_group_metadata_from_prompts(
|
||||
prompts, num_gpu_blocks, block_size,
|
||||
final_seq_lens=final_seq_lens))
|
||||
|
||||
zero_kv_cache(multi_step_worker.cache_engine)
|
||||
set_random_seed(seed)
|
||||
actual_output = multi_step_worker.execute_model_multi_step(
|
||||
**multi_step_execute_model_data.to_dict(), num_steps=num_steps)
|
||||
assert len(actual_output) == num_steps
|
||||
actual_output = actual_output[0]
|
||||
|
||||
zero_kv_cache(worker.cache_engine)
|
||||
set_random_seed(seed)
|
||||
expected_output = worker.execute_model(
|
||||
**single_step_execute_model_data.to_dict(), )
|
||||
|
||||
actual_token_ids = [
|
||||
output.samples[0].output_token for output in actual_output
|
||||
]
|
||||
actual_logprobs = [output.samples[0].logprobs for output in actual_output]
|
||||
|
||||
expected_token_ids = [
|
||||
output.samples[0].output_token for output in expected_output
|
||||
]
|
||||
expected_logprobs = [
|
||||
output.samples[0].logprobs for output in expected_output
|
||||
]
|
||||
|
||||
assert actual_token_ids == expected_token_ids
|
||||
|
||||
print(f'{actual_logprobs=}')
|
||||
print(f'{expected_logprobs=}')
|
||||
assert_logprobs_dict_allclose(actual_logprobs, expected_logprobs)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def test_same_output_for_multi_step():
|
||||
"""Verify the multi-step worker produces the same output as the normal
|
||||
worker when num_steps > 1. This test runs the multi-step worker once, and
|
||||
then runs the worker num_steps times, and compares the output.
|
||||
"""
|
||||
seed = 100
|
||||
model_name = 'JackFram/llama-68m'
|
||||
|
||||
block_size = 16
|
||||
num_gpu_blocks = 2048 // block_size
|
||||
multi_step_worker = create_worker(
|
||||
MultiStepWorker,
|
||||
model_name,
|
||||
block_size,
|
||||
num_gpu_blocks,
|
||||
seed,
|
||||
)
|
||||
|
||||
worker = create_worker(
|
||||
Worker,
|
||||
model_name,
|
||||
block_size,
|
||||
num_gpu_blocks,
|
||||
seed,
|
||||
)
|
||||
|
||||
# Make sure we go over the block boundary.
|
||||
num_steps = block_size + 1
|
||||
|
||||
random.seed(seed)
|
||||
prompts = [[
|
||||
random.randint(0, 1000) for _ in range(random.randint(10, 20))
|
||||
] for _ in range(10)]
|
||||
|
||||
final_seq_lens = [len(prompt) + num_steps for prompt in prompts]
|
||||
|
||||
rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
|
||||
multi_step_worker.execute_model = patch_execute_model_with_seeds(
|
||||
multi_step_worker, rand_seeds)
|
||||
worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
|
||||
|
||||
continuations = [[1] for _ in prompts]
|
||||
execute_model_data = create_execute_model_data(
|
||||
create_seq_group_metadata_from_prompts(
|
||||
prompts,
|
||||
num_gpu_blocks,
|
||||
block_size,
|
||||
continuations=continuations,
|
||||
final_seq_lens=final_seq_lens), )
|
||||
|
||||
# Run multi-step.
|
||||
zero_kv_cache(multi_step_worker.cache_engine)
|
||||
set_random_seed(seed)
|
||||
multi_step_output = multi_step_worker.execute_model_multi_step(
|
||||
**execute_model_data.to_dict(), num_steps=num_steps)
|
||||
|
||||
# Run single-step repeatedly.
|
||||
zero_kv_cache(worker.cache_engine)
|
||||
single_step_output = []
|
||||
continuations = [[1] for _ in prompts]
|
||||
set_random_seed(seed)
|
||||
|
||||
for _ in multi_step_output:
|
||||
|
||||
execute_model_data = create_execute_model_data(
|
||||
create_seq_group_metadata_from_prompts(
|
||||
prompts,
|
||||
num_gpu_blocks,
|
||||
block_size,
|
||||
continuations=continuations,
|
||||
final_seq_lens=final_seq_lens))
|
||||
|
||||
single_step_output.append(
|
||||
worker.execute_model(**execute_model_data.to_dict(), ))
|
||||
|
||||
# Append output tokens to new sequence data.
|
||||
for i, seq_group_output in enumerate(single_step_output[-1]):
|
||||
continuations[i].append(seq_group_output.samples[0].output_token)
|
||||
|
||||
# Get token ids and logprobs for comparison.
|
||||
multi_step_output_logprobs = [[] for _ in prompts]
|
||||
single_step_output_logprobs = [[] for _ in prompts]
|
||||
|
||||
multi_step_output_token_ids = [[] for _ in prompts]
|
||||
single_step_output_token_ids = [[] for _ in prompts]
|
||||
for i, _ in enumerate(prompts):
|
||||
for multi_step, single_step in zip(multi_step_output,
|
||||
single_step_output):
|
||||
multi_step_output_token_ids[i].append(
|
||||
multi_step[i].samples[0].output_token)
|
||||
single_step_output_token_ids[i].append(
|
||||
single_step[i].samples[0].output_token)
|
||||
|
||||
multi_step_output_logprobs[i].append(
|
||||
multi_step[i].samples[0].logprobs)
|
||||
single_step_output_logprobs[i].append(
|
||||
single_step[i].samples[0].logprobs)
|
||||
|
||||
# Print per-sequence token ids
|
||||
for i, (multi_step_tokens, single_step_tokens) in enumerate(
|
||||
zip(multi_step_output_token_ids, single_step_output_token_ids)):
|
||||
print(f'{i=} {multi_step_tokens=}')
|
||||
print(f'{i=} {single_step_tokens=}')
|
||||
print(f'{i=} equal {multi_step_tokens == single_step_tokens}')
|
||||
|
||||
# Assert token ids are equal.
|
||||
for multi_step_tokens, single_step_tokens in zip(
|
||||
multi_step_output_token_ids, single_step_output_token_ids):
|
||||
assert multi_step_tokens == single_step_tokens
|
||||
|
||||
# Assert logprobs are equal.
|
||||
for multi_step_logprobs, single_step_logprobs in zip(
|
||||
multi_step_output_logprobs, single_step_output_logprobs):
|
||||
assert_logprobs_dict_allclose(multi_step_logprobs,
|
||||
single_step_logprobs)
|
||||
177
tests/worker/spec_decode/utils.py
Normal file
177
tests/worker/spec_decode/utils.py
Normal file
@@ -0,0 +1,177 @@
|
||||
import torch
|
||||
from typing import List, Optional, Dict
|
||||
|
||||
from vllm.worker.worker import Worker
|
||||
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.sequence import SequenceGroupMetadata, SequenceData
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.worker.cache_engine import CacheEngine
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from dataclasses import dataclass, fields
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExecuteModelData:
|
||||
"""Helper data structure which facilitates cleaner tests.
|
||||
"""
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata]
|
||||
blocks_to_swap_in: Dict[int, int]
|
||||
blocks_to_swap_out: Dict[int, int]
|
||||
blocks_to_copy: Dict[int, List[int]]
|
||||
|
||||
def to_dict(self):
|
||||
return dict(
|
||||
(field.name, getattr(self, field.name)) for field in fields(self))
|
||||
|
||||
|
||||
def round_up_to_next_block(seq_len: int, block_size: int) -> int:
|
||||
return (seq_len + block_size - 1) // block_size
|
||||
|
||||
|
||||
def create_execute_model_data(
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
blocks_to_swap_in: Optional[Dict[int, int]] = None,
|
||||
blocks_to_swap_out: Optional[Dict[int, int]] = None,
|
||||
blocks_to_copy: Optional[Dict[int, int]] = None,
|
||||
) -> ExecuteModelData:
|
||||
if blocks_to_swap_in is None:
|
||||
blocks_to_swap_in = {}
|
||||
if blocks_to_swap_out is None:
|
||||
blocks_to_swap_out = {}
|
||||
if blocks_to_copy is None:
|
||||
blocks_to_copy = {}
|
||||
|
||||
return ExecuteModelData(
|
||||
seq_group_metadata_list=seq_group_metadata_list,
|
||||
blocks_to_swap_in=blocks_to_swap_in,
|
||||
blocks_to_swap_out=blocks_to_swap_out,
|
||||
blocks_to_copy=blocks_to_copy,
|
||||
)
|
||||
|
||||
|
||||
def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]):
|
||||
seed_iter = iter(rand_seeds)
|
||||
original_execute_model = worker.execute_model
|
||||
|
||||
def new_execute_model(*args, **kwargs):
|
||||
result = original_execute_model(*args, **kwargs)
|
||||
set_random_seed(next(seed_iter))
|
||||
return result
|
||||
|
||||
return new_execute_model
|
||||
|
||||
|
||||
def zero_kv_cache(cache_engine: CacheEngine):
|
||||
assert cache_engine.gpu_cache
|
||||
for key_blocks, value_blocks in cache_engine.gpu_cache:
|
||||
key_blocks.zero_()
|
||||
value_blocks.zero_()
|
||||
|
||||
|
||||
def create_worker(cls: type,
|
||||
model_name: str,
|
||||
block_size: int,
|
||||
num_gpu_blocks: int,
|
||||
seed: int,
|
||||
is_driver_worker: bool = True,
|
||||
enforce_eager: bool = True):
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
seed=seed,
|
||||
block_size=block_size,
|
||||
enforce_eager=enforce_eager,
|
||||
)
|
||||
|
||||
(model_config, cache_config, parallel_config,
|
||||
scheduler_config) = engine_args.create_engine_configs()
|
||||
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
get_ip(), get_open_port())
|
||||
|
||||
worker = cls(
|
||||
model_config=model_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
local_rank=0,
|
||||
rank=0,
|
||||
distributed_init_method=distributed_init_method,
|
||||
is_driver_worker=is_driver_worker,
|
||||
)
|
||||
|
||||
worker.init_model()
|
||||
worker.load_model()
|
||||
|
||||
cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
cache_config.num_cpu_blocks = 0
|
||||
worker.init_cache_engine(cache_config)
|
||||
worker.warm_up_model()
|
||||
|
||||
return worker
|
||||
|
||||
|
||||
def create_seq_group_metadata_from_prompts(
|
||||
prompts: List[List[int]],
|
||||
num_gpu_blocks: int,
|
||||
block_size: int,
|
||||
final_seq_lens: List[int],
|
||||
continuations: Optional[List[List[int]]] = None,
|
||||
num_tokens_processed: Optional[List[int]] = None,
|
||||
seq_ids: Optional[List[int]] = None,
|
||||
) -> List[SequenceGroupMetadata]:
|
||||
|
||||
if continuations is None:
|
||||
continuations = [[] for _ in prompts]
|
||||
|
||||
if num_tokens_processed is None:
|
||||
# Default to 1 token missing from kv cache for generation sequences.
|
||||
num_tokens_processed = []
|
||||
for continuation, prompt in zip(continuations, prompts):
|
||||
# If prefill, then default to zero tokens processed.
|
||||
if not continuation:
|
||||
num_tokens_processed.append(0)
|
||||
else:
|
||||
# If generation, then default to all but one tokens processed.
|
||||
num_tokens_processed.append(
|
||||
len(continuation) + len(prompt) - 1)
|
||||
|
||||
if seq_ids is None:
|
||||
seq_ids = list(i for i, _ in enumerate(prompts))
|
||||
|
||||
free_gpu_blocks = list(range(num_gpu_blocks))
|
||||
|
||||
block_allocations = {
|
||||
i: [
|
||||
free_gpu_blocks.pop()
|
||||
for _ in range(round_up_to_next_block(final_len, block_size))
|
||||
]
|
||||
for i, final_len in enumerate(final_seq_lens)
|
||||
}
|
||||
|
||||
return [
|
||||
SequenceGroupMetadata(
|
||||
request_id=str(i),
|
||||
is_prompt=len(cont_token_ids) == 0,
|
||||
seq_data={
|
||||
i:
|
||||
SequenceData(prompt_token_ids=prompt_token_ids[:] +
|
||||
cont_token_ids[:])
|
||||
},
|
||||
sampling_params=SamplingParams(temperature=0.0, ),
|
||||
block_tables={i: block_allocations[i][:]},
|
||||
) for i, (prompt_token_ids, cont_token_ids, num_tokens_saved) in
|
||||
enumerate(zip(prompts, continuations, num_tokens_processed))
|
||||
]
|
||||
|
||||
|
||||
def assert_logprobs_dict_allclose(
|
||||
actual_logprobs: List[Dict[int, float]],
|
||||
expected_logprobs: List[Dict[int, float]]) -> None:
|
||||
for single_step_actual_logprobs, single_step_expected_logprobs in zip(
|
||||
actual_logprobs, expected_logprobs):
|
||||
assert set(single_step_actual_logprobs.keys()) == set(
|
||||
single_step_expected_logprobs.keys())
|
||||
for token_id in single_step_actual_logprobs:
|
||||
actual = torch.tensor(single_step_actual_logprobs[token_id])
|
||||
expected = torch.tensor(single_step_expected_logprobs[token_id])
|
||||
assert torch.allclose(actual, expected)
|
||||
Reference in New Issue
Block a user