[Misc] [Core] Implement RFC "Augment BaseExecutor interfaces to enable hardware-agnostic speculative decoding" (#3837)
This commit is contained in:
@@ -512,8 +512,8 @@ def test_init_device():
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def test_init_cache_engine():
|
||||
"""Verify SpecDecodeWorker invokes init_cache_engine on proposer/scorer
|
||||
def test_initialize_cache():
|
||||
"""Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer
|
||||
workers.
|
||||
"""
|
||||
draft_worker = mock_worker(cls=MultiStepWorker)
|
||||
@@ -525,12 +525,11 @@ def test_init_cache_engine():
|
||||
worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
|
||||
metrics_collector)
|
||||
|
||||
cache_config = MagicMock()
|
||||
kwargs = {"num_gpu_blocks": 1024, "num_cpu_blocks": 1023}
|
||||
worker.initialize_cache(**kwargs)
|
||||
|
||||
worker.init_cache_engine(cache_config)
|
||||
|
||||
draft_worker.init_cache_engine.assert_called_once_with(cache_config)
|
||||
target_worker.init_cache_engine.assert_called_once_with(cache_config)
|
||||
draft_worker.initialize_cache.assert_called_once_with(**kwargs)
|
||||
target_worker.initialize_cache.assert_called_once_with(**kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('available_gpu_blocks', [1, 1024])
|
||||
@@ -538,10 +537,10 @@ def test_init_cache_engine():
|
||||
@pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096])
|
||||
@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_profile_num_available_blocks(available_gpu_blocks: int,
|
||||
available_cpu_blocks: int,
|
||||
target_cache_block_size_bytes: int,
|
||||
draft_kv_size_bytes: int):
|
||||
def test_determine_num_available_blocks(available_gpu_blocks: int,
|
||||
available_cpu_blocks: int,
|
||||
target_cache_block_size_bytes: int,
|
||||
draft_kv_size_bytes: int):
|
||||
"""Verify SpecDecodeWorker correctly profiles num available GPU blocks.
|
||||
Specifically, it should run profiling in the scorer worker, and then evenly
|
||||
split the blocks between proposer and scorer worker.
|
||||
@@ -552,7 +551,7 @@ def test_profile_num_available_blocks(available_gpu_blocks: int,
|
||||
rejection_sampler.token_id_dtype = torch.int64
|
||||
metrics_collector = MagicMock(spec=AsyncMetricsCollector)
|
||||
|
||||
target_worker.profile_num_available_blocks.return_value = (
|
||||
target_worker.determine_num_available_blocks.return_value = (
|
||||
available_gpu_blocks, available_cpu_blocks)
|
||||
target_worker.get_cache_block_size_bytes.return_value = (
|
||||
target_cache_block_size_bytes)
|
||||
@@ -561,17 +560,9 @@ def test_profile_num_available_blocks(available_gpu_blocks: int,
|
||||
worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
|
||||
metrics_collector)
|
||||
|
||||
# These values do not directly impact the adjusted block size calculation,
|
||||
# so they can be fixed.
|
||||
gpu_memory_utilization = 0.9
|
||||
cpu_swap_space = 100
|
||||
block_size = 16
|
||||
num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks()
|
||||
|
||||
num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks(
|
||||
block_size, gpu_memory_utilization, cpu_swap_space, cache_dtype="auto")
|
||||
|
||||
target_worker.profile_num_available_blocks.assert_called_once_with(
|
||||
block_size, gpu_memory_utilization, cpu_swap_space, "auto")
|
||||
target_worker.determine_num_available_blocks.assert_called_once()
|
||||
assert num_cpu_blocks == available_cpu_blocks
|
||||
|
||||
assert num_gpu_blocks == split_num_cache_blocks_evenly(
|
||||
|
||||
Reference in New Issue
Block a user