[Misc] [Core] Implement RFC "Augment BaseExecutor interfaces to enable hardware-agnostic speculative decoding" (#3837)

2024-04-09 11:44:15 -07:00
parent 6d592eb430
commit e7c7067b45
20 changed files with 451 additions and 275 deletions
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -512,8 +512,8 @@ def test_init_device():


@torch.inference_mode()
-def test_init_cache_engine():
-    """Verify SpecDecodeWorker invokes init_cache_engine on proposer/scorer
+def test_initialize_cache():
+    """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer
    workers.
    """
    draft_worker = mock_worker(cls=MultiStepWorker)
@@ -525,12 +525,11 @@ def test_init_cache_engine():
    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
                              metrics_collector)

-    cache_config = MagicMock()
+    kwargs = {"num_gpu_blocks": 1024, "num_cpu_blocks": 1023}
+    worker.initialize_cache(**kwargs)

-    worker.init_cache_engine(cache_config)
-
-    draft_worker.init_cache_engine.assert_called_once_with(cache_config)
-    target_worker.init_cache_engine.assert_called_once_with(cache_config)
+    draft_worker.initialize_cache.assert_called_once_with(**kwargs)
+    target_worker.initialize_cache.assert_called_once_with(**kwargs)


@pytest.mark.parametrize('available_gpu_blocks', [1, 1024])
@@ -538,10 +537,10 @@ def test_init_cache_engine():
@pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096])
@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
@pytest.mark.skip_global_cleanup
-def test_profile_num_available_blocks(available_gpu_blocks: int,
-                                      available_cpu_blocks: int,
-                                      target_cache_block_size_bytes: int,
-                                      draft_kv_size_bytes: int):
+def test_determine_num_available_blocks(available_gpu_blocks: int,
+                                        available_cpu_blocks: int,
+                                        target_cache_block_size_bytes: int,
+                                        draft_kv_size_bytes: int):
    """Verify SpecDecodeWorker correctly profiles num available GPU blocks.
    Specifically, it should run profiling in the scorer worker, and then evenly
    split the blocks between proposer and scorer worker.
@@ -552,7 +551,7 @@ def test_profile_num_available_blocks(available_gpu_blocks: int,
    rejection_sampler.token_id_dtype = torch.int64
    metrics_collector = MagicMock(spec=AsyncMetricsCollector)

-    target_worker.profile_num_available_blocks.return_value = (
+    target_worker.determine_num_available_blocks.return_value = (
        available_gpu_blocks, available_cpu_blocks)
    target_worker.get_cache_block_size_bytes.return_value = (
        target_cache_block_size_bytes)
@@ -561,17 +560,9 @@ def test_profile_num_available_blocks(available_gpu_blocks: int,
    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
                              metrics_collector)

-    # These values do not directly impact the adjusted block size calculation,
-    # so they can be fixed.
-    gpu_memory_utilization = 0.9
-    cpu_swap_space = 100
-    block_size = 16
+    num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks()

-    num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks(
-        block_size, gpu_memory_utilization, cpu_swap_space, cache_dtype="auto")
-
-    target_worker.profile_num_available_blocks.assert_called_once_with(
-        block_size, gpu_memory_utilization, cpu_swap_space, "auto")
+    target_worker.determine_num_available_blocks.assert_called_once()
    assert num_cpu_blocks == available_cpu_blocks

    assert num_gpu_blocks == split_num_cache_blocks_evenly(