[Feature] [Spec decode]: Combine chunked prefill with speculative decoding (#9291)

Signed-off-by: NickLucche <nlucches@redhat.com>
2024-11-07 17:15:14 +01:00
parent ae62fd17c0
commit 9d43afcc53
17 changed files with 476 additions and 146 deletions
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -118,7 +118,8 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
        num_gpu_blocks,
        block_size,
        final_prompt_lens=final_prompt_lens)
-
+    for sg in seq_group_metadata_list:
+        sg.is_prompt = False
    proposals = proposer.get_spec_proposals(
        execute_model_req=ExecuteModelRequest(
            seq_group_metadata_list=seq_group_metadata_list,
@@ -147,7 +148,7 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
 def test_ngram_algo_correctness_for_batches_match_all():
    """Verify our ngram algo find the right candidate in the prompt

-    For the scenario find candidate in all batchs
+    For the scenario find candidate in all batches
    """

    block_size = 32
@@ -192,6 +193,10 @@ def test_ngram_algo_correctness_for_batches_match_all():
        block_size,
        final_prompt_lens=final_prompt_lens)

+    # Normally drafter is run on decode requests only; here we check the output
+    # of the ngram worker as it is the sole proposer that has no forward.
+    for sg in seq_group_metadata_list:
+        sg.is_prompt = False
    proposals = proposer.get_spec_proposals(
        execute_model_req=ExecuteModelRequest(
            seq_group_metadata_list=seq_group_metadata_list,