[Speculative decoding] Add ngram prompt lookup decoding (#4237)

Co-authored-by: Lei Wen <wenlei03@qiyi.com>
2024-05-02 02:13:03 +08:00
parent 8b798eec75
commit b38e42fbca
14 changed files with 1003 additions and 318 deletions
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,4 +1,5 @@
 import asyncio
+from itertools import cycle
 from typing import List, Optional, Tuple, Union

 import pytest
@@ -185,3 +186,60 @@ def get_output_from_llm_generator(
        del llm

    return tokens, token_ids
+
+
+def run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len,
+                                         force_output_len: bool,
+                                         print_tokens: bool = False):
+    """Helper method that compares the outputs of both the baseline LLM and
+    the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
+    the same when temperature is zero.
+    """
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+        "San Francisco is know for its",
+        "Facebook was created in 2004 by",
+        "Curious George is a",
+        "Python 3.11 brings improvements to its",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    # If the test requires that we generated max_output_len tokens, then set the
+    # sampling params to ignore eos token.
+    ignore_eos = force_output_len
+
+    sampling_params = SamplingParams(
+        max_tokens=max_output_len,
+        ignore_eos=ignore_eos,
+        temperature=temperature,
+    )
+
+    spec_batch_tokens, spec_batch_token_ids = get_output_from_llm_generator(
+        test_llm_generator, prompts, sampling_params)
+
+    (baseline_batch_tokens,
+     baseline_batch_token_ids) = get_output_from_llm_generator(
+         baseline_llm_generator, prompts, sampling_params)
+
+    assert len(baseline_batch_token_ids) == len(prompts)
+    assert len(spec_batch_token_ids) == len(prompts)
+
+    for i, (baseline_token_ids, baseline_tokens, spec_token_ids,
+            spec_tokens) in enumerate(
+                zip(baseline_batch_token_ids, baseline_batch_tokens,
+                    spec_batch_token_ids, spec_batch_tokens)):
+        if print_tokens:
+            print(f'{i=} {baseline_tokens=}')
+            print(f'{i=}     {spec_tokens=}')
+        print(f'{i=} {baseline_token_ids=}')
+        print(f'{i=}     {spec_token_ids=}')
+        assert baseline_token_ids == spec_token_ids