[Speculative decoding 6/9] Integrate speculative decoding with LLMEngine (#3894)

2024-04-16 13:09:21 -07:00
parent 69e1d2fb69
commit e95cd87959
31 changed files with 1347 additions and 407 deletions
--- a/tests/spec_decode/e2e/test_correctness.py
+++ b/tests/spec_decode/e2e/test_correctness.py
@@ -1,4 +1,8 @@
+from itertools import cycle
+from typing import List, Tuple
+
 import pytest
+from transformers import AutoTokenizer

 from vllm import SamplingParams

@@ -7,18 +11,47 @@ from vllm import SamplingParams
    "common_llm_kwargs",
    [{
        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-        "speculative_model": "facebook/opt-125m",
-        "num_speculative_tokens": 5,
+        # Note this is repeated in the test body; to initialize a tokenizer.
+        "model": "JackFram/llama-68m",
+
+        # Skip real loading for fast test.
+        "load_format": "dummy",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,

        # Required for spec decode.
        "use_v2_block_manager": True
    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
+        {
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": 1,
+        },
+        {
+            # No spec decode.
+        },
+    ])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("batch_size", [1])
+# NOTE: We should run more permutations of this test (more BS, more seeds). But
+# because our spec decode generates gibberish token ids, the likelihood of
+# emitting an invalid token combination is nontrivial. This causes divergence in
+# behavior of vLLM detokenization vs. hf tokenizer, for example when two "utf-
+# start" bytes are emitted.
@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_config(test_llm_generator):
-    output_len = 1024
+def test_spec_decode_e2e_logical_flow(test_llm_generator, batch_size: int):
+    """Run generation with speculative decoding on a batch. Verify the engine
+    generates the correct number of tokens (via ignore_eos=True), and that the
+    detokenization matches HF transformers.
+    """
+    output_len = 32
    temperature = 0.0

    prompts = [
@@ -28,23 +61,91 @@ def test_spec_decode_config(test_llm_generator):
        "The future of AI is",
    ]

+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+        skip_special_tokens=True,
+        spaces_between_special_tokens=False,
+    )
+
+    batch_tokens, batch_token_ids = get_output_from_llm_generator(
+        test_llm_generator, prompts, sampling_params)
+
+    # Expect a generation for each prompt in the batch.
+    assert len(batch_token_ids) == len(prompts)
+
+    # Expect each generation to have expected number of tokens (note
+    # ignore_eos=True).
+    assert all(len(token_ids) == output_len for token_ids in batch_token_ids)
+
+    # Expect detokenized string to match.
+    tok = AutoTokenizer.from_pretrained("JackFram/llama-68m")
+    for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids):
+        expected_tokens = tok.decode(actual_token_ids)
+        print(f"{actual_token_ids=}")
+        assert actual_tokens.strip() == expected_tokens.strip()
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "JackFram/llama-68m",
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+
+        # Skip real loading for fast test.
+        "load_format": "dummy",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            # Expect failure as spec decode not supported by
+            # Ray backend.
+            "worker_use_ray": True,
+        },
+    ])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_xfail(test_llm_generator):
+    """Verify that speculative decoding with Ray fails.
+    """
+    output_len = 128
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+    ]
+
    sampling_params = SamplingParams(
        max_tokens=output_len,
        ignore_eos=True,
        temperature=temperature,
    )

-    with pytest.raises(
-            AssertionError,
-            match="Speculative decoding not yet supported for GPU backend"):
-        get_token_ids_from_llm_generator(test_llm_generator, prompts,
-                                         sampling_params)
+    with pytest.raises(AssertionError,
+                       match="Speculative decoding not yet supported for "):
+        get_output_from_llm_generator(test_llm_generator, prompts,
+                                      sampling_params)


-def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
+def get_output_from_llm_generator(
+        llm_generator, prompts,
+        sampling_params) -> Tuple[List[str], List[List[int]]]:
    for llm in llm_generator:
        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
        token_ids = [output.outputs[0].token_ids for output in outputs]
+        tokens = [output.outputs[0].text for output in outputs]
        del llm

-    return token_ids
+    return tokens, token_ids