[Speculative decoding][Re-take] Enable TP>1 speculative decoding (#4840)

Co-authored-by: Cade Daniel <edacih@gmail.com> Co-authored-by: Cade Daniel <cade@anyscale.com>
2024-05-16 00:53:51 -07:00
parent 30e754390c
commit 973617ae02
12 changed files with 295 additions and 180 deletions
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -5,56 +5,6 @@ from vllm import SamplingParams
 from .conftest import get_output_from_llm_generator


-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model": "JackFram/llama-68m",
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-
-        # Required for spec decode.
-        "use_v2_block_manager": True
-    }])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-    [
-        {
-            # Expect failure as spec decode not supported by
-            # Ray backend.
-            "worker_use_ray": True,
-        },
-    ])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_xfail_ray(test_llm_generator):
-    """Verify that speculative decoding with Ray fails.
-    """
-    output_len = 128
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-    ]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    try:
-        with pytest.raises(
-                AssertionError,
-                match="Speculative decoding not yet supported for "):
-            get_output_from_llm_generator(test_llm_generator, prompts,
-                                          sampling_params)
-    finally:
-        # we need to free up ray resource,
-        # so that latter test could use the gpu we allocated here
-        import ray
-        ray.shutdown()
-
-
@pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -0,0 +1,44 @@
+"""Tests which cover integration of the speculative decoding framework with
+other features, e.g. cuda graphs.
+"""
+
+import pytest
+
+from .conftest import run_greedy_equality_correctness_test
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Verify equality when cuda graphs allowed.
+        "enforce_eager": False,
+        "model": "JackFram/llama-68m",
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            # Identical models.
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
+    ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("output_len", [32])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator,
+                                batch_size, output_len):
+    """Verify spec decode equality when cuda graphs are enabled.
+    """
+    run_greedy_equality_correctness_test(
+        baseline_llm_generator,
+        test_llm_generator,
+        batch_size,
+        max_output_len=output_len,
+        force_output_len=True,
+    )
--- a/tests/spec_decode/e2e/test_integration_dist.py
+++ b/tests/spec_decode/e2e/test_integration_dist.py
@@ -0,0 +1,65 @@
+"""Tests which cover integration of the speculative decoding framework with
+tensor parallelism.
+"""
+
+import pytest
+import torch
+
+from vllm.utils import is_hip
+
+from .conftest import run_greedy_equality_correctness_test
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+    },
+    {
+        "speculative_model": "[ngram]",
+        "num_speculative_tokens": 5,
+        "ngram_prompt_lookup_max": 3,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
+                              batch_size: int, output_len: int):
+    """Verify greedy equality when tensor parallelism is used.
+    """
+    if is_hip():
+        pytest.skip("hip is not well-supported yet")
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -611,40 +611,3 @@ def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
                                         batch_size,
                                         max_output_len=output_len,
                                         force_output_len=True)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Required for spec decode.
-        "use_v2_block_manager": True,
-
-        # Verify equality when cuda graphs allowed.
-        "enforce_eager": False,
-        "model": "JackFram/llama-68m",
-    }])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-    [
-        {
-            # Identical models.
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
-        },
-    ])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize("output_len", [32])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator,
-                                batch_size, output_len):
-    """Verify spec decode equality when cuda graphs are enabled.
-    """
-    run_greedy_equality_correctness_test(
-        baseline_llm_generator,
-        test_llm_generator,
-        batch_size,
-        max_output_len=output_len,
-        force_output_len=True,
-    )