[Speculative Decoding] Support draft model on different tensor-parallel size than target model (#5414)

2024-06-25 18:56:06 +09:00
parent f23871e9ee
commit 2ce5d6688b
11 changed files with 388 additions and 59 deletions
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -0,0 +1,60 @@
+"""Tests which cover integration of the speculative decoding framework with
+tensor parallelism.
+"""
+
+import pytest
+import torch
+
+from .conftest import run_greedy_equality_correctness_test
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 4,
+                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        # Note this is repeated in the test body; to initialize a tokenizer.
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 4,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+    },
+])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        #TODO(wooyeon): add spec_draft_dp=2 case
+        {
+            "speculative_draft_tensor_parallel_size": 1,
+        },
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_draft_model_tp_lt_target_model_tp4(test_llm_generator,
+                                            baseline_llm_generator,
+                                            batch_size: int):
+    """Verify spec decode works well with smaller tp for draft models.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=32,
+                                         force_output_len=True)