[Speculative Decoding] Test refactor (#8317)

Co-authored-by: youkaichao <youkaichao@126.com>
This commit is contained in:
Lily Liu
2024-09-11 14:07:34 -07:00
committed by GitHub
parent 8baa454937
commit 775f00f81e
12 changed files with 927 additions and 1042 deletions

View File

@@ -7,42 +7,39 @@ import torch
from vllm.utils import is_hip
from .conftest import run_greedy_equality_correctness_test
from .conftest import run_equality_correctness_test_tp
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model": "JackFram/llama-68m",
[[
# Skip cuda graph recording for fast test.
"enforce_eager": True,
"--enforce-eager",
# Required for spec decode.
"use_v2_block_manager": True,
"tensor_parallel_size": 2,
# Use AsyncLLM engine, so that the engine runs in its own process.
# Otherwise, since vLLM does not follow true SPMD, the test runner
# process will have both the engine and the rank0 worker. NCCL is not
# cleaned up properly, and its server host thread leaks, causing the
# second run of the test to fail with internal NCCL error.
"use_async": True,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
"--use-v2-block-manager",
"--tensor-parallel-size",
"2"
]])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 3,
},
{
"speculative_model": "[ngram]",
"num_speculative_tokens": 5,
"ngram_prompt_lookup_max": 3,
},
[
"--speculative-model",
"JackFram/llama-68m",
"--num-speculative-tokens",
"3",
],
[
"--speculative-model",
"[ngram]",
"--num-speculative-tokens",
"5",
"--ngram-prompt-lookup-max",
"3",
],
])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize(
@@ -52,75 +49,75 @@ from .conftest import run_greedy_equality_correctness_test
32,
])
@pytest.mark.parametrize("seed", [1])
def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
batch_size: int, output_len: int):
def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size: int, output_len: int, seed: int):
"""Verify greedy equality when tensor parallelism is used.
"""
if is_hip():
pytest.skip("hip is not well-supported yet")
run_greedy_equality_correctness_test(baseline_llm_generator,
test_llm_generator,
batch_size,
max_output_len=output_len,
force_output_len=True)
run_equality_correctness_test_tp("JackFram/llama-68m",
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
output_len,
seed,
temperature=0.0)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
[[
# Skip cuda graph recording for fast test.
"enforce_eager": True,
"--enforce-eager",
# Required for spec decode.
"use_v2_block_manager": True,
"tensor_parallel_size": 2,
# Use AsyncLLM engine, so that the engine runs in its own process.
# Otherwise, since vLLM does not follow true SPMD, the test runner
# process will have both the engine and the rank0 worker. NCCL is not
# cleaned up properly, and its server host thread leaks, causing the
# second run of the test to fail with internal NCCL error.
"use_async": True,
"--use_v2_block_manager",
"--tensor_parallel_size",
"2",
# precision
"dtype": "float32",
}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs, test_llm_kwargs",
[
(
{
# Use a small model for a fast test.
# Note this is repeated in the test body; to initialize a
# tokenizer.
"model": "JackFram/llama-68m",
},
{
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
"speculative_draft_tensor_parallel_size": 1,
}),
({
"model": "ibm-granite/granite-3b-code-instruct",
}, {
"speculative_model":
"ibm-granite/granite-3b-code-instruct-accelerator",
"num_speculative_tokens": 5,
"speculative_draft_tensor_parallel_size": 1,
})
])
"--dtype",
"bfloat16",
]])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [
"--speculative-model",
"JackFram/llama-68m",
"--num_speculative-tokens",
"5",
"--speculative-draft-tensor-parallel-size",
"1",
]),
("ibm-granite/granite-3b-code-instruct", [
"--speculative-model",
"ibm-granite/granite-3b-code-instruct",
"--num_speculative-tokens",
"5",
"--speculative-draft-tensor-parallel-size",
"1",
])])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize("seed", [1])
def test_draft_model_tp_lt_target_model_tp2(test_llm_generator,
baseline_llm_generator,
batch_size: int):
def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs, batch_size: int,
seed: int):
"""Verify spec decode works well with smaller tp for draft models.
"""
run_greedy_equality_correctness_test(baseline_llm_generator,
test_llm_generator,
batch_size,
max_output_len=32,
force_output_len=True)
run_equality_correctness_test_tp(model,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
max_output_len=32,
seed=seed,
temperature=0.0)