[Bugfix] Make spec. decode respect per-request seed. (#6034)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com>
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import asyncio
|
||||
from itertools import cycle
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
@@ -128,7 +128,9 @@ class AsyncLLM:
|
||||
try:
|
||||
for i in range(num_requests):
|
||||
prompt = prompts[i] if prompts is not None else None
|
||||
res = asyncio.run(get_output(prompt, sampling_params))
|
||||
params = sampling_params[i] if isinstance(
|
||||
sampling_params, Sequence) else sampling_params
|
||||
res = asyncio.run(get_output(prompt, params))
|
||||
outputs.append(res)
|
||||
finally:
|
||||
ray.shutdown()
|
||||
@@ -267,7 +269,31 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
|
||||
the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
|
||||
the same when temperature is zero.
|
||||
"""
|
||||
temperature = 0.0
|
||||
|
||||
run_equality_correctness_test(baseline_llm_generator,
|
||||
test_llm_generator,
|
||||
batch_size,
|
||||
max_output_len,
|
||||
force_output_len,
|
||||
temperature=0.0,
|
||||
seeded=False,
|
||||
print_tokens=print_tokens,
|
||||
ensure_all_accepted=ensure_all_accepted)
|
||||
|
||||
|
||||
def run_equality_correctness_test(baseline_llm_generator,
|
||||
test_llm_generator,
|
||||
batch_size,
|
||||
max_output_len,
|
||||
force_output_len: bool,
|
||||
temperature: float,
|
||||
seeded: bool,
|
||||
print_tokens: bool = False,
|
||||
ensure_all_accepted: bool = False):
|
||||
"""Helper method that compares the outputs of both the baseline LLM and
|
||||
the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
|
||||
the same when temperature is zero (or when temperature is > 0 and seeded).
|
||||
"""
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
@@ -286,11 +312,21 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
|
||||
# sampling params to ignore eos token.
|
||||
ignore_eos = force_output_len
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=max_output_len,
|
||||
ignore_eos=ignore_eos,
|
||||
temperature=temperature,
|
||||
)
|
||||
if seeded:
|
||||
sampling_params = [
|
||||
SamplingParams(
|
||||
max_tokens=max_output_len,
|
||||
ignore_eos=ignore_eos,
|
||||
temperature=temperature,
|
||||
seed=i,
|
||||
) for i in range(len(prompts))
|
||||
]
|
||||
else:
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=max_output_len,
|
||||
ignore_eos=ignore_eos,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
(spec_batch_tokens, spec_batch_token_ids,
|
||||
acceptance_rate) = get_output_from_llm_generator(test_llm_generator,
|
||||
|
||||
44
tests/spec_decode/e2e/test_seed.py
Normal file
44
tests/spec_decode/e2e/test_seed.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import pytest
|
||||
|
||||
from .conftest import run_equality_correctness_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"common_llm_kwargs",
|
||||
[{
|
||||
"model": "JackFram/llama-68m",
|
||||
|
||||
# Skip cuda graph recording for fast test.
|
||||
"enforce_eager": True,
|
||||
|
||||
# Required for spec decode.
|
||||
"use_v2_block_manager": True,
|
||||
|
||||
# speculative model
|
||||
"speculative_model": "JackFram/llama-160m",
|
||||
|
||||
# num speculative tokens
|
||||
"num_speculative_tokens": 3,
|
||||
}])
|
||||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
|
||||
@pytest.mark.parametrize("batch_size", [1, 8, 32])
|
||||
@pytest.mark.parametrize("temperature", [0.1, 1.0])
|
||||
@pytest.mark.parametrize(
|
||||
"output_len",
|
||||
[
|
||||
# Use smaller output len for fast test.
|
||||
10,
|
||||
])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_seeded_consistency(baseline_llm_generator, batch_size: int,
|
||||
temperature: float, output_len: int):
|
||||
"""Verify outputs are consistent across multiple runs with same seed
|
||||
"""
|
||||
run_equality_correctness_test(baseline_llm_generator,
|
||||
baseline_llm_generator,
|
||||
batch_size,
|
||||
max_output_len=output_len,
|
||||
temperature=temperature,
|
||||
seeded=True,
|
||||
force_output_len=True)
|
||||
Reference in New Issue
Block a user