[Perf] Async Scheduling + Speculative Decoding + Structured Outputs (#29821)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
committed by
GitHub
parent
4e67a8f616
commit
f7008ce1c4
@@ -30,8 +30,9 @@ example_prompts = [first_prompt, "In one word, the capital of France is "] + [
|
||||
|
||||
default_params = dict(
|
||||
temperature=0.0, # greedy
|
||||
max_tokens=23,
|
||||
min_tokens=18,
|
||||
max_tokens=30,
|
||||
# spec decoding currently doesn't support min_tokens
|
||||
# min_tokens=28,
|
||||
)
|
||||
|
||||
|
||||
@@ -86,7 +87,7 @@ def test_without_spec_decoding(
|
||||
run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
|
||||
|
||||
|
||||
def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
|
||||
def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test consistency and acceptance rates with some different combos of
|
||||
preemption, executor, async scheduling, prefill chunking,
|
||||
spec decoding model length.
|
||||
@@ -100,9 +101,16 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
|
||||
# Set small draft model len to force doesn't-fit-in-drafter case.
|
||||
spec_config_short = spec_config | {"max_model_len": 50}
|
||||
|
||||
struct_outputs = StructuredOutputsParams(json=sample_json_schema)
|
||||
|
||||
test_sampling_params = [
|
||||
dict(),
|
||||
dict(logprobs=2),
|
||||
dict(structured_outputs=struct_outputs),
|
||||
dict(
|
||||
structured_outputs=struct_outputs,
|
||||
logprobs=2,
|
||||
),
|
||||
]
|
||||
|
||||
# test_preemption, executor, async_scheduling,
|
||||
|
||||
Reference in New Issue
Block a user