[Perf] Async Scheduling + Speculative Decoding + Structured Outputs (#29821)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
Benjamin Chislett
2026-01-06 13:50:37 -05:00
committed by GitHub
parent 4e67a8f616
commit f7008ce1c4
8 changed files with 185 additions and 55 deletions

View File

@@ -30,8 +30,9 @@ example_prompts = [first_prompt, "In one word, the capital of France is "] + [
default_params = dict(
temperature=0.0, # greedy
max_tokens=23,
min_tokens=18,
max_tokens=30,
# spec decoding currently doesn't support min_tokens
# min_tokens=28,
)
@@ -86,7 +87,7 @@ def test_without_spec_decoding(
run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
"""Test consistency and acceptance rates with some different combos of
preemption, executor, async scheduling, prefill chunking,
spec decoding model length.
@@ -100,9 +101,16 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
# Set small draft model len to force doesn't-fit-in-drafter case.
spec_config_short = spec_config | {"max_model_len": 50}
struct_outputs = StructuredOutputsParams(json=sample_json_schema)
test_sampling_params = [
dict(),
dict(logprobs=2),
dict(structured_outputs=struct_outputs),
dict(
structured_outputs=struct_outputs,
logprobs=2,
),
]
# test_preemption, executor, async_scheduling,