[Perf] Async Scheduling + Speculative Decoding + Structured Outputs (#29821)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Nick Hill <nickhill123@gmail.com>
2026-01-06 13:50:37 -05:00
parent 4e67a8f616
commit f7008ce1c4
8 changed files with 185 additions and 55 deletions
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -30,8 +30,9 @@ example_prompts = [first_prompt, "In one word, the capital of France is "] + [

 default_params = dict(
    temperature=0.0,  # greedy
-    max_tokens=23,
-    min_tokens=18,
+    max_tokens=30,
+    # spec decoding currently doesn't support min_tokens
+    # min_tokens=28,
 )


@@ -86,7 +87,7 @@ def test_without_spec_decoding(
    run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)


-def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
+def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
    """Test consistency and acceptance rates with some different combos of
    preemption, executor, async scheduling, prefill chunking,
    spec decoding model length.
@@ -100,9 +101,16 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
    # Set small draft model len to force doesn't-fit-in-drafter case.
    spec_config_short = spec_config | {"max_model_len": 50}

+    struct_outputs = StructuredOutputsParams(json=sample_json_schema)
+
    test_sampling_params = [
        dict(),
        dict(logprobs=2),
+        dict(structured_outputs=struct_outputs),
+        dict(
+            structured_outputs=struct_outputs,
+            logprobs=2,
+        ),
    ]

    # test_preemption, executor, async_scheduling,