[Core] Async Scheduling X Spec Decoding Compatibility (#24799)

Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
This commit is contained in:
Ronald
2025-11-18 04:16:20 +08:00
committed by GitHub
parent f8b19c0ffd
commit d8874c61a5
11 changed files with 314 additions and 98 deletions

View File

@@ -15,7 +15,7 @@ from ...conftest import VllmRunner
from ...models.utils import check_outputs_equal
MODEL = "Qwen/Qwen3-0.6B"
MTP_MODEL = "XiaomiMiMo/MiMo-7B-Base"
MTP_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
first_prompt = (
@@ -29,7 +29,8 @@ example_prompts = [first_prompt, "In one word, the capital of France is "] + [
default_params = dict(
temperature=0.0, # greedy
max_tokens=20,
max_tokens=23,
min_tokens=18,
)
@@ -69,15 +70,9 @@ def test_without_spec_decoding(
(True, "uni", True, None, True),
]
run_tests(
monkeypatch,
MODEL,
test_configs,
test_sampling_params,
)
run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
@pytest.mark.skip("MTP model too big to run in fp32 in CI")
def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
"""Test consistency and acceptance rates with some different combos of
preemption, executor, async scheduling, prefill chunking,
@@ -85,8 +80,9 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
"""
spec_config = {
"method": "mtp",
"method": "eagle3",
"num_speculative_tokens": 2,
"model": "nm-testing/Llama3_2_1B_speculator.eagle3",
}
spec_config_short = spec_config | {"max_model_len": 50}
@@ -106,12 +102,7 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
(True, "uni", True, spec_config_short, True),
]
run_tests(
monkeypatch,
MTP_MODEL,
test_configs,
[{}],
)
run_tests(monkeypatch, MTP_MODEL, test_configs, [{}])
@dynamo_config.patch(cache_size_limit=16)
@@ -182,15 +173,13 @@ def run_tests(
and test_acceptance_rate is not None
):
if "spec_mml=None" in test_config:
# because the acceptance rate can vary, we use a looser
# tolerance here.
assert (
pytest.approx(test_acceptance_rate, rel=5e-2)
== base_acceptance_rate
)
else:
# Currently the reported acceptance rate is expected to be
# lower when we skip drafting altogether.
# lower when we sometimes skip drafting altogether.
assert test_acceptance_rate > 0.05
print(
f"PASSED: config=[{test_config}], params={params}"
@@ -220,6 +209,7 @@ def run_test(
):
spec_decoding = spec_config is not None
cache_arg: dict[str, Any] = (
# Force preemptions
dict(num_gpu_blocks_override=32)
if test_preemption
else dict(gpu_memory_utilization=0.9)
@@ -238,6 +228,7 @@ def run_test(
model,
max_model_len=512,
enable_chunked_prefill=test_prefill_chunking,
# Force prefill chunking
max_num_batched_tokens=48 if test_prefill_chunking else None,
# enforce_eager=True,
async_scheduling=async_scheduling,
@@ -255,10 +246,7 @@ def run_test(
results.append(
vllm_model.generate(
example_prompts,
sampling_params=SamplingParams(
**default_params,
**override_params,
),
sampling_params=SamplingParams(**default_params, **override_params),
return_logprobs=True,
)
)
@@ -270,9 +258,7 @@ def run_test(
if test_preemption:
preemptions = _get_count(
metrics_before,
metrics_after,
"vllm:num_preemptions",
metrics_before, metrics_after, "vllm:num_preemptions"
)
assert preemptions > 0, "preemption test had no preemptions"