[Core] NGram GPU Implementation compatible with Async Scheduler (#29184)
This commit is contained in:
@@ -98,7 +98,7 @@ def test_without_spec_decoding(
|
||||
|
||||
@single_gpu_only
|
||||
@large_gpu_mark(min_gb=16)
|
||||
def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
|
||||
def test_with_eagle3_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test consistency and acceptance rates with some different combos of
|
||||
preemption, executor, async scheduling, prefill chunking,
|
||||
spec decoding model length.
|
||||
@@ -154,6 +154,42 @@ def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch)
|
||||
)
|
||||
|
||||
|
||||
def test_with_ngram_gpu_spec_decoding(monkeypatch: pytest.MonkeyPatch):
|
||||
"""Test ngram_gpu speculative decoding with different configurations.
|
||||
|
||||
This test specifically validates ngram_gpu behavior with various:
|
||||
- Number of speculative tokens (2-6)
|
||||
- Prompt lookup window sizes (min/max)
|
||||
- Async scheduling enabled (as in production)
|
||||
- Different executors and chunking settings
|
||||
"""
|
||||
|
||||
# Variant with larger speculation window
|
||||
ngram_gpu_config = {
|
||||
"method": "ngram_gpu",
|
||||
"num_speculative_tokens": 3,
|
||||
"prompt_lookup_max": 3,
|
||||
"prompt_lookup_min": 2,
|
||||
}
|
||||
|
||||
# Test configurations covering various scenarios
|
||||
# test_preemption, executor, async_scheduling,
|
||||
# spec_config, test_prefill_chunking
|
||||
test_configs = [
|
||||
(False, "mp", False, None, False),
|
||||
(False, "mp", False, ngram_gpu_config, False),
|
||||
(True, "mp", False, ngram_gpu_config, True),
|
||||
(False, "mp", True, ngram_gpu_config, False),
|
||||
(True, "mp", True, ngram_gpu_config, False),
|
||||
(True, "uni", True, ngram_gpu_config, False),
|
||||
(True, "mp", True, ngram_gpu_config, True),
|
||||
]
|
||||
|
||||
# Use MODEL (Qwen) for ngram_gpu tests as it's lighter weight
|
||||
# and ngram_gpu doesn't require a specific draft model
|
||||
run_tests(monkeypatch, MODEL, test_configs, [{}])
|
||||
|
||||
|
||||
@dynamo_config.patch(cache_size_limit=16)
|
||||
def run_tests(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
@@ -282,11 +318,12 @@ def run_test(
|
||||
else dict(gpu_memory_utilization=0.9)
|
||||
)
|
||||
spec_mml = (spec_config or {}).get("max_model_len")
|
||||
spec_method = (spec_config or {}).get("method", "none")
|
||||
test_config = (
|
||||
f"executor={executor}, preemption={test_preemption}, "
|
||||
f"async_sched={async_scheduling}, "
|
||||
f"chunk_prefill={test_prefill_chunking}, "
|
||||
f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
|
||||
f"spec_decoding={spec_decoding}, spec_method={spec_method}, spec_mml={spec_mml}"
|
||||
)
|
||||
print("-" * 80)
|
||||
print(f"---- TESTING {test_str}: {test_config}")
|
||||
@@ -294,7 +331,7 @@ def run_test(
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
max_model_len=4096,
|
||||
enable_chunked_prefill=test_prefill_chunking,
|
||||
# Force prefill chunking
|
||||
max_num_batched_tokens=48 if test_prefill_chunking else None,
|
||||
|
||||
@@ -183,6 +183,34 @@ def test_ngram_and_suffix_correctness(
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("async_scheduling", [True], ids=["async"])
|
||||
@single_gpu_only
|
||||
@large_gpu_mark(min_gb=20)
|
||||
def test_ngram_gpu_default_with_async_scheduling(
|
||||
async_scheduling: bool,
|
||||
):
|
||||
"""
|
||||
Test ngram_gpu speculative decoding (k=3) correctness with and without
|
||||
async scheduling, validated via GSM8K accuracy.
|
||||
Uses Qwen/Qwen3-8B (ref GSM8K accuracy: 87%-92%).
|
||||
"""
|
||||
qwen3_model = "Qwen/Qwen3-8B"
|
||||
spec_llm = LLM(
|
||||
model=qwen3_model,
|
||||
speculative_config={
|
||||
"method": "ngram_gpu",
|
||||
"prompt_lookup_max": 3,
|
||||
"prompt_lookup_min": 2,
|
||||
"num_speculative_tokens": 2,
|
||||
},
|
||||
max_model_len=4096,
|
||||
async_scheduling=async_scheduling,
|
||||
)
|
||||
evaluate_llm_for_gsm8k(spec_llm, expected_accuracy_threshold=0.8)
|
||||
del spec_llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@single_gpu_only
|
||||
@large_gpu_mark(min_gb=20)
|
||||
def test_suffix_decoding_acceptance(
|
||||
|
||||
Reference in New Issue
Block a user