[Core] NGram GPU Implementation compatible with Async Scheduler (#29184)

This commit is contained in:
PatchyTIS
2026-03-08 05:51:37 +08:00
committed by GitHub
parent ee54f9cdb9
commit a6be75dbd2
9 changed files with 940 additions and 12 deletions

View File

@@ -183,6 +183,34 @@ def test_ngram_and_suffix_correctness(
cleanup_dist_env_and_memory()
@pytest.mark.parametrize("async_scheduling", [True], ids=["async"])
@single_gpu_only
@large_gpu_mark(min_gb=20)
def test_ngram_gpu_default_with_async_scheduling(
async_scheduling: bool,
):
"""
Test ngram_gpu speculative decoding (k=3) correctness with and without
async scheduling, validated via GSM8K accuracy.
Uses Qwen/Qwen3-8B (ref GSM8K accuracy: 87%-92%).
"""
qwen3_model = "Qwen/Qwen3-8B"
spec_llm = LLM(
model=qwen3_model,
speculative_config={
"method": "ngram_gpu",
"prompt_lookup_max": 3,
"prompt_lookup_min": 2,
"num_speculative_tokens": 2,
},
max_model_len=4096,
async_scheduling=async_scheduling,
)
evaluate_llm_for_gsm8k(spec_llm, expected_accuracy_threshold=0.8)
del spec_llm
cleanup_dist_env_and_memory()
@single_gpu_only
@large_gpu_mark(min_gb=20)
def test_suffix_decoding_acceptance(