[Core] NGram GPU Implementation compatible with Async Scheduler (#29184)
This commit is contained in:
@@ -183,6 +183,34 @@ def test_ngram_and_suffix_correctness(
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("async_scheduling", [True], ids=["async"])
|
||||
@single_gpu_only
|
||||
@large_gpu_mark(min_gb=20)
|
||||
def test_ngram_gpu_default_with_async_scheduling(
|
||||
async_scheduling: bool,
|
||||
):
|
||||
"""
|
||||
Test ngram_gpu speculative decoding (k=3) correctness with and without
|
||||
async scheduling, validated via GSM8K accuracy.
|
||||
Uses Qwen/Qwen3-8B (ref GSM8K accuracy: 87%-92%).
|
||||
"""
|
||||
qwen3_model = "Qwen/Qwen3-8B"
|
||||
spec_llm = LLM(
|
||||
model=qwen3_model,
|
||||
speculative_config={
|
||||
"method": "ngram_gpu",
|
||||
"prompt_lookup_max": 3,
|
||||
"prompt_lookup_min": 2,
|
||||
"num_speculative_tokens": 2,
|
||||
},
|
||||
max_model_len=4096,
|
||||
async_scheduling=async_scheduling,
|
||||
)
|
||||
evaluate_llm_for_gsm8k(spec_llm, expected_accuracy_threshold=0.8)
|
||||
del spec_llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@single_gpu_only
|
||||
@large_gpu_mark(min_gb=20)
|
||||
def test_suffix_decoding_acceptance(
|
||||
|
||||
Reference in New Issue
Block a user