[Core] NGram GPU Implementation compatible with Async Scheduler (#29184)

2026-03-08 05:51:37 +08:00
parent ee54f9cdb9
commit a6be75dbd2
9 changed files with 940 additions and 12 deletions
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -183,6 +183,34 @@ def test_ngram_and_suffix_correctness(
    cleanup_dist_env_and_memory()


+@pytest.mark.parametrize("async_scheduling", [True], ids=["async"])
+@single_gpu_only
+@large_gpu_mark(min_gb=20)
+def test_ngram_gpu_default_with_async_scheduling(
+    async_scheduling: bool,
+):
+    """
+    Test ngram_gpu speculative decoding (k=3) correctness with and without
+    async scheduling, validated via GSM8K accuracy.
+    Uses Qwen/Qwen3-8B (ref GSM8K accuracy: 87%-92%).
+    """
+    qwen3_model = "Qwen/Qwen3-8B"
+    spec_llm = LLM(
+        model=qwen3_model,
+        speculative_config={
+            "method": "ngram_gpu",
+            "prompt_lookup_max": 3,
+            "prompt_lookup_min": 2,
+            "num_speculative_tokens": 2,
+        },
+        max_model_len=4096,
+        async_scheduling=async_scheduling,
+    )
+    evaluate_llm_for_gsm8k(spec_llm, expected_accuracy_threshold=0.8)
+    del spec_llm
+    cleanup_dist_env_and_memory()
+
+
@single_gpu_only
@large_gpu_mark(min_gb=20)
 def test_suffix_decoding_acceptance(