[BugFix] fix num_lookahead_slots missing in async executor (#4165)

Co-authored-by: Lei Wen <wenlei03@qiyi.com>
2024-05-01 01:12:59 +08:00
parent 26f2fb5113
commit 4bb53e2dde
9 changed files with 163 additions and 19 deletions
--- a/tests/spec_decode/e2e/test_correctness.py
+++ b/tests/spec_decode/e2e/test_correctness.py
@@ -40,17 +40,24 @@ from .conftest import get_output_from_llm_generator

@pytest.mark.parametrize(
    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        # Note this is repeated in the test body; to initialize a tokenizer.
-        "model": "JackFram/llama-68m",
+    [
+        {
+            # Use a small model for a fast test.
+            # Note this is repeated in the test body; to initialize a tokenizer.
+            "model": "JackFram/llama-68m",

-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
+            # Skip cuda graph recording for fast test.
+            "enforce_eager": True,

-        # Required for spec decode.
-        "use_v2_block_manager": True
-    }])
+            # Required for spec decode.
+            "use_v2_block_manager": True,
+
+            # whether use AsyncLLM engine
+            "use_async": async_mode,
+        }
+        # Try both async and sync engine execution
+        for async_mode in [True, False]
+    ])
@pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
    [