[V1][Spec Decode] Handle draft tokens beyond max_model_len (#16087)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-04-21 12:38:50 -07:00
parent 299ebb62b2
commit 3a0fba5cf4
7 changed files with 137 additions and 15 deletions
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -30,6 +30,7 @@ def create_scheduler(
    use_kv_connector: bool = False,
    num_blocks: int = 10000,
    block_size: int = 16,
+    max_model_len: Optional[int] = None,
 ) -> Scheduler:
    '''Create scheduler under test.

@@ -44,12 +45,15 @@ def create_scheduler(
    Returns:
      :class:`Scheduler` instance
    '''
+    if max_model_len is None:
+        max_model_len = max_num_batched_tokens
    scheduler_config = SchedulerConfig(
        max_num_seqs=max_num_seqs,
        max_num_batched_tokens=max_num_batched_tokens,
-        max_model_len=max_num_batched_tokens,
+        max_model_len=max_model_len,
        long_prefill_token_threshold=long_prefill_token_threshold,
        disable_chunked_mm_input=disable_chunked_mm_input,
+        enable_chunked_prefill=True,
    )
    model_config = ModelConfig(
        model=model,
@@ -296,6 +300,7 @@ def test_no_mm_input_chunking():
        model="llava-hf/llava-1.5-7b-hf",
        max_num_batched_tokens=1024,
        disable_chunked_mm_input=True,
+        max_model_len=2048,
    )
    mm_positions = [[PlaceholderRange(offset=400, length=800)]]
    requests = create_requests(num_requests=1,