[Bugfix] Fix for Spec model TP + Chunked Prefill (#10232)

Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com>
Signed-off-by: Sourashis Roy <sroy@roblox.com>
Co-authored-by: Sourashis Roy <sroy@roblox.com>
This commit is contained in:
Murali Andoorveedu
2024-11-26 09:11:16 -08:00
committed by GitHub
parent 1f6584ee85
commit db66e018ea
8 changed files with 144 additions and 72 deletions

View File

@@ -413,6 +413,45 @@ def test_chunked_prefill_preempt():
assert out.num_batched_tokens == max_num_batched_tokens
@pytest.mark.parametrize("num_scheduler_steps", [1, 5])
def test_chunked_prefill_spec_prefill(num_scheduler_steps):
"""Verify that the num_lookahead_slots is set appropriately for an all"""
"""prefill batch depending on whether multi-step scheduling is enabled"""
"""or not"""
block_size = 4
max_seqs = 30
max_model_len = 200
max_num_batched_tokens = 30
num_lookahead_slots = 4
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
enable_chunked_prefill=True,
num_lookahead_slots=num_lookahead_slots,
num_scheduler_steps=num_scheduler_steps,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16
cache_config.num_gpu_blocks = 16
scheduler = Scheduler(scheduler_config, cache_config, None)
_, seq_group = create_dummy_prompt("1",
prompt_length=30,
block_size=block_size)
scheduler.add_seq_group(seq_group)
_, out = schedule_and_update_computed_tokens(scheduler)
# The request is chunked.
# prefill scheduled now.
assert len(out.scheduled_seq_groups) == 1
assert out.num_prefill_groups == 1
assert out.num_batched_tokens == max_num_batched_tokens
print(out.num_lookahead_slots)
assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else
num_lookahead_slots)
def test_chunked_prefill_max_seqs():
block_size = 4
max_seqs = 2