[Core] Reduce TTFT with concurrent partial prefills (#10235)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
This commit is contained in:
Joe Runde
2025-02-14 16:36:07 -07:00
committed by GitHub
parent 5e5c8e091e
commit 3bcb8c75da
6 changed files with 699 additions and 106 deletions

View File

@@ -8,7 +8,6 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`.
"""
import os
from contextlib import nullcontext
import pytest
@@ -233,7 +232,6 @@ def test_with_prefix_caching(
max_num_batched_tokens = max_num_seqs = chunk_size
outputs = {} # type: ignore
check_result = True
for enable in (True, False):
with vllm_runner(
model,
@@ -245,25 +243,17 @@ def test_with_prefix_caching(
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
) as vllm_model:
# It should fail when prefix caching is enable and chunk
# size is not a multiple of block size (16).
should_fail = chunk_size % 16 != 0 and enable
check_result &= not should_fail
outputs[enable] = []
# Send the request one-by-one to ensure the cache is populated.
with pytest.raises(ValueError) if should_fail else nullcontext():
for prompt in full_prompts:
outputs[enable] += vllm_model.generate_greedy([prompt],
max_tokens)
for prompt in full_prompts:
outputs[enable] += vllm_model.generate_greedy([prompt],
max_tokens)
# Check results only if we did not expect a failure.
if check_result:
check_outputs_equal(
outputs_0_lst=outputs[False],
outputs_1_lst=outputs[True],
name_0="w/o prefix caching",
name_1="with prefix caching",
)
check_outputs_equal(
outputs_0_lst=outputs[False],
outputs_1_lst=outputs[True],
name_0="w/o prefix caching",
name_1="with prefix caching",
)
@pytest.mark.parametrize("model", ["facebook/opt-125m"])