[V1][Perf] Simpler request output queues (#15156)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Co-authored-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
This commit is contained in:
Nick Hill
2025-03-24 15:44:08 -07:00
committed by GitHub
parent 6dd55af6c9
commit 9d72daf4ce
3 changed files with 146 additions and 25 deletions

View File

@@ -11,11 +11,13 @@ from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
STOP_STRINGS,
DummyOutputProcessorTestVectors,
MockEngineCore)
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.sequence import PromptLogprobs, SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.output_processor import OutputProcessor
from vllm.v1.engine.output_processor import (OutputProcessor,
RequestOutputCollector)
from vllm.v1.metrics.stats import IterationStats
@@ -834,3 +836,88 @@ def test_iteration_stats(dummy_test_vectors):
assert iteration_stats.num_prompt_tokens == 0
assert iteration_stats.num_generation_tokens == num_active
@pytest.mark.asyncio
async def test_request_output_collector():
NUM_REQS = 3
TEXT = "a"
def make_outputs() -> list[RequestOutput]:
return [
RequestOutput(
request_id="my-request-id",
prompt=None,
prompt_token_ids=[1, 2, 3],
prompt_logprobs=None,
outputs=[
CompletionOutput(
index=0,
text=TEXT,
token_ids=[idx],
cumulative_logprob=(idx + 1 * 1.0),
logprobs=[{
"a": idx,
"b": idx
}],
finish_reason="length" if
(idx == NUM_REQS - 1) else None,
)
],
finished=(idx == NUM_REQS - 1),
) for idx in range(NUM_REQS)
]
collector = RequestOutputCollector(RequestOutputKind.DELTA)
# CASE 1: Put then get.
outputs = make_outputs()
collector.put(outputs[0])
output = await collector.get()
assert not collector.ready.is_set()
assert collector.output is None
assert output.outputs[0].text == "a"
assert output.outputs[0].token_ids == [0]
# CASE 2: 2 puts then get.
num_to_put = 2
outputs = make_outputs()
for i in range(num_to_put):
collector.put(outputs[i])
output = await collector.get()
assert not collector.ready.is_set()
assert collector.output is None
assert not output.finished
# Text, token_ids, and logprobs should get merged.
assert output.outputs[0].text == TEXT * num_to_put
for tok_0, tok_1 in zip(output.outputs[0].token_ids,
list(range(num_to_put))):
assert tok_0 == tok_1
assert len(output.outputs[0].logprobs) == num_to_put
# Cumulative logprobs should be the last one.
cumulative_logprob_expected = 1.0 * num_to_put
assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
# CASE 3: Put all 3 (including a finished).
num_to_put = 3
outputs = make_outputs()
for i in range(num_to_put):
collector.put(outputs[i])
output = await collector.get()
assert not collector.ready.is_set()
assert collector.output is None
assert output.finished
assert output.outputs[0].finish_reason == "length"
# Text, token_ids, and logprobs should get merged.
assert output.outputs[0].text == TEXT * num_to_put
for tok_0, tok_1 in zip(output.outputs[0].token_ids,
list(range(num_to_put))):
assert tok_0 == tok_1
assert len(output.outputs[0].logprobs) == num_to_put
# Cumulative logprobs should be the last one.
cumulative_logprob_expected = 1.0 * num_to_put
assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected