[V1][Frontend] Coalesce bunched RequestOutputs (#12298)

Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
This commit is contained in:
Nick Hill
2025-01-23 17:17:41 -08:00
committed by GitHub
parent c5cffcd0cd
commit 24b0205f58
3 changed files with 65 additions and 18 deletions

View File

@@ -15,7 +15,7 @@ from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.usage.usage_lib import UsageContext
@@ -214,6 +214,14 @@ class AsyncLLM(EngineClient):
# task switching under load which helps performance).
out = q.get_nowait() if not q.empty() else await q.get()
# Coalesce any additional queued outputs
while not q.empty():
next_out = q.get_nowait()
if sampling_params.output_kind == RequestOutputKind.DELTA:
out.add(next_out)
else:
out = next_out
# Note: both OutputProcessor and EngineCore handle their
# own request cleanup based on finished.
finished = out.finished