[V1][Frontend] Coalesce bunched RequestOutputs (#12298)
Signed-off-by: Nick Hill <nhill@redhat.com> Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
This commit is contained in:
@@ -15,7 +15,7 @@ from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
@@ -214,6 +214,14 @@ class AsyncLLM(EngineClient):
|
||||
# task switching under load which helps performance).
|
||||
out = q.get_nowait() if not q.empty() else await q.get()
|
||||
|
||||
# Coalesce any additional queued outputs
|
||||
while not q.empty():
|
||||
next_out = q.get_nowait()
|
||||
if sampling_params.output_kind == RequestOutputKind.DELTA:
|
||||
out.add(next_out)
|
||||
else:
|
||||
out = next_out
|
||||
|
||||
# Note: both OutputProcessor and EngineCore handle their
|
||||
# own request cleanup based on finished.
|
||||
finished = out.finished
|
||||
|
||||
Reference in New Issue
Block a user