From e18464a57d145872899463838ca07050ea15141b Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 9 Jan 2026 19:46:11 -0500 Subject: [PATCH] [Perf] Optimize async scheduling placeholder using empty (#32056) Signed-off-by: yewentao256 --- vllm/v1/engine/output_processor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 13b332533..7f762bcbb 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -31,6 +31,9 @@ from vllm.v1.metrics.stats import ( SchedulerStats, ) +# shared empty CPU tensor used as a placeholder pooling output +EMPTY_CPU_TENSOR = torch.empty(0, device="cpu") + class RequestOutputCollector: """ @@ -426,7 +429,7 @@ class OutputProcessor: new_token_ids=[], # Set pooling_output is not None to # correctly enter the abort pooling branch - pooling_output=torch.randn(0, device="cpu") + pooling_output=EMPTY_CPU_TENSOR if req_state.detokenizer is None else None, finish_reason=FinishReason.ABORT,