From e18464a57d145872899463838ca07050ea15141b Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 9 Jan 2026 19:46:11 -0500
Subject: [PATCH] [Perf] Optimize async scheduling placeholder using empty
 (#32056)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/v1/engine/output_processor.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 13b332533..7f762bcbb 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -31,6 +31,9 @@ from vllm.v1.metrics.stats import (
     SchedulerStats,
 )
 
+# shared empty CPU tensor used as a placeholder pooling output
+EMPTY_CPU_TENSOR = torch.empty(0, device="cpu")
+
 
 class RequestOutputCollector:
     """
@@ -426,7 +429,7 @@ class OutputProcessor:
                         new_token_ids=[],
                         # Set pooling_output is not None to
                         # correctly enter the abort pooling branch
-                        pooling_output=torch.randn(0, device="cpu")
+                        pooling_output=EMPTY_CPU_TENSOR
                         if req_state.detokenizer is None
                         else None,
                         finish_reason=FinishReason.ABORT,