[Perf] Optimize async scheduling placeholder using empty (#32056)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-01-09 19:46:11 -05:00
parent 1963245ed1
commit e18464a57d
1 changed files with 4 additions and 1 deletions
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -31,6 +31,9 @@ from vllm.v1.metrics.stats import (
    SchedulerStats,
 )

+# shared empty CPU tensor used as a placeholder pooling output
+EMPTY_CPU_TENSOR = torch.empty(0, device="cpu")
+

 class RequestOutputCollector:
    """
@@ -426,7 +429,7 @@ class OutputProcessor:
                        new_token_ids=[],
                        # Set pooling_output is not None to
                        # correctly enter the abort pooling branch
-                        pooling_output=torch.randn(0, device="cpu")
+                        pooling_output=EMPTY_CPU_TENSOR
                        if req_state.detokenizer is None
                        else None,
                        finish_reason=FinishReason.ABORT,