[Perf] Optimize async scheduling placeholder using empty (#32056)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
@@ -31,6 +31,9 @@ from vllm.v1.metrics.stats import (
|
|||||||
SchedulerStats,
|
SchedulerStats,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# shared empty CPU tensor used as a placeholder pooling output
|
||||||
|
EMPTY_CPU_TENSOR = torch.empty(0, device="cpu")
|
||||||
|
|
||||||
|
|
||||||
class RequestOutputCollector:
|
class RequestOutputCollector:
|
||||||
"""
|
"""
|
||||||
@@ -426,7 +429,7 @@ class OutputProcessor:
|
|||||||
new_token_ids=[],
|
new_token_ids=[],
|
||||||
# Set pooling_output is not None to
|
# Set pooling_output is not None to
|
||||||
# correctly enter the abort pooling branch
|
# correctly enter the abort pooling branch
|
||||||
pooling_output=torch.randn(0, device="cpu")
|
pooling_output=EMPTY_CPU_TENSOR
|
||||||
if req_state.detokenizer is None
|
if req_state.detokenizer is None
|
||||||
else None,
|
else None,
|
||||||
finish_reason=FinishReason.ABORT,
|
finish_reason=FinishReason.ABORT,
|
||||||
|
|||||||
Reference in New Issue
Block a user