From 3a6d5cbefd97a3dee07ba1756d8b5a9052801403 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 27 Jan 2026 17:24:41 -0500 Subject: [PATCH] [Perf] Optimize dcp allocate tensor (#33102) Signed-off-by: yewentao256 --- vllm/v1/attention/ops/common.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/vllm/v1/attention/ops/common.py b/vllm/v1/attention/ops/common.py index bd6bc864d..46c689ce0 100644 --- a/vllm/v1/attention/ops/common.py +++ b/vllm/v1/attention/ops/common.py @@ -195,14 +195,10 @@ def _cp_lse_common( if ctx is None: ctx = CPTritonContext() - lses = torch.empty( - (cp_group.world_size,) + cp_attn_lse.shape, - dtype=cp_attn_lse.dtype, - device=cp_attn_lse.device, - ) - cp_attn_lse = cp_attn_lse.contiguous() - lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses) + lses = cp_group.all_gather(cp_attn_lse, dim=0).reshape( + (cp_group.world_size,) + cp_attn_lse.shape + ) out, lse = correct_attn_out( cp_attn_out, lses,