From 3a6d5cbefd97a3dee07ba1756d8b5a9052801403 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 27 Jan 2026 17:24:41 -0500
Subject: [PATCH] [Perf] Optimize dcp allocate tensor (#33102)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/v1/attention/ops/common.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/attention/ops/common.py b/vllm/v1/attention/ops/common.py
index bd6bc864d..46c689ce0 100644
--- a/vllm/v1/attention/ops/common.py
+++ b/vllm/v1/attention/ops/common.py
@@ -195,14 +195,10 @@ def _cp_lse_common(
     if ctx is None:
         ctx = CPTritonContext()
 
-    lses = torch.empty(
-        (cp_group.world_size,) + cp_attn_lse.shape,
-        dtype=cp_attn_lse.dtype,
-        device=cp_attn_lse.device,
-    )
-
     cp_attn_lse = cp_attn_lse.contiguous()
-    lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
+    lses = cp_group.all_gather(cp_attn_lse, dim=0).reshape(
+        (cp_group.world_size,) + cp_attn_lse.shape
+    )
     out, lse = correct_attn_out(
         cp_attn_out,
         lses,