From 7d6abdd02241a135e2429de1b583dbfb6f76d6ff Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Wed, 11 Mar 2026 12:26:14 +0800 Subject: [PATCH] [Fix] Use torch.empty for output in attention+quant fusion (#31785) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- vllm/compilation/passes/fusion/attn_quant_fusion.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/compilation/passes/fusion/attn_quant_fusion.py b/vllm/compilation/passes/fusion/attn_quant_fusion.py index bb064f58c..5e6bf28c0 100644 --- a/vllm/compilation/passes/fusion/attn_quant_fusion.py +++ b/vllm/compilation/passes/fusion/attn_quant_fusion.py @@ -170,9 +170,8 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern): kv_cache_dummy_dep: torch.Tensor, ) -> torch.Tensor: # attn output in quant_dtype - output_attn = torch.ops.aten.full.default( + output_attn = torch.empty( [q.shape[0], self.num_heads, self.head_size], - 0.0, dtype=self.quant_dtype, device=q.device, ) @@ -271,9 +270,8 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern): kv_cache_dummy_dep: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: # attention output in quant_dtype - output_attn = torch.ops.aten.full.default( + output_attn = torch.empty( [q.shape[0], self.num_heads, self.head_size // 2], - 0.0, dtype=self.quant_dtype, device=q.device, )