diff --git a/vllm/compilation/passes/fusion/attn_quant_fusion.py b/vllm/compilation/passes/fusion/attn_quant_fusion.py index bb064f58c..5e6bf28c0 100644 --- a/vllm/compilation/passes/fusion/attn_quant_fusion.py +++ b/vllm/compilation/passes/fusion/attn_quant_fusion.py @@ -170,9 +170,8 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern): kv_cache_dummy_dep: torch.Tensor, ) -> torch.Tensor: # attn output in quant_dtype - output_attn = torch.ops.aten.full.default( + output_attn = torch.empty( [q.shape[0], self.num_heads, self.head_size], - 0.0, dtype=self.quant_dtype, device=q.device, ) @@ -271,9 +270,8 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern): kv_cache_dummy_dep: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: # attention output in quant_dtype - output_attn = torch.ops.aten.full.default( + output_attn = torch.empty( [q.shape[0], self.num_heads, self.head_size // 2], - 0.0, dtype=self.quant_dtype, device=q.device, )