[Kernel] Move attn_type to Attention.__init__() (#11690)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-01-07 00:11:28 +08:00
parent 32c9eff2ff
commit e20c92bb61
18 changed files with 159 additions and 201 deletions
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -89,6 +89,7 @@ class FlashAttentionImpl(AttentionImpl):
        kv_cache_dtype: str,
        blocksparse_params: Optional[Dict[str, Any]] = None,
        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
    ) -> None:
        if blocksparse_params is not None:
            raise ValueError(
@@ -119,6 +120,12 @@ class FlashAttentionImpl(AttentionImpl):
                f"Head size {head_size} is not supported by FlashAttention. "
                f"Supported head sizes are: {support_head_sizes}.")

+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashAttentionImpl")
+
    def forward(
        self,
        query: torch.Tensor,
@@ -128,7 +135,6 @@ class FlashAttentionImpl(AttentionImpl):
        attn_metadata: FlashAttentionMetadata,
        k_scale: float = 1.0,
        v_scale: float = 1.0,
-        attn_type: AttentionType = AttentionType.DECODER,
        output: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """Forward pass with FlashAttention.
@@ -142,12 +148,6 @@ class FlashAttentionImpl(AttentionImpl):
        Returns:
            shape = [num_tokens, num_heads * head_size]
        """
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashAttentionImpl")
-
        # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
        assert k_scale == 1.0 and v_scale == 1.0, (
            "key/v_scale is not supported in FlashAttention.")