Reenable features for ROCm attention backends (#36185)

Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
2026-03-05 22:21:06 -06:00
parent 0a49676fb0
commit c5362c739f
8 changed files with 66 additions and 33 deletions
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -9,6 +9,7 @@ import torch

 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention import Attention
 from vllm.platforms import current_platform
@@ -732,6 +733,13 @@ class AiterFlashAttentionMetadataBuilder(
 class AiterFlashAttentionBackend(AttentionBackend):
    accept_output_buffer: bool = True
    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
+    supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
+        "auto",
+        "bfloat16",
+        "fp8",
+        "fp8_e4m3",
+        "fp8_e5m2",
+    ]

    @staticmethod
    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: