[CI Failure] Fix backend selection for encoder-only models (#28534)

Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-11-13 07:11:27 -08:00
parent a7791eac9d
commit 07a606aa7e
14 changed files with 75 additions and 6 deletions
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -134,6 +134,7 @@ class CpuPlatform(Platform):
        use_mla: bool,
        has_sink: bool,
        use_sparse: bool,
+        attn_type: str | None = None,
    ) -> str:
        from vllm.attention.backends.registry import AttentionBackendEnum

--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -298,6 +298,7 @@ class CudaPlatformBase(Platform):
        has_sink,
        use_sparse,
        device_capability,
+        attn_type,
    ) -> tuple[
        list[tuple["AttentionBackendEnum", int]],
        dict["AttentionBackendEnum", list[str]],
@@ -318,6 +319,7 @@ class CudaPlatformBase(Platform):
                    has_sink,
                    use_sparse,
                    device_capability,
+                    attn_type,
                )
            except ImportError:
                invalid_reasons_i = ["ImportError"]
@@ -339,7 +341,13 @@ class CudaPlatformBase(Platform):
        use_mla: bool,
        has_sink: bool,
        use_sparse: bool,
+        attn_type: str | None = None,
    ) -> str:
+        from vllm.attention import AttentionType
+
+        if attn_type is None:
+            attn_type = AttentionType.DECODER
+
        device_capability = cls.get_device_capability()
        assert device_capability is not None

@@ -356,6 +364,7 @@ class CudaPlatformBase(Platform):
                    has_sink,
                    use_sparse,
                    device_capability,
+                    attn_type,
                )
            except ImportError:
                invalid_reasons = ["ImportError"]
@@ -379,6 +388,7 @@ class CudaPlatformBase(Platform):
            has_sink,
            use_sparse,
            device_capability,
+            attn_type,
        )
        reasons_str = (
            "{"
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -222,6 +222,7 @@ class Platform:
        use_mla: bool,
        has_sink: bool,
        use_sparse: bool,
+        attn_type: str | None = None,
    ) -> str:
        """Get the attention backend class of a device."""
        return ""
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -216,6 +216,7 @@ class RocmPlatform(Platform):
        use_mla,
        has_sink,
        use_sparse,
+        attn_type: str | None = None,
    ) -> str:
        from vllm._aiter_ops import rocm_aiter_ops
        from vllm.attention.backends.registry import AttentionBackendEnum
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -61,6 +61,7 @@ class TpuPlatform(Platform):
        use_mla: bool,
        has_sink,
        use_sparse,
+        attn_type: str | None = None,
    ) -> str:
        from vllm.attention.backends.registry import AttentionBackendEnum

--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -51,6 +51,7 @@ class XPUPlatform(Platform):
        use_mla: bool,
        has_sink: bool,
        use_sparse,
+        attn_type: str | None = None,
    ) -> str:
        from vllm.v1.attention.backends.utils import set_kv_cache_layout