[CI Failure] Fix backend selection for encoder-only models (#28534)

Signed-off-by: Huamin Li <3ericli@gmail.com>
This commit is contained in:
Huamin Li
2025-11-13 07:11:27 -08:00
committed by GitHub
parent a7791eac9d
commit 07a606aa7e
14 changed files with 75 additions and 6 deletions

View File

@@ -134,6 +134,7 @@ class CpuPlatform(Platform):
use_mla: bool,
has_sink: bool,
use_sparse: bool,
attn_type: str | None = None,
) -> str:
from vllm.attention.backends.registry import AttentionBackendEnum

View File

@@ -298,6 +298,7 @@ class CudaPlatformBase(Platform):
has_sink,
use_sparse,
device_capability,
attn_type,
) -> tuple[
list[tuple["AttentionBackendEnum", int]],
dict["AttentionBackendEnum", list[str]],
@@ -318,6 +319,7 @@ class CudaPlatformBase(Platform):
has_sink,
use_sparse,
device_capability,
attn_type,
)
except ImportError:
invalid_reasons_i = ["ImportError"]
@@ -339,7 +341,13 @@ class CudaPlatformBase(Platform):
use_mla: bool,
has_sink: bool,
use_sparse: bool,
attn_type: str | None = None,
) -> str:
from vllm.attention import AttentionType
if attn_type is None:
attn_type = AttentionType.DECODER
device_capability = cls.get_device_capability()
assert device_capability is not None
@@ -356,6 +364,7 @@ class CudaPlatformBase(Platform):
has_sink,
use_sparse,
device_capability,
attn_type,
)
except ImportError:
invalid_reasons = ["ImportError"]
@@ -379,6 +388,7 @@ class CudaPlatformBase(Platform):
has_sink,
use_sparse,
device_capability,
attn_type,
)
reasons_str = (
"{"

View File

@@ -222,6 +222,7 @@ class Platform:
use_mla: bool,
has_sink: bool,
use_sparse: bool,
attn_type: str | None = None,
) -> str:
"""Get the attention backend class of a device."""
return ""

View File

@@ -216,6 +216,7 @@ class RocmPlatform(Platform):
use_mla,
has_sink,
use_sparse,
attn_type: str | None = None,
) -> str:
from vllm._aiter_ops import rocm_aiter_ops
from vllm.attention.backends.registry import AttentionBackendEnum

View File

@@ -61,6 +61,7 @@ class TpuPlatform(Platform):
use_mla: bool,
has_sink,
use_sparse,
attn_type: str | None = None,
) -> str:
from vllm.attention.backends.registry import AttentionBackendEnum

View File

@@ -51,6 +51,7 @@ class XPUPlatform(Platform):
use_mla: bool,
has_sink: bool,
use_sparse,
attn_type: str | None = None,
) -> str:
from vllm.v1.attention.backends.utils import set_kv_cache_layout