[CI Failure] Fix backend selection for encoder-only models (#28534)
Signed-off-by: Huamin Li <3ericli@gmail.com>
This commit is contained in:
@@ -134,6 +134,7 @@ class CpuPlatform(Platform):
|
||||
use_mla: bool,
|
||||
has_sink: bool,
|
||||
use_sparse: bool,
|
||||
attn_type: str | None = None,
|
||||
) -> str:
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
|
||||
@@ -298,6 +298,7 @@ class CudaPlatformBase(Platform):
|
||||
has_sink,
|
||||
use_sparse,
|
||||
device_capability,
|
||||
attn_type,
|
||||
) -> tuple[
|
||||
list[tuple["AttentionBackendEnum", int]],
|
||||
dict["AttentionBackendEnum", list[str]],
|
||||
@@ -318,6 +319,7 @@ class CudaPlatformBase(Platform):
|
||||
has_sink,
|
||||
use_sparse,
|
||||
device_capability,
|
||||
attn_type,
|
||||
)
|
||||
except ImportError:
|
||||
invalid_reasons_i = ["ImportError"]
|
||||
@@ -339,7 +341,13 @@ class CudaPlatformBase(Platform):
|
||||
use_mla: bool,
|
||||
has_sink: bool,
|
||||
use_sparse: bool,
|
||||
attn_type: str | None = None,
|
||||
) -> str:
|
||||
from vllm.attention import AttentionType
|
||||
|
||||
if attn_type is None:
|
||||
attn_type = AttentionType.DECODER
|
||||
|
||||
device_capability = cls.get_device_capability()
|
||||
assert device_capability is not None
|
||||
|
||||
@@ -356,6 +364,7 @@ class CudaPlatformBase(Platform):
|
||||
has_sink,
|
||||
use_sparse,
|
||||
device_capability,
|
||||
attn_type,
|
||||
)
|
||||
except ImportError:
|
||||
invalid_reasons = ["ImportError"]
|
||||
@@ -379,6 +388,7 @@ class CudaPlatformBase(Platform):
|
||||
has_sink,
|
||||
use_sparse,
|
||||
device_capability,
|
||||
attn_type,
|
||||
)
|
||||
reasons_str = (
|
||||
"{"
|
||||
|
||||
@@ -222,6 +222,7 @@ class Platform:
|
||||
use_mla: bool,
|
||||
has_sink: bool,
|
||||
use_sparse: bool,
|
||||
attn_type: str | None = None,
|
||||
) -> str:
|
||||
"""Get the attention backend class of a device."""
|
||||
return ""
|
||||
|
||||
@@ -216,6 +216,7 @@ class RocmPlatform(Platform):
|
||||
use_mla,
|
||||
has_sink,
|
||||
use_sparse,
|
||||
attn_type: str | None = None,
|
||||
) -> str:
|
||||
from vllm._aiter_ops import rocm_aiter_ops
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
@@ -61,6 +61,7 @@ class TpuPlatform(Platform):
|
||||
use_mla: bool,
|
||||
has_sink,
|
||||
use_sparse,
|
||||
attn_type: str | None = None,
|
||||
) -> str:
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ class XPUPlatform(Platform):
|
||||
use_mla: bool,
|
||||
has_sink: bool,
|
||||
use_sparse,
|
||||
attn_type: str | None = None,
|
||||
) -> str:
|
||||
from vllm.v1.attention.backends.utils import set_kv_cache_layout
|
||||
|
||||
|
||||
Reference in New Issue
Block a user