[gpt-oss] Enable gpt-oss on ampere (#22714)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-12 06:21:44 -04:00
parent b8a9d0e429
commit 007dd90859
10 changed files with 26 additions and 17 deletions
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -144,6 +144,7 @@ def get_attn_backend(
    block_size: int,
    is_attention_free: bool = False,
    use_mla: bool = False,
+    has_sink: bool = False,
 ) -> type[AttentionBackend]:
    """Selects which attention backend to use and lazily imports it."""
    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
@@ -158,6 +159,7 @@ def get_attn_backend(
        is_attention_free=is_attention_free,
        use_v1=envs.VLLM_USE_V1,
        use_mla=use_mla,
+        has_sink=has_sink,
    )


@@ -170,6 +172,7 @@ def _cached_get_attn_backend(
    is_attention_free: bool,
    use_v1: bool = False,
    use_mla: bool = False,
+    has_sink: bool = False,
 ) -> type[AttentionBackend]:
    # If there are no attention layers (e.g. we are running Mamba),
    # use the placeholder NO_ATTENTION
@@ -201,7 +204,7 @@ def _cached_get_attn_backend(
    # get device-specific attn_backend
    attention_cls = current_platform.get_attn_backend_cls(
        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
-        use_mla)
+        use_mla, has_sink)
    if not attention_cls:
        raise ValueError(
            f"Invalid attention backend for {current_platform.device_name}")