[Attention] MLA decode optimizations (#12528)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: simon-mo <simon.mo@hey.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Co-authored-by: simon-mo <xmo@berkeley.edu>
2025-01-31 02:49:37 -05:00
parent a1fc18c030
commit cabaf4eff3
31 changed files with 2266 additions and 32 deletions
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -83,6 +83,7 @@ def get_attn_backend(
    block_size: int,
    is_attention_free: bool,
    is_blocksparse: bool = False,
+    use_mla: bool = False,
 ) -> Type[AttentionBackend]:
    """Selects which attention backend to use and lazily imports it."""
    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
@@ -97,6 +98,7 @@ def get_attn_backend(
        is_attention_free=is_attention_free,
        is_blocksparse=is_blocksparse,
        use_v1=envs.VLLM_USE_V1,
+        use_mla=use_mla,
    )


@@ -109,6 +111,7 @@ def _cached_get_attn_backend(
    is_attention_free: bool,
    is_blocksparse: bool = False,
    use_v1: bool = False,
+    use_mla: bool = False,
 ) -> Type[AttentionBackend]:
    if is_blocksparse:
        logger.info("Using BlocksparseFlashAttention backend.")
@@ -141,7 +144,8 @@ def _cached_get_attn_backend(

    # get device-specific attn_backend
    attention_cls = current_platform.get_attn_backend_cls(
-        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1)
+        selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1,
+        use_mla)
    if not attention_cls:
        raise ValueError(
            f"Invalid attention backend for {current_platform.device_name}")