[Attention] MLA support for V1 (#13789)

Signed-off-by: Yang Chen <yangche@fb.com>
This commit is contained in:
Yang Chen
2025-02-27 10:14:17 -08:00
committed by GitHub
parent f1579b229d
commit 58d1b2aa77
10 changed files with 1340 additions and 59 deletions

View File

@@ -162,8 +162,13 @@ class CudaPlatformBase(Platform):
kv_cache_dtype, block_size, use_v1,
use_mla) -> str:
if use_v1:
logger.info("Using Flash Attention backend on V1 engine.")
return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
if use_mla:
logger.info("Using Triton MLA backend on V1 engine.")
return "vllm.v1.attention.backends.triton_mla.TritonMLABackend"
else:
logger.info("Using Flash Attention backend on V1 engine.")
return ("vllm.v1.attention.backends.flash_attn."
"FlashAttentionBackend")
if use_mla:
if selected_backend == _Backend.FLASHMLA:
from vllm.attention.backends.flashmla import (

View File

@@ -35,6 +35,7 @@ class _Backend(enum.Enum):
OPENVINO = enum.auto()
FLASHINFER = enum.auto()
TRITON_MLA = enum.auto()
TRITON_MLA_VLLM_V1 = enum.auto()
FLASHMLA = enum.auto()
HPU_ATTN = enum.auto()
PALLAS = enum.auto()