[Attention] Update tests to remove deprecated env vars (#30563)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
This commit is contained in:
Matthew Bonanni
2025-12-17 12:49:59 -05:00
committed by GitHub
parent 9ca8cb38fd
commit 7eb6cb6c18
34 changed files with 580 additions and 447 deletions

View File

@@ -249,8 +249,8 @@ def create_dummy_kv_cache(
@dataclass
class BackendConfig:
name: str
env_vars: dict
comp_config: dict # compilation config
attention_config: dict
comp_config: dict
specific_gpu_arch: tuple | None = None
@@ -259,10 +259,10 @@ full_cg_backend_configs = {
# FA3 on Hopper
"FA3": BackendConfig(
name="FA3",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_FLASH_ATTN_VERSION": "3",
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
attention_config={
"backend": "FLASH_ATTN",
"flash_attn_version": 3,
"flash_attn_max_num_splits_for_cuda_graph": 16,
},
comp_config={
"cudagraph_mode": "FULL",
@@ -272,9 +272,7 @@ full_cg_backend_configs = {
# FlashMLA on Hopper
"FlashMLA": BackendConfig(
name="FlashMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASHMLA",
},
attention_config={"backend": "FLASHMLA"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
@@ -283,9 +281,7 @@ full_cg_backend_configs = {
# Cutlass MLA on Blackwell
"CutlassMLA": BackendConfig(
name="CutlassMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
},
attention_config={"backend": "CUTLASS_MLA"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
@@ -294,9 +290,7 @@ full_cg_backend_configs = {
# FlashInfer MLA on Blackwell
"FlashInferMLA": BackendConfig(
name="FlashInferMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA",
},
attention_config={"backend": "FLASHINFER_MLA"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
@@ -305,9 +299,9 @@ full_cg_backend_configs = {
# FlashAttention MLA on Hopper
"FlashAttentionMLA": BackendConfig(
name="FlashAttentionMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
attention_config={
"backend": "FLASH_ATTN_MLA",
"flash_attn_max_num_splits_for_cuda_graph": 16,
},
comp_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
@@ -317,10 +311,10 @@ full_cg_backend_configs = {
# FA2
"FA2": BackendConfig(
name="FA2",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_FLASH_ATTN_VERSION": "2",
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
attention_config={
"backend": "FLASH_ATTN",
"flash_attn_version": 2,
"flash_attn_max_num_splits_for_cuda_graph": 16,
},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
@@ -329,7 +323,7 @@ full_cg_backend_configs = {
# Triton Attention
"TritonAttn": BackendConfig(
name="TritonAttn",
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
attention_config={"backend": "TRITON_ATTN"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
@@ -337,14 +331,17 @@ full_cg_backend_configs = {
# FlashInfer
"FlashInfer": BackendConfig(
name="FlashInfer",
env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
attention_config={"backend": "FLASHINFER"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
),
"RocmAttn": BackendConfig(
name="RocmAttn",
env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
attention_config={
"backend": "ROCM_ATTN",
"use_prefill_decode_attention": True,
},
comp_config={
"cudagraph_mode": "FULL",
},