[Attention] Update tests to remove deprecated env vars (#30563)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
This commit is contained in:
@@ -249,8 +249,8 @@ def create_dummy_kv_cache(
|
||||
@dataclass
|
||||
class BackendConfig:
|
||||
name: str
|
||||
env_vars: dict
|
||||
comp_config: dict # compilation config
|
||||
attention_config: dict
|
||||
comp_config: dict
|
||||
specific_gpu_arch: tuple | None = None
|
||||
|
||||
|
||||
@@ -259,10 +259,10 @@ full_cg_backend_configs = {
|
||||
# FA3 on Hopper
|
||||
"FA3": BackendConfig(
|
||||
name="FA3",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
|
||||
"VLLM_FLASH_ATTN_VERSION": "3",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
attention_config={
|
||||
"backend": "FLASH_ATTN",
|
||||
"flash_attn_version": 3,
|
||||
"flash_attn_max_num_splits_for_cuda_graph": 16,
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
@@ -272,9 +272,7 @@ full_cg_backend_configs = {
|
||||
# FlashMLA on Hopper
|
||||
"FlashMLA": BackendConfig(
|
||||
name="FlashMLA",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASHMLA",
|
||||
},
|
||||
attention_config={"backend": "FLASHMLA"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
},
|
||||
@@ -283,9 +281,7 @@ full_cg_backend_configs = {
|
||||
# Cutlass MLA on Blackwell
|
||||
"CutlassMLA": BackendConfig(
|
||||
name="CutlassMLA",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
|
||||
},
|
||||
attention_config={"backend": "CUTLASS_MLA"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
},
|
||||
@@ -294,9 +290,7 @@ full_cg_backend_configs = {
|
||||
# FlashInfer MLA on Blackwell
|
||||
"FlashInferMLA": BackendConfig(
|
||||
name="FlashInferMLA",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA",
|
||||
},
|
||||
attention_config={"backend": "FLASHINFER_MLA"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
},
|
||||
@@ -305,9 +299,9 @@ full_cg_backend_configs = {
|
||||
# FlashAttention MLA on Hopper
|
||||
"FlashAttentionMLA": BackendConfig(
|
||||
name="FlashAttentionMLA",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
attention_config={
|
||||
"backend": "FLASH_ATTN_MLA",
|
||||
"flash_attn_max_num_splits_for_cuda_graph": 16,
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
@@ -317,10 +311,10 @@ full_cg_backend_configs = {
|
||||
# FA2
|
||||
"FA2": BackendConfig(
|
||||
name="FA2",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
|
||||
"VLLM_FLASH_ATTN_VERSION": "2",
|
||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
|
||||
attention_config={
|
||||
"backend": "FLASH_ATTN",
|
||||
"flash_attn_version": 2,
|
||||
"flash_attn_max_num_splits_for_cuda_graph": 16,
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
@@ -329,7 +323,7 @@ full_cg_backend_configs = {
|
||||
# Triton Attention
|
||||
"TritonAttn": BackendConfig(
|
||||
name="TritonAttn",
|
||||
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
|
||||
attention_config={"backend": "TRITON_ATTN"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
},
|
||||
@@ -337,14 +331,17 @@ full_cg_backend_configs = {
|
||||
# FlashInfer
|
||||
"FlashInfer": BackendConfig(
|
||||
name="FlashInfer",
|
||||
env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
|
||||
attention_config={"backend": "FLASHINFER"},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
},
|
||||
),
|
||||
"RocmAttn": BackendConfig(
|
||||
name="RocmAttn",
|
||||
env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
|
||||
attention_config={
|
||||
"backend": "ROCM_ATTN",
|
||||
"use_prefill_decode_attention": True,
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL",
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user