[V0 deprecation] Remove _VLLM_V1 suffixes from attention backend names (#25489)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
This commit is contained in:
Matthew Bonanni
2025-09-25 13:37:50 -04:00
committed by GitHub
parent 71b25b0d48
commit 3468f17ebe
42 changed files with 131 additions and 174 deletions

View File

@@ -31,7 +31,7 @@ DEVICE_MLA_BACKENDS = {
}
DEVICE_REGULAR_ATTN_BACKENDS = {
"cuda": ["XFORMERS", "FLASHINFER"],
"cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
"hip": ["ROCM_FLASH"],
"cpu": ["TORCH_SDPA"],
}
@@ -86,7 +86,7 @@ def test_env(
with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
backend = get_attn_backend(16, torch.float16, None, block_size)
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
assert backend.get_name() == "TORCH_SDPA"
elif device == "hip":
with patch("vllm.attention.selector.current_platform",
@@ -125,7 +125,7 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = f"{name}_VLLM_V1"
expected = name
assert backend.get_name() == expected
else:
backend = get_attn_backend(16,
@@ -133,7 +133,7 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = "TRITON_ATTN_VLLM_V1"
expected = "TRITON_ATTN"
assert backend.get_name() == expected
elif device == "cuda":
@@ -160,7 +160,7 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = "CUTLASS_MLA_VLLM_V1"
expected = "CUTLASS_MLA"
assert backend.get_name() == expected
elif name == "FLASHINFER_MLA":
if block_size not in [32, 64]:
@@ -193,7 +193,7 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = f"{name}_VLLM_V1"
expected = name
assert backend.get_name() == expected
elif name == "FLASH_ATTN_MLA":
backend = get_attn_backend(16,
@@ -210,7 +210,7 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = "TRITON_MLA_VLLM_V1"
expected = "TRITON_MLA"
assert backend.get_name() == expected
elif name == "FLASHINFER":
backend = get_attn_backend(16,
@@ -218,25 +218,24 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = "FLASHINFER_VLLM_V1"
expected = "FLASHINFER"
assert backend.get_name() == expected
else:
elif name == "XFORMERS":
backend = get_attn_backend(32,
torch.float16,
None,
block_size,
use_mla=use_mla)
expected = "FLASH_ATTN_VLLM_V1"
expected = "XFORMERS"
assert backend.get_name() == expected
backend = get_attn_backend(16,
elif name == "FLASH_ATTN":
backend = get_attn_backend(32,
torch.float16,
None,
block_size,
use_mla=use_mla)
assert backend.get_name() == "FLEX_ATTENTION", (
"Should fallback to FlexAttention if head size is "
"not supported by FlashAttention")
expected = "FLASH_ATTN"
assert backend.get_name() == expected
@pytest.mark.parametrize("device", ["cpu", "cuda"])
@@ -252,7 +251,7 @@ def test_fp32_fallback(
with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
assert backend.get_name() == "TORCH_SDPA"
elif device == "cuda":
with patch("vllm.attention.selector.current_platform",
@@ -266,6 +265,9 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
# TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend
pytest.skip("Skipping as current backend selector does not " \
"handle fallbacks when a backend is set via env var.")
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)