[V0 deprecation] Remove _VLLM_V1 suffixes from attention backend names (#25489)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
2025-09-25 13:37:50 -04:00
parent 71b25b0d48
commit 3468f17ebe
42 changed files with 131 additions and 174 deletions
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -314,12 +314,11 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,

    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

-    if (attn_backend == "TRITON_ATTN_VLLM_V1"
-            and not current_platform.is_rocm()):
-        pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
+    if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
+        pytest.skip("TRITON_ATTN does not support "
                    "multi-token eagle spec decode on current platform")

-    if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
+    if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")

    # Setup draft model mock
@@ -400,16 +399,15 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):

    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

-    if (attn_backend == "TRITON_ATTN_VLLM_V1"
-            and not current_platform.is_rocm()):
-        pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
+    if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
+        pytest.skip("TRITON_ATTN does not support "
                    "multi-token eagle spec decode on current platform")

    if (attn_backend == "TREE_ATTN"):
        pytest.skip("TREE_ATTN is tested separately in test_propose_tree"
                    "because it requires special input mocking.")

-    if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
+    if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")

    # Use GPU device
@@ -510,12 +508,12 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
                                   device=device)
    sampling_metadata = mock.MagicMock()

-    if attn_backend == "FLASH_ATTN_VLLM_V1":
+    if attn_backend == "FLASH_ATTN":
        attn_metadata_builder_cls, _ = get_attention_backend(
-            _Backend.FLASH_ATTN_VLLM_V1)
-    elif attn_backend == "TRITON_ATTN_VLLM_V1":
+            _Backend.FLASH_ATTN)
+    elif attn_backend == "TRITON_ATTN":
        attn_metadata_builder_cls, _ = get_attention_backend(
-            _Backend.TRITON_ATTN_VLLM_V1)
+            _Backend.TRITON_ATTN)
    elif attn_backend == "TREE_ATTN":
        attn_metadata_builder_cls, _ = get_attention_backend(
            _Backend.TREE_ATTN)
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -41,12 +41,11 @@ def test_eagle_max_len(monkeypatch: pytest.MonkeyPatch,
        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

-        if (attn_backend == "TRITON_ATTN_VLLM_V1"
-                and not current_platform.is_rocm()):
-            pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
+        if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
+            pytest.skip("TRITON_ATTN does not support "
                        "multi-token eagle spec decode on current platform")

-        if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
+        if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
            m.setenv("VLLM_ROCM_USE_AITER", "1")

        llm = LLM(
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -278,7 +278,7 @@ def test_tree_attn_correctness() -> None:
                            block_table=block_table,
                            slot_mapping=branch_slot_mapping,
                            seqlen_k=sequence_position + q_len,
-                            backend=_Backend.FLASH_ATTN_VLLM_V1,
+                            backend=_Backend.FLASH_ATTN,
                        ).view(batch_size, -1, num_heads, dim_per_head)

                        # Compare the outputs.