[Attention] Update tests to remove deprecated env vars (#30563)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-12-17 12:49:59 -05:00
parent 9ca8cb38fd
commit 7eb6cb6c18
34 changed files with 580 additions and 447 deletions
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -6,7 +6,9 @@ from unittest.mock import patch
 import pytest
 import torch

+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
@@ -73,18 +75,18 @@ def generate_params():


@pytest.mark.parametrize("device, name, use_mla, block_size", generate_params())
-def test_env(
+def test_backend_selection(
    device: str,
    name: str,
    use_mla: bool,
    block_size: int,
-    monkeypatch: pytest.MonkeyPatch,
 ):
    """Test attention backend selection with valid device-backend pairs."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", name)
-        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
+    # Create AttentionConfig with the specified backend
+    attention_config = AttentionConfig(backend=AttentionBackendEnum[name])
+    vllm_config = VllmConfig(attention_config=attention_config)

+    with set_current_vllm_config(vllm_config):
        if device == "cpu":
            with patch("vllm.platforms.current_platform", CpuPlatform()):
                backend = get_attn_backend(16, torch.float16, None, block_size)
@@ -217,27 +219,32 @@ def test_env(
@pytest.mark.parametrize("device", ["cpu", "cuda"])
 def test_fp32_fallback(device: str):
    """Test attention backend selection with fp32."""
-    if device == "cpu":
-        with patch("vllm.platforms.current_platform", CpuPlatform()):
-            backend = get_attn_backend(16, torch.float32, None, 16)
-        assert backend.get_name() == "CPU_ATTN"
+    # Use default config (no backend specified)
+    vllm_config = VllmConfig()

-    elif device == "cuda":
-        with patch("vllm.platforms.current_platform", CudaPlatform()):
-            backend = get_attn_backend(16, torch.float32, None, 16)
-        assert backend.get_name() == "FLEX_ATTENTION"
+    with set_current_vllm_config(vllm_config):
+        if device == "cpu":
+            with patch("vllm.platforms.current_platform", CpuPlatform()):
+                backend = get_attn_backend(16, torch.float32, None, 16)
+            assert backend.get_name() == "CPU_ATTN"
+
+        elif device == "cuda":
+            with patch("vllm.platforms.current_platform", CudaPlatform()):
+                backend = get_attn_backend(16, torch.float32, None, 16)
+            assert backend.get_name() == "FLEX_ATTENTION"


 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    """Test FlashAttn validation."""
    pytest.skip(
        "Skipping as current backend selector does not "
-        "handle fallbacks when a backend is set via env var."
+        "handle fallbacks when a backend is explicitly set."
    )

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN")
+    attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASH_ATTN)
+    vllm_config = VllmConfig(attention_config=attention_config)

+    with set_current_vllm_config(vllm_config):
        # Unsupported CUDA arch
        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
        backend = get_attn_backend(16, torch.float16, None, 16)
@@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
        assert backend.get_name() != "FLASH_ATTN"


-def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
+def test_invalid_backend():
    """Test that invalid attention backend names raise ValueError."""
    with (
-        monkeypatch.context() as m,
-        patch("vllm.platforms.current_platform", CudaPlatform()),
+        pytest.raises(ValueError),
    ):
-        m.setenv("VLLM_ATTENTION_BACKEND", "INVALID")
-
-        # Should raise ValueError for invalid backend
-        with pytest.raises(ValueError) as exc_info:
-            get_attn_backend(32, torch.float16, None, 16)
-        assert "Invalid value 'INVALID'" in str(exc_info.value)
+        # Invalid backend name should raise ValueError when creating enum
+        AttentionConfig(backend=AttentionBackendEnum["INVALID"])