[Attention] Update tests to remove deprecated env vars (#30563)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-12-17 12:49:59 -05:00
parent 9ca8cb38fd
commit 7eb6cb6c18
34 changed files with 580 additions and 447 deletions
--- a/tests/compile/fullgraph/test_full_cudagraph.py
+++ b/tests/compile/fullgraph/test_full_cudagraph.py
@@ -74,7 +74,6 @@ def llm_pair(request):
        # Force native sampler to avoid potential nondeterminism in FlashInfer
        # when per-request generators are not used in V1.
        "VLLM_USE_FLASHINFER_SAMPLER": "0",
-        **backend_config.env_vars,
    }
    with temporary_environ(env_vars):
        full = LLM(
@@ -170,16 +169,10 @@ class TestFullCUDAGraph:

@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 def test_full_cudagraph_with_invalid_backend():
-    with (
-        temporary_environ(
-            {
-                "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
-                # Flex_Attention is not supported with full cuda graph
-            }
-        ),
-        pytest.raises(RuntimeError),
-    ):
+    # Flex_Attention is not supported with full cuda graph
+    with pytest.raises(RuntimeError):
        LLM(
            model="Qwen/Qwen2-1.5B-Instruct",
            compilation_config=CompilationConfig(cudagraph_mode="FULL"),
+            attention_config={"backend": "FLEX_ATTENTION"},
        )