[Attention] Update tests to remove deprecated env vars (#30563)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-12-17 12:49:59 -05:00
parent 9ca8cb38fd
commit 7eb6cb6c18
34 changed files with 580 additions and 447 deletions
--- a/tests/models/multimodal/pooling/conftest.py
+++ b/tests/models/multimodal/pooling/conftest.py
@@ -2,23 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM pooling tests."""

-import os
-import warnings
+import pytest

 from vllm.platforms import current_platform


-def pytest_collection_modifyitems(config, items):
-    """Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
-    if not current_platform.is_rocm():
-        return
+@pytest.fixture
+def siglip_attention_config():
+    """Return attention config for SigLIP tests on ROCm.

-    siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
-
-    if siglip_tests:
-        os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
-        warnings.warn(
-            "ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
-            UserWarning,
-            stacklevel=1,
-        )
+    On ROCm, SigLIP tests require FLEX_ATTENTION backend.
+    """
+    if current_platform.is_rocm():
+        return {"backend": "FLEX_ATTENTION"}
+    return None
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -38,6 +38,7 @@ def _run_test(
    *,
    dtype: str,
    tokenization_kwargs: dict[str, Any] | None = None,
+    attention_config: dict[str, Any] | None = None,
 ) -> None:
    if tokenization_kwargs is None:
        tokenization_kwargs = {}
@@ -49,6 +50,7 @@ def _run_test(
        enforce_eager=True,
        max_model_len=64,
        gpu_memory_utilization=0.7,
+        attention_config=attention_config,
    ) as vllm_model:
        vllm_outputs = vllm_model.embed(
            input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
@@ -90,6 +92,7 @@ def test_models_text(
    hf_runner,
    vllm_runner,
    image_assets,
+    siglip_attention_config,
    model: str,
    dtype: str,
 ) -> None:
@@ -108,6 +111,7 @@ def test_models_text(
            "padding": "max_length",
            "max_length": 64,
        },  # siglip2 was trained with this padding setting.
+        attention_config=siglip_attention_config,
    )


@@ -117,6 +121,7 @@ def test_models_image(
    hf_runner,
    vllm_runner,
    image_assets,
+    siglip_attention_config,
    model: str,
    dtype: str,
 ) -> None:
@@ -133,6 +138,7 @@ def test_models_image(
        input_images,
        model,
        dtype=dtype,
+        attention_config=siglip_attention_config,
    )


@@ -141,6 +147,7 @@ def test_models_image(
 def test_models_text_image_no_crash(
    vllm_runner,
    image_assets,
+    siglip_attention_config,
    model: str,
    dtype: str,
 ) -> None:
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
        enforce_eager=True,
        max_model_len=64,
        gpu_memory_utilization=0.7,
+        attention_config=siglip_attention_config,
    ) as vllm_model:
        with pytest.raises(ValueError, match="not both"):
            vllm_model.embed(texts, images=images)