[Attention] Update tests to remove deprecated env vars (#30563)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-12-17 12:49:59 -05:00
parent 9ca8cb38fd
commit 7eb6cb6c18
34 changed files with 580 additions and 447 deletions
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME
 models = [MODEL_NAME]


-@pytest.fixture(autouse=True)
-def set_attention_backend_for_rocm(monkeypatch):
+@pytest.fixture
+def granite_speech_attention_config():
+    """Return attention config for Granite Speech tests on ROCm."""
    if current_platform.is_rocm():
-        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+        return {"backend": "TRITON_ATTN"}
+    return None


 def run_test(
@@ -53,6 +55,7 @@ def run_test(
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: str | None = None,
+    attention_config: dict | None = None,
 ):
    """Inference result should be the same between hf and vllm.

@@ -80,6 +83,7 @@ def run_test(
        enable_lora=True,
        max_lora_rank=64,
        enforce_eager=True,
+        attention_config=attention_config,
    ) as vllm_model:
        lora_request = LoRARequest("audio", 1, audio_lora_path)
        vllm_outputs_per_case = [
@@ -131,6 +135,7 @@ def test_models(
    vllm_runner,
    model: str,
    audio_assets: AudioTestAssets,
+    granite_speech_attention_config,
    dtype: str,
    max_model_len: int,
    max_tokens: int,
@@ -157,4 +162,5 @@ def test_models(
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
+        attention_config=granite_speech_attention_config,
    )
--- a/tests/models/multimodal/pooling/conftest.py
+++ b/tests/models/multimodal/pooling/conftest.py
@@ -2,23 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM pooling tests."""

-import os
-import warnings
+import pytest

 from vllm.platforms import current_platform


-def pytest_collection_modifyitems(config, items):
-    """Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
-    if not current_platform.is_rocm():
-        return
+@pytest.fixture
+def siglip_attention_config():
+    """Return attention config for SigLIP tests on ROCm.

-    siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
-
-    if siglip_tests:
-        os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
-        warnings.warn(
-            "ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
-            UserWarning,
-            stacklevel=1,
-        )
+    On ROCm, SigLIP tests require FLEX_ATTENTION backend.
+    """
+    if current_platform.is_rocm():
+        return {"backend": "FLEX_ATTENTION"}
+    return None
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -38,6 +38,7 @@ def _run_test(
    *,
    dtype: str,
    tokenization_kwargs: dict[str, Any] | None = None,
+    attention_config: dict[str, Any] | None = None,
 ) -> None:
    if tokenization_kwargs is None:
        tokenization_kwargs = {}
@@ -49,6 +50,7 @@ def _run_test(
        enforce_eager=True,
        max_model_len=64,
        gpu_memory_utilization=0.7,
+        attention_config=attention_config,
    ) as vllm_model:
        vllm_outputs = vllm_model.embed(
            input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
@@ -90,6 +92,7 @@ def test_models_text(
    hf_runner,
    vllm_runner,
    image_assets,
+    siglip_attention_config,
    model: str,
    dtype: str,
 ) -> None:
@@ -108,6 +111,7 @@ def test_models_text(
            "padding": "max_length",
            "max_length": 64,
        },  # siglip2 was trained with this padding setting.
+        attention_config=siglip_attention_config,
    )


@@ -117,6 +121,7 @@ def test_models_image(
    hf_runner,
    vllm_runner,
    image_assets,
+    siglip_attention_config,
    model: str,
    dtype: str,
 ) -> None:
@@ -133,6 +138,7 @@ def test_models_image(
        input_images,
        model,
        dtype=dtype,
+        attention_config=siglip_attention_config,
    )


@@ -141,6 +147,7 @@ def test_models_image(
 def test_models_text_image_no_crash(
    vllm_runner,
    image_assets,
+    siglip_attention_config,
    model: str,
    dtype: str,
 ) -> None:
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
        enforce_eager=True,
        max_model_len=64,
        gpu_memory_utilization=0.7,
+        attention_config=siglip_attention_config,
    ) as vllm_model:
        with pytest.raises(ValueError, match="not both"):
            vllm_model.embed(texts, images=images)