[Attention] Update tests to remove deprecated env vars (#30563)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
This commit is contained in:
Matthew Bonanni
2025-12-17 12:49:59 -05:00
committed by GitHub
parent 9ca8cb38fd
commit 7eb6cb6c18
34 changed files with 580 additions and 447 deletions

View File

@@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME
models = [MODEL_NAME]
@pytest.fixture(autouse=True)
def set_attention_backend_for_rocm(monkeypatch):
@pytest.fixture
def granite_speech_attention_config():
"""Return attention config for Granite Speech tests on ROCm."""
if current_platform.is_rocm():
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
return {"backend": "TRITON_ATTN"}
return None
def run_test(
@@ -53,6 +55,7 @@ def run_test(
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
attention_config: dict | None = None,
):
"""Inference result should be the same between hf and vllm.
@@ -80,6 +83,7 @@ def run_test(
enable_lora=True,
max_lora_rank=64,
enforce_eager=True,
attention_config=attention_config,
) as vllm_model:
lora_request = LoRARequest("audio", 1, audio_lora_path)
vllm_outputs_per_case = [
@@ -131,6 +135,7 @@ def test_models(
vllm_runner,
model: str,
audio_assets: AudioTestAssets,
granite_speech_attention_config,
dtype: str,
max_model_len: int,
max_tokens: int,
@@ -157,4 +162,5 @@ def test_models(
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
attention_config=granite_speech_attention_config,
)

View File

@@ -2,23 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
import os
import warnings
import pytest
from vllm.platforms import current_platform
def pytest_collection_modifyitems(config, items):
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
if not current_platform.is_rocm():
return
@pytest.fixture
def siglip_attention_config():
"""Return attention config for SigLIP tests on ROCm.
siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
if siglip_tests:
os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
warnings.warn(
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
UserWarning,
stacklevel=1,
)
On ROCm, SigLIP tests require FLEX_ATTENTION backend.
"""
if current_platform.is_rocm():
return {"backend": "FLEX_ATTENTION"}
return None

View File

@@ -38,6 +38,7 @@ def _run_test(
*,
dtype: str,
tokenization_kwargs: dict[str, Any] | None = None,
attention_config: dict[str, Any] | None = None,
) -> None:
if tokenization_kwargs is None:
tokenization_kwargs = {}
@@ -49,6 +50,7 @@ def _run_test(
enforce_eager=True,
max_model_len=64,
gpu_memory_utilization=0.7,
attention_config=attention_config,
) as vllm_model:
vllm_outputs = vllm_model.embed(
input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
@@ -90,6 +92,7 @@ def test_models_text(
hf_runner,
vllm_runner,
image_assets,
siglip_attention_config,
model: str,
dtype: str,
) -> None:
@@ -108,6 +111,7 @@ def test_models_text(
"padding": "max_length",
"max_length": 64,
}, # siglip2 was trained with this padding setting.
attention_config=siglip_attention_config,
)
@@ -117,6 +121,7 @@ def test_models_image(
hf_runner,
vllm_runner,
image_assets,
siglip_attention_config,
model: str,
dtype: str,
) -> None:
@@ -133,6 +138,7 @@ def test_models_image(
input_images,
model,
dtype=dtype,
attention_config=siglip_attention_config,
)
@@ -141,6 +147,7 @@ def test_models_image(
def test_models_text_image_no_crash(
vllm_runner,
image_assets,
siglip_attention_config,
model: str,
dtype: str,
) -> None:
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
enforce_eager=True,
max_model_len=64,
gpu_memory_utilization=0.7,
attention_config=siglip_attention_config,
) as vllm_model:
with pytest.raises(ValueError, match="not both"):
vllm_model.embed(texts, images=images)