VLLM_USE_TRITON_FLASH_ATTN V0 variable deprecation (#27611)

Signed-off-by: Andreas Karatzas <akaratza@amd.com> Signed-off-by: Andreas Karatzas <Andreas.Karatzas@amd.com>
2025-11-11 20:34:36 -06:00
parent 7f829be7d3
commit 9f0247cfa4
15 changed files with 12 additions and 1588 deletions
--- a/tests/models/language/pooling/test_classification.py
+++ b/tests/models/language/pooling/test_classification.py
@@ -27,13 +27,7 @@ def test_models(
    example_prompts,
    model: str,
    dtype: str,
-    monkeypatch,
 ) -> None:
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.classify(example_prompts)

--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -4,7 +4,6 @@
 import pytest

 from vllm.config import PoolerConfig
-from vllm.platforms import current_platform

 from ...utils import check_embeddings_close

@@ -51,13 +50,7 @@ def test_models(
    vllm_runner,
    example_prompts,
    model,
-    monkeypatch,
 ) -> None:
-    if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
    vllm_extra_kwargs = {}
    if model == "ssmits/Qwen2-7B-Instruct-embed-base":
        vllm_extra_kwargs["pooler_config"] = PoolerConfig(
--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@@ -2,18 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from vllm.config.pooler import PoolerConfig
-from vllm.platforms import current_platform


 def test_idefics_multimodal(
    vllm_runner,
-    monkeypatch,
 ) -> None:
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
    prompts = [
        "Hello, my name is",
        "The president of the United States is",
@@ -59,13 +52,7 @@ def update_config(config):

 def test_gemma_multimodal(
    vllm_runner,
-    monkeypatch,
 ) -> None:
-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
    messages = [
        {
            "role": "system",
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -76,7 +76,6 @@ def test_prm_models(
    math_step_prompts,
    model: str,
    dtype: str,
-    monkeypatch,
 ) -> None:
    check_transformers_version(
        "Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
@@ -85,11 +84,6 @@ def test_prm_models(
    if current_platform.is_cpu():
        pytest.skip("CPU only supports V1")

-    if current_platform.is_rocm():
-        # ROCm Triton FA does not currently support sliding window attention
-        # switch to use ROCm CK FA backend
-        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
-
    with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.reward(math_step_prompts)