[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 23:42:31 +08:00
parent c0a7b89d8e
commit 1e4ecca1d0
51 changed files with 817 additions and 1275 deletions
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -80,7 +80,6 @@ def test_env(
 ):
    """Test attention backend selection with valid device-backend pairs."""
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv(STR_BACKEND_ENV_VAR, name)
        m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")

@@ -212,30 +211,21 @@ def test_env(


@pytest.mark.parametrize("device", ["cpu", "cuda"])
-def test_fp32_fallback(
-    device: str,
-    monkeypatch: pytest.MonkeyPatch,
-):
+def test_fp32_fallback(device: str):
    """Test attention backend selection with fp32."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    if device == "cpu":
+        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+            backend = get_attn_backend(16, torch.float32, None, 16)
+        assert backend.get_name() == "TORCH_SDPA"

-        if device == "cpu":
-            with patch("vllm.attention.selector.current_platform", CpuPlatform()):
-                backend = get_attn_backend(16, torch.float32, None, 16)
-            assert backend.get_name() == "TORCH_SDPA"
-
-        elif device == "cuda":
-            with patch("vllm.attention.selector.current_platform", CudaPlatform()):
-                backend = get_attn_backend(16, torch.float32, None, 16)
-            assert backend.get_name() == "FLEX_ATTENTION"
+    elif device == "cuda":
+        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            backend = get_attn_backend(16, torch.float32, None, 16)
+        assert backend.get_name() == "FLEX_ATTENTION"


 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    """Test FlashAttn validation."""
-    # TODO: When testing for v1, pipe in `use_v1` as an argument to
-    # get_attn_backend
-
    pytest.skip(
        "Skipping as current backend selector does not "
        "handle fallbacks when a backend is set via env var."
@@ -289,7 +279,6 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
        monkeypatch.context() as m,
        patch("vllm.attention.selector.current_platform", CudaPlatform()),
    ):
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)

        # Should raise ValueError for invalid backend
--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):

    # Run with flex attention
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")

        set_seed(seed)
@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):

    # Run with default backend
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        set_seed(seed)
        with vllm_runner(
            model_name,
@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):

    # Run with flex attention
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
        with vllm_runner(
            model_name,
@@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
            flex_outputs = llm_flex.embed(prompts)

    # Run with default backend
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        with vllm_runner(
+    with (
+        monkeypatch.context() as m,
+        vllm_runner(
            model_name,
            runner="pooling",
            dtype=torch.bfloat16,
            tensor_parallel_size=1,
            max_model_len=100,
            enforce_eager=True,
-        ) as llm_default:
-            default_outputs = llm_default.embed(prompts)
+        ) as llm_default,
+    ):
+        default_outputs = llm_default.embed(prompts)

    check_embeddings_close(
        embeddings_0_lst=flex_outputs,