[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 23:42:31 +08:00
parent c0a7b89d8e
commit 1e4ecca1d0
51 changed files with 817 additions and 1275 deletions
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -66,7 +66,6 @@ def llm_pair(request):
            pytest.skip("Only Blackwell GPUs support Cutlass MLA")

    env_vars = {
-        "VLLM_USE_V1": "1",
        # Force native sampler to avoid potential nondeterminism in FlashInfer
        # when per-request generators are not used in V1.
        "VLLM_USE_FLASHINFER_SAMPLER": "0",
@@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
    with (
        temporary_environ(
            {
-                "VLLM_USE_V1": "1",
                "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
                # Flex_Attention is not supported with full cuda graph
            }
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -18,7 +18,6 @@ from vllm.config import (
    VllmConfig,
    set_current_vllm_config,
 )
-from vllm.envs import VLLM_USE_V1
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import is_torch_equal_or_newer

@@ -127,7 +126,6 @@ def _run_simple_model(
@pytest.mark.parametrize("use_inductor", [True, False])
@torch.inference_mode()
 def test_simple_piecewise_compile(use_inductor):
-    assert VLLM_USE_V1
    _run_simple_model(
        splitting_ops=["silly.attention"],
        use_inductor_graph_partition=False,
@@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
@torch.inference_mode()
@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
 def test_simple_inductor_graph_partition(splitting_ops):
-    assert VLLM_USE_V1
    if not is_torch_equal_or_newer("2.9.0.dev"):
        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
        "pass_config": {"enable_async_tp": async_tp_enabled},
    }

-    async_tp_env = tp_env = {
-        "VLLM_USE_V1": "1",
-    }
-
    async_tp_args = [
        *common_args,
        "--tensor-parallel-size",
@@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
        "mp",
    ]

-    compare_two_settings(
-        model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
-    )
+    compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest

-import vllm
 from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.utils import _is_torch_equal_or_newer
@@ -16,15 +15,10 @@ def test_version():
    assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")


-def test_use_cudagraphs_dynamic(monkeypatch):
-    assert vllm.envs.VLLM_USE_V1
+def test_use_cudagraphs_dynamic():
    vllm_config = VllmConfig()
    assert vllm_config.compilation_config.use_cudagraph

-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    vllm_config = VllmConfig()
-    assert not vllm_config.compilation_config.use_cudagraph
-

 def test_custom_op():
    # proper syntax
@@ -41,8 +35,6 @@ def test_custom_op():
 # may be influenced by other tests.
@pytest.mark.parametrize("val", ["1"])
 def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
-    assert vllm.envs.VLLM_USE_V1
-
    # Disable multiprocessing so that the counter is in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
@@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
@pytest.mark.forked
@pytest.mark.parametrize("enabled", [True, False])
 def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
-    assert vllm.envs.VLLM_USE_V1
-
    # Disable multiprocessing so that the counter is in the same process
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -303,7 +303,6 @@ def test_attention_quant_pattern(
    model_class: type[AttentionQuantPatternModel],
    backend: _Backend,
    use_inductor_graph_partition: bool,
-    monkeypatch,
    dist_init,
    caplog_vllm,
 ):
@@ -312,8 +311,6 @@ def test_attention_quant_pattern(
    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

-    monkeypatch.setenv("VLLM_USE_V1", "1")
-
    device = torch.device("cuda:0")
    torch.manual_seed(42)