[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 23:42:31 +08:00
parent c0a7b89d8e
commit 1e4ecca1d0
51 changed files with 817 additions and 1275 deletions
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
        )

    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)

        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -32,7 +32,7 @@ model_config = {
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
 def test_sliding_window_retrieval(
-    monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager
+    model, batch_size, seed, disable_hybrid_kv_cache_manager
 ):
    """
    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
@@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
    If we tell it upfront which we are going to be looking for, then
    it answers correctly (mostly).
    """
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    test_config = model_config[model]

-        test_config = model_config[model]
+    llm = LLM(
+        model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
+    )
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)

-        llm = LLM(
-            model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
-        )
-        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+    prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)

-        prompts, answer, indices = prep_prompts(
-            batch_size, ln_range=test_config.ln_range
-        )
+    check_length(prompts, llm, test_config.sliding_window)

-        check_length(prompts, llm, test_config.sliding_window)
+    # Fresh generation
+    responses = llm.generate(prompts, sampling_params)
+    check_answers(
+        indices,
+        answer,
+        [response.outputs[0].text for response in responses],
+        accept_rate=1.0,
+    )

-        # Fresh generation
-        responses = llm.generate(prompts, sampling_params)
-        check_answers(
-            indices,
-            answer,
-            [response.outputs[0].text for response in responses],
-            accept_rate=1.0,
-        )
-
-        # Re-generate with the same prompts to test prefix caching
-        responses = llm.generate(prompts, sampling_params)
-        check_answers(
-            indices,
-            answer,
-            [response.outputs[0].text for response in responses],
-            accept_rate=1.0,
-        )
+    # Re-generate with the same prompts to test prefix caching
+    responses = llm.generate(prompts, sampling_params)
+    check_answers(
+        indices,
+        answer,
+        [response.outputs[0].text for response in responses],
+        accept_rate=1.0,
+    )


 def check_length(prompts: list[str], llm: LLM, sliding_window: int):
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
    )

    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
        # Make scheduling deterministic for reproducibility
        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

--- a/tests/v1/e2e/test_min_tokens.py
+++ b/tests/v1/e2e/test_min_tokens.py
@@ -13,7 +13,6 @@ Covers:
 5) Multiple stop conditions
 """

-import os
 from typing import Optional, Union

 import pytest
@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
@pytest.fixture(scope="module")
 def llm_v1():
    """Create V1 LLM instance for testing"""
-    # Ensure V1 engine is used
-    os.environ["VLLM_USE_V1"] = "1"
-
    llm = LLM(
        model=TEST_MODEL,
        tensor_parallel_size=1,
@@ -503,6 +499,6 @@ if __name__ == "__main__":
    
    Usage:
        cd vllm/
-        VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v
+        python -m pytest tests/v1/e2e/test_min_tokens.py -v
    """
    pytest.main([__file__, "-v"])
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -301,7 +301,6 @@ def test_mtp_correctness(
    model_setup: (method, model_name, tp_size)
    """
    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
        m.setenv("VLLM_MLA_DISABLE", "1")

        method, model_name, tp_size = model_setup