[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 23:42:31 +08:00
parent c0a7b89d8e
commit 1e4ecca1d0
51 changed files with 817 additions and 1275 deletions
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
 def test_basic(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    model: str,
    max_tokens: int,
    tensor_parallel_size: int,
@@ -55,23 +54,20 @@ def test_basic(
    )
    example_prompts = [prompt]

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    with vllm_runner(
+        model,
+        # Note: max_num_batched_tokens == 1024 is needed here to
+        # actually test chunked prompt
+        max_num_batched_tokens=1024,
+        max_model_len=8192,
+        gpu_memory_utilization=0.7,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    output = vllm_outputs[0][1]

-        with vllm_runner(
-            model,
-            # Note: max_num_batched_tokens == 1024 is needed here to
-            # actually test chunked prompt
-            max_num_batched_tokens=1024,
-            max_model_len=8192,
-            gpu_memory_utilization=0.7,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        output = vllm_outputs[0][1]
-
-        assert "1024" in output or "0, 1" in output
+    assert "1024" in output or "0, 1" in output


@pytest.mark.skip(reason="Temporarily disabled due to timeout")
@@ -82,7 +78,6 @@ def test_basic(
@pytest.mark.parametrize("max_num_seqs", [16])
 def test_phi3(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    max_tokens: int,
    max_num_seqs: int,
 ) -> None:
@@ -99,18 +94,15 @@ def test_phi3(
    # test head dim = 96
    model = "microsoft/Phi-3-mini-128k-instruct"

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with vllm_runner(
-            model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
-        # vllm_outputs is a list of tuples whose first element is the token id
-        # and the second element is the output (including the prompt).
-        for output, answer in zip(vllm_outputs, answers):
-            generated_text = output[1]
-            assert answer in generated_text
+    with vllm_runner(
+        model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+    # vllm_outputs is a list of tuples whose first element is the token id
+    # and the second element is the output (including the prompt).
+    for output, answer in zip(vllm_outputs, answers):
+        generated_text = output[1]
+        assert answer in generated_text


 TP_SIZE_8 = 8
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
 )
 def test_gemma3_27b_with_text_input_and_tp(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    model = "google/gemma-3-27b-it"
    max_tokens = 16
@@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
        " but in rising every time we fall.",
    ]

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with vllm_runner(
-            model,
-            max_num_batched_tokens=256,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
-        # vllm_outputs is a list of tuples whose first element is the token id
-        # and the second element is the output (including the prompt).
-        for output, answer in zip(vllm_outputs, answers):
-            generated_text = output[1]
-            assert answer in generated_text
+    with vllm_runner(
+        model,
+        max_num_batched_tokens=256,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+    # vllm_outputs is a list of tuples whose first element is the token id
+    # and the second element is the output (including the prompt).
+    for output, answer in zip(vllm_outputs, answers):
+        generated_text = output[1]
+        assert answer in generated_text


@pytest.mark.skipif(
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
 )
 def test_w8a8_quantization(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
    max_tokens = 5
@@ -176,18 +163,15 @@ def test_w8a8_quantization(
    )
    example_prompts = [prompt]

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    with vllm_runner(
+        model,
+        max_num_batched_tokens=64,
+        max_model_len=4096,
+        gpu_memory_utilization=0.7,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    output = vllm_outputs[0][1]

-        with vllm_runner(
-            model,
-            max_num_batched_tokens=64,
-            max_model_len=4096,
-            gpu_memory_utilization=0.7,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        output = vllm_outputs[0][1]
-
-        assert "1024" in output or "0, 1" in output
+    assert "1024" in output or "0, 1" in output