[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 23:42:31 +08:00
parent c0a7b89d8e
commit 1e4ecca1d0
51 changed files with 817 additions and 1275 deletions
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
 def test_basic(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    model: str,
    max_tokens: int,
    tensor_parallel_size: int,
@@ -55,23 +54,20 @@ def test_basic(
    )
    example_prompts = [prompt]

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    with vllm_runner(
+        model,
+        # Note: max_num_batched_tokens == 1024 is needed here to
+        # actually test chunked prompt
+        max_num_batched_tokens=1024,
+        max_model_len=8192,
+        gpu_memory_utilization=0.7,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    output = vllm_outputs[0][1]

-        with vllm_runner(
-            model,
-            # Note: max_num_batched_tokens == 1024 is needed here to
-            # actually test chunked prompt
-            max_num_batched_tokens=1024,
-            max_model_len=8192,
-            gpu_memory_utilization=0.7,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        output = vllm_outputs[0][1]
-
-        assert "1024" in output or "0, 1" in output
+    assert "1024" in output or "0, 1" in output


@pytest.mark.skip(reason="Temporarily disabled due to timeout")
@@ -82,7 +78,6 @@ def test_basic(
@pytest.mark.parametrize("max_num_seqs", [16])
 def test_phi3(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    max_tokens: int,
    max_num_seqs: int,
 ) -> None:
@@ -99,18 +94,15 @@ def test_phi3(
    # test head dim = 96
    model = "microsoft/Phi-3-mini-128k-instruct"

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with vllm_runner(
-            model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
-        # vllm_outputs is a list of tuples whose first element is the token id
-        # and the second element is the output (including the prompt).
-        for output, answer in zip(vllm_outputs, answers):
-            generated_text = output[1]
-            assert answer in generated_text
+    with vllm_runner(
+        model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+    # vllm_outputs is a list of tuples whose first element is the token id
+    # and the second element is the output (including the prompt).
+    for output, answer in zip(vllm_outputs, answers):
+        generated_text = output[1]
+        assert answer in generated_text


 TP_SIZE_8 = 8
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
 )
 def test_gemma3_27b_with_text_input_and_tp(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    model = "google/gemma-3-27b-it"
    max_tokens = 16
@@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
        " but in rising every time we fall.",
    ]

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        with vllm_runner(
-            model,
-            max_num_batched_tokens=256,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
-        # vllm_outputs is a list of tuples whose first element is the token id
-        # and the second element is the output (including the prompt).
-        for output, answer in zip(vllm_outputs, answers):
-            generated_text = output[1]
-            assert answer in generated_text
+    with vllm_runner(
+        model,
+        max_num_batched_tokens=256,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
+    # vllm_outputs is a list of tuples whose first element is the token id
+    # and the second element is the output (including the prompt).
+    for output, answer in zip(vllm_outputs, answers):
+        generated_text = output[1]
+        assert answer in generated_text


@pytest.mark.skipif(
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
 )
 def test_w8a8_quantization(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
    max_tokens = 5
@@ -176,18 +163,15 @@ def test_w8a8_quantization(
    )
    example_prompts = [prompt]

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    with vllm_runner(
+        model,
+        max_num_batched_tokens=64,
+        max_model_len=4096,
+        gpu_memory_utilization=0.7,
+        max_num_seqs=max_num_seqs,
+        tensor_parallel_size=tensor_parallel_size,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    output = vllm_outputs[0][1]

-        with vllm_runner(
-            model,
-            max_num_batched_tokens=64,
-            max_model_len=4096,
-            gpu_memory_utilization=0.7,
-            max_num_seqs=max_num_seqs,
-            tensor_parallel_size=tensor_parallel_size,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        output = vllm_outputs[0][1]
-
-        assert "1024" in output or "0, 1" in output
+    assert "1024" in output or "0, 1" in output
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
@pytest.mark.parametrize("params", TEST_PARAMS)
 def test_perf(
    vllm_runner: type[VllmRunner],
-    monkeypatch: pytest.MonkeyPatch,
    params: TestParams,
 ) -> None:
    tokenizer = get_tokenizer(
@@ -107,48 +106,45 @@ def test_perf(
        )
    )

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+    sampling_params = SamplingParams(
+        max_tokens=params.decode_len, temperature=1.0, min_p=0.0
+    )

-        sampling_params = SamplingParams(
-            max_tokens=params.decode_len, temperature=1.0, min_p=0.0
-        )
+    with vllm_runner(
+        params.model,
+        max_num_batched_tokens=MAX_MODEL_LEN,
+        max_model_len=MAX_MODEL_LEN,
+        max_num_seqs=MAX_NUM_SEQS,
+        gpu_memory_utilization=GPU_UTIL,
+        enforce_eager=False,
+        tensor_parallel_size=1,
+    ) as vllm_model:
+        print("  -- Warmup / Compile")
+        for i in range(NUM_WARMUPS):
+            _ = vllm_model.generate(prompts, sampling_params)

-        with vllm_runner(
-            params.model,
-            max_num_batched_tokens=MAX_MODEL_LEN,
-            max_model_len=MAX_MODEL_LEN,
-            max_num_seqs=MAX_NUM_SEQS,
-            gpu_memory_utilization=GPU_UTIL,
-            enforce_eager=False,
-            tensor_parallel_size=1,
-        ) as vllm_model:
-            print("  -- Warmup / Compile")
-            for i in range(NUM_WARMUPS):
-                _ = vllm_model.generate(prompts, sampling_params)
+        print("  -- Benchmarking... ")
+        times = []
+        for i in range(NUM_RUNS):
+            start_time = time.time()
+            _ = vllm_model.generate(prompts, sampling_params)
+            times.append(time.time() - start_time)

-            print("  -- Benchmarking... ")
-            times = []
-            for i in range(NUM_RUNS):
-                start_time = time.time()
-                _ = vllm_model.generate(prompts, sampling_params)
-                times.append(time.time() - start_time)
+        avg_time = sum(times) / len(times)

-            avg_time = sum(times) / len(times)
-
-            print("  -- avg_time = {}".format(avg_time))
-            print(
-                "  -- expected_avg_time = {} with err_tol = {}".format(
-                    params.expected_avg_time, params.err_tol
-                )
+        print("  -- avg_time = {}".format(avg_time))
+        print(
+            "  -- expected_avg_time = {} with err_tol = {}".format(
+                params.expected_avg_time, params.err_tol
+            )
+        )
+        diff = avg_time - params.expected_avg_time
+        ok = diff < params.err_tol
+        if diff < -params.err_tol:
+            print(
+                "  !! WARNING !! Performance has improved by {}, "
+                "it may be necessary to fine-tune the "
+                "expected_avg_time = {}".format(-diff, params.expected_avg_time)
            )
-            diff = avg_time - params.expected_avg_time
-            ok = diff < params.err_tol
-            if diff < -params.err_tol:
-                print(
-                    "  !! WARNING !! Performance has improved by {}, "
-                    "it may be necessary to fine-tune the "
-                    "expected_avg_time = {}".format(-diff, params.expected_avg_time)
-                )

-            assert ok, " !! ERROR !! Regression detected"
+        assert ok, " !! ERROR !! Regression detected"