[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -66,7 +66,6 @@ def llm_pair(request):
|
||||
pytest.skip("Only Blackwell GPUs support Cutlass MLA")
|
||||
|
||||
env_vars = {
|
||||
"VLLM_USE_V1": "1",
|
||||
# Force native sampler to avoid potential nondeterminism in FlashInfer
|
||||
# when per-request generators are not used in V1.
|
||||
"VLLM_USE_FLASHINFER_SAMPLER": "0",
|
||||
@@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
|
||||
with (
|
||||
temporary_environ(
|
||||
{
|
||||
"VLLM_USE_V1": "1",
|
||||
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
|
||||
# Flex_Attention is not supported with full cuda graph
|
||||
}
|
||||
|
||||
@@ -18,7 +18,6 @@ from vllm.config import (
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
from vllm.envs import VLLM_USE_V1
|
||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||
from vllm.utils import is_torch_equal_or_newer
|
||||
|
||||
@@ -127,7 +126,6 @@ def _run_simple_model(
|
||||
@pytest.mark.parametrize("use_inductor", [True, False])
|
||||
@torch.inference_mode()
|
||||
def test_simple_piecewise_compile(use_inductor):
|
||||
assert VLLM_USE_V1
|
||||
_run_simple_model(
|
||||
splitting_ops=["silly.attention"],
|
||||
use_inductor_graph_partition=False,
|
||||
@@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
|
||||
def test_simple_inductor_graph_partition(splitting_ops):
|
||||
assert VLLM_USE_V1
|
||||
if not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
|
||||
@@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
|
||||
"pass_config": {"enable_async_tp": async_tp_enabled},
|
||||
}
|
||||
|
||||
async_tp_env = tp_env = {
|
||||
"VLLM_USE_V1": "1",
|
||||
}
|
||||
|
||||
async_tp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
@@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
|
||||
"mp",
|
||||
]
|
||||
|
||||
compare_two_settings(
|
||||
model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
|
||||
)
|
||||
compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
|
||||
from vllm.utils import _is_torch_equal_or_newer
|
||||
@@ -16,15 +15,10 @@ def test_version():
|
||||
assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
|
||||
|
||||
|
||||
def test_use_cudagraphs_dynamic(monkeypatch):
|
||||
assert vllm.envs.VLLM_USE_V1
|
||||
def test_use_cudagraphs_dynamic():
|
||||
vllm_config = VllmConfig()
|
||||
assert vllm_config.compilation_config.use_cudagraph
|
||||
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
vllm_config = VllmConfig()
|
||||
assert not vllm_config.compilation_config.use_cudagraph
|
||||
|
||||
|
||||
def test_custom_op():
|
||||
# proper syntax
|
||||
@@ -41,8 +35,6 @@ def test_custom_op():
|
||||
# may be influenced by other tests.
|
||||
@pytest.mark.parametrize("val", ["1"])
|
||||
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
|
||||
assert vllm.envs.VLLM_USE_V1
|
||||
|
||||
# Disable multiprocessing so that the counter is in the same process
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
|
||||
@@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
|
||||
@pytest.mark.forked
|
||||
@pytest.mark.parametrize("enabled", [True, False])
|
||||
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
|
||||
assert vllm.envs.VLLM_USE_V1
|
||||
|
||||
# Disable multiprocessing so that the counter is in the same process
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
|
||||
|
||||
@@ -303,7 +303,6 @@ def test_attention_quant_pattern(
|
||||
model_class: type[AttentionQuantPatternModel],
|
||||
backend: _Backend,
|
||||
use_inductor_graph_partition: bool,
|
||||
monkeypatch,
|
||||
dist_init,
|
||||
caplog_vllm,
|
||||
):
|
||||
@@ -312,8 +311,6 @@ def test_attention_quant_pattern(
|
||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
device = torch.device("cuda:0")
|
||||
torch.manual_seed(42)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user