[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-07 23:42:31 +08:00
committed by GitHub
parent c0a7b89d8e
commit 1e4ecca1d0
51 changed files with 817 additions and 1275 deletions

View File

@@ -296,6 +296,7 @@ steps:
- tests/v1 - tests/v1
commands: commands:
# split the test to avoid interference # split the test to avoid interference
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor - pytest -v -s v1/executor
- pytest -v -s v1/kv_offload - pytest -v -s v1/kv_offload
- pytest -v -s v1/sample - pytest -v -s v1/sample
@@ -317,7 +318,7 @@ steps:
no_gpu: true no_gpu: true
commands: commands:
# split the test to avoid interference # split the test to avoid interference
- pytest -v -s v1/core - pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/structured_output - pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit - pytest -v -s -m 'cpu_test' v1/kv_connector/unit

View File

@@ -13,7 +13,7 @@ import pytest
import torch import torch
from vllm import LLM from vllm import LLM
from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1 from vllm.v1.engine.llm_engine import LLMEngine
from ..conftest import HfRunner, VllmRunner from ..conftest import HfRunner, VllmRunner
from ..models.utils import check_outputs_equal from ..models.utils import check_outputs_equal
@@ -211,16 +211,11 @@ def test_models_distributed(
def test_failed_model_execution(vllm_runner, monkeypatch) -> None: def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
from vllm.envs import VLLM_USE_V1
if not VLLM_USE_V1:
pytest.skip("Skipping V0 test, dump input not supported")
# Needed to mock an error in the same process # Needed to mock an error in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model: with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1): if isinstance(vllm_model.llm.llm_engine, LLMEngine):
v1_test_failed_model_execution(vllm_model) v1_test_failed_model_execution(vllm_model)

View File

@@ -117,68 +117,59 @@ def test_cumem_with_cudagraph():
@create_new_process_for_each_test() @create_new_process_for_each_test()
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model, use_v1", "model",
[ [
# sleep mode with safetensors # sleep mode with safetensors
("meta-llama/Llama-3.2-1B", True), "meta-llama/Llama-3.2-1B",
# sleep mode with pytorch checkpoint # sleep mode with pytorch checkpoint
("facebook/opt-125m", True), "facebook/opt-125m",
], ],
) )
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): def test_end_to_end(model: str):
with monkeypatch.context() as m: free, total = torch.cuda.mem_get_info()
assert use_v1 used_bytes_baseline = total - free # in case other process is running
m.setenv("VLLM_USE_V1", "1") llm = LLM(model, enable_sleep_mode=True)
free, total = torch.cuda.mem_get_info() prompt = "How are you?"
used_bytes_baseline = total - free # in case other process is running sampling_params = SamplingParams(temperature=0, max_tokens=10)
llm = LLM(model, enable_sleep_mode=True) output = llm.generate(prompt, sampling_params)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only # which is difficult to measure in the test. therefore, we only
# test sleep level 1 here. # test sleep level 1 here.
llm.sleep(level=1) llm.sleep(level=1)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool, # now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights) # and it should be less than the model weights (1B model, 2GiB weights)
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size) # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
# is captured but cannot be releasesd from PyTorch due to a known bug, # is captured but cannot be releasesd from PyTorch due to a known bug,
# therefore high memory usage after `llm.sleep` is called is expected. # therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# in V1. # in V1.
if use_v1: assert used_bytes < 7 * GiB_bytes
assert used_bytes < 7 * GiB_bytes
else:
assert used_bytes < 2 * GiB_bytes
llm.wake_up() llm.wake_up()
output2 = llm.generate(prompt, sampling_params) output2 = llm.generate(prompt, sampling_params)
# cmp output # cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text assert output[0].outputs[0].text == output2[0].outputs[0].text
llm.sleep(level=1) llm.sleep(level=1)
llm.wake_up(tags=["weights"]) llm.wake_up(tags=["weights"])
free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info() free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
# should just reallocate memory for weights (1B model, ~2GiB weights) # should just reallocate memory for weights (1B model, ~2GiB weights)
if use_v1: assert used_bytes < 10 * GiB_bytes
assert used_bytes < 10 * GiB_bytes
else:
assert used_bytes < 6 * GiB_bytes
# now allocate kv cache memory # now allocate kv cache memory
llm.wake_up(tags=["kv_cache"]) llm.wake_up(tags=["kv_cache"])
output3 = llm.generate(prompt, sampling_params) output3 = llm.generate(prompt, sampling_params)
# cmp output # cmp output
assert output[0].outputs[0].text == output3[0].outputs[0].text assert output[0].outputs[0].text == output3[0].outputs[0].text
@create_new_process_for_each_test() @create_new_process_for_each_test()

View File

@@ -66,7 +66,6 @@ def llm_pair(request):
pytest.skip("Only Blackwell GPUs support Cutlass MLA") pytest.skip("Only Blackwell GPUs support Cutlass MLA")
env_vars = { env_vars = {
"VLLM_USE_V1": "1",
# Force native sampler to avoid potential nondeterminism in FlashInfer # Force native sampler to avoid potential nondeterminism in FlashInfer
# when per-request generators are not used in V1. # when per-request generators are not used in V1.
"VLLM_USE_FLASHINFER_SAMPLER": "0", "VLLM_USE_FLASHINFER_SAMPLER": "0",
@@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
with ( with (
temporary_environ( temporary_environ(
{ {
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION", "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
# Flex_Attention is not supported with full cuda graph # Flex_Attention is not supported with full cuda graph
} }

View File

@@ -18,7 +18,6 @@ from vllm.config import (
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
) )
from vllm.envs import VLLM_USE_V1
from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import is_torch_equal_or_newer from vllm.utils import is_torch_equal_or_newer
@@ -127,7 +126,6 @@ def _run_simple_model(
@pytest.mark.parametrize("use_inductor", [True, False]) @pytest.mark.parametrize("use_inductor", [True, False])
@torch.inference_mode() @torch.inference_mode()
def test_simple_piecewise_compile(use_inductor): def test_simple_piecewise_compile(use_inductor):
assert VLLM_USE_V1
_run_simple_model( _run_simple_model(
splitting_ops=["silly.attention"], splitting_ops=["silly.attention"],
use_inductor_graph_partition=False, use_inductor_graph_partition=False,
@@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
@torch.inference_mode() @torch.inference_mode()
@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []]) @pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
def test_simple_inductor_graph_partition(splitting_ops): def test_simple_inductor_graph_partition(splitting_ops):
assert VLLM_USE_V1
if not is_torch_equal_or_newer("2.9.0.dev"): if not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+") pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

View File

@@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
"pass_config": {"enable_async_tp": async_tp_enabled}, "pass_config": {"enable_async_tp": async_tp_enabled},
} }
async_tp_env = tp_env = {
"VLLM_USE_V1": "1",
}
async_tp_args = [ async_tp_args = [
*common_args, *common_args,
"--tensor-parallel-size", "--tensor-parallel-size",
@@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
"mp", "mp",
] ]
compare_two_settings( compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
)

View File

@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
import vllm
from vllm.compilation.counter import compilation_counter from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
from vllm.utils import _is_torch_equal_or_newer from vllm.utils import _is_torch_equal_or_newer
@@ -16,15 +15,10 @@ def test_version():
assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev") assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
def test_use_cudagraphs_dynamic(monkeypatch): def test_use_cudagraphs_dynamic():
assert vllm.envs.VLLM_USE_V1
vllm_config = VllmConfig() vllm_config = VllmConfig()
assert vllm_config.compilation_config.use_cudagraph assert vllm_config.compilation_config.use_cudagraph
monkeypatch.setenv("VLLM_USE_V1", "0")
vllm_config = VllmConfig()
assert not vllm_config.compilation_config.use_cudagraph
def test_custom_op(): def test_custom_op():
# proper syntax # proper syntax
@@ -41,8 +35,6 @@ def test_custom_op():
# may be influenced by other tests. # may be influenced by other tests.
@pytest.mark.parametrize("val", ["1"]) @pytest.mark.parametrize("val", ["1"])
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val): def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
assert vllm.envs.VLLM_USE_V1
# Disable multiprocessing so that the counter is in the same process # Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val) monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
@@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
@pytest.mark.forked @pytest.mark.forked
@pytest.mark.parametrize("enabled", [True, False]) @pytest.mark.parametrize("enabled", [True, False])
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
assert vllm.envs.VLLM_USE_V1
# Disable multiprocessing so that the counter is in the same process # Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

View File

@@ -303,7 +303,6 @@ def test_attention_quant_pattern(
model_class: type[AttentionQuantPatternModel], model_class: type[AttentionQuantPatternModel],
backend: _Backend, backend: _Backend,
use_inductor_graph_partition: bool, use_inductor_graph_partition: bool,
monkeypatch,
dist_init, dist_init,
caplog_vllm, caplog_vllm,
): ):
@@ -312,8 +311,6 @@ def test_attention_quant_pattern(
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+") pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
monkeypatch.setenv("VLLM_USE_V1", "1")
device = torch.device("cuda:0") device = torch.device("cuda:0")
torch.manual_seed(42) torch.manual_seed(42)

View File

@@ -8,16 +8,13 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
def test_mp_reducer(monkeypatch): def test_mp_reducer():
""" """
Test that _reduce_config reducer is registered when AsyncLLM is instantiated Test that _reduce_config reducer is registered when AsyncLLM is instantiated
without transformers_modules. This is a regression test for without transformers_modules. This is a regression test for
https://github.com/vllm-project/vllm/pull/18640. https://github.com/vllm-project/vllm/pull/18640.
""" """
# Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
monkeypatch.setenv("VLLM_USE_V1", "1")
# Ensure transformers_modules is not in sys.modules # Ensure transformers_modules is not in sys.modules
if "transformers_modules" in sys.modules: if "transformers_modules" in sys.modules:
del sys.modules["transformers_modules"] del sys.modules["transformers_modules"]

View File

@@ -5,7 +5,7 @@ from typing import Any, Optional
import pytest import pytest
from vllm import LLM, SamplingParams, envs from vllm import LLM, SamplingParams
MODEL = "meta-llama/llama-2-7b-hf" MODEL = "meta-llama/llama-2-7b-hf"
MAX_TOKENS = 200 MAX_TOKENS = 200
@@ -111,9 +111,7 @@ def _stop_token_id(llm):
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_stop_strings(): def test_stop_strings():
# If V0, must set enforce_eager=False since we use llm = LLM(MODEL, enforce_eager=True)
# async output processing below.
llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
_stop_basic(llm) _stop_basic(llm)
_stop_multi_tokens(llm) _stop_multi_tokens(llm)

View File

@@ -42,24 +42,10 @@ class CPTestOptions(NamedTuple):
@dataclass @dataclass
class CPTestSettings: class CPTestSettings:
parallel_setups: list[ParallelSetup] parallel_setups: list[ParallelSetup]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends: list[str] distributed_backends: list[str]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions: list[str]
runner: RunnerOption runner: RunnerOption
test_options: CPTestOptions test_options: CPTestOptions
def __post_init__(self):
if len(self.distributed_backends) != len(self.vllm_major_versions):
raise ValueError(
f"Length mismatch: distributed_backends "
f"({len(self.distributed_backends)}) != "
f"vllm_major_versions ({len(self.vllm_major_versions)})"
)
@staticmethod @staticmethod
def detailed( def detailed(
*, *,
@@ -87,7 +73,6 @@ class CPTestSettings:
return CPTestSettings( return CPTestSettings(
parallel_setups=parallel_setups, parallel_setups=parallel_setups,
distributed_backends=["mp"], distributed_backends=["mp"],
vllm_major_versions=["1"],
runner=runner, runner=runner,
test_options=CPTestOptions( test_options=CPTestOptions(
multi_node_only=multi_node_only, load_format=load_format multi_node_only=multi_node_only, load_format=load_format
@@ -98,14 +83,11 @@ class CPTestSettings:
opts = self.test_options opts = self.test_options
for parallel_setup in self.parallel_setups: for parallel_setup in self.parallel_setups:
for backend, vllm_major_version in zip( for backend in self.distributed_backends:
self.distributed_backends, self.vllm_major_versions
):
yield ( yield (
model_id, model_id,
parallel_setup, parallel_setup,
backend, backend,
vllm_major_version,
self.runner, self.runner,
opts, opts,
) )
@@ -115,7 +97,6 @@ def _compare_cp_with_tp(
model_id: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption, runner: RunnerOption,
test_options: CPTestOptions, test_options: CPTestOptions,
num_gpus_available: int, num_gpus_available: int,
@@ -191,10 +172,6 @@ def _compare_cp_with_tp(
if hf_overrides: if hf_overrides:
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)]) common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
cp_env = tp_env = {
"VLLM_USE_V1": vllm_major_version, # Note(hc): DCP only support V1 engine only
}
cp_args = [ cp_args = [
*common_args, *common_args,
"--tensor-parallel-size", "--tensor-parallel-size",
@@ -217,24 +194,13 @@ def _compare_cp_with_tp(
distributed_backend, distributed_backend,
] ]
try: compare_two_settings(
compare_two_settings( model_id,
model_id, cp_args,
cp_args, tp_args,
tp_args, method=method,
cp_env, max_wait_seconds=720,
tp_env, )
method=method,
max_wait_seconds=720,
)
except Exception:
testing_ray_compiled_graph = cp_env is not None
if testing_ray_compiled_graph and vllm_major_version == "0":
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger.exception("Ray Compiled Graph tests failed")
else:
raise
CP_TEXT_GENERATION_MODELS = { CP_TEXT_GENERATION_MODELS = {
@@ -257,7 +223,6 @@ CP_TEST_MODELS = [
"model_id", "model_id",
"parallel_setup", "parallel_setup",
"distributed_backend", "distributed_backend",
"vllm_major_version",
"runner", "runner",
"test_options", "test_options",
), ),
@@ -274,7 +239,6 @@ def test_cp_generation(
model_id: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption, runner: RunnerOption,
test_options: CPTestOptions, test_options: CPTestOptions,
num_gpus_available, num_gpus_available,
@@ -283,7 +247,6 @@ def test_cp_generation(
model_id, model_id,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
vllm_major_version,
runner, runner,
test_options, test_options,
num_gpus_available, num_gpus_available,

View File

@@ -307,7 +307,6 @@ def _compare_tp(
if distributed_backend == "ray": if distributed_backend == "ray":
# For V1, test Ray Compiled Graph for all the tests # For V1, test Ray Compiled Graph for all the tests
pp_env = { pp_env = {
"VLLM_USE_V1": "1",
"VLLM_USE_RAY_COMPILED_DAG": "1", "VLLM_USE_RAY_COMPILED_DAG": "1",
"VLLM_USE_RAY_SPMD_WORKER": "1", "VLLM_USE_RAY_SPMD_WORKER": "1",
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1", "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
@@ -316,15 +315,11 @@ def _compare_tp(
# terminate because of a Ray Compiled Graph issue. # terminate because of a Ray Compiled Graph issue.
common_args.append("--disable-frontend-multiprocessing") common_args.append("--disable-frontend-multiprocessing")
elif distributed_backend == "mp": elif distributed_backend == "mp":
pp_env = { pp_env = None
"VLLM_USE_V1": "1",
}
else: else:
pp_env = None pp_env = None
tp_env = { tp_env = None
"VLLM_USE_V1": "1",
}
pp_args = [ pp_args = [
*common_args, *common_args,

View File

@@ -42,24 +42,10 @@ class SPTestOptions(NamedTuple):
@dataclass @dataclass
class SPTestSettings: class SPTestSettings:
parallel_setups: list[ParallelSetup] parallel_setups: list[ParallelSetup]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends: list[str] distributed_backends: list[str]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions: list[str]
runner: RunnerOption runner: RunnerOption
test_options: SPTestOptions test_options: SPTestOptions
def __post_init__(self):
if len(self.distributed_backends) != len(self.vllm_major_versions):
raise ValueError(
f"Length mismatch: distributed_backends "
f"({len(self.distributed_backends)}) != "
f"vllm_major_versions ({len(self.vllm_major_versions)})"
)
@staticmethod @staticmethod
def detailed( def detailed(
*, *,
@@ -85,7 +71,6 @@ class SPTestSettings:
return SPTestSettings( return SPTestSettings(
parallel_setups=parallel_setups, parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"], distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
runner=runner, runner=runner,
test_options=SPTestOptions( test_options=SPTestOptions(
multi_node_only=multi_node_only, load_format=load_format multi_node_only=multi_node_only, load_format=load_format
@@ -117,7 +102,6 @@ class SPTestSettings:
return SPTestSettings( return SPTestSettings(
parallel_setups=parallel_setups, parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"], distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
runner=runner, runner=runner,
test_options=SPTestOptions( test_options=SPTestOptions(
multi_node_only=multi_node_only, load_format=load_format multi_node_only=multi_node_only, load_format=load_format
@@ -147,7 +131,6 @@ class SPTestSettings:
return SPTestSettings( return SPTestSettings(
parallel_setups=parallel_setups, parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"], distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
runner=runner, runner=runner,
test_options=SPTestOptions( test_options=SPTestOptions(
multi_node_only=multi_node_only, load_format=load_format multi_node_only=multi_node_only, load_format=load_format
@@ -158,14 +141,11 @@ class SPTestSettings:
opts = self.test_options opts = self.test_options
for parallel_setup in self.parallel_setups: for parallel_setup in self.parallel_setups:
for backend, vllm_major_version in zip( for backend in self.distributed_backends:
self.distributed_backends, self.vllm_major_versions
):
yield ( yield (
model_id, model_id,
parallel_setup, parallel_setup,
backend, backend,
vllm_major_version,
self.runner, self.runner,
opts, opts,
) )
@@ -175,7 +155,6 @@ def _compare_sp(
model_id: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption, runner: RunnerOption,
test_options: SPTestOptions, test_options: SPTestOptions,
num_gpus_available: int, num_gpus_available: int,
@@ -265,10 +244,6 @@ def _compare_sp(
}, },
} }
tp_sp_env = tp_env = {
"VLLM_USE_V1": vllm_major_version,
}
tp_sp_args = [ tp_sp_args = [
*common_args, *common_args,
"--tensor-parallel-size", "--tensor-parallel-size",
@@ -281,9 +256,6 @@ def _compare_sp(
json.dumps(compilation_config), json.dumps(compilation_config),
] ]
tp_env = {
"VLLM_USE_V1": vllm_major_version,
}
tp_args = [ tp_args = [
*common_args, *common_args,
"--tensor-parallel-size", "--tensor-parallel-size",
@@ -292,18 +264,7 @@ def _compare_sp(
"mp", "mp",
] ]
try: compare_two_settings(model_id, tp_sp_args, tp_args, method=method)
compare_two_settings(
model_id, tp_sp_args, tp_args, tp_sp_env, tp_env, method=method
)
except Exception:
testing_ray_compiled_graph = tp_sp_env is not None
if testing_ray_compiled_graph and vllm_major_version == "0":
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger.exception("Ray Compiled Graph tests failed")
else:
raise
SP_TEXT_GENERATION_MODELS = { SP_TEXT_GENERATION_MODELS = {
@@ -325,7 +286,6 @@ SP_TEST_MODELS = [
"model_id", "model_id",
"parallel_setup", "parallel_setup",
"distributed_backend", "distributed_backend",
"vllm_major_version",
"runner", "runner",
"test_options", "test_options",
), ),
@@ -341,7 +301,6 @@ def test_tp_sp_generation(
model_id: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption, runner: RunnerOption,
test_options: SPTestOptions, test_options: SPTestOptions,
num_gpus_available, num_gpus_available,
@@ -350,7 +309,6 @@ def test_tp_sp_generation(
model_id, model_id,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
vllm_major_version,
runner, runner,
test_options, test_options,
num_gpus_available, num_gpus_available,

View File

@@ -61,50 +61,34 @@ def run_test(model_name, more_args=None):
TPU_TP_TEST_STR = "" # "tensor_parallel_size=4" TPU_TP_TEST_STR = "" # "tensor_parallel_size=4"
@pytest.mark.skipif(
not current_platform.is_cuda() and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU",
)
@pytest.mark.parametrize("model", MODEL_NAMES) @pytest.mark.parametrize("model", MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch): def test_lm_eval_accuracy_v1_engine(model):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: more_args = None
m.setenv("VLLM_USE_V1", "1") if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = None more_args = "max_model_len=2048,max_num_seqs=64"
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_model_len=2048,max_num_seqs=64" # Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
# Add TP test (if provided) run_test(model, more_args)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
run_test(model, more_args)
@pytest.mark.skipif(
not current_platform.is_cuda() and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU",
)
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES) @pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache( def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
model, monkeypatch: pytest.MonkeyPatch
):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: more_args = None
m.setenv("VLLM_USE_V1", "1") if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
more_args = None # Add TP test (if provided)
if current_platform.is_tpu(): if TPU_TP_TEST_STR:
# Limit compilation time for TPU V1 more_args += ",{}".format(TPU_TP_TEST_STR)
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
# Add TP test (if provided) run_test(model, more_args)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
run_test(model, more_args)

View File

@@ -10,7 +10,6 @@ AsyncLLMEngine are working correctly.
""" """
import lm_eval import lm_eval
import pytest
from vllm.platforms import current_platform from vllm.platforms import current_platform
@@ -67,21 +66,13 @@ def run_test(more_args):
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
@pytest.mark.skipif( def test_lm_eval_accuracy_v1_engine():
not current_platform.is_cuda()
and not current_platform.is_tpu()
and not current_platform.is_xpu(),
reason="V1 currently only supported on CUDA, XPU and TPU",
)
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine.""" """Run with the V1 Engine."""
with monkeypatch.context() as m: more_args = []
m.setenv("VLLM_USE_V1", "1")
more_args = []
# Limit compilation time for V1 # Limit compilation time for V1
if current_platform.is_tpu(): if current_platform.is_tpu():
more_args = ["--max-num-seqs", "64"] more_args = ["--max-num-seqs", "64"]
run_test(more_args) run_test(more_args)

View File

@@ -21,18 +21,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def monkeypatch_module(): def server(zephyr_lora_files): # noqa: F811
from _pytest.monkeypatch import MonkeyPatch
mpatch = MonkeyPatch()
yield mpatch
mpatch.undo()
@pytest.fixture(scope="module")
def server(monkeypatch_module, zephyr_lora_files): # noqa: F811
monkeypatch_module.setenv("VLLM_USE_V1", "1")
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",

View File

@@ -37,21 +37,8 @@ BADREQUEST_CASES = [
] ]
@pytest.fixture(scope="module")
def monkeypatch_module():
from _pytest.monkeypatch import MonkeyPatch
mpatch = MonkeyPatch()
yield mpatch
mpatch.undo()
@pytest.fixture(scope="module", params=[True]) @pytest.fixture(scope="module", params=[True])
def server_with_lora_modules_json(request, monkeypatch_module, zephyr_lora_files): def server_with_lora_modules_json(request, zephyr_lora_files):
use_v1 = request.param
assert use_v1
monkeypatch_module.setenv("VLLM_USE_V1", "1")
# Define the json format LoRA module configurations # Define the json format LoRA module configurations
lora_module_1 = { lora_module_1 = {
"name": "zephyr-lora", "name": "zephyr-lora",

View File

@@ -22,24 +22,6 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PREV_MINOR_VERSION = version._prev_minor_version() PREV_MINOR_VERSION = version._prev_minor_version()
@pytest.fixture(scope="module", params=[True])
def use_v1(request):
# Module-scoped variant of run_with_both_engines
#
# Use this fixture to run a test with both v0 and v1, and
# also to conditionalize the test logic e.g.
#
# def test_metrics_exist(use_v1, server, client):
# ...
# expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
# for metric in expected:
# assert metric in response.text
#
# @skip_v1 wouldn't work here because this is a module-level
# fixture - per-function decorators would have no effect
yield request.param
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def default_server_args(): def default_server_args():
return [ return [
@@ -63,13 +45,11 @@ def default_server_args():
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}", f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
], ],
) )
def server(use_v1, default_server_args, request): def server(default_server_args, request):
if request.param: if request.param:
default_server_args.append(request.param) default_server_args.append(request.param)
env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
with RemoteOpenAIServer( with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
MODEL_NAME, default_server_args, env_dict=env_dict
) as remote_server:
yield remote_server yield remote_server
@@ -129,7 +109,8 @@ EXPECTED_VALUES = {
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_metrics_counts( async def test_metrics_counts(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool server: RemoteOpenAIServer,
client: openai.AsyncClient,
): ):
for _ in range(_NUM_REQUESTS): for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged. # sending a request triggers the metrics to be logged.
@@ -145,7 +126,7 @@ async def test_metrics_counts(
# Loop over all expected metric_families # Loop over all expected metric_families
for metric_family, suffix_values_list in EXPECTED_VALUES.items(): for metric_family, suffix_values_list in EXPECTED_VALUES.items():
if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or ( if (metric_family not in EXPECTED_METRICS_V1) or (
not server.show_hidden_metrics not server.show_hidden_metrics
and metric_family in HIDDEN_DEPRECATED_METRICS and metric_family in HIDDEN_DEPRECATED_METRICS
): ):
@@ -183,62 +164,6 @@ async def test_metrics_counts(
assert found_metric, f"Did not find {metric_family} in prom endpoint" assert found_metric, f"Did not find {metric_family} in prom endpoint"
EXPECTED_METRICS = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
"vllm:e2e_request_latency_seconds_sum",
"vllm:e2e_request_latency_seconds_bucket",
"vllm:e2e_request_latency_seconds_count",
"vllm:request_queue_time_seconds_sum",
"vllm:request_queue_time_seconds_bucket",
"vllm:request_queue_time_seconds_count",
"vllm:request_inference_time_seconds_sum",
"vllm:request_inference_time_seconds_bucket",
"vllm:request_inference_time_seconds_count",
"vllm:request_prefill_time_seconds_sum",
"vllm:request_prefill_time_seconds_bucket",
"vllm:request_prefill_time_seconds_count",
"vllm:request_decode_time_seconds_sum",
"vllm:request_decode_time_seconds_bucket",
"vllm:request_decode_time_seconds_count",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:iteration_tokens_total",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_success_total",
"vllm:cache_config_info",
# labels in cache_config_info
"block_size",
"cache_dtype",
"cpu_offload_gb",
"enable_prefix_caching",
"gpu_memory_utilization",
"num_cpu_blocks",
"num_gpu_blocks",
"num_gpu_blocks_override",
"sliding_window",
"swap_space_bytes",
]
EXPECTED_METRICS_V1 = [ EXPECTED_METRICS_V1 = [
"vllm:num_requests_running", "vllm:num_requests_running",
"vllm:num_requests_waiting", "vllm:num_requests_waiting",
@@ -304,17 +229,21 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_metrics_exist( async def test_metrics_exist(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool server: RemoteOpenAIServer,
client: openai.AsyncClient,
): ):
# sending a request triggers the metrics to be logged. # sending a request triggers the metrics to be logged.
await client.completions.create( await client.completions.create(
model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0 model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0,
) )
response = requests.get(server.url_for("metrics")) response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK assert response.status_code == HTTPStatus.OK
for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS: for metric in EXPECTED_METRICS_V1:
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics: if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
continue continue
assert metric in response.text assert metric in response.text
@@ -322,10 +251,11 @@ async def test_metrics_exist(
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_abort_metrics_reset( async def test_abort_metrics_reset(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool server: RemoteOpenAIServer,
client: openai.AsyncClient,
): ):
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api( running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server, use_v1 server
) )
# Expect no running requests or kvcache usage # Expect no running requests or kvcache usage
@@ -351,7 +281,7 @@ async def test_abort_metrics_reset(
# Check that we have running requests # Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api( running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server, use_v1 server
) )
# Expect running requests and kvcache usage # Expect running requests and kvcache usage
@@ -371,7 +301,7 @@ async def test_abort_metrics_reset(
# Verify running and waiting requests counts and KV cache usage are zero # Verify running and waiting requests counts and KV cache usage are zero
running_requests_after, waiting_requests_after, kv_cache_usage_after = ( running_requests_after, waiting_requests_after, kv_cache_usage_after = (
_get_running_metrics_from_api(server, use_v1) _get_running_metrics_from_api(server)
) )
assert running_requests_after == 0, ( assert running_requests_after == 0, (
@@ -385,7 +315,7 @@ async def test_abort_metrics_reset(
) )
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool): def _get_running_metrics_from_api(server: RemoteOpenAIServer):
"""Return (running_count, waiting_count, kv_cache_usage)""" """Return (running_count, waiting_count, kv_cache_usage)"""
response = requests.get(server.url_for("metrics")) response = requests.get(server.url_for("metrics"))
@@ -394,9 +324,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
# Verify running and waiting requests counts and KV cache usage are zero # Verify running and waiting requests counts and KV cache usage are zero
running_requests, waiting_requests, kv_cache_usage = None, None, None running_requests, waiting_requests, kv_cache_usage = None, None, None
kv_cache_usage_metric = ( kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
"vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
)
for family in text_string_to_metric_families(response.text): for family in text_string_to_metric_families(response.text):
if family.name == "vllm:num_requests_running": if family.name == "vllm:num_requests_running":
@@ -422,7 +350,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
return running_requests, waiting_requests, kv_cache_usage return running_requests, waiting_requests, kv_cache_usage
def test_metrics_exist_run_batch(use_v1: bool): def test_metrics_exist_run_batch():
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501 input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
base_url = "0.0.0.0" base_url = "0.0.0.0"
@@ -452,7 +380,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
"--port", "--port",
port, port,
], ],
env={"VLLM_USE_V1": "1"},
) )
def is_server_up(url): def is_server_up(url):

View File

@@ -15,11 +15,6 @@ from vllm.entrypoints.renderer import BaseRenderer
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch):
monkeypatch.setenv("VLLM_USE_V1", "1")
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_empty_prompt(): async def test_empty_prompt():
model_name = "gpt2" model_name = "gpt2"

View File

@@ -80,7 +80,6 @@ def test_env(
): ):
"""Test attention backend selection with valid device-backend pairs.""" """Test attention backend selection with valid device-backend pairs."""
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv(STR_BACKEND_ENV_VAR, name) m.setenv(STR_BACKEND_ENV_VAR, name)
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0") m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
@@ -212,30 +211,21 @@ def test_env(
@pytest.mark.parametrize("device", ["cpu", "cuda"]) @pytest.mark.parametrize("device", ["cpu", "cuda"])
def test_fp32_fallback( def test_fp32_fallback(device: str):
device: str,
monkeypatch: pytest.MonkeyPatch,
):
"""Test attention backend selection with fp32.""" """Test attention backend selection with fp32."""
with monkeypatch.context() as m: if device == "cpu":
m.setenv("VLLM_USE_V1", "1") with patch("vllm.attention.selector.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "TORCH_SDPA"
if device == "cpu": elif device == "cuda":
with patch("vllm.attention.selector.current_platform", CpuPlatform()): with patch("vllm.attention.selector.current_platform", CudaPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16) backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "TORCH_SDPA" assert backend.get_name() == "FLEX_ATTENTION"
elif device == "cuda":
with patch("vllm.attention.selector.current_platform", CudaPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "FLEX_ATTENTION"
def test_flash_attn(monkeypatch: pytest.MonkeyPatch): def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation.""" """Test FlashAttn validation."""
# TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend
pytest.skip( pytest.skip(
"Skipping as current backend selector does not " "Skipping as current backend selector does not "
"handle fallbacks when a backend is set via env var." "handle fallbacks when a backend is set via env var."
@@ -289,7 +279,6 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
monkeypatch.context() as m, monkeypatch.context() as m,
patch("vllm.attention.selector.current_platform", CudaPlatform()), patch("vllm.attention.selector.current_platform", CudaPlatform()),
): ):
m.setenv("VLLM_USE_V1", "1")
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL) m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
# Should raise ValueError for invalid backend # Should raise ValueError for invalid backend

View File

@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention # Run with flex attention
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
set_seed(seed) set_seed(seed)
@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with default backend # Run with default backend
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
set_seed(seed) set_seed(seed)
with vllm_runner( with vllm_runner(
model_name, model_name,
@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention # Run with flex attention
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
with vllm_runner( with vllm_runner(
model_name, model_name,
@@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
flex_outputs = llm_flex.embed(prompts) flex_outputs = llm_flex.embed(prompts)
# Run with default backend # Run with default backend
with monkeypatch.context() as m: with (
m.setenv("VLLM_USE_V1", "1") monkeypatch.context() as m,
with vllm_runner( vllm_runner(
model_name, model_name,
runner="pooling", runner="pooling",
dtype=torch.bfloat16, dtype=torch.bfloat16,
tensor_parallel_size=1, tensor_parallel_size=1,
max_model_len=100, max_model_len=100,
enforce_eager=True, enforce_eager=True,
) as llm_default: ) as llm_default,
default_outputs = llm_default.embed(prompts) ):
default_outputs = llm_default.embed(prompts)
check_embeddings_close( check_embeddings_close(
embeddings_0_lst=flex_outputs, embeddings_0_lst=flex_outputs,

View File

@@ -613,7 +613,6 @@ def test_dummy_maverick(
profile: bool = False, profile: bool = False,
) -> None: ) -> None:
# Disable multiprocessing allows us to access model executor from LLM engine # Disable multiprocessing allows us to access model executor from LLM engine
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
model_path = create_reduced_maverick_model( model_path = create_reduced_maverick_model(

View File

@@ -8,7 +8,6 @@ if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import VllmConfig
else: else:
VllmConfig = None VllmConfig = None
from vllm import envs
class DummyPlatform(Platform): class DummyPlatform(Platform):
@@ -19,10 +18,7 @@ class DummyPlatform(Platform):
@classmethod @classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None: def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if envs.VLLM_USE_V1: vllm_config.compilation_config.custom_ops = ["all"]
compilation_config = vllm_config.compilation_config
# Activate custom ops for v1.
compilation_config.custom_ops = ["all"]
def get_attn_backend_cls( def get_attn_backend_cls(
self, self,

View File

@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler):
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch): def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Explicitly turn off engine multiprocessing so # Explicitly turn off engine multiprocessing so
# that the scheduler runs in this process # that the scheduler runs in this process
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

View File

@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
from typing import Optional from typing import Optional
import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
@pytest.fixture(autouse=True)
def v1(monkeypatch):
"""Only run on vLLM v1."""
monkeypatch.setenv("VLLM_USE_V1", "1")
def _generate( def _generate(
llm: LLM, llm: LLM,
prompt: str, prompt: str,

View File

@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest
# 100 training iterations with a training batch size of 100. # 100 training iterations with a training batch size of 100.
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch: pytest.MonkeyPatch):
"""
Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
for all tests in this file
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
yield
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM: def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
return vllm.LLM( return vllm.LLM(
model="Qwen/Qwen2.5-3B-Instruct", model="Qwen/Qwen2.5-3B-Instruct",

View File

@@ -305,7 +305,6 @@ full_cg_backend_configs = {
"CutlassMLA": BackendConfig( "CutlassMLA": BackendConfig(
name="CutlassMLA", name="CutlassMLA",
env_vars={ env_vars={
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA", "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
"FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed "FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed
}, },

View File

@@ -1,11 +1,14 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch import torch
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
pytestmark = pytest.mark.cpu_test
def new_kv_cache_spec(): def new_kv_cache_spec():
return FullAttentionSpec(16, 1, 1, torch.float32, False) return FullAttentionSpec(16, 1, 1, torch.float32, False)

View File

@@ -1,14 +1,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
from vllm import LLM from vllm import LLM
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B" MODEL = "meta-llama/Llama-3.2-1B"
PROMPT = "Hello my name is Robert and I" PROMPT = "Hello my name is Robert and I"

View File

@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
): ):
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA") pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars} env_vars = backend_configs[backend_name].env_vars
with temporary_environ(env_vars), ExitStack() as stack: with temporary_environ(env_vars), ExitStack() as stack:
if not supported: if not supported:
@@ -117,7 +117,7 @@ combo_cases_2 = [
def test_cudagraph_compilation_combo(combo_case): def test_cudagraph_compilation_combo(combo_case):
backend_name, cudagraph_mode, compilation_level, supported = combo_case backend_name, cudagraph_mode, compilation_level, supported = combo_case
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars} env_vars = backend_configs[backend_name].env_vars
with temporary_environ(env_vars), ExitStack() as stack: with temporary_environ(env_vars), ExitStack() as stack:
if not supported: if not supported:

View File

@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
) )
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")

View File

@@ -32,7 +32,7 @@ model_config = {
@pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False]) @pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
def test_sliding_window_retrieval( def test_sliding_window_retrieval(
monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager model, batch_size, seed, disable_hybrid_kv_cache_manager
): ):
""" """
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
@@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
If we tell it upfront which we are going to be looking for, then If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly). it answers correctly (mostly).
""" """
with monkeypatch.context() as m: test_config = model_config[model]
m.setenv("VLLM_USE_V1", "1")
test_config = model_config[model] llm = LLM(
model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
)
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
llm = LLM( prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)
model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
)
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
prompts, answer, indices = prep_prompts( check_length(prompts, llm, test_config.sliding_window)
batch_size, ln_range=test_config.ln_range
)
check_length(prompts, llm, test_config.sliding_window) # Fresh generation
responses = llm.generate(prompts, sampling_params)
check_answers(
indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0,
)
# Fresh generation # Re-generate with the same prompts to test prefix caching
responses = llm.generate(prompts, sampling_params) responses = llm.generate(prompts, sampling_params)
check_answers( check_answers(
indices, indices,
answer, answer,
[response.outputs[0].text for response in responses], [response.outputs[0].text for response in responses],
accept_rate=1.0, accept_rate=1.0,
) )
# Re-generate with the same prompts to test prefix caching
responses = llm.generate(prompts, sampling_params)
check_answers(
indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0,
)
def check_length(prompts: list[str], llm: LLM, sliding_window: int): def check_length(prompts: list[str], llm: LLM, sliding_window: int):

View File

@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
) )
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Make scheduling deterministic for reproducibility # Make scheduling deterministic for reproducibility
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

View File

@@ -13,7 +13,6 @@ Covers:
5) Multiple stop conditions 5) Multiple stop conditions
""" """
import os
from typing import Optional, Union from typing import Optional, Union
import pytest import pytest
@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llm_v1(): def llm_v1():
"""Create V1 LLM instance for testing""" """Create V1 LLM instance for testing"""
# Ensure V1 engine is used
os.environ["VLLM_USE_V1"] = "1"
llm = LLM( llm = LLM(
model=TEST_MODEL, model=TEST_MODEL,
tensor_parallel_size=1, tensor_parallel_size=1,
@@ -503,6 +499,6 @@ if __name__ == "__main__":
Usage: Usage:
cd vllm/ cd vllm/
VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v python -m pytest tests/v1/e2e/test_min_tokens.py -v
""" """
pytest.main([__file__, "-v"]) pytest.main([__file__, "-v"])

View File

@@ -301,7 +301,6 @@ def test_mtp_correctness(
model_setup: (method, model_name, tp_size) model_setup: (method, model_name, tp_size)
""" """
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_MLA_DISABLE", "1") m.setenv("VLLM_MLA_DISABLE", "1")
method, model_name, tp_size = model_setup method, model_name, tp_size = model_setup

View File

@@ -95,17 +95,11 @@ async def generate(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load( async def test_load(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind, output_kind: RequestOutputKind,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
prompt: PromptType, prompt: PromptType,
): ):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1 with ExitStack() as after:
# so that in the future when we switch, we don't have to change all the
# tests.
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
@@ -149,14 +143,11 @@ async def test_load(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_abort( async def test_abort(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind, output_kind: RequestOutputKind,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
prompt: PromptType, prompt: PromptType,
): ):
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
@@ -222,13 +213,8 @@ async def test_abort(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY] "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_multi_abort( async def test_multi_abort(output_kind: RequestOutputKind):
monkeypatch: pytest.MonkeyPatch, with ExitStack() as after:
output_kind: RequestOutputKind,
):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
@@ -304,14 +290,11 @@ async def test_multi_abort(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_finished_flag( async def test_finished_flag(
monkeypatch: pytest.MonkeyPatch,
n: int, n: int,
engine_args: AsyncEngineArgs, engine_args: AsyncEngineArgs,
prompt: PromptType, prompt: PromptType,
): ):
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
@@ -341,12 +324,10 @@ async def test_finished_flag(
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_mid_stream_cancellation( async def test_mid_stream_cancellation(
monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType engine_args: AsyncEngineArgs, prompt: PromptType
): ):
"""Test that requests can be cancelled mid-stream.""" """Test that requests can be cancelled mid-stream."""
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args) engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown) after.callback(engine.shutdown)
@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
be added to the default loggers. be added to the default loggers.
""" """
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args( engine = AsyncLLM.from_engine_args(
TEXT_ENGINE_ARGS, TEXT_ENGINE_ARGS,
@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
@pytest.mark.asyncio(scope="module") @pytest.mark.asyncio(scope="module")
async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch): async def test_dp_rank_argument():
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_check_health(monkeypatch: pytest.MonkeyPatch): async def test_check_health():
"""Test that check_health returns normally for healthy engine """Test that check_health returns normally for healthy engine
and raises EngineDeadError when the engine is dead. and raises EngineDeadError when the engine is dead.
""" """
@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.engine.exceptions import EngineDeadError
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)
@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY] "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
) )
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_abort_final_output( async def test_abort_final_output(output_kind: RequestOutputKind):
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
):
"""Test that abort() returns a final output with correct information.""" """Test that abort() returns a final output with correct information."""
with monkeypatch.context() as m, ExitStack() as after: with ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS) engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown) after.callback(engine.shutdown)

View File

@@ -5,18 +5,11 @@ from argparse import ArgumentError
import pytest import pytest
from vllm import envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
if not envs.VLLM_USE_V1:
pytest.skip(
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
allow_module_level=True,
)
def test_prefix_caching_from_cli(): def test_prefix_caching_from_cli():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) parser = EngineArgs.add_cli_args(FlexibleArgumentParser())

View File

@@ -46,188 +46,184 @@ def make_request() -> EngineCoreRequest:
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_engine_core(monkeypatch: pytest.MonkeyPatch): def test_engine_core():
with monkeypatch.context() as m: """Setup the EngineCore."""
m.setenv("VLLM_USE_V1", "1") engine_args = EngineArgs(model=MODEL_NAME)
"""Setup the EngineCore.""" vllm_config = engine_args.create_engine_config()
engine_args = EngineArgs(model=MODEL_NAME) executor_class = Executor.get_class(vllm_config)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine_core = EngineCore( engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True vllm_config=vllm_config, executor_class=executor_class, log_stats=True
) )
"""Test basic request lifecycle.""" """Test basic request lifecycle."""
# First request. # First request.
engine_core.add_request(*engine_core.preprocess_add_request(make_request())) engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0 assert len(engine_core.scheduler.running) == 0
_ = engine_core.step() _ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 1 assert len(engine_core.scheduler.running) == 1
# Second request. # Second request.
engine_core.add_request(*engine_core.preprocess_add_request(make_request())) engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 1 assert len(engine_core.scheduler.running) == 1
_ = engine_core.step() _ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2 assert len(engine_core.scheduler.running) == 2
# Add two requests in a row. # Add two requests in a row.
engine_core.add_request(*engine_core.preprocess_add_request(make_request())) engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
engine_core.add_request(*engine_core.preprocess_add_request(make_request())) engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
assert len(engine_core.scheduler.waiting) == 2 assert len(engine_core.scheduler.waiting) == 2
assert len(engine_core.scheduler.running) == 2 assert len(engine_core.scheduler.running) == 2
_ = engine_core.step() _ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 4 assert len(engine_core.scheduler.running) == 4
# Loop through until they are all done. # Loop through until they are all done.
while (outs := engine_core.step()[0].get(0)) and outs.outputs: while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass pass
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0 assert len(engine_core.scheduler.running) == 0
"""Test abort cycle.""" """Test abort cycle."""
# Basic abort. # Basic abort.
req = make_request() req = make_request()
request_id = req.request_id request_id = req.request_id
engine_core.add_request(*engine_core.preprocess_add_request(req)) engine_core.add_request(*engine_core.preprocess_add_request(req))
assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0 assert len(engine_core.scheduler.running) == 0
assert engine_core.scheduler.has_unfinished_requests() assert engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests() assert not engine_core.scheduler.has_finished_requests()
_ = engine_core.step() _ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 1 assert len(engine_core.scheduler.running) == 1
assert engine_core.scheduler.has_unfinished_requests() assert engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests() assert not engine_core.scheduler.has_finished_requests()
engine_core.abort_requests([request_id]) engine_core.abort_requests([request_id])
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0 assert len(engine_core.scheduler.running) == 0
assert not engine_core.scheduler.has_unfinished_requests() assert not engine_core.scheduler.has_unfinished_requests()
assert engine_core.scheduler.has_finished_requests() assert engine_core.scheduler.has_finished_requests()
_ = engine_core.step() _ = engine_core.step()
assert not engine_core.scheduler.has_unfinished_requests() assert not engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests() assert not engine_core.scheduler.has_finished_requests()
# Add, step, abort 1 of the 3. # Add, step, abort 1 of the 3.
req0 = make_request() req0 = make_request()
req1 = make_request() req1 = make_request()
req2 = make_request() req2 = make_request()
engine_core.add_request(*engine_core.preprocess_add_request(req0)) engine_core.add_request(*engine_core.preprocess_add_request(req0))
engine_core.add_request(*engine_core.preprocess_add_request(req1)) engine_core.add_request(*engine_core.preprocess_add_request(req1))
assert len(engine_core.scheduler.waiting) == 2 assert len(engine_core.scheduler.waiting) == 2
assert len(engine_core.scheduler.running) == 0 assert len(engine_core.scheduler.running) == 0
_ = engine_core.step() _ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2 assert len(engine_core.scheduler.running) == 2
engine_core.add_request(*engine_core.preprocess_add_request(req2)) engine_core.add_request(*engine_core.preprocess_add_request(req2))
assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 2 assert len(engine_core.scheduler.running) == 2
_ = engine_core.step() _ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 3 assert len(engine_core.scheduler.running) == 3
# Abort just one. # Abort just one.
engine_core.abort_requests([req1.request_id]) engine_core.abort_requests([req1.request_id])
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2 assert len(engine_core.scheduler.running) == 2
_ = engine_core.step() _ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2 assert len(engine_core.scheduler.running) == 2
# Abort the other requests at the same time. # Abort the other requests at the same time.
engine_core.abort_requests([req2.request_id, req0.request_id]) engine_core.abort_requests([req2.request_id, req0.request_id])
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0 assert len(engine_core.scheduler.running) == 0
# Sending duplicate requests with same request_id # Sending duplicate requests with same request_id
req0 = make_request() req0 = make_request()
req1 = make_request() req1 = make_request()
req0.request_id = req1.request_id = "test" req0.request_id = req1.request_id = "test"
engine_core.add_request(*engine_core.preprocess_add_request(req0)) engine_core.add_request(*engine_core.preprocess_add_request(req0))
while (outs := engine_core.step()[0].get(0)) and outs.outputs: while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass pass
engine_core.add_request(*engine_core.preprocess_add_request(req1)) engine_core.add_request(*engine_core.preprocess_add_request(req1))
while (outs := engine_core.step()[0].get(0)) and outs.outputs: while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass pass
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0 assert len(engine_core.scheduler.running) == 0
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): def test_engine_core_advanced_sampling():
""" """
A basic end-to-end test to verify that the engine functions correctly A basic end-to-end test to verify that the engine functions correctly
when additional sampling parameters, such as top_p, min_tokens, and when additional sampling parameters, such as top_p, min_tokens, and
presence_penalty, are set. presence_penalty, are set.
""" """
with monkeypatch.context() as m: """Setup the EngineCore."""
m.setenv("VLLM_USE_V1", "1") engine_args = EngineArgs(model=MODEL_NAME)
"""Setup the EngineCore.""" vllm_config = engine_args.create_engine_config()
engine_args = EngineArgs(model=MODEL_NAME) executor_class = Executor.get_class(vllm_config)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine_core = EngineCore( engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
"""Test basic request lifecycle."""
# First request.
request: EngineCoreRequest = make_request()
request.sampling_params = SamplingParams(
min_tokens=4,
presence_penalty=1.0,
frequency_penalty=1.0,
repetition_penalty=0.1,
stop_token_ids=[1001, 1002],
) )
engine_core.add_request(*engine_core.preprocess_add_request(request)) """Test basic request lifecycle."""
# First request.
request: EngineCoreRequest = make_request()
request.sampling_params = SamplingParams(
min_tokens=4,
presence_penalty=1.0,
frequency_penalty=1.0,
repetition_penalty=0.1,
stop_token_ids=[1001, 1002],
)
engine_core.add_request(*engine_core.preprocess_add_request(request))
def _check_engine_state(): def _check_engine_state():
assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0 assert len(engine_core.scheduler.running) == 0
# Loop through until they are all done. # Loop through until they are all done.
while (outs := engine_core.step()[0].get(0)) and outs.outputs: while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass pass
assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0 assert len(engine_core.scheduler.running) == 0
_check_engine_state() _check_engine_state()
# Second request. # Second request.
request2 = make_request() request2 = make_request()
request2.sampling_params = SamplingParams( request2.sampling_params = SamplingParams(
top_p=0.99, top_p=0.99,
top_k=50, top_k=50,
) )
engine_core.add_request(*engine_core.preprocess_add_request(request2)) engine_core.add_request(*engine_core.preprocess_add_request(request2))
_check_engine_state() _check_engine_state()
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): def test_engine_core_concurrent_batches():
""" """
Test that the engine can handle multiple concurrent batches. Test that the engine can handle multiple concurrent batches.
""" """
@@ -272,173 +268,163 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
if hasattr(self, "thread_pool"): if hasattr(self, "thread_pool"):
self.thread_pool.shutdown(wait=False) self.thread_pool.shutdown(wait=False)
with monkeypatch.context() as m: engine_args = EngineArgs(
m.setenv("VLLM_USE_V1", "1") model=MODEL_NAME,
# To test concurrent batches.
engine_args = EngineArgs( max_num_seqs=2,
model=MODEL_NAME, # Avoid all requests being scheduled once.
# To test concurrent batches. enable_prefix_caching=False,
max_num_seqs=2, max_num_batched_tokens=10,
# Avoid all requests being scheduled once. # Reduce startup time.
enable_prefix_caching=False, enforce_eager=True,
max_num_batched_tokens=10, )
# Reduce startup time. vllm_config = engine_args.create_engine_config()
enforce_eager=True, with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
) )
vllm_config = engine_args.create_engine_config() assert engine_core.batch_queue is not None
with set_default_torch_num_threads(1):
engine_core = EngineCore( # Add two requests in a row. Each request have 12 prompt tokens.
vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor req0 = make_request_with_max_tokens("0", 5)
engine_core.add_request(*engine_core.preprocess_add_request(req0))
req1 = make_request_with_max_tokens("1", 5)
engine_core.add_request(*engine_core.preprocess_add_request(req1))
# Schedule Batch 1: (10, req0)
assert engine_core.step_with_batch_queue()[0] is None
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 10
# num_computed_tokens should have been updated immediately.
assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
# Schedule Batch 2: (2, req0), (8, req1)
assert engine_core.step_with_batch_queue()[0] == {}
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 2
assert scheduler_output.num_scheduled_tokens["1"] == 8
# num_computed_tokens should have been updated immediately.
assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
assert engine_core.scheduler.get_num_unfinished_requests() == 2
# Finish Batch 1 and schedule Batch 3: (4, req1).
# Note that req0 cannot be scheduled
# because it is in the decoding stage now.
engine_core.step_with_batch_queue()
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["1"] == 4
# Finish Batch 2. Get first token of req0.
# Schedule Batch 4: (1, req0).
output = engine_core.step_with_batch_queue()[0].get(0)
assert output is not None
assert len(output.outputs) == 1
assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 1
# Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
output = engine_core.step_with_batch_queue()[0].get(0)
assert output is not None
assert len(output.outputs) == 1
assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["1"] == 1
# Loop until req0 is finished.
req_id = 0
expected_num_tokens = [
engine_core.scheduler.requests["0"].num_tokens + 1,
engine_core.scheduler.requests["1"].num_tokens + 1,
]
while engine_core.scheduler.get_num_unfinished_requests() == 2:
output = engine_core.step_with_batch_queue()[0]
# Every step consumes an output.
assert output is not None
assert len(output[0].outputs) == 1
if req_id in engine_core.scheduler.requests:
assert (
engine_core.scheduler.requests[req_id].num_tokens
== expected_num_tokens[req_id]
) )
assert engine_core.batch_queue is not None expected_num_tokens[req_id] += 1
req_id = (req_id + 1) % 2
# Add two requests in a row. Each request have 12 prompt tokens.
req0 = make_request_with_max_tokens("0", 5)
engine_core.add_request(*engine_core.preprocess_add_request(req0))
req1 = make_request_with_max_tokens("1", 5)
engine_core.add_request(*engine_core.preprocess_add_request(req1))
# Schedule Batch 1: (10, req0)
assert engine_core.step_with_batch_queue()[0] is None
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 10
# num_computed_tokens should have been updated immediately.
assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
# Schedule Batch 2: (2, req0), (8, req1)
assert engine_core.step_with_batch_queue()[0] == {}
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 2
assert scheduler_output.num_scheduled_tokens["1"] == 8
# num_computed_tokens should have been updated immediately.
assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
assert engine_core.scheduler.get_num_unfinished_requests() == 2
# Finish Batch 1 and schedule Batch 3: (4, req1).
# Note that req0 cannot be scheduled
# because it is in the decoding stage now.
engine_core.step_with_batch_queue()
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["1"] == 4
# Finish Batch 2. Get first token of req0.
# Schedule Batch 4: (1, req0).
output = engine_core.step_with_batch_queue()[0].get(0)
assert output is not None
assert len(output.outputs) == 1
assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 1
# Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
output = engine_core.step_with_batch_queue()[0].get(0)
assert output is not None
assert len(output.outputs) == 1
assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["1"] == 1
# Loop until req0 is finished.
req_id = 0
expected_num_tokens = [
engine_core.scheduler.requests["0"].num_tokens + 1,
engine_core.scheduler.requests["1"].num_tokens + 1,
]
while engine_core.scheduler.get_num_unfinished_requests() == 2:
output = engine_core.step_with_batch_queue()[0]
# Every step consumes an output.
assert output is not None
assert len(output[0].outputs) == 1
if req_id in engine_core.scheduler.requests:
assert (
engine_core.scheduler.requests[req_id].num_tokens
== expected_num_tokens[req_id]
)
expected_num_tokens[req_id] += 1
req_id = (req_id + 1) % 2
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch): def test_engine_core_tp():
""" """
Test engine can initialize worker in tp properly Test engine can initialize worker in tp properly
""" """
with monkeypatch.context() as m: """Setup the EngineCore."""
m.setenv("VLLM_USE_V1", "1") engine_args = EngineArgs(
"""Setup the EngineCore.""" model=MODEL_NAME,
engine_args = EngineArgs( tensor_parallel_size=2,
model=MODEL_NAME, # Reduce startup time.
tensor_parallel_size=2, enforce_eager=True,
# Reduce startup time. )
enforce_eager=True, vllm_config = engine_args.create_engine_config()
) executor_class = Executor.get_class(vllm_config)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1): with set_default_torch_num_threads(1):
engine_core = EngineCore( engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
def get_worker_cache_config_field(worker, key: str):
return getattr(worker.cache_config, key)
num_gpu_blocks = engine_core.collective_rpc(
get_worker_cache_config_field, args=("num_gpu_blocks",)
) )
num_cpu_blocks = engine_core.collective_rpc(
get_worker_cache_config_field, args=("num_cpu_blocks",) def get_worker_cache_config_field(worker, key: str):
) return getattr(worker.cache_config, key)
assert all(x is not None for x in num_gpu_blocks)
assert all(x is not None for x in num_cpu_blocks) num_gpu_blocks = engine_core.collective_rpc(
get_worker_cache_config_field, args=("num_gpu_blocks",)
)
num_cpu_blocks = engine_core.collective_rpc(
get_worker_cache_config_field, args=("num_cpu_blocks",)
)
assert all(x is not None for x in num_gpu_blocks)
assert all(x is not None for x in num_cpu_blocks)
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch): def test_engine_core_invalid_request_id_type():
"""Test that engine raises TypeError for non-string request_id.""" """Test that engine raises TypeError for non-string request_id."""
with monkeypatch.context() as m: engine_args = EngineArgs(model=MODEL_NAME)
m.setenv("VLLM_USE_V1", "1") vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
engine_args = EngineArgs(model=MODEL_NAME) with set_default_torch_num_threads(1):
vllm_config = engine_args.create_engine_config() engine_core = EngineCore(
executor_class = Executor.get_class(vllm_config) vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
with set_default_torch_num_threads(1): # Test with UUID object (common mistake)
engine_core = EngineCore( uuid_request = make_request()
vllm_config=vllm_config, executor_class=executor_class, log_stats=True uuid_request.request_id = uuid.uuid4() # UUID object instead of string
)
# Test with UUID object (common mistake) with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
uuid_request = make_request() engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))
uuid_request.request_id = uuid.uuid4() # UUID object instead of string
with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"): # Test with integer
engine_core.add_request(*engine_core.preprocess_add_request(uuid_request)) int_request = make_request()
int_request.request_id = 12345
# Test with integer with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
int_request = make_request() engine_core.add_request(*engine_core.preprocess_add_request(int_request))
int_request.request_id = 12345
with pytest.raises(TypeError, match="request_id must be a string, got.*int"): # Test with None
engine_core.add_request(*engine_core.preprocess_add_request(int_request)) none_request = make_request()
none_request.request_id = None
# Test with None with pytest.raises(TypeError, match="request_id must be a string, got.*NoneType"):
none_request = make_request() engine_core.add_request(*engine_core.preprocess_add_request(none_request))
none_request.request_id = None
with pytest.raises( # Verify engine is still functional after errors
TypeError, match="request_id must be a string, got.*NoneType" valid_request = make_request()
): engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
engine_core.add_request(*engine_core.preprocess_add_request(none_request)) assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
# Verify engine is still functional after errors
valid_request = make_request()
engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0

View File

@@ -130,8 +130,6 @@ def test_engine_core_client(
monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch core engine utility function to test. # Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False) m.setattr(EngineCore, "echo", echo, raising=False)
@@ -218,8 +216,6 @@ def test_engine_core_client(
@pytest.mark.asyncio(loop_scope="function") @pytest.mark.asyncio(loop_scope="function")
async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch core engine utility function to test. # Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False) m.setattr(EngineCore, "echo", echo, raising=False)
@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types. # Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types. # Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types. # Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures(
indirect=["publisher_config"], indirect=["publisher_config"],
) )
def test_kv_cache_events( def test_kv_cache_events(
monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool, multiprocessing_mode: bool,
publisher_config, publisher_config,
): ):
with monkeypatch.context() as m: block_size = 16
m.setenv("VLLM_USE_V1", "1") num_blocks = 2
block_size = 16
num_blocks = 2
engine_args = EngineArgs( engine_args = EngineArgs(
model=MODEL_NAME, model=MODEL_NAME,
enforce_eager=True, enforce_eager=True,
enable_prefix_caching=True, enable_prefix_caching=True,
block_size=block_size, block_size=block_size,
)
engine_args.kv_events_config = publisher_config
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode,
asyncio_mode=False,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
) )
engine_args.kv_events_config = publisher_config endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
subscriber = MockSubscriber(
endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
)
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT) try:
custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1)
request = make_request(sampling_params, custom_tokens)
client.add_request(request)
executor_class = Executor.get_class(vllm_config) outputs: dict[str, list] = {request.request_id: []}
with set_default_torch_num_threads(1): loop_until_done(client, outputs)
client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode, result = subscriber.receive_one(timeout=1000)
asyncio_mode=False, assert result is not None, "No message received"
vllm_config=vllm_config,
executor_class=executor_class, seq, received = result
log_stats=False,
) assert seq == 0, "Sequence number mismatch"
endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") assert len(received.events) == 1, "We should have exactly one BlockStored event"
subscriber = MockSubscriber( event = received.events[0]
endpoint, topic=publisher_config.topic, decode_type=KVEventBatch assert isinstance(event, BlockStored), "We should have a BlockStored event"
assert len(event.block_hashes) == num_blocks, (
"We should have a BlockStored event with 2 block_hashes"
) )
assert event.block_size == block_size, (
try: "Block size should be the same as the block size"
custom_tokens = list(range(num_blocks * block_size)) )
sampling_params = SamplingParams(max_tokens=1) assert event.parent_block_hash is None, "Parent block hash should be None"
request = make_request(sampling_params, custom_tokens) assert event.lora_id is None, "Lora id should be None"
client.add_request(request) assert len(event.token_ids) == num_blocks * block_size, (
"Token ids should be the same as the custom tokens"
outputs: dict[str, list] = {request.request_id: []} )
loop_until_done(client, outputs) assert event.token_ids == custom_tokens, (
"Token ids should be the same as the custom tokens"
result = subscriber.receive_one(timeout=1000) )
assert result is not None, "No message received" finally:
client.shutdown()
seq, received = result subscriber.close()
assert seq == 0, "Sequence number mismatch"
assert len(received.events) == 1, (
"We should have exactly one BlockStored event"
)
event = received.events[0]
assert isinstance(event, BlockStored), "We should have a BlockStored event"
assert len(event.block_hashes) == num_blocks, (
"We should have a BlockStored event with 2 block_hashes"
)
assert event.block_size == block_size, (
"Block size should be the same as the block size"
)
assert event.parent_block_hash is None, "Parent block hash should be None"
assert event.lora_id is None, "Lora id should be None"
assert len(event.token_ids) == num_blocks * block_size, (
"Token ids should be the same as the custom tokens"
)
assert event.token_ids == custom_tokens, (
"Token ids should be the same as the custom tokens"
)
finally:
client.shutdown()
subscriber.close()
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -672,101 +657,96 @@ def test_kv_cache_events(
) )
@multi_gpu_test(num_gpus=4) @multi_gpu_test(num_gpus=4)
async def test_kv_cache_events_dp( async def test_kv_cache_events_dp(
monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool, multiprocessing_mode: bool,
publisher_config, publisher_config,
): ):
with monkeypatch.context() as m: block_size = 16
m.setenv("VLLM_USE_V1", "1") num_blocks = 2
block_size = 16 dp_size = 2
num_blocks = 2 tp_size = 2
dp_size = 2
tp_size = 2
engine_args = EngineArgs( engine_args = EngineArgs(
model=MODEL_NAME, model=MODEL_NAME,
enforce_eager=True, enforce_eager=True,
enable_prefix_caching=True, enable_prefix_caching=True,
data_parallel_size=dp_size, data_parallel_size=dp_size,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
block_size=block_size, block_size=block_size,
)
engine_args.kv_events_config = publisher_config
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode,
asyncio_mode=True,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
) )
engine_args.kv_events_config = publisher_config await asyncio.sleep(1)
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT) # Build endpoints for all DP ranks
base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
endpoints = []
for i in range(dp_size):
offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
endpoints.append(offset_endpoint)
executor_class = Executor.get_class(vllm_config) subscriber = MockSubscriber(
with set_default_torch_num_threads(1): endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
client = EngineCoreClient.make_client( )
multiprocess_mode=multiprocessing_mode,
asyncio_mode=True,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
)
await asyncio.sleep(1)
# Build endpoints for all DP ranks try:
base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") custom_tokens = list(range(num_blocks * block_size))
endpoints = [] sampling_params = SamplingParams(max_tokens=1)
for i in range(dp_size): all_request_ids = []
offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
endpoints.append(offset_endpoint)
subscriber = MockSubscriber( # Create and add 25 requests
endpoints, topic=publisher_config.topic, decode_type=KVEventBatch # NOTE: attempts to force routing to both dp groups but can be flaky
for i in range(25):
await asyncio.sleep(0.01)
request = make_request(sampling_params, custom_tokens)
await client.add_request_async(request)
all_request_ids.append(request.request_id)
await asyncio.sleep(0.1)
# Initialize outputs dict for all requests
outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
print("processing requests...")
await asyncio.wait_for(
loop_until_fully_done_async(client, outputs), timeout=20.0
) )
try: # Receive from subscriber until no more messages
custom_tokens = list(range(num_blocks * block_size)) print("collecting results...")
sampling_params = SamplingParams(max_tokens=1) results = []
all_request_ids = [] while True:
result = subscriber.receive_one(timeout=1)
print(result)
if result is None:
break
results.append(result)
# Create and add 25 requests # Collect all events and data_parallel_ranks from all results
# NOTE: attempts to force routing to both dp groups but can be flaky all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
for i in range(25): unique_dps = set(all_dp_ranks)
await asyncio.sleep(0.01) assert len(unique_dps) == 2, (
request = make_request(sampling_params, custom_tokens) f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
await client.add_request_async(request) )
all_request_ids.append(request.request_id)
await asyncio.sleep(0.1) finally:
client.shutdown()
# Initialize outputs dict for all requests subscriber.close()
outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
print("processing requests...")
await asyncio.wait_for(
loop_until_fully_done_async(client, outputs), timeout=20.0
)
# Receive from subscriber until no more messages
print("collecting results...")
results = []
while True:
result = subscriber.receive_one(timeout=1)
print(result)
if result is None:
break
results.append(result)
# Collect all events and data_parallel_ranks from all results
all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
unique_dps = set(all_dp_ranks)
assert len(unique_dps) == 2, (
f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
)
finally:
client.shutdown()
subscriber.close()
@pytest.mark.timeout(20) @pytest.mark.timeout(20)
def test_startup_failure(monkeypatch: pytest.MonkeyPatch): def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m, pytest.raises(Exception) as e_info: with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch to extract core process pid while it's starting. # Monkey-patch to extract core process pid while it's starting.
core_proc_pid = [None] core_proc_pid = [None]
cepm_ctor = CoreEngineProcManager.__init__ cepm_ctor = CoreEngineProcManager.__init__
@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
mock_executor_class.side_effect = create_mock_executor mock_executor_class.side_effect = create_mock_executor
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices
from vllm.v1.engine.utils import EngineZmqAddresses from vllm.v1.engine.utils import EngineZmqAddresses

View File

@@ -21,12 +21,10 @@ DTYPE = "half"
def _vllm_model( def _vllm_model(
apc: bool, apc: bool,
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
*, *,
skip_tokenizer_init: bool = False, skip_tokenizer_init: bool = False,
): ):
"""Set up VllmRunner instance.""" """Set up VllmRunner instance."""
monkeypatch.setenv("VLLM_USE_V1", "1")
return vllm_runner( return vllm_runner(
MODEL, MODEL,
dtype=DTYPE, dtype=DTYPE,
@@ -45,16 +43,16 @@ def _vllm_model(
# Prefix caching # Prefix caching
params=[False, True], params=[False, True],
) )
def vllm_model(vllm_runner, request, monkeypatch): def vllm_model(vllm_runner, request):
"""VllmRunner test fixture parameterized by APC True/False.""" """VllmRunner test fixture parameterized by APC True/False."""
with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model: with _vllm_model(request.param, vllm_runner) as vllm_model:
yield vllm_model yield vllm_model
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def vllm_model_apc(vllm_runner, monkeypatch): def vllm_model_apc(vllm_runner):
"""VllmRunner test fixture with APC.""" """VllmRunner test fixture with APC."""
with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model: with _vllm_model(True, vllm_runner) as vllm_model:
yield vllm_model yield vllm_model
@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
# Prefix caching # Prefix caching
params=[False, True], params=[False, True],
) )
def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch): def vllm_model_skip_tokenizer_init(vllm_runner, request):
"""VllmRunner test fixture with APC.""" """VllmRunner test fixture with APC."""
with _vllm_model( with _vllm_model(
request.param, request.param,
vllm_runner, vllm_runner,
monkeypatch,
skip_tokenizer_init=True, skip_tokenizer_init=True,
) as vllm_model: ) as vllm_model:
yield vllm_model yield vllm_model
@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
) )
def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): def test_engine_metrics(vllm_runner, example_prompts):
max_tokens = 100 max_tokens = 100
# Use spec decoding to test num_accepted_tokens_per_pos # Use spec decoding to test num_accepted_tokens_per_pos
speculative_config = { speculative_config = {
@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
"prompt_lookup_min": 3, "prompt_lookup_min": 3,
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
} }
monkeypatch.setenv("VLLM_USE_V1", "1")
with vllm_runner( with vllm_runner(
MODEL, MODEL,
speculative_config=speculative_config, speculative_config=speculative_config,
@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"]) @pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
def test_skip_tokenizer_initialization(model: str, monkeypatch: pytest.MonkeyPatch): def test_skip_tokenizer_initialization(model: str):
monkeypatch.setenv("VLLM_USE_V1", "1")
# This test checks if the flag skip_tokenizer_init skips the initialization # This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain # of tokenizer and detokenizer. The generated output is expected to contain
# token ids. # token ids.

View File

@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE, PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
) )
def test_structured_output( def test_structured_output(
monkeypatch: pytest.MonkeyPatch,
sample_json_schema: dict[str, Any], sample_json_schema: dict[str, Any],
unsupported_json_schema: dict[str, Any], unsupported_json_schema: dict[str, Any],
sample_sql_ebnf: str, sample_sql_ebnf: str,
@@ -115,8 +114,6 @@ def test_structured_output(
model_name: str, model_name: str,
speculative_config: dict[str, Any], speculative_config: dict[str, Any],
): ):
monkeypatch.setenv("VLLM_USE_V1", "1")
if current_platform.is_tpu() and speculative_config: if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding") pytest.skip("TPU does not support speculative decoding")
@@ -620,15 +617,12 @@ Make the response as short as possible.
], ],
) )
def test_structured_output_with_reasoning_matrices( def test_structured_output_with_reasoning_matrices(
monkeypatch: pytest.MonkeyPatch,
backend: str, backend: str,
tokenizer_mode: TokenizerMode, tokenizer_mode: TokenizerMode,
reasoning_parser: str, reasoning_parser: str,
model_name: str, model_name: str,
speculative_config: dict[str, Any] | None, speculative_config: dict[str, Any] | None,
): ):
monkeypatch.setenv("VLLM_USE_V1", "1")
if current_platform.is_tpu() and speculative_config: if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding") pytest.skip("TPU does not support speculative decoding")
@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE) @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
def test_structured_output_auto_mode( def test_structured_output_auto_mode(
monkeypatch: pytest.MonkeyPatch,
unsupported_json_schema: dict[str, Any], unsupported_json_schema: dict[str, Any],
model_name: str, model_name: str,
tokenizer_mode: str, tokenizer_mode: str,
): ):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
model=model_name, model=model_name,
max_model_len=1024, max_model_len=1024,
@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): def test_guidance_no_additional_properties():
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM( llm = LLM(
model="Qwen/Qwen2.5-1.5B-Instruct", model="Qwen/Qwen2.5-1.5B-Instruct",
max_model_len=1024, max_model_len=1024,
@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"]) @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
def test_structured_output_batched_with_non_structured_outputs_requests( def test_structured_output_batched_with_non_structured_outputs_requests(
monkeypatch: pytest.MonkeyPatch,
sample_json_schema: dict[str, Any], sample_json_schema: dict[str, Any],
backend: str, backend: str,
): ):
monkeypatch.setenv("VLLM_USE_V1", "1")
# Don't use eager execution on TPUs because we want to test for no # Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime # recompilation at runtime
enforce_eager = bool(not current_platform.is_tpu()) enforce_eager = bool(not current_platform.is_tpu())

View File

@@ -53,7 +53,6 @@ cleanup() {
launch_baseline() { launch_baseline() {
BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME}; BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
@@ -73,7 +72,6 @@ launch_pd() {
UCX_TLS=tcp \ UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \ VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \ VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
@@ -93,7 +91,6 @@ launch_pd() {
UCX_TLS=tcp \ UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \

View File

@@ -55,7 +55,6 @@ launch_pd() {
UCX_TLS=tcp \ UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \ VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \ VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
@@ -75,7 +74,6 @@ launch_pd() {
UCX_TLS=tcp \ UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \ VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \ VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
PJRT_DEVICE=TPU \ PJRT_DEVICE=TPU \
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \ VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \

View File

@@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
import ray import ray
@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch):
"""
The change relies on V1 APIs, so set VLLM_USE_V1=1.
"""
monkeypatch.setenv("VLLM_USE_V1", "1")
MODELS = [ MODELS = [
"distilbert/distilgpt2", "distilbert/distilgpt2",
] ]
@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
@ray.remote(num_gpus=1) @ray.remote(num_gpus=1)
class EngineTestActor: class EngineTestActor:
async def run(self): async def run(self):
# Set environment variable inside the Ray actor since environment
# variables from pytest fixtures don't propagate to Ray actors
os.environ["VLLM_USE_V1"] = "1"
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
) )

View File

@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
batch_logprobs_composition: BatchLogprobsComposition, batch_logprobs_composition: BatchLogprobsComposition,
temperature: float, temperature: float,
example_prompts: list[str], example_prompts: list[str],
monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
"""Test V1 Engine logprobs & prompt logprobs """Test V1 Engine logprobs & prompt logprobs
@@ -308,220 +307,204 @@ def test_get_logprobs_and_prompt_logprobs(
temperature: "temperature" sampling parameter temperature: "temperature" sampling parameter
example_prompts: example prompt fixture example_prompts: example prompt fixture
""" """
with monkeypatch.context() as m: do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
m.setenv("VLLM_USE_V1", "1") if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching # Skip some test-cases to save time.
if do_apc and ( pytest.skip()
temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT test_prompts = example_prompts
):
# Skip some test-cases to save time.
pytest.skip()
test_prompts = example_prompts
max_tokens = 5 max_tokens = 5
hf_outputs = hf_model.generate_greedy( hf_outputs = hf_model.generate_greedy(
test_prompts, test_prompts,
max_tokens=max_tokens,
)
hf_logprobs = hf_model.generate_greedy_logprobs(
test_prompts,
max_tokens=max_tokens,
)
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
# Ensure that each test prompt has a logprob config for testing
logprob_prompt_logprob_list = _repeat_logprob_config(
test_prompts, logprob_prompt_logprob_list
)
# Generate SamplingParams
vllm_sampling_params = [
SamplingParams(
max_tokens=max_tokens, max_tokens=max_tokens,
logprobs=num_lp,
prompt_logprobs=num_plp,
temperature=temperature,
seed=1984,
) )
hf_logprobs = hf_model.generate_greedy_logprobs( for num_lp, num_plp in logprob_prompt_logprob_list
test_prompts, ]
for _ in range(2 if do_apc else 1):
_run_and_validate(
vllm_model=vllm_model,
test_prompts=test_prompts,
vllm_sampling_params=vllm_sampling_params,
hf_logprobs=hf_logprobs,
hf_outputs=hf_outputs,
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
temperature=temperature,
max_tokens=max_tokens, max_tokens=max_tokens,
do_apc=do_apc,
) )
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
# Ensure that each test prompt has a logprob config for testing def test_max_logprobs():
logprob_prompt_logprob_list = _repeat_logprob_config(
test_prompts, logprob_prompt_logprob_list
)
# Generate SamplingParams
vllm_sampling_params = [
SamplingParams(
max_tokens=max_tokens,
logprobs=num_lp,
prompt_logprobs=num_plp,
temperature=temperature,
seed=1984,
)
for num_lp, num_plp in logprob_prompt_logprob_list
]
for _ in range(2 if do_apc else 1):
_run_and_validate(
vllm_model=vllm_model,
test_prompts=test_prompts,
vllm_sampling_params=vllm_sampling_params,
hf_logprobs=hf_logprobs,
hf_outputs=hf_outputs,
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
temperature=temperature,
max_tokens=max_tokens,
do_apc=do_apc,
)
def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs` """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs` Should also fail for `prompt_logprobs > max_logprobs`
APC should not matter as this test checks basic request validation. APC should not matter as this test checks basic request validation.
""" """
with monkeypatch.context() as m: runner = VllmRunner(
m.setenv("VLLM_USE_V1", "1") "facebook/opt-125m",
max_logprobs=1,
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.15,
max_model_len=256,
)
vllm_sampling_params = SamplingParams(logprobs=1)
# should pass
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
runner = VllmRunner( bad_sampling_params = SamplingParams(logprobs=2)
"facebook/opt-125m", with pytest.raises(ValueError):
max_logprobs=1, runner.generate(["Hello world"], sampling_params=bad_sampling_params)
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.15,
max_model_len=256,
)
vllm_sampling_params = SamplingParams(logprobs=1)
# should pass
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
bad_sampling_params = SamplingParams(logprobs=2)
with pytest.raises(ValueError):
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch): def test_none_logprobs(vllm_model, example_prompts):
"""Engine should return `logprobs` and `prompt_logprobs` as `None` """Engine should return `logprobs` and `prompt_logprobs` as `None`
Args: Args:
vllm_model: vLLM model fixture vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
with monkeypatch.context() as m: max_tokens = 5
m.setenv("VLLM_USE_V1", "1")
max_tokens = 5
sampling_params_logprobs_none = SamplingParams( sampling_params_logprobs_none = SamplingParams(
max_tokens=max_tokens, max_tokens=max_tokens,
logprobs=None, logprobs=None,
prompt_logprobs=None, prompt_logprobs=None,
temperature=0.0, temperature=0.0,
) )
results_logprobs_none = vllm_model.llm.generate( results_logprobs_none = vllm_model.llm.generate(
example_prompts, example_prompts,
sampling_params=sampling_params_logprobs_none, sampling_params=sampling_params_logprobs_none,
) )
for i in range(len(results_logprobs_none)): for i in range(len(results_logprobs_none)):
# Check sample logprobs are None # Check sample logprobs are None
assert results_logprobs_none[i].outputs[0].logprobs is None assert results_logprobs_none[i].outputs[0].logprobs is None
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
# Check prompt logprobs are None # Check prompt logprobs are None
assert results_logprobs_none[i].prompt_logprobs is None assert results_logprobs_none[i].prompt_logprobs is None
def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch): def test_zero_logprobs(vllm_model, example_prompts):
"""Engine should return sampled token and prompt token logprobs """Engine should return sampled token and prompt token logprobs
Args: Args:
vllm_model: vLLM model fixture vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
with monkeypatch.context() as m: max_tokens = 5
m.setenv("VLLM_USE_V1", "1")
max_tokens = 5
sampling_params_logprobs_zero = SamplingParams( sampling_params_logprobs_zero = SamplingParams(
max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0 max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
) )
results_logprobs_zero = vllm_model.llm.generate( results_logprobs_zero = vllm_model.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_zero example_prompts, sampling_params=sampling_params_logprobs_zero
) )
for i in range(len(results_logprobs_zero)): for i in range(len(results_logprobs_zero)):
# Check that there is one sample logprob dict for each # Check that there is one sample logprob dict for each
# sample token # sample token
logprobs = results_logprobs_zero[i].outputs[0].logprobs logprobs = results_logprobs_zero[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_zero[i].prompt_logprobs prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
prompt_token_ids = results_logprobs_zero[i].prompt_token_ids prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
assert logprobs is not None assert logprobs is not None
assert len(sampled_token_ids) == len(logprobs) assert len(sampled_token_ids) == len(logprobs)
assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
# Check that there is one prompt logprob dict for each # Check that there is one prompt logprob dict for each
# prompt token # prompt token
assert prompt_logprobs is not None assert prompt_logprobs is not None
assert len(prompt_token_ids) == len(prompt_logprobs) assert len(prompt_token_ids) == len(prompt_logprobs)
def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch): def test_all_logprobs(example_prompts):
"""Engine should return all vocabulary logprobs and prompt logprobs """Engine should return all vocabulary logprobs and prompt logprobs
Args: Args:
example_prompts: list of example prompts (test fixture) example_prompts: list of example prompts (test fixture)
""" """
with monkeypatch.context() as m: runner = VllmRunner(
m.setenv("VLLM_USE_V1", "1") "facebook/opt-125m",
runner = VllmRunner( max_logprobs=-1,
"facebook/opt-125m", enable_prefix_caching=False,
max_logprobs=-1, # 2 other llms alive during whole session
enable_prefix_caching=False, gpu_memory_utilization=0.15,
# 2 other llms alive during whole session max_model_len=256,
gpu_memory_utilization=0.15, )
max_model_len=256,
)
sampling_params_logprobs_all = SamplingParams( sampling_params_logprobs_all = SamplingParams(
max_tokens=5, logprobs=-1, prompt_logprobs=-1 max_tokens=5, logprobs=-1, prompt_logprobs=-1
) )
results_logprobs_all = runner.llm.generate( results_logprobs_all = runner.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_all example_prompts, sampling_params=sampling_params_logprobs_all
) )
vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size() vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
for i in range(len(results_logprobs_all)): for i in range(len(results_logprobs_all)):
logprobs = results_logprobs_all[i].outputs[0].logprobs logprobs = results_logprobs_all[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_all[i].prompt_logprobs prompt_logprobs = results_logprobs_all[i].prompt_logprobs
assert logprobs is not None assert logprobs is not None
for logprob in logprobs: for logprob in logprobs:
assert len(logprob) == vocab_size assert len(logprob) == vocab_size
assert prompt_logprobs is not None assert prompt_logprobs is not None
assert prompt_logprobs[0] is None assert prompt_logprobs[0] is None
for prompt_logprob in prompt_logprobs[1:]: for prompt_logprob in prompt_logprobs[1:]:
assert len(prompt_logprob) == vocab_size assert len(prompt_logprob) == vocab_size
@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode)) @pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch): def test_logprobs_mode(logprobs_mode: LogprobsMode):
"""Test with LLM engine with different logprobs_mode. """Test with LLM engine with different logprobs_mode.
For logprobs, we should have non-positive values. For logprobs, we should have non-positive values.
For logits, we should expect at least one positive values. For logits, we should expect at least one positive values.
""" """
from vllm import LLM from vllm import LLM
with monkeypatch.context() as m: llm = LLM(
m.setenv("VLLM_USE_V1", "1") "facebook/opt-125m",
max_logprobs=5,
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.05,
max_model_len=16,
logprobs_mode=logprobs_mode,
)
vllm_sampling_params = SamplingParams(logprobs=1)
results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
llm = LLM( total_token_with_logprobs = 0
"facebook/opt-125m", positive_values = 0
max_logprobs=5, for output in results[0].outputs:
enable_prefix_caching=False, for logprobs in output.logprobs:
# 2 other llms alive during whole session for token_id in logprobs:
gpu_memory_utilization=0.05, logprob = logprobs[token_id]
max_model_len=16, if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
logprobs_mode=logprobs_mode, assert logprob.logprob <= 0
) if logprob.logprob > 0:
vllm_sampling_params = SamplingParams(logprobs=1) positive_values = positive_values + 1
results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params) total_token_with_logprobs = total_token_with_logprobs + 1
assert total_token_with_logprobs >= len(results[0].outputs)
total_token_with_logprobs = 0 if logprobs_mode in ("raw_logits", "processed_logits"):
positive_values = 0 assert positive_values > 0
for output in results[0].outputs: del llm
for logprobs in output.logprobs:
for token_id in logprobs:
logprob = logprobs[token_id]
if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
assert logprob.logprob <= 0
if logprob.logprob > 0:
positive_values = positive_values + 1
total_token_with_logprobs = total_token_with_logprobs + 1
assert total_token_with_logprobs >= len(results[0].outputs)
if logprobs_mode in ("raw_logits", "processed_logits"):
assert positive_values > 0
del llm

View File

@@ -1,14 +1,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B" MODEL = "meta-llama/Llama-3.2-1B"
PROMPT = "Hello my name is Robert and I" PROMPT = "Hello my name is Robert and I"
@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000])) _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
def test_priority(llm):
"""Check that we reject requests with priority."""
# Reject all allowed token ids
with pytest.raises(ValueError):
_ = llm.generate(PROMPT, priority=[1])
def test_seed(llm): def test_seed(llm):
"""Check that seed impacts randomness.""" """Check that seed impacts randomness."""

View File

@@ -38,7 +38,6 @@ def test_eagle_max_len(
monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm(): if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():

View File

@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS) @pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
def test_basic( def test_basic(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
model: str, model: str,
max_tokens: int, max_tokens: int,
tensor_parallel_size: int, tensor_parallel_size: int,
@@ -55,23 +54,20 @@ def test_basic(
) )
example_prompts = [prompt] example_prompts = [prompt]
with monkeypatch.context() as m: with vllm_runner(
m.setenv("VLLM_USE_V1", "1") model,
# Note: max_num_batched_tokens == 1024 is needed here to
# actually test chunked prompt
max_num_batched_tokens=1024,
max_model_len=8192,
gpu_memory_utilization=0.7,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
output = vllm_outputs[0][1]
with vllm_runner( assert "1024" in output or "0, 1" in output
model,
# Note: max_num_batched_tokens == 1024 is needed here to
# actually test chunked prompt
max_num_batched_tokens=1024,
max_model_len=8192,
gpu_memory_utilization=0.7,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
output = vllm_outputs[0][1]
assert "1024" in output or "0, 1" in output
@pytest.mark.skip(reason="Temporarily disabled due to timeout") @pytest.mark.skip(reason="Temporarily disabled due to timeout")
@@ -82,7 +78,6 @@ def test_basic(
@pytest.mark.parametrize("max_num_seqs", [16]) @pytest.mark.parametrize("max_num_seqs", [16])
def test_phi3( def test_phi3(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
max_tokens: int, max_tokens: int,
max_num_seqs: int, max_num_seqs: int,
) -> None: ) -> None:
@@ -99,18 +94,15 @@ def test_phi3(
# test head dim = 96 # test head dim = 96
model = "microsoft/Phi-3-mini-128k-instruct" model = "microsoft/Phi-3-mini-128k-instruct"
with monkeypatch.context() as m: with vllm_runner(
m.setenv("VLLM_USE_V1", "1") model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
) as vllm_model:
with vllm_runner( vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs # vllm_outputs is a list of tuples whose first element is the token id
) as vllm_model: # and the second element is the output (including the prompt).
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens) for output, answer in zip(vllm_outputs, answers):
# vllm_outputs is a list of tuples whose first element is the token id generated_text = output[1]
# and the second element is the output (including the prompt). assert answer in generated_text
for output, answer in zip(vllm_outputs, answers):
generated_text = output[1]
assert answer in generated_text
TP_SIZE_8 = 8 TP_SIZE_8 = 8
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
) )
def test_gemma3_27b_with_text_input_and_tp( def test_gemma3_27b_with_text_input_and_tp(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
model = "google/gemma-3-27b-it" model = "google/gemma-3-27b-it"
max_tokens = 16 max_tokens = 16
@@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
" but in rising every time we fall.", " but in rising every time we fall.",
] ]
with monkeypatch.context() as m: with vllm_runner(
m.setenv("VLLM_USE_V1", "1") model,
max_num_batched_tokens=256,
with vllm_runner( max_num_seqs=max_num_seqs,
model, tensor_parallel_size=tensor_parallel_size,
max_num_batched_tokens=256, ) as vllm_model:
max_num_seqs=max_num_seqs, vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
tensor_parallel_size=tensor_parallel_size, # vllm_outputs is a list of tuples whose first element is the token id
) as vllm_model: # and the second element is the output (including the prompt).
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens) for output, answer in zip(vllm_outputs, answers):
# vllm_outputs is a list of tuples whose first element is the token id generated_text = output[1]
# and the second element is the output (including the prompt). assert answer in generated_text
for output, answer in zip(vllm_outputs, answers):
generated_text = output[1]
assert answer in generated_text
@pytest.mark.skipif( @pytest.mark.skipif(
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
) )
def test_w8a8_quantization( def test_w8a8_quantization(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8" model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
max_tokens = 5 max_tokens = 5
@@ -176,18 +163,15 @@ def test_w8a8_quantization(
) )
example_prompts = [prompt] example_prompts = [prompt]
with monkeypatch.context() as m: with vllm_runner(
m.setenv("VLLM_USE_V1", "1") model,
max_num_batched_tokens=64,
max_model_len=4096,
gpu_memory_utilization=0.7,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
output = vllm_outputs[0][1]
with vllm_runner( assert "1024" in output or "0, 1" in output
model,
max_num_batched_tokens=64,
max_model_len=4096,
gpu_memory_utilization=0.7,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
output = vllm_outputs[0][1]
assert "1024" in output or "0, 1" in output

View File

@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
@pytest.mark.parametrize("params", TEST_PARAMS) @pytest.mark.parametrize("params", TEST_PARAMS)
def test_perf( def test_perf(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
params: TestParams, params: TestParams,
) -> None: ) -> None:
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
@@ -107,48 +106,45 @@ def test_perf(
) )
) )
with monkeypatch.context() as m: sampling_params = SamplingParams(
m.setenv("VLLM_USE_V1", "1") max_tokens=params.decode_len, temperature=1.0, min_p=0.0
)
sampling_params = SamplingParams( with vllm_runner(
max_tokens=params.decode_len, temperature=1.0, min_p=0.0 params.model,
) max_num_batched_tokens=MAX_MODEL_LEN,
max_model_len=MAX_MODEL_LEN,
max_num_seqs=MAX_NUM_SEQS,
gpu_memory_utilization=GPU_UTIL,
enforce_eager=False,
tensor_parallel_size=1,
) as vllm_model:
print(" -- Warmup / Compile")
for i in range(NUM_WARMUPS):
_ = vllm_model.generate(prompts, sampling_params)
with vllm_runner( print(" -- Benchmarking... ")
params.model, times = []
max_num_batched_tokens=MAX_MODEL_LEN, for i in range(NUM_RUNS):
max_model_len=MAX_MODEL_LEN, start_time = time.time()
max_num_seqs=MAX_NUM_SEQS, _ = vllm_model.generate(prompts, sampling_params)
gpu_memory_utilization=GPU_UTIL, times.append(time.time() - start_time)
enforce_eager=False,
tensor_parallel_size=1,
) as vllm_model:
print(" -- Warmup / Compile")
for i in range(NUM_WARMUPS):
_ = vllm_model.generate(prompts, sampling_params)
print(" -- Benchmarking... ") avg_time = sum(times) / len(times)
times = []
for i in range(NUM_RUNS):
start_time = time.time()
_ = vllm_model.generate(prompts, sampling_params)
times.append(time.time() - start_time)
avg_time = sum(times) / len(times) print(" -- avg_time = {}".format(avg_time))
print(
print(" -- avg_time = {}".format(avg_time)) " -- expected_avg_time = {} with err_tol = {}".format(
print( params.expected_avg_time, params.err_tol
" -- expected_avg_time = {} with err_tol = {}".format( )
params.expected_avg_time, params.err_tol )
) diff = avg_time - params.expected_avg_time
ok = diff < params.err_tol
if diff < -params.err_tol:
print(
" !! WARNING !! Performance has improved by {}, "
"it may be necessary to fine-tune the "
"expected_avg_time = {}".format(-diff, params.expected_avg_time)
) )
diff = avg_time - params.expected_avg_time
ok = diff < params.err_tol
if diff < -params.err_tol:
print(
" !! WARNING !! Performance has improved by {}, "
"it may be necessary to fine-tune the "
"expected_avg_time = {}".format(-diff, params.expected_avg_time)
)
assert ok, " !! ERROR !! Regression detected" assert ok, " !! ERROR !! Regression detected"

View File

@@ -82,7 +82,7 @@ def test_traces(
): ):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true") m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
m.setenv("VLLM_USE_V1", "1")
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0.01, temperature=0.01,
top_p=0.1, top_p=0.1,

View File

@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
logger.info("Warming up model for the compilation...") logger.info("Warming up model for the compilation...")
# Only generate graph for the generic shape # Only generate graph for the generic shape
with _set_global_compilation_settings(self.vllm_config): with _set_global_compilation_settings(self.vllm_config):
self._dummy_run(max(16, self.max_num_reqs)) self._dummy_run(
min(
max(16, self.max_num_reqs),
self.scheduler_config.max_num_batched_tokens,
)
)
logger.info("Warming up done.") logger.info("Warming up done.")
def _init_device_properties(self) -> None: def _init_device_properties(self) -> None: