[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -296,6 +296,7 @@ steps:
|
|||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
|
- pytest -v -s -m 'not cpu_test' v1/core
|
||||||
- pytest -v -s v1/executor
|
- pytest -v -s v1/executor
|
||||||
- pytest -v -s v1/kv_offload
|
- pytest -v -s v1/kv_offload
|
||||||
- pytest -v -s v1/sample
|
- pytest -v -s v1/sample
|
||||||
@@ -317,7 +318,7 @@ steps:
|
|||||||
no_gpu: true
|
no_gpu: true
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s v1/core
|
- pytest -v -s -m 'cpu_test' v1/core
|
||||||
- pytest -v -s v1/structured_output
|
- pytest -v -s v1/structured_output
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
|
from vllm.v1.engine.llm_engine import LLMEngine
|
||||||
|
|
||||||
from ..conftest import HfRunner, VllmRunner
|
from ..conftest import HfRunner, VllmRunner
|
||||||
from ..models.utils import check_outputs_equal
|
from ..models.utils import check_outputs_equal
|
||||||
@@ -211,16 +211,11 @@ def test_models_distributed(
|
|||||||
|
|
||||||
|
|
||||||
def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
|
def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
|
||||||
from vllm.envs import VLLM_USE_V1
|
|
||||||
|
|
||||||
if not VLLM_USE_V1:
|
|
||||||
pytest.skip("Skipping V0 test, dump input not supported")
|
|
||||||
|
|
||||||
# Needed to mock an error in the same process
|
# Needed to mock an error in the same process
|
||||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
|
|
||||||
with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
|
with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
|
||||||
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
|
if isinstance(vllm_model.llm.llm_engine, LLMEngine):
|
||||||
v1_test_failed_model_execution(vllm_model)
|
v1_test_failed_model_execution(vllm_model)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -117,18 +117,15 @@ def test_cumem_with_cudagraph():
|
|||||||
|
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"model, use_v1",
|
"model",
|
||||||
[
|
[
|
||||||
# sleep mode with safetensors
|
# sleep mode with safetensors
|
||||||
("meta-llama/Llama-3.2-1B", True),
|
"meta-llama/Llama-3.2-1B",
|
||||||
# sleep mode with pytorch checkpoint
|
# sleep mode with pytorch checkpoint
|
||||||
("facebook/opt-125m", True),
|
"facebook/opt-125m",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
|
def test_end_to_end(model: str):
|
||||||
with monkeypatch.context() as m:
|
|
||||||
assert use_v1
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
free, total = torch.cuda.mem_get_info()
|
free, total = torch.cuda.mem_get_info()
|
||||||
used_bytes_baseline = total - free # in case other process is running
|
used_bytes_baseline = total - free # in case other process is running
|
||||||
llm = LLM(model, enable_sleep_mode=True)
|
llm = LLM(model, enable_sleep_mode=True)
|
||||||
@@ -151,10 +148,7 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
|
|||||||
# therefore high memory usage after `llm.sleep` is called is expected.
|
# therefore high memory usage after `llm.sleep` is called is expected.
|
||||||
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
|
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
|
||||||
# in V1.
|
# in V1.
|
||||||
if use_v1:
|
|
||||||
assert used_bytes < 7 * GiB_bytes
|
assert used_bytes < 7 * GiB_bytes
|
||||||
else:
|
|
||||||
assert used_bytes < 2 * GiB_bytes
|
|
||||||
|
|
||||||
llm.wake_up()
|
llm.wake_up()
|
||||||
output2 = llm.generate(prompt, sampling_params)
|
output2 = llm.generate(prompt, sampling_params)
|
||||||
@@ -168,10 +162,7 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
|
|||||||
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
|
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
|
||||||
|
|
||||||
# should just reallocate memory for weights (1B model, ~2GiB weights)
|
# should just reallocate memory for weights (1B model, ~2GiB weights)
|
||||||
if use_v1:
|
|
||||||
assert used_bytes < 10 * GiB_bytes
|
assert used_bytes < 10 * GiB_bytes
|
||||||
else:
|
|
||||||
assert used_bytes < 6 * GiB_bytes
|
|
||||||
|
|
||||||
# now allocate kv cache memory
|
# now allocate kv cache memory
|
||||||
llm.wake_up(tags=["kv_cache"])
|
llm.wake_up(tags=["kv_cache"])
|
||||||
|
|||||||
@@ -66,7 +66,6 @@ def llm_pair(request):
|
|||||||
pytest.skip("Only Blackwell GPUs support Cutlass MLA")
|
pytest.skip("Only Blackwell GPUs support Cutlass MLA")
|
||||||
|
|
||||||
env_vars = {
|
env_vars = {
|
||||||
"VLLM_USE_V1": "1",
|
|
||||||
# Force native sampler to avoid potential nondeterminism in FlashInfer
|
# Force native sampler to avoid potential nondeterminism in FlashInfer
|
||||||
# when per-request generators are not used in V1.
|
# when per-request generators are not used in V1.
|
||||||
"VLLM_USE_FLASHINFER_SAMPLER": "0",
|
"VLLM_USE_FLASHINFER_SAMPLER": "0",
|
||||||
@@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
|
|||||||
with (
|
with (
|
||||||
temporary_environ(
|
temporary_environ(
|
||||||
{
|
{
|
||||||
"VLLM_USE_V1": "1",
|
|
||||||
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
|
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
|
||||||
# Flex_Attention is not supported with full cuda graph
|
# Flex_Attention is not supported with full cuda graph
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ from vllm.config import (
|
|||||||
VllmConfig,
|
VllmConfig,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
)
|
)
|
||||||
from vllm.envs import VLLM_USE_V1
|
|
||||||
from vllm.forward_context import BatchDescriptor, set_forward_context
|
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||||
from vllm.utils import is_torch_equal_or_newer
|
from vllm.utils import is_torch_equal_or_newer
|
||||||
|
|
||||||
@@ -127,7 +126,6 @@ def _run_simple_model(
|
|||||||
@pytest.mark.parametrize("use_inductor", [True, False])
|
@pytest.mark.parametrize("use_inductor", [True, False])
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def test_simple_piecewise_compile(use_inductor):
|
def test_simple_piecewise_compile(use_inductor):
|
||||||
assert VLLM_USE_V1
|
|
||||||
_run_simple_model(
|
_run_simple_model(
|
||||||
splitting_ops=["silly.attention"],
|
splitting_ops=["silly.attention"],
|
||||||
use_inductor_graph_partition=False,
|
use_inductor_graph_partition=False,
|
||||||
@@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
|
|||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
|
@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
|
||||||
def test_simple_inductor_graph_partition(splitting_ops):
|
def test_simple_inductor_graph_partition(splitting_ops):
|
||||||
assert VLLM_USE_V1
|
|
||||||
if not is_torch_equal_or_newer("2.9.0.dev"):
|
if not is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||||
|
|
||||||
|
|||||||
@@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
|
|||||||
"pass_config": {"enable_async_tp": async_tp_enabled},
|
"pass_config": {"enable_async_tp": async_tp_enabled},
|
||||||
}
|
}
|
||||||
|
|
||||||
async_tp_env = tp_env = {
|
|
||||||
"VLLM_USE_V1": "1",
|
|
||||||
}
|
|
||||||
|
|
||||||
async_tp_args = [
|
async_tp_args = [
|
||||||
*common_args,
|
*common_args,
|
||||||
"--tensor-parallel-size",
|
"--tensor-parallel-size",
|
||||||
@@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
|
|||||||
"mp",
|
"mp",
|
||||||
]
|
]
|
||||||
|
|
||||||
compare_two_settings(
|
compare_two_settings(model_id, async_tp_args, tp_args, method="generate")
|
||||||
model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import vllm
|
|
||||||
from vllm.compilation.counter import compilation_counter
|
from vllm.compilation.counter import compilation_counter
|
||||||
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
|
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
|
||||||
from vllm.utils import _is_torch_equal_or_newer
|
from vllm.utils import _is_torch_equal_or_newer
|
||||||
@@ -16,15 +15,10 @@ def test_version():
|
|||||||
assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
|
assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
|
||||||
|
|
||||||
|
|
||||||
def test_use_cudagraphs_dynamic(monkeypatch):
|
def test_use_cudagraphs_dynamic():
|
||||||
assert vllm.envs.VLLM_USE_V1
|
|
||||||
vllm_config = VllmConfig()
|
vllm_config = VllmConfig()
|
||||||
assert vllm_config.compilation_config.use_cudagraph
|
assert vllm_config.compilation_config.use_cudagraph
|
||||||
|
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
|
||||||
vllm_config = VllmConfig()
|
|
||||||
assert not vllm_config.compilation_config.use_cudagraph
|
|
||||||
|
|
||||||
|
|
||||||
def test_custom_op():
|
def test_custom_op():
|
||||||
# proper syntax
|
# proper syntax
|
||||||
@@ -41,8 +35,6 @@ def test_custom_op():
|
|||||||
# may be influenced by other tests.
|
# may be influenced by other tests.
|
||||||
@pytest.mark.parametrize("val", ["1"])
|
@pytest.mark.parametrize("val", ["1"])
|
||||||
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
|
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
|
||||||
assert vllm.envs.VLLM_USE_V1
|
|
||||||
|
|
||||||
# Disable multiprocessing so that the counter is in the same process
|
# Disable multiprocessing so that the counter is in the same process
|
||||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
|
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
|
||||||
@@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
|
|||||||
@pytest.mark.forked
|
@pytest.mark.forked
|
||||||
@pytest.mark.parametrize("enabled", [True, False])
|
@pytest.mark.parametrize("enabled", [True, False])
|
||||||
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
|
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
|
||||||
assert vllm.envs.VLLM_USE_V1
|
|
||||||
|
|
||||||
# Disable multiprocessing so that the counter is in the same process
|
# Disable multiprocessing so that the counter is in the same process
|
||||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
|
|
||||||
|
|||||||
@@ -303,7 +303,6 @@ def test_attention_quant_pattern(
|
|||||||
model_class: type[AttentionQuantPatternModel],
|
model_class: type[AttentionQuantPatternModel],
|
||||||
backend: _Backend,
|
backend: _Backend,
|
||||||
use_inductor_graph_partition: bool,
|
use_inductor_graph_partition: bool,
|
||||||
monkeypatch,
|
|
||||||
dist_init,
|
dist_init,
|
||||||
caplog_vllm,
|
caplog_vllm,
|
||||||
):
|
):
|
||||||
@@ -312,8 +311,6 @@ def test_attention_quant_pattern(
|
|||||||
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
|
||||||
|
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
device = torch.device("cuda:0")
|
device = torch.device("cuda:0")
|
||||||
torch.manual_seed(42)
|
torch.manual_seed(42)
|
||||||
|
|
||||||
|
|||||||
@@ -8,16 +8,13 @@ from vllm.engine.arg_utils import AsyncEngineArgs
|
|||||||
from vllm.v1.engine.async_llm import AsyncLLM
|
from vllm.v1.engine.async_llm import AsyncLLM
|
||||||
|
|
||||||
|
|
||||||
def test_mp_reducer(monkeypatch):
|
def test_mp_reducer():
|
||||||
"""
|
"""
|
||||||
Test that _reduce_config reducer is registered when AsyncLLM is instantiated
|
Test that _reduce_config reducer is registered when AsyncLLM is instantiated
|
||||||
without transformers_modules. This is a regression test for
|
without transformers_modules. This is a regression test for
|
||||||
https://github.com/vllm-project/vllm/pull/18640.
|
https://github.com/vllm-project/vllm/pull/18640.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
|
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
# Ensure transformers_modules is not in sys.modules
|
# Ensure transformers_modules is not in sys.modules
|
||||||
if "transformers_modules" in sys.modules:
|
if "transformers_modules" in sys.modules:
|
||||||
del sys.modules["transformers_modules"]
|
del sys.modules["transformers_modules"]
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from typing import Any, Optional
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams, envs
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
MODEL = "meta-llama/llama-2-7b-hf"
|
MODEL = "meta-llama/llama-2-7b-hf"
|
||||||
MAX_TOKENS = 200
|
MAX_TOKENS = 200
|
||||||
@@ -111,9 +111,7 @@ def _stop_token_id(llm):
|
|||||||
|
|
||||||
@pytest.mark.skip_global_cleanup
|
@pytest.mark.skip_global_cleanup
|
||||||
def test_stop_strings():
|
def test_stop_strings():
|
||||||
# If V0, must set enforce_eager=False since we use
|
llm = LLM(MODEL, enforce_eager=True)
|
||||||
# async output processing below.
|
|
||||||
llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
|
|
||||||
|
|
||||||
_stop_basic(llm)
|
_stop_basic(llm)
|
||||||
_stop_multi_tokens(llm)
|
_stop_multi_tokens(llm)
|
||||||
|
|||||||
@@ -42,24 +42,10 @@ class CPTestOptions(NamedTuple):
|
|||||||
@dataclass
|
@dataclass
|
||||||
class CPTestSettings:
|
class CPTestSettings:
|
||||||
parallel_setups: list[ParallelSetup]
|
parallel_setups: list[ParallelSetup]
|
||||||
# NOTE: the length of distributed_backends and
|
|
||||||
# vllm_major_versions should be the same, and they
|
|
||||||
# are first zipped together to iterate over all
|
|
||||||
# test settings.
|
|
||||||
distributed_backends: list[str]
|
distributed_backends: list[str]
|
||||||
# vllm major version: "0" for V0, "1" for V1
|
|
||||||
vllm_major_versions: list[str]
|
|
||||||
runner: RunnerOption
|
runner: RunnerOption
|
||||||
test_options: CPTestOptions
|
test_options: CPTestOptions
|
||||||
|
|
||||||
def __post_init__(self):
|
|
||||||
if len(self.distributed_backends) != len(self.vllm_major_versions):
|
|
||||||
raise ValueError(
|
|
||||||
f"Length mismatch: distributed_backends "
|
|
||||||
f"({len(self.distributed_backends)}) != "
|
|
||||||
f"vllm_major_versions ({len(self.vllm_major_versions)})"
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def detailed(
|
def detailed(
|
||||||
*,
|
*,
|
||||||
@@ -87,7 +73,6 @@ class CPTestSettings:
|
|||||||
return CPTestSettings(
|
return CPTestSettings(
|
||||||
parallel_setups=parallel_setups,
|
parallel_setups=parallel_setups,
|
||||||
distributed_backends=["mp"],
|
distributed_backends=["mp"],
|
||||||
vllm_major_versions=["1"],
|
|
||||||
runner=runner,
|
runner=runner,
|
||||||
test_options=CPTestOptions(
|
test_options=CPTestOptions(
|
||||||
multi_node_only=multi_node_only, load_format=load_format
|
multi_node_only=multi_node_only, load_format=load_format
|
||||||
@@ -98,14 +83,11 @@ class CPTestSettings:
|
|||||||
opts = self.test_options
|
opts = self.test_options
|
||||||
|
|
||||||
for parallel_setup in self.parallel_setups:
|
for parallel_setup in self.parallel_setups:
|
||||||
for backend, vllm_major_version in zip(
|
for backend in self.distributed_backends:
|
||||||
self.distributed_backends, self.vllm_major_versions
|
|
||||||
):
|
|
||||||
yield (
|
yield (
|
||||||
model_id,
|
model_id,
|
||||||
parallel_setup,
|
parallel_setup,
|
||||||
backend,
|
backend,
|
||||||
vllm_major_version,
|
|
||||||
self.runner,
|
self.runner,
|
||||||
opts,
|
opts,
|
||||||
)
|
)
|
||||||
@@ -115,7 +97,6 @@ def _compare_cp_with_tp(
|
|||||||
model_id: str,
|
model_id: str,
|
||||||
parallel_setup: ParallelSetup,
|
parallel_setup: ParallelSetup,
|
||||||
distributed_backend: str,
|
distributed_backend: str,
|
||||||
vllm_major_version: str,
|
|
||||||
runner: RunnerOption,
|
runner: RunnerOption,
|
||||||
test_options: CPTestOptions,
|
test_options: CPTestOptions,
|
||||||
num_gpus_available: int,
|
num_gpus_available: int,
|
||||||
@@ -191,10 +172,6 @@ def _compare_cp_with_tp(
|
|||||||
if hf_overrides:
|
if hf_overrides:
|
||||||
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||||
|
|
||||||
cp_env = tp_env = {
|
|
||||||
"VLLM_USE_V1": vllm_major_version, # Note(hc): DCP only support V1 engine only
|
|
||||||
}
|
|
||||||
|
|
||||||
cp_args = [
|
cp_args = [
|
||||||
*common_args,
|
*common_args,
|
||||||
"--tensor-parallel-size",
|
"--tensor-parallel-size",
|
||||||
@@ -217,24 +194,13 @@ def _compare_cp_with_tp(
|
|||||||
distributed_backend,
|
distributed_backend,
|
||||||
]
|
]
|
||||||
|
|
||||||
try:
|
|
||||||
compare_two_settings(
|
compare_two_settings(
|
||||||
model_id,
|
model_id,
|
||||||
cp_args,
|
cp_args,
|
||||||
tp_args,
|
tp_args,
|
||||||
cp_env,
|
|
||||||
tp_env,
|
|
||||||
method=method,
|
method=method,
|
||||||
max_wait_seconds=720,
|
max_wait_seconds=720,
|
||||||
)
|
)
|
||||||
except Exception:
|
|
||||||
testing_ray_compiled_graph = cp_env is not None
|
|
||||||
if testing_ray_compiled_graph and vllm_major_version == "0":
|
|
||||||
# Ray Compiled Graph tests are flaky for V0,
|
|
||||||
# so we don't want to fail the test
|
|
||||||
logger.exception("Ray Compiled Graph tests failed")
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
CP_TEXT_GENERATION_MODELS = {
|
CP_TEXT_GENERATION_MODELS = {
|
||||||
@@ -257,7 +223,6 @@ CP_TEST_MODELS = [
|
|||||||
"model_id",
|
"model_id",
|
||||||
"parallel_setup",
|
"parallel_setup",
|
||||||
"distributed_backend",
|
"distributed_backend",
|
||||||
"vllm_major_version",
|
|
||||||
"runner",
|
"runner",
|
||||||
"test_options",
|
"test_options",
|
||||||
),
|
),
|
||||||
@@ -274,7 +239,6 @@ def test_cp_generation(
|
|||||||
model_id: str,
|
model_id: str,
|
||||||
parallel_setup: ParallelSetup,
|
parallel_setup: ParallelSetup,
|
||||||
distributed_backend: str,
|
distributed_backend: str,
|
||||||
vllm_major_version: str,
|
|
||||||
runner: RunnerOption,
|
runner: RunnerOption,
|
||||||
test_options: CPTestOptions,
|
test_options: CPTestOptions,
|
||||||
num_gpus_available,
|
num_gpus_available,
|
||||||
@@ -283,7 +247,6 @@ def test_cp_generation(
|
|||||||
model_id,
|
model_id,
|
||||||
parallel_setup,
|
parallel_setup,
|
||||||
distributed_backend,
|
distributed_backend,
|
||||||
vllm_major_version,
|
|
||||||
runner,
|
runner,
|
||||||
test_options,
|
test_options,
|
||||||
num_gpus_available,
|
num_gpus_available,
|
||||||
|
|||||||
@@ -307,7 +307,6 @@ def _compare_tp(
|
|||||||
if distributed_backend == "ray":
|
if distributed_backend == "ray":
|
||||||
# For V1, test Ray Compiled Graph for all the tests
|
# For V1, test Ray Compiled Graph for all the tests
|
||||||
pp_env = {
|
pp_env = {
|
||||||
"VLLM_USE_V1": "1",
|
|
||||||
"VLLM_USE_RAY_COMPILED_DAG": "1",
|
"VLLM_USE_RAY_COMPILED_DAG": "1",
|
||||||
"VLLM_USE_RAY_SPMD_WORKER": "1",
|
"VLLM_USE_RAY_SPMD_WORKER": "1",
|
||||||
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
|
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
|
||||||
@@ -316,15 +315,11 @@ def _compare_tp(
|
|||||||
# terminate because of a Ray Compiled Graph issue.
|
# terminate because of a Ray Compiled Graph issue.
|
||||||
common_args.append("--disable-frontend-multiprocessing")
|
common_args.append("--disable-frontend-multiprocessing")
|
||||||
elif distributed_backend == "mp":
|
elif distributed_backend == "mp":
|
||||||
pp_env = {
|
pp_env = None
|
||||||
"VLLM_USE_V1": "1",
|
|
||||||
}
|
|
||||||
else:
|
else:
|
||||||
pp_env = None
|
pp_env = None
|
||||||
|
|
||||||
tp_env = {
|
tp_env = None
|
||||||
"VLLM_USE_V1": "1",
|
|
||||||
}
|
|
||||||
|
|
||||||
pp_args = [
|
pp_args = [
|
||||||
*common_args,
|
*common_args,
|
||||||
|
|||||||
@@ -42,24 +42,10 @@ class SPTestOptions(NamedTuple):
|
|||||||
@dataclass
|
@dataclass
|
||||||
class SPTestSettings:
|
class SPTestSettings:
|
||||||
parallel_setups: list[ParallelSetup]
|
parallel_setups: list[ParallelSetup]
|
||||||
# NOTE: the length of distributed_backends and
|
|
||||||
# vllm_major_versions should be the same, and they
|
|
||||||
# are first zipped together to iterate over all
|
|
||||||
# test settings.
|
|
||||||
distributed_backends: list[str]
|
distributed_backends: list[str]
|
||||||
# vllm major version: "0" for V0, "1" for V1
|
|
||||||
vllm_major_versions: list[str]
|
|
||||||
runner: RunnerOption
|
runner: RunnerOption
|
||||||
test_options: SPTestOptions
|
test_options: SPTestOptions
|
||||||
|
|
||||||
def __post_init__(self):
|
|
||||||
if len(self.distributed_backends) != len(self.vllm_major_versions):
|
|
||||||
raise ValueError(
|
|
||||||
f"Length mismatch: distributed_backends "
|
|
||||||
f"({len(self.distributed_backends)}) != "
|
|
||||||
f"vllm_major_versions ({len(self.vllm_major_versions)})"
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def detailed(
|
def detailed(
|
||||||
*,
|
*,
|
||||||
@@ -85,7 +71,6 @@ class SPTestSettings:
|
|||||||
return SPTestSettings(
|
return SPTestSettings(
|
||||||
parallel_setups=parallel_setups,
|
parallel_setups=parallel_setups,
|
||||||
distributed_backends=["mp", "ray"],
|
distributed_backends=["mp", "ray"],
|
||||||
vllm_major_versions=["1", "1"],
|
|
||||||
runner=runner,
|
runner=runner,
|
||||||
test_options=SPTestOptions(
|
test_options=SPTestOptions(
|
||||||
multi_node_only=multi_node_only, load_format=load_format
|
multi_node_only=multi_node_only, load_format=load_format
|
||||||
@@ -117,7 +102,6 @@ class SPTestSettings:
|
|||||||
return SPTestSettings(
|
return SPTestSettings(
|
||||||
parallel_setups=parallel_setups,
|
parallel_setups=parallel_setups,
|
||||||
distributed_backends=["mp", "ray"],
|
distributed_backends=["mp", "ray"],
|
||||||
vllm_major_versions=["1", "1"],
|
|
||||||
runner=runner,
|
runner=runner,
|
||||||
test_options=SPTestOptions(
|
test_options=SPTestOptions(
|
||||||
multi_node_only=multi_node_only, load_format=load_format
|
multi_node_only=multi_node_only, load_format=load_format
|
||||||
@@ -147,7 +131,6 @@ class SPTestSettings:
|
|||||||
return SPTestSettings(
|
return SPTestSettings(
|
||||||
parallel_setups=parallel_setups,
|
parallel_setups=parallel_setups,
|
||||||
distributed_backends=["mp", "ray"],
|
distributed_backends=["mp", "ray"],
|
||||||
vllm_major_versions=["1", "1"],
|
|
||||||
runner=runner,
|
runner=runner,
|
||||||
test_options=SPTestOptions(
|
test_options=SPTestOptions(
|
||||||
multi_node_only=multi_node_only, load_format=load_format
|
multi_node_only=multi_node_only, load_format=load_format
|
||||||
@@ -158,14 +141,11 @@ class SPTestSettings:
|
|||||||
opts = self.test_options
|
opts = self.test_options
|
||||||
|
|
||||||
for parallel_setup in self.parallel_setups:
|
for parallel_setup in self.parallel_setups:
|
||||||
for backend, vllm_major_version in zip(
|
for backend in self.distributed_backends:
|
||||||
self.distributed_backends, self.vllm_major_versions
|
|
||||||
):
|
|
||||||
yield (
|
yield (
|
||||||
model_id,
|
model_id,
|
||||||
parallel_setup,
|
parallel_setup,
|
||||||
backend,
|
backend,
|
||||||
vllm_major_version,
|
|
||||||
self.runner,
|
self.runner,
|
||||||
opts,
|
opts,
|
||||||
)
|
)
|
||||||
@@ -175,7 +155,6 @@ def _compare_sp(
|
|||||||
model_id: str,
|
model_id: str,
|
||||||
parallel_setup: ParallelSetup,
|
parallel_setup: ParallelSetup,
|
||||||
distributed_backend: str,
|
distributed_backend: str,
|
||||||
vllm_major_version: str,
|
|
||||||
runner: RunnerOption,
|
runner: RunnerOption,
|
||||||
test_options: SPTestOptions,
|
test_options: SPTestOptions,
|
||||||
num_gpus_available: int,
|
num_gpus_available: int,
|
||||||
@@ -265,10 +244,6 @@ def _compare_sp(
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tp_sp_env = tp_env = {
|
|
||||||
"VLLM_USE_V1": vllm_major_version,
|
|
||||||
}
|
|
||||||
|
|
||||||
tp_sp_args = [
|
tp_sp_args = [
|
||||||
*common_args,
|
*common_args,
|
||||||
"--tensor-parallel-size",
|
"--tensor-parallel-size",
|
||||||
@@ -281,9 +256,6 @@ def _compare_sp(
|
|||||||
json.dumps(compilation_config),
|
json.dumps(compilation_config),
|
||||||
]
|
]
|
||||||
|
|
||||||
tp_env = {
|
|
||||||
"VLLM_USE_V1": vllm_major_version,
|
|
||||||
}
|
|
||||||
tp_args = [
|
tp_args = [
|
||||||
*common_args,
|
*common_args,
|
||||||
"--tensor-parallel-size",
|
"--tensor-parallel-size",
|
||||||
@@ -292,18 +264,7 @@ def _compare_sp(
|
|||||||
"mp",
|
"mp",
|
||||||
]
|
]
|
||||||
|
|
||||||
try:
|
compare_two_settings(model_id, tp_sp_args, tp_args, method=method)
|
||||||
compare_two_settings(
|
|
||||||
model_id, tp_sp_args, tp_args, tp_sp_env, tp_env, method=method
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
testing_ray_compiled_graph = tp_sp_env is not None
|
|
||||||
if testing_ray_compiled_graph and vllm_major_version == "0":
|
|
||||||
# Ray Compiled Graph tests are flaky for V0,
|
|
||||||
# so we don't want to fail the test
|
|
||||||
logger.exception("Ray Compiled Graph tests failed")
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
SP_TEXT_GENERATION_MODELS = {
|
SP_TEXT_GENERATION_MODELS = {
|
||||||
@@ -325,7 +286,6 @@ SP_TEST_MODELS = [
|
|||||||
"model_id",
|
"model_id",
|
||||||
"parallel_setup",
|
"parallel_setup",
|
||||||
"distributed_backend",
|
"distributed_backend",
|
||||||
"vllm_major_version",
|
|
||||||
"runner",
|
"runner",
|
||||||
"test_options",
|
"test_options",
|
||||||
),
|
),
|
||||||
@@ -341,7 +301,6 @@ def test_tp_sp_generation(
|
|||||||
model_id: str,
|
model_id: str,
|
||||||
parallel_setup: ParallelSetup,
|
parallel_setup: ParallelSetup,
|
||||||
distributed_backend: str,
|
distributed_backend: str,
|
||||||
vllm_major_version: str,
|
|
||||||
runner: RunnerOption,
|
runner: RunnerOption,
|
||||||
test_options: SPTestOptions,
|
test_options: SPTestOptions,
|
||||||
num_gpus_available,
|
num_gpus_available,
|
||||||
@@ -350,7 +309,6 @@ def test_tp_sp_generation(
|
|||||||
model_id,
|
model_id,
|
||||||
parallel_setup,
|
parallel_setup,
|
||||||
distributed_backend,
|
distributed_backend,
|
||||||
vllm_major_version,
|
|
||||||
runner,
|
runner,
|
||||||
test_options,
|
test_options,
|
||||||
num_gpus_available,
|
num_gpus_available,
|
||||||
|
|||||||
@@ -61,17 +61,10 @@ def run_test(model_name, more_args=None):
|
|||||||
TPU_TP_TEST_STR = "" # "tensor_parallel_size=4"
|
TPU_TP_TEST_STR = "" # "tensor_parallel_size=4"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
|
||||||
not current_platform.is_cuda() and not current_platform.is_tpu(),
|
|
||||||
reason="V1 is currently only supported on CUDA and TPU",
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("model", MODEL_NAMES)
|
@pytest.mark.parametrize("model", MODEL_NAMES)
|
||||||
def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
|
def test_lm_eval_accuracy_v1_engine(model):
|
||||||
"""Run with the V1 Engine."""
|
"""Run with the V1 Engine."""
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
more_args = None
|
more_args = None
|
||||||
if current_platform.is_tpu():
|
if current_platform.is_tpu():
|
||||||
# Limit compilation time for TPU V1
|
# Limit compilation time for TPU V1
|
||||||
@@ -85,19 +78,10 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
|
|||||||
run_test(model, more_args)
|
run_test(model, more_args)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
|
||||||
not current_platform.is_cuda() and not current_platform.is_tpu(),
|
|
||||||
reason="V1 is currently only supported on CUDA and TPU",
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
|
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
|
||||||
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
|
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
|
||||||
model, monkeypatch: pytest.MonkeyPatch
|
|
||||||
):
|
|
||||||
"""Run with the V1 Engine."""
|
"""Run with the V1 Engine."""
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
more_args = None
|
more_args = None
|
||||||
if current_platform.is_tpu():
|
if current_platform.is_tpu():
|
||||||
# Limit compilation time for TPU V1
|
# Limit compilation time for TPU V1
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ AsyncLLMEngine are working correctly.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import pytest
|
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
@@ -67,17 +66,9 @@ def run_test(more_args):
|
|||||||
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
def test_lm_eval_accuracy_v1_engine():
|
||||||
not current_platform.is_cuda()
|
|
||||||
and not current_platform.is_tpu()
|
|
||||||
and not current_platform.is_xpu(),
|
|
||||||
reason="V1 currently only supported on CUDA, XPU and TPU",
|
|
||||||
)
|
|
||||||
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
|
|
||||||
"""Run with the V1 Engine."""
|
"""Run with the V1 Engine."""
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
more_args = []
|
more_args = []
|
||||||
|
|
||||||
# Limit compilation time for V1
|
# Limit compilation time for V1
|
||||||
|
|||||||
@@ -21,18 +21,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def monkeypatch_module():
|
def server(zephyr_lora_files): # noqa: F811
|
||||||
from _pytest.monkeypatch import MonkeyPatch
|
|
||||||
|
|
||||||
mpatch = MonkeyPatch()
|
|
||||||
yield mpatch
|
|
||||||
mpatch.undo()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
|
||||||
def server(monkeypatch_module, zephyr_lora_files): # noqa: F811
|
|
||||||
monkeypatch_module.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
args = [
|
args = [
|
||||||
# use half precision for speed and memory savings in CI environment
|
# use half precision for speed and memory savings in CI environment
|
||||||
"--dtype",
|
"--dtype",
|
||||||
|
|||||||
@@ -37,21 +37,8 @@ BADREQUEST_CASES = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
|
||||||
def monkeypatch_module():
|
|
||||||
from _pytest.monkeypatch import MonkeyPatch
|
|
||||||
|
|
||||||
mpatch = MonkeyPatch()
|
|
||||||
yield mpatch
|
|
||||||
mpatch.undo()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module", params=[True])
|
@pytest.fixture(scope="module", params=[True])
|
||||||
def server_with_lora_modules_json(request, monkeypatch_module, zephyr_lora_files):
|
def server_with_lora_modules_json(request, zephyr_lora_files):
|
||||||
use_v1 = request.param
|
|
||||||
assert use_v1
|
|
||||||
monkeypatch_module.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
# Define the json format LoRA module configurations
|
# Define the json format LoRA module configurations
|
||||||
lora_module_1 = {
|
lora_module_1 = {
|
||||||
"name": "zephyr-lora",
|
"name": "zephyr-lora",
|
||||||
|
|||||||
@@ -22,24 +22,6 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
|||||||
PREV_MINOR_VERSION = version._prev_minor_version()
|
PREV_MINOR_VERSION = version._prev_minor_version()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module", params=[True])
|
|
||||||
def use_v1(request):
|
|
||||||
# Module-scoped variant of run_with_both_engines
|
|
||||||
#
|
|
||||||
# Use this fixture to run a test with both v0 and v1, and
|
|
||||||
# also to conditionalize the test logic e.g.
|
|
||||||
#
|
|
||||||
# def test_metrics_exist(use_v1, server, client):
|
|
||||||
# ...
|
|
||||||
# expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
|
|
||||||
# for metric in expected:
|
|
||||||
# assert metric in response.text
|
|
||||||
#
|
|
||||||
# @skip_v1 wouldn't work here because this is a module-level
|
|
||||||
# fixture - per-function decorators would have no effect
|
|
||||||
yield request.param
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def default_server_args():
|
def default_server_args():
|
||||||
return [
|
return [
|
||||||
@@ -63,13 +45,11 @@ def default_server_args():
|
|||||||
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
|
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def server(use_v1, default_server_args, request):
|
def server(default_server_args, request):
|
||||||
if request.param:
|
if request.param:
|
||||||
default_server_args.append(request.param)
|
default_server_args.append(request.param)
|
||||||
env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
|
|
||||||
with RemoteOpenAIServer(
|
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
|
||||||
MODEL_NAME, default_server_args, env_dict=env_dict
|
|
||||||
) as remote_server:
|
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
@@ -129,7 +109,8 @@ EXPECTED_VALUES = {
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_metrics_counts(
|
async def test_metrics_counts(
|
||||||
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
|
server: RemoteOpenAIServer,
|
||||||
|
client: openai.AsyncClient,
|
||||||
):
|
):
|
||||||
for _ in range(_NUM_REQUESTS):
|
for _ in range(_NUM_REQUESTS):
|
||||||
# sending a request triggers the metrics to be logged.
|
# sending a request triggers the metrics to be logged.
|
||||||
@@ -145,7 +126,7 @@ async def test_metrics_counts(
|
|||||||
|
|
||||||
# Loop over all expected metric_families
|
# Loop over all expected metric_families
|
||||||
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
|
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
|
||||||
if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
|
if (metric_family not in EXPECTED_METRICS_V1) or (
|
||||||
not server.show_hidden_metrics
|
not server.show_hidden_metrics
|
||||||
and metric_family in HIDDEN_DEPRECATED_METRICS
|
and metric_family in HIDDEN_DEPRECATED_METRICS
|
||||||
):
|
):
|
||||||
@@ -183,62 +164,6 @@ async def test_metrics_counts(
|
|||||||
assert found_metric, f"Did not find {metric_family} in prom endpoint"
|
assert found_metric, f"Did not find {metric_family} in prom endpoint"
|
||||||
|
|
||||||
|
|
||||||
EXPECTED_METRICS = [
|
|
||||||
"vllm:num_requests_running",
|
|
||||||
"vllm:num_requests_waiting",
|
|
||||||
"vllm:gpu_cache_usage_perc",
|
|
||||||
"vllm:time_to_first_token_seconds_sum",
|
|
||||||
"vllm:time_to_first_token_seconds_bucket",
|
|
||||||
"vllm:time_to_first_token_seconds_count",
|
|
||||||
"vllm:time_per_output_token_seconds_sum",
|
|
||||||
"vllm:time_per_output_token_seconds_bucket",
|
|
||||||
"vllm:time_per_output_token_seconds_count",
|
|
||||||
"vllm:e2e_request_latency_seconds_sum",
|
|
||||||
"vllm:e2e_request_latency_seconds_bucket",
|
|
||||||
"vllm:e2e_request_latency_seconds_count",
|
|
||||||
"vllm:request_queue_time_seconds_sum",
|
|
||||||
"vllm:request_queue_time_seconds_bucket",
|
|
||||||
"vllm:request_queue_time_seconds_count",
|
|
||||||
"vllm:request_inference_time_seconds_sum",
|
|
||||||
"vllm:request_inference_time_seconds_bucket",
|
|
||||||
"vllm:request_inference_time_seconds_count",
|
|
||||||
"vllm:request_prefill_time_seconds_sum",
|
|
||||||
"vllm:request_prefill_time_seconds_bucket",
|
|
||||||
"vllm:request_prefill_time_seconds_count",
|
|
||||||
"vllm:request_decode_time_seconds_sum",
|
|
||||||
"vllm:request_decode_time_seconds_bucket",
|
|
||||||
"vllm:request_decode_time_seconds_count",
|
|
||||||
"vllm:request_prompt_tokens_sum",
|
|
||||||
"vllm:request_prompt_tokens_bucket",
|
|
||||||
"vllm:request_prompt_tokens_count",
|
|
||||||
"vllm:request_generation_tokens_sum",
|
|
||||||
"vllm:request_generation_tokens_bucket",
|
|
||||||
"vllm:request_generation_tokens_count",
|
|
||||||
"vllm:request_params_n_sum",
|
|
||||||
"vllm:request_params_n_bucket",
|
|
||||||
"vllm:request_params_n_count",
|
|
||||||
"vllm:request_params_max_tokens_sum",
|
|
||||||
"vllm:request_params_max_tokens_bucket",
|
|
||||||
"vllm:request_params_max_tokens_count",
|
|
||||||
"vllm:iteration_tokens_total",
|
|
||||||
"vllm:num_preemptions_total",
|
|
||||||
"vllm:prompt_tokens_total",
|
|
||||||
"vllm:generation_tokens_total",
|
|
||||||
"vllm:request_success_total",
|
|
||||||
"vllm:cache_config_info",
|
|
||||||
# labels in cache_config_info
|
|
||||||
"block_size",
|
|
||||||
"cache_dtype",
|
|
||||||
"cpu_offload_gb",
|
|
||||||
"enable_prefix_caching",
|
|
||||||
"gpu_memory_utilization",
|
|
||||||
"num_cpu_blocks",
|
|
||||||
"num_gpu_blocks",
|
|
||||||
"num_gpu_blocks_override",
|
|
||||||
"sliding_window",
|
|
||||||
"swap_space_bytes",
|
|
||||||
]
|
|
||||||
|
|
||||||
EXPECTED_METRICS_V1 = [
|
EXPECTED_METRICS_V1 = [
|
||||||
"vllm:num_requests_running",
|
"vllm:num_requests_running",
|
||||||
"vllm:num_requests_waiting",
|
"vllm:num_requests_waiting",
|
||||||
@@ -304,17 +229,21 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_metrics_exist(
|
async def test_metrics_exist(
|
||||||
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
|
server: RemoteOpenAIServer,
|
||||||
|
client: openai.AsyncClient,
|
||||||
):
|
):
|
||||||
# sending a request triggers the metrics to be logged.
|
# sending a request triggers the metrics to be logged.
|
||||||
await client.completions.create(
|
await client.completions.create(
|
||||||
model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
|
model=MODEL_NAME,
|
||||||
|
prompt="Hello, my name is",
|
||||||
|
max_tokens=5,
|
||||||
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
response = requests.get(server.url_for("metrics"))
|
response = requests.get(server.url_for("metrics"))
|
||||||
assert response.status_code == HTTPStatus.OK
|
assert response.status_code == HTTPStatus.OK
|
||||||
|
|
||||||
for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
|
for metric in EXPECTED_METRICS_V1:
|
||||||
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
|
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
|
||||||
continue
|
continue
|
||||||
assert metric in response.text
|
assert metric in response.text
|
||||||
@@ -322,10 +251,11 @@ async def test_metrics_exist(
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_abort_metrics_reset(
|
async def test_abort_metrics_reset(
|
||||||
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
|
server: RemoteOpenAIServer,
|
||||||
|
client: openai.AsyncClient,
|
||||||
):
|
):
|
||||||
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
|
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
|
||||||
server, use_v1
|
server
|
||||||
)
|
)
|
||||||
|
|
||||||
# Expect no running requests or kvcache usage
|
# Expect no running requests or kvcache usage
|
||||||
@@ -351,7 +281,7 @@ async def test_abort_metrics_reset(
|
|||||||
|
|
||||||
# Check that we have running requests
|
# Check that we have running requests
|
||||||
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
|
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
|
||||||
server, use_v1
|
server
|
||||||
)
|
)
|
||||||
|
|
||||||
# Expect running requests and kvcache usage
|
# Expect running requests and kvcache usage
|
||||||
@@ -371,7 +301,7 @@ async def test_abort_metrics_reset(
|
|||||||
|
|
||||||
# Verify running and waiting requests counts and KV cache usage are zero
|
# Verify running and waiting requests counts and KV cache usage are zero
|
||||||
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
|
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
|
||||||
_get_running_metrics_from_api(server, use_v1)
|
_get_running_metrics_from_api(server)
|
||||||
)
|
)
|
||||||
|
|
||||||
assert running_requests_after == 0, (
|
assert running_requests_after == 0, (
|
||||||
@@ -385,7 +315,7 @@ async def test_abort_metrics_reset(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
|
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
|
||||||
"""Return (running_count, waiting_count, kv_cache_usage)"""
|
"""Return (running_count, waiting_count, kv_cache_usage)"""
|
||||||
|
|
||||||
response = requests.get(server.url_for("metrics"))
|
response = requests.get(server.url_for("metrics"))
|
||||||
@@ -394,9 +324,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
|
|||||||
# Verify running and waiting requests counts and KV cache usage are zero
|
# Verify running and waiting requests counts and KV cache usage are zero
|
||||||
running_requests, waiting_requests, kv_cache_usage = None, None, None
|
running_requests, waiting_requests, kv_cache_usage = None, None, None
|
||||||
|
|
||||||
kv_cache_usage_metric = (
|
kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
|
||||||
"vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
|
|
||||||
)
|
|
||||||
|
|
||||||
for family in text_string_to_metric_families(response.text):
|
for family in text_string_to_metric_families(response.text):
|
||||||
if family.name == "vllm:num_requests_running":
|
if family.name == "vllm:num_requests_running":
|
||||||
@@ -422,7 +350,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
|
|||||||
return running_requests, waiting_requests, kv_cache_usage
|
return running_requests, waiting_requests, kv_cache_usage
|
||||||
|
|
||||||
|
|
||||||
def test_metrics_exist_run_batch(use_v1: bool):
|
def test_metrics_exist_run_batch():
|
||||||
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
|
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
|
||||||
|
|
||||||
base_url = "0.0.0.0"
|
base_url = "0.0.0.0"
|
||||||
@@ -452,7 +380,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
|
|||||||
"--port",
|
"--port",
|
||||||
port,
|
port,
|
||||||
],
|
],
|
||||||
env={"VLLM_USE_V1": "1"},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def is_server_up(url):
|
def is_server_up(url):
|
||||||
|
|||||||
@@ -15,11 +15,6 @@ from vllm.entrypoints.renderer import BaseRenderer
|
|||||||
from ...utils import RemoteOpenAIServer
|
from ...utils import RemoteOpenAIServer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
|
||||||
def use_v1_only(monkeypatch):
|
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_empty_prompt():
|
async def test_empty_prompt():
|
||||||
model_name = "gpt2"
|
model_name = "gpt2"
|
||||||
|
|||||||
@@ -80,7 +80,6 @@ def test_env(
|
|||||||
):
|
):
|
||||||
"""Test attention backend selection with valid device-backend pairs."""
|
"""Test attention backend selection with valid device-backend pairs."""
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
m.setenv(STR_BACKEND_ENV_VAR, name)
|
m.setenv(STR_BACKEND_ENV_VAR, name)
|
||||||
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
|
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
|
||||||
|
|
||||||
@@ -212,14 +211,8 @@ def test_env(
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
@pytest.mark.parametrize("device", ["cpu", "cuda"])
|
||||||
def test_fp32_fallback(
|
def test_fp32_fallback(device: str):
|
||||||
device: str,
|
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
):
|
|
||||||
"""Test attention backend selection with fp32."""
|
"""Test attention backend selection with fp32."""
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
if device == "cpu":
|
if device == "cpu":
|
||||||
with patch("vllm.attention.selector.current_platform", CpuPlatform()):
|
with patch("vllm.attention.selector.current_platform", CpuPlatform()):
|
||||||
backend = get_attn_backend(16, torch.float32, None, 16)
|
backend = get_attn_backend(16, torch.float32, None, 16)
|
||||||
@@ -233,9 +226,6 @@ def test_fp32_fallback(
|
|||||||
|
|
||||||
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""Test FlashAttn validation."""
|
"""Test FlashAttn validation."""
|
||||||
# TODO: When testing for v1, pipe in `use_v1` as an argument to
|
|
||||||
# get_attn_backend
|
|
||||||
|
|
||||||
pytest.skip(
|
pytest.skip(
|
||||||
"Skipping as current backend selector does not "
|
"Skipping as current backend selector does not "
|
||||||
"handle fallbacks when a backend is set via env var."
|
"handle fallbacks when a backend is set via env var."
|
||||||
@@ -289,7 +279,6 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
|
|||||||
monkeypatch.context() as m,
|
monkeypatch.context() as m,
|
||||||
patch("vllm.attention.selector.current_platform", CudaPlatform()),
|
patch("vllm.attention.selector.current_platform", CudaPlatform()),
|
||||||
):
|
):
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
|
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
|
||||||
|
|
||||||
# Should raise ValueError for invalid backend
|
# Should raise ValueError for invalid backend
|
||||||
|
|||||||
@@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
|
|||||||
|
|
||||||
# Run with flex attention
|
# Run with flex attention
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
||||||
|
|
||||||
set_seed(seed)
|
set_seed(seed)
|
||||||
@@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
|
|||||||
|
|
||||||
# Run with default backend
|
# Run with default backend
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
set_seed(seed)
|
set_seed(seed)
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model_name,
|
model_name,
|
||||||
@@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
|
|||||||
|
|
||||||
# Run with flex attention
|
# Run with flex attention
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model_name,
|
model_name,
|
||||||
@@ -126,16 +123,17 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
|
|||||||
flex_outputs = llm_flex.embed(prompts)
|
flex_outputs = llm_flex.embed(prompts)
|
||||||
|
|
||||||
# Run with default backend
|
# Run with default backend
|
||||||
with monkeypatch.context() as m:
|
with (
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
monkeypatch.context() as m,
|
||||||
with vllm_runner(
|
vllm_runner(
|
||||||
model_name,
|
model_name,
|
||||||
runner="pooling",
|
runner="pooling",
|
||||||
dtype=torch.bfloat16,
|
dtype=torch.bfloat16,
|
||||||
tensor_parallel_size=1,
|
tensor_parallel_size=1,
|
||||||
max_model_len=100,
|
max_model_len=100,
|
||||||
enforce_eager=True,
|
enforce_eager=True,
|
||||||
) as llm_default:
|
) as llm_default,
|
||||||
|
):
|
||||||
default_outputs = llm_default.embed(prompts)
|
default_outputs = llm_default.embed(prompts)
|
||||||
|
|
||||||
check_embeddings_close(
|
check_embeddings_close(
|
||||||
|
|||||||
@@ -613,7 +613,6 @@ def test_dummy_maverick(
|
|||||||
profile: bool = False,
|
profile: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
# Disable multiprocessing allows us to access model executor from LLM engine
|
# Disable multiprocessing allows us to access model executor from LLM engine
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
|
|
||||||
model_path = create_reduced_maverick_model(
|
model_path = create_reduced_maverick_model(
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ if TYPE_CHECKING:
|
|||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
else:
|
else:
|
||||||
VllmConfig = None
|
VllmConfig = None
|
||||||
from vllm import envs
|
|
||||||
|
|
||||||
|
|
||||||
class DummyPlatform(Platform):
|
class DummyPlatform(Platform):
|
||||||
@@ -19,10 +18,7 @@ class DummyPlatform(Platform):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||||
if envs.VLLM_USE_V1:
|
vllm_config.compilation_config.custom_ops = ["all"]
|
||||||
compilation_config = vllm_config.compilation_config
|
|
||||||
# Activate custom ops for v1.
|
|
||||||
compilation_config.custom_ops = ["all"]
|
|
||||||
|
|
||||||
def get_attn_backend_cls(
|
def get_attn_backend_cls(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler):
|
|||||||
|
|
||||||
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
|
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
# Explicitly turn off engine multiprocessing so
|
# Explicitly turn off engine multiprocessing so
|
||||||
# that the scheduler runs in this process
|
# that the scheduler runs in this process
|
||||||
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
|
|||||||
@@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
|
|||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import pytest
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def v1(monkeypatch):
|
|
||||||
"""Only run on vLLM v1."""
|
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
|
|
||||||
def _generate(
|
def _generate(
|
||||||
llm: LLM,
|
llm: LLM,
|
||||||
prompt: str,
|
prompt: str,
|
||||||
|
|||||||
@@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest
|
|||||||
# 100 training iterations with a training batch size of 100.
|
# 100 training iterations with a training batch size of 100.
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
|
||||||
def use_v1_only(monkeypatch: pytest.MonkeyPatch):
|
|
||||||
"""
|
|
||||||
Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
|
|
||||||
for all tests in this file
|
|
||||||
"""
|
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
yield
|
|
||||||
|
|
||||||
|
|
||||||
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
|
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
|
||||||
return vllm.LLM(
|
return vllm.LLM(
|
||||||
model="Qwen/Qwen2.5-3B-Instruct",
|
model="Qwen/Qwen2.5-3B-Instruct",
|
||||||
|
|||||||
@@ -305,7 +305,6 @@ full_cg_backend_configs = {
|
|||||||
"CutlassMLA": BackendConfig(
|
"CutlassMLA": BackendConfig(
|
||||||
name="CutlassMLA",
|
name="CutlassMLA",
|
||||||
env_vars={
|
env_vars={
|
||||||
"VLLM_USE_V1": "1",
|
|
||||||
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
|
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
|
||||||
"FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed
|
"FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -1,11 +1,14 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
|
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
|
||||||
from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
|
from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.cpu_test
|
||||||
|
|
||||||
|
|
||||||
def new_kv_cache_spec():
|
def new_kv_cache_spec():
|
||||||
return FullAttentionSpec(16, 1, 1, torch.float32, False)
|
return FullAttentionSpec(16, 1, 1, torch.float32, False)
|
||||||
|
|||||||
@@ -1,14 +1,10 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
if os.getenv("VLLM_USE_V1", "0") != "1":
|
|
||||||
pytest.skip("Test package requires V1", allow_module_level=True)
|
|
||||||
|
|
||||||
MODEL = "meta-llama/Llama-3.2-1B"
|
MODEL = "meta-llama/Llama-3.2-1B"
|
||||||
PROMPT = "Hello my name is Robert and I"
|
PROMPT = "Hello my name is Robert and I"
|
||||||
|
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
|
|||||||
):
|
):
|
||||||
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
|
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
|
||||||
|
|
||||||
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
|
env_vars = backend_configs[backend_name].env_vars
|
||||||
|
|
||||||
with temporary_environ(env_vars), ExitStack() as stack:
|
with temporary_environ(env_vars), ExitStack() as stack:
|
||||||
if not supported:
|
if not supported:
|
||||||
@@ -117,7 +117,7 @@ combo_cases_2 = [
|
|||||||
def test_cudagraph_compilation_combo(combo_case):
|
def test_cudagraph_compilation_combo(combo_case):
|
||||||
backend_name, cudagraph_mode, compilation_level, supported = combo_case
|
backend_name, cudagraph_mode, compilation_level, supported = combo_case
|
||||||
|
|
||||||
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
|
env_vars = backend_configs[backend_name].env_vars
|
||||||
|
|
||||||
with temporary_environ(env_vars), ExitStack() as stack:
|
with temporary_environ(env_vars), ExitStack() as stack:
|
||||||
if not supported:
|
if not supported:
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||||
|
|
||||||
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
|
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ model_config = {
|
|||||||
@pytest.mark.parametrize("seed", [1])
|
@pytest.mark.parametrize("seed", [1])
|
||||||
@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
|
@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
|
||||||
def test_sliding_window_retrieval(
|
def test_sliding_window_retrieval(
|
||||||
monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager
|
model, batch_size, seed, disable_hybrid_kv_cache_manager
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
||||||
@@ -40,9 +40,6 @@ def test_sliding_window_retrieval(
|
|||||||
If we tell it upfront which we are going to be looking for, then
|
If we tell it upfront which we are going to be looking for, then
|
||||||
it answers correctly (mostly).
|
it answers correctly (mostly).
|
||||||
"""
|
"""
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
test_config = model_config[model]
|
test_config = model_config[model]
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
@@ -50,9 +47,7 @@ def test_sliding_window_retrieval(
|
|||||||
)
|
)
|
||||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
|
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
|
||||||
|
|
||||||
prompts, answer, indices = prep_prompts(
|
prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)
|
||||||
batch_size, ln_range=test_config.ln_range
|
|
||||||
)
|
|
||||||
|
|
||||||
check_length(prompts, llm, test_config.sliding_window)
|
check_length(prompts, llm, test_config.sliding_window)
|
||||||
|
|
||||||
|
|||||||
@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
|
|||||||
)
|
)
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
# Make scheduling deterministic for reproducibility
|
# Make scheduling deterministic for reproducibility
|
||||||
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ Covers:
|
|||||||
5) Multiple stop conditions
|
5) Multiple stop conditions
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
|
|||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def llm_v1():
|
def llm_v1():
|
||||||
"""Create V1 LLM instance for testing"""
|
"""Create V1 LLM instance for testing"""
|
||||||
# Ensure V1 engine is used
|
|
||||||
os.environ["VLLM_USE_V1"] = "1"
|
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model=TEST_MODEL,
|
model=TEST_MODEL,
|
||||||
tensor_parallel_size=1,
|
tensor_parallel_size=1,
|
||||||
@@ -503,6 +499,6 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
cd vllm/
|
cd vllm/
|
||||||
VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v
|
python -m pytest tests/v1/e2e/test_min_tokens.py -v
|
||||||
"""
|
"""
|
||||||
pytest.main([__file__, "-v"])
|
pytest.main([__file__, "-v"])
|
||||||
|
|||||||
@@ -301,7 +301,6 @@ def test_mtp_correctness(
|
|||||||
model_setup: (method, model_name, tp_size)
|
model_setup: (method, model_name, tp_size)
|
||||||
"""
|
"""
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
m.setenv("VLLM_MLA_DISABLE", "1")
|
m.setenv("VLLM_MLA_DISABLE", "1")
|
||||||
|
|
||||||
method, model_name, tp_size = model_setup
|
method, model_name, tp_size = model_setup
|
||||||
|
|||||||
@@ -95,17 +95,11 @@ async def generate(
|
|||||||
)
|
)
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_load(
|
async def test_load(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
output_kind: RequestOutputKind,
|
output_kind: RequestOutputKind,
|
||||||
engine_args: AsyncEngineArgs,
|
engine_args: AsyncEngineArgs,
|
||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
):
|
):
|
||||||
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
|
with ExitStack() as after:
|
||||||
# so that in the future when we switch, we don't have to change all the
|
|
||||||
# tests.
|
|
||||||
with monkeypatch.context() as m, ExitStack() as after:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with set_default_torch_num_threads(1):
|
with set_default_torch_num_threads(1):
|
||||||
engine = AsyncLLM.from_engine_args(engine_args)
|
engine = AsyncLLM.from_engine_args(engine_args)
|
||||||
after.callback(engine.shutdown)
|
after.callback(engine.shutdown)
|
||||||
@@ -149,14 +143,11 @@ async def test_load(
|
|||||||
)
|
)
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_abort(
|
async def test_abort(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
output_kind: RequestOutputKind,
|
output_kind: RequestOutputKind,
|
||||||
engine_args: AsyncEngineArgs,
|
engine_args: AsyncEngineArgs,
|
||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
):
|
):
|
||||||
with monkeypatch.context() as m, ExitStack() as after:
|
with ExitStack() as after:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with set_default_torch_num_threads(1):
|
with set_default_torch_num_threads(1):
|
||||||
engine = AsyncLLM.from_engine_args(engine_args)
|
engine = AsyncLLM.from_engine_args(engine_args)
|
||||||
after.callback(engine.shutdown)
|
after.callback(engine.shutdown)
|
||||||
@@ -222,13 +213,8 @@ async def test_abort(
|
|||||||
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
|
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
|
||||||
)
|
)
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_multi_abort(
|
async def test_multi_abort(output_kind: RequestOutputKind):
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
with ExitStack() as after:
|
||||||
output_kind: RequestOutputKind,
|
|
||||||
):
|
|
||||||
with monkeypatch.context() as m, ExitStack() as after:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with set_default_torch_num_threads(1):
|
with set_default_torch_num_threads(1):
|
||||||
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
|
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
|
||||||
after.callback(engine.shutdown)
|
after.callback(engine.shutdown)
|
||||||
@@ -304,14 +290,11 @@ async def test_multi_abort(
|
|||||||
)
|
)
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_finished_flag(
|
async def test_finished_flag(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
n: int,
|
n: int,
|
||||||
engine_args: AsyncEngineArgs,
|
engine_args: AsyncEngineArgs,
|
||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
):
|
):
|
||||||
with monkeypatch.context() as m, ExitStack() as after:
|
with ExitStack() as after:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with set_default_torch_num_threads(1):
|
with set_default_torch_num_threads(1):
|
||||||
engine = AsyncLLM.from_engine_args(engine_args)
|
engine = AsyncLLM.from_engine_args(engine_args)
|
||||||
after.callback(engine.shutdown)
|
after.callback(engine.shutdown)
|
||||||
@@ -341,12 +324,10 @@ async def test_finished_flag(
|
|||||||
)
|
)
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_mid_stream_cancellation(
|
async def test_mid_stream_cancellation(
|
||||||
monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType
|
engine_args: AsyncEngineArgs, prompt: PromptType
|
||||||
):
|
):
|
||||||
"""Test that requests can be cancelled mid-stream."""
|
"""Test that requests can be cancelled mid-stream."""
|
||||||
with monkeypatch.context() as m, ExitStack() as after:
|
with ExitStack() as after:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with set_default_torch_num_threads(1):
|
with set_default_torch_num_threads(1):
|
||||||
engine = AsyncLLM.from_engine_args(engine_args)
|
engine = AsyncLLM.from_engine_args(engine_args)
|
||||||
after.callback(engine.shutdown)
|
after.callback(engine.shutdown)
|
||||||
@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
|
|||||||
be added to the default loggers.
|
be added to the default loggers.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
with monkeypatch.context() as m, ExitStack() as after:
|
with ExitStack() as after:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with set_default_torch_num_threads(1):
|
with set_default_torch_num_threads(1):
|
||||||
engine = AsyncLLM.from_engine_args(
|
engine = AsyncLLM.from_engine_args(
|
||||||
TEXT_ENGINE_ARGS,
|
TEXT_ENGINE_ARGS,
|
||||||
@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio(scope="module")
|
@pytest.mark.asyncio(scope="module")
|
||||||
async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
|
async def test_dp_rank_argument():
|
||||||
with monkeypatch.context() as m, ExitStack() as after:
|
with ExitStack() as after:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with set_default_torch_num_threads(1):
|
with set_default_torch_num_threads(1):
|
||||||
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
|
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
|
||||||
after.callback(engine.shutdown)
|
after.callback(engine.shutdown)
|
||||||
@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_check_health(monkeypatch: pytest.MonkeyPatch):
|
async def test_check_health():
|
||||||
"""Test that check_health returns normally for healthy engine
|
"""Test that check_health returns normally for healthy engine
|
||||||
and raises EngineDeadError when the engine is dead.
|
and raises EngineDeadError when the engine is dead.
|
||||||
"""
|
"""
|
||||||
@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
|
|||||||
|
|
||||||
from vllm.v1.engine.exceptions import EngineDeadError
|
from vllm.v1.engine.exceptions import EngineDeadError
|
||||||
|
|
||||||
with monkeypatch.context() as m, ExitStack() as after:
|
with ExitStack() as after:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with set_default_torch_num_threads(1):
|
with set_default_torch_num_threads(1):
|
||||||
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
|
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
|
||||||
after.callback(engine.shutdown)
|
after.callback(engine.shutdown)
|
||||||
@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
|
|||||||
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
|
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
|
||||||
)
|
)
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_abort_final_output(
|
async def test_abort_final_output(output_kind: RequestOutputKind):
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
output_kind: RequestOutputKind,
|
|
||||||
):
|
|
||||||
"""Test that abort() returns a final output with correct information."""
|
"""Test that abort() returns a final output with correct information."""
|
||||||
|
|
||||||
with monkeypatch.context() as m, ExitStack() as after:
|
with ExitStack() as after:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with set_default_torch_num_threads(1):
|
with set_default_torch_num_threads(1):
|
||||||
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
|
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
|
||||||
after.callback(engine.shutdown)
|
after.callback(engine.shutdown)
|
||||||
|
|||||||
@@ -5,18 +5,11 @@ from argparse import ArgumentError
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import envs
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.usage.usage_lib import UsageContext
|
from vllm.usage.usage_lib import UsageContext
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
if not envs.VLLM_USE_V1:
|
|
||||||
pytest.skip(
|
|
||||||
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
|
|
||||||
allow_module_level=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_prefix_caching_from_cli():
|
def test_prefix_caching_from_cli():
|
||||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||||
|
|||||||
@@ -46,9 +46,7 @@ def make_request() -> EngineCoreRequest:
|
|||||||
|
|
||||||
|
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
def test_engine_core():
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
"""Setup the EngineCore."""
|
"""Setup the EngineCore."""
|
||||||
engine_args = EngineArgs(model=MODEL_NAME)
|
engine_args = EngineArgs(model=MODEL_NAME)
|
||||||
vllm_config = engine_args.create_engine_config()
|
vllm_config = engine_args.create_engine_config()
|
||||||
@@ -176,14 +174,12 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
|||||||
|
|
||||||
|
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
|
def test_engine_core_advanced_sampling():
|
||||||
"""
|
"""
|
||||||
A basic end-to-end test to verify that the engine functions correctly
|
A basic end-to-end test to verify that the engine functions correctly
|
||||||
when additional sampling parameters, such as top_p, min_tokens, and
|
when additional sampling parameters, such as top_p, min_tokens, and
|
||||||
presence_penalty, are set.
|
presence_penalty, are set.
|
||||||
"""
|
"""
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
"""Setup the EngineCore."""
|
"""Setup the EngineCore."""
|
||||||
engine_args = EngineArgs(model=MODEL_NAME)
|
engine_args = EngineArgs(model=MODEL_NAME)
|
||||||
vllm_config = engine_args.create_engine_config()
|
vllm_config = engine_args.create_engine_config()
|
||||||
@@ -227,7 +223,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
|
|||||||
|
|
||||||
|
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
def test_engine_core_concurrent_batches():
|
||||||
"""
|
"""
|
||||||
Test that the engine can handle multiple concurrent batches.
|
Test that the engine can handle multiple concurrent batches.
|
||||||
"""
|
"""
|
||||||
@@ -272,9 +268,6 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
|||||||
if hasattr(self, "thread_pool"):
|
if hasattr(self, "thread_pool"):
|
||||||
self.thread_pool.shutdown(wait=False)
|
self.thread_pool.shutdown(wait=False)
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model=MODEL_NAME,
|
model=MODEL_NAME,
|
||||||
# To test concurrent batches.
|
# To test concurrent batches.
|
||||||
@@ -364,13 +357,11 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
|||||||
|
|
||||||
|
|
||||||
@multi_gpu_test(num_gpus=2)
|
@multi_gpu_test(num_gpus=2)
|
||||||
def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
|
def test_engine_core_tp():
|
||||||
"""
|
"""
|
||||||
Test engine can initialize worker in tp properly
|
Test engine can initialize worker in tp properly
|
||||||
"""
|
"""
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
"""Setup the EngineCore."""
|
"""Setup the EngineCore."""
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model=MODEL_NAME,
|
model=MODEL_NAME,
|
||||||
@@ -400,11 +391,8 @@ def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
|
|||||||
|
|
||||||
|
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch):
|
def test_engine_core_invalid_request_id_type():
|
||||||
"""Test that engine raises TypeError for non-string request_id."""
|
"""Test that engine raises TypeError for non-string request_id."""
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
engine_args = EngineArgs(model=MODEL_NAME)
|
engine_args = EngineArgs(model=MODEL_NAME)
|
||||||
vllm_config = engine_args.create_engine_config()
|
vllm_config = engine_args.create_engine_config()
|
||||||
executor_class = Executor.get_class(vllm_config)
|
executor_class = Executor.get_class(vllm_config)
|
||||||
@@ -432,9 +420,7 @@ def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch):
|
|||||||
none_request = make_request()
|
none_request = make_request()
|
||||||
none_request.request_id = None
|
none_request.request_id = None
|
||||||
|
|
||||||
with pytest.raises(
|
with pytest.raises(TypeError, match="request_id must be a string, got.*NoneType"):
|
||||||
TypeError, match="request_id must be a string, got.*NoneType"
|
|
||||||
):
|
|
||||||
engine_core.add_request(*engine_core.preprocess_add_request(none_request))
|
engine_core.add_request(*engine_core.preprocess_add_request(none_request))
|
||||||
|
|
||||||
# Verify engine is still functional after errors
|
# Verify engine is still functional after errors
|
||||||
|
|||||||
@@ -130,8 +130,6 @@ def test_engine_core_client(
|
|||||||
monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
|
monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
|
||||||
):
|
):
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
# Monkey-patch core engine utility function to test.
|
# Monkey-patch core engine utility function to test.
|
||||||
m.setattr(EngineCore, "echo", echo, raising=False)
|
m.setattr(EngineCore, "echo", echo, raising=False)
|
||||||
|
|
||||||
@@ -218,8 +216,6 @@ def test_engine_core_client(
|
|||||||
@pytest.mark.asyncio(loop_scope="function")
|
@pytest.mark.asyncio(loop_scope="function")
|
||||||
async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
|
async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
# Monkey-patch core engine utility function to test.
|
# Monkey-patch core engine utility function to test.
|
||||||
m.setattr(EngineCore, "echo", echo, raising=False)
|
m.setattr(EngineCore, "echo", echo, raising=False)
|
||||||
|
|
||||||
@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
|
|||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
):
|
):
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
# Must set insecure serialization to allow returning custom types.
|
# Must set insecure serialization to allow returning custom types.
|
||||||
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
|
|
||||||
@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
|
|||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
):
|
):
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
# Must set insecure serialization to allow returning custom types.
|
# Must set insecure serialization to allow returning custom types.
|
||||||
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
|
|
||||||
@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
|
|||||||
monkeypatch: pytest.MonkeyPatch,
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
):
|
):
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
# Must set insecure serialization to allow returning custom types.
|
# Must set insecure serialization to allow returning custom types.
|
||||||
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||||
|
|
||||||
@@ -592,12 +582,9 @@ async def test_engine_core_client_util_method_nested_structures(
|
|||||||
indirect=["publisher_config"],
|
indirect=["publisher_config"],
|
||||||
)
|
)
|
||||||
def test_kv_cache_events(
|
def test_kv_cache_events(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
multiprocessing_mode: bool,
|
multiprocessing_mode: bool,
|
||||||
publisher_config,
|
publisher_config,
|
||||||
):
|
):
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
block_size = 16
|
block_size = 16
|
||||||
num_blocks = 2
|
num_blocks = 2
|
||||||
|
|
||||||
@@ -640,9 +627,7 @@ def test_kv_cache_events(
|
|||||||
seq, received = result
|
seq, received = result
|
||||||
|
|
||||||
assert seq == 0, "Sequence number mismatch"
|
assert seq == 0, "Sequence number mismatch"
|
||||||
assert len(received.events) == 1, (
|
assert len(received.events) == 1, "We should have exactly one BlockStored event"
|
||||||
"We should have exactly one BlockStored event"
|
|
||||||
)
|
|
||||||
event = received.events[0]
|
event = received.events[0]
|
||||||
assert isinstance(event, BlockStored), "We should have a BlockStored event"
|
assert isinstance(event, BlockStored), "We should have a BlockStored event"
|
||||||
assert len(event.block_hashes) == num_blocks, (
|
assert len(event.block_hashes) == num_blocks, (
|
||||||
@@ -672,12 +657,9 @@ def test_kv_cache_events(
|
|||||||
)
|
)
|
||||||
@multi_gpu_test(num_gpus=4)
|
@multi_gpu_test(num_gpus=4)
|
||||||
async def test_kv_cache_events_dp(
|
async def test_kv_cache_events_dp(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
multiprocessing_mode: bool,
|
multiprocessing_mode: bool,
|
||||||
publisher_config,
|
publisher_config,
|
||||||
):
|
):
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
block_size = 16
|
block_size = 16
|
||||||
num_blocks = 2
|
num_blocks = 2
|
||||||
dp_size = 2
|
dp_size = 2
|
||||||
@@ -765,8 +747,6 @@ async def test_kv_cache_events_dp(
|
|||||||
@pytest.mark.timeout(20)
|
@pytest.mark.timeout(20)
|
||||||
def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
|
def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
|
||||||
with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
|
with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
# Monkey-patch to extract core process pid while it's starting.
|
# Monkey-patch to extract core process pid while it's starting.
|
||||||
core_proc_pid = [None]
|
core_proc_pid = [None]
|
||||||
cepm_ctor = CoreEngineProcManager.__init__
|
cepm_ctor = CoreEngineProcManager.__init__
|
||||||
@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
|
|||||||
mock_executor_class.side_effect = create_mock_executor
|
mock_executor_class.side_effect = create_mock_executor
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices
|
m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices
|
||||||
|
|
||||||
from vllm.v1.engine.utils import EngineZmqAddresses
|
from vllm.v1.engine.utils import EngineZmqAddresses
|
||||||
|
|||||||
@@ -21,12 +21,10 @@ DTYPE = "half"
|
|||||||
def _vllm_model(
|
def _vllm_model(
|
||||||
apc: bool,
|
apc: bool,
|
||||||
vllm_runner: type[VllmRunner],
|
vllm_runner: type[VllmRunner],
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
*,
|
*,
|
||||||
skip_tokenizer_init: bool = False,
|
skip_tokenizer_init: bool = False,
|
||||||
):
|
):
|
||||||
"""Set up VllmRunner instance."""
|
"""Set up VllmRunner instance."""
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
return vllm_runner(
|
return vllm_runner(
|
||||||
MODEL,
|
MODEL,
|
||||||
dtype=DTYPE,
|
dtype=DTYPE,
|
||||||
@@ -45,16 +43,16 @@ def _vllm_model(
|
|||||||
# Prefix caching
|
# Prefix caching
|
||||||
params=[False, True],
|
params=[False, True],
|
||||||
)
|
)
|
||||||
def vllm_model(vllm_runner, request, monkeypatch):
|
def vllm_model(vllm_runner, request):
|
||||||
"""VllmRunner test fixture parameterized by APC True/False."""
|
"""VllmRunner test fixture parameterized by APC True/False."""
|
||||||
with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model:
|
with _vllm_model(request.param, vllm_runner) as vllm_model:
|
||||||
yield vllm_model
|
yield vllm_model
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
def vllm_model_apc(vllm_runner, monkeypatch):
|
def vllm_model_apc(vllm_runner):
|
||||||
"""VllmRunner test fixture with APC."""
|
"""VllmRunner test fixture with APC."""
|
||||||
with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model:
|
with _vllm_model(True, vllm_runner) as vllm_model:
|
||||||
yield vllm_model
|
yield vllm_model
|
||||||
|
|
||||||
|
|
||||||
@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
|
|||||||
# Prefix caching
|
# Prefix caching
|
||||||
params=[False, True],
|
params=[False, True],
|
||||||
)
|
)
|
||||||
def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
|
def vllm_model_skip_tokenizer_init(vllm_runner, request):
|
||||||
"""VllmRunner test fixture with APC."""
|
"""VllmRunner test fixture with APC."""
|
||||||
with _vllm_model(
|
with _vllm_model(
|
||||||
request.param,
|
request.param,
|
||||||
vllm_runner,
|
vllm_runner,
|
||||||
monkeypatch,
|
|
||||||
skip_tokenizer_init=True,
|
skip_tokenizer_init=True,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
yield vllm_model
|
yield vllm_model
|
||||||
@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
|
def test_engine_metrics(vllm_runner, example_prompts):
|
||||||
max_tokens = 100
|
max_tokens = 100
|
||||||
# Use spec decoding to test num_accepted_tokens_per_pos
|
# Use spec decoding to test num_accepted_tokens_per_pos
|
||||||
speculative_config = {
|
speculative_config = {
|
||||||
@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
|
|||||||
"prompt_lookup_min": 3,
|
"prompt_lookup_min": 3,
|
||||||
"num_speculative_tokens": 5,
|
"num_speculative_tokens": 5,
|
||||||
}
|
}
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
MODEL,
|
MODEL,
|
||||||
speculative_config=speculative_config,
|
speculative_config=speculative_config,
|
||||||
@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
|
@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
|
||||||
def test_skip_tokenizer_initialization(model: str, monkeypatch: pytest.MonkeyPatch):
|
def test_skip_tokenizer_initialization(model: str):
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
# This test checks if the flag skip_tokenizer_init skips the initialization
|
# This test checks if the flag skip_tokenizer_init skips the initialization
|
||||||
# of tokenizer and detokenizer. The generated output is expected to contain
|
# of tokenizer and detokenizer. The generated output is expected to contain
|
||||||
# token ids.
|
# token ids.
|
||||||
|
|||||||
@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
|
|||||||
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
|
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
|
||||||
)
|
)
|
||||||
def test_structured_output(
|
def test_structured_output(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
sample_json_schema: dict[str, Any],
|
sample_json_schema: dict[str, Any],
|
||||||
unsupported_json_schema: dict[str, Any],
|
unsupported_json_schema: dict[str, Any],
|
||||||
sample_sql_ebnf: str,
|
sample_sql_ebnf: str,
|
||||||
@@ -115,8 +114,6 @@ def test_structured_output(
|
|||||||
model_name: str,
|
model_name: str,
|
||||||
speculative_config: dict[str, Any],
|
speculative_config: dict[str, Any],
|
||||||
):
|
):
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
if current_platform.is_tpu() and speculative_config:
|
if current_platform.is_tpu() and speculative_config:
|
||||||
pytest.skip("TPU does not support speculative decoding")
|
pytest.skip("TPU does not support speculative decoding")
|
||||||
|
|
||||||
@@ -620,15 +617,12 @@ Make the response as short as possible.
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_structured_output_with_reasoning_matrices(
|
def test_structured_output_with_reasoning_matrices(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
backend: str,
|
backend: str,
|
||||||
tokenizer_mode: TokenizerMode,
|
tokenizer_mode: TokenizerMode,
|
||||||
reasoning_parser: str,
|
reasoning_parser: str,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
speculative_config: dict[str, Any] | None,
|
speculative_config: dict[str, Any] | None,
|
||||||
):
|
):
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
if current_platform.is_tpu() and speculative_config:
|
if current_platform.is_tpu() and speculative_config:
|
||||||
pytest.skip("TPU does not support speculative decoding")
|
pytest.skip("TPU does not support speculative decoding")
|
||||||
|
|
||||||
@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
|
|||||||
@pytest.mark.skip_global_cleanup
|
@pytest.mark.skip_global_cleanup
|
||||||
@pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
|
@pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
|
||||||
def test_structured_output_auto_mode(
|
def test_structured_output_auto_mode(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
unsupported_json_schema: dict[str, Any],
|
unsupported_json_schema: dict[str, Any],
|
||||||
model_name: str,
|
model_name: str,
|
||||||
tokenizer_mode: str,
|
tokenizer_mode: str,
|
||||||
):
|
):
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip_global_cleanup
|
@pytest.mark.skip_global_cleanup
|
||||||
def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
|
def test_guidance_no_additional_properties():
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="Qwen/Qwen2.5-1.5B-Instruct",
|
model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
|
|||||||
|
|
||||||
@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
|
@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
|
||||||
def test_structured_output_batched_with_non_structured_outputs_requests(
|
def test_structured_output_batched_with_non_structured_outputs_requests(
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
sample_json_schema: dict[str, Any],
|
sample_json_schema: dict[str, Any],
|
||||||
backend: str,
|
backend: str,
|
||||||
):
|
):
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
# Don't use eager execution on TPUs because we want to test for no
|
# Don't use eager execution on TPUs because we want to test for no
|
||||||
# recompilation at runtime
|
# recompilation at runtime
|
||||||
enforce_eager = bool(not current_platform.is_tpu())
|
enforce_eager = bool(not current_platform.is_tpu())
|
||||||
|
|||||||
@@ -53,7 +53,6 @@ cleanup() {
|
|||||||
launch_baseline() {
|
launch_baseline() {
|
||||||
BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
|
BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
|
||||||
VLLM_LOGGING_LEVEL=DEBUG \
|
VLLM_LOGGING_LEVEL=DEBUG \
|
||||||
VLLM_USE_V1=1 \
|
|
||||||
PJRT_DEVICE=TPU \
|
PJRT_DEVICE=TPU \
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
|
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
|
||||||
@@ -73,7 +72,6 @@ launch_pd() {
|
|||||||
UCX_TLS=tcp \
|
UCX_TLS=tcp \
|
||||||
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
||||||
VLLM_LOGGING_LEVEL=DEBUG \
|
VLLM_LOGGING_LEVEL=DEBUG \
|
||||||
VLLM_USE_V1=1 \
|
|
||||||
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
|
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
|
||||||
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
|
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
|
||||||
PJRT_DEVICE=TPU \
|
PJRT_DEVICE=TPU \
|
||||||
@@ -93,7 +91,6 @@ launch_pd() {
|
|||||||
UCX_TLS=tcp \
|
UCX_TLS=tcp \
|
||||||
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
||||||
VLLM_LOGGING_LEVEL=DEBUG \
|
VLLM_LOGGING_LEVEL=DEBUG \
|
||||||
VLLM_USE_V1=1 \
|
|
||||||
PJRT_DEVICE=TPU \
|
PJRT_DEVICE=TPU \
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
|
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
|
||||||
|
|||||||
@@ -55,7 +55,6 @@ launch_pd() {
|
|||||||
UCX_TLS=tcp \
|
UCX_TLS=tcp \
|
||||||
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
||||||
VLLM_LOGGING_LEVEL=DEBUG \
|
VLLM_LOGGING_LEVEL=DEBUG \
|
||||||
VLLM_USE_V1=1 \
|
|
||||||
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
|
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
|
||||||
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
|
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
|
||||||
PJRT_DEVICE=TPU \
|
PJRT_DEVICE=TPU \
|
||||||
@@ -75,7 +74,6 @@ launch_pd() {
|
|||||||
UCX_TLS=tcp \
|
UCX_TLS=tcp \
|
||||||
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
||||||
VLLM_LOGGING_LEVEL=DEBUG \
|
VLLM_LOGGING_LEVEL=DEBUG \
|
||||||
VLLM_USE_V1=1 \
|
|
||||||
PJRT_DEVICE=TPU \
|
PJRT_DEVICE=TPU \
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
|
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import ray
|
import ray
|
||||||
@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
|
|||||||
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
|
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
|
||||||
from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
|
from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
|
||||||
def use_v1_only(monkeypatch):
|
|
||||||
"""
|
|
||||||
The change relies on V1 APIs, so set VLLM_USE_V1=1.
|
|
||||||
"""
|
|
||||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"distilbert/distilgpt2",
|
"distilbert/distilgpt2",
|
||||||
]
|
]
|
||||||
@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
|
|||||||
@ray.remote(num_gpus=1)
|
@ray.remote(num_gpus=1)
|
||||||
class EngineTestActor:
|
class EngineTestActor:
|
||||||
async def run(self):
|
async def run(self):
|
||||||
# Set environment variable inside the Ray actor since environment
|
|
||||||
# variables from pytest fixtures don't propagate to Ray actors
|
|
||||||
os.environ["VLLM_USE_V1"] = "1"
|
|
||||||
|
|
||||||
engine_args = AsyncEngineArgs(
|
engine_args = AsyncEngineArgs(
|
||||||
model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
|
model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
|
|||||||
batch_logprobs_composition: BatchLogprobsComposition,
|
batch_logprobs_composition: BatchLogprobsComposition,
|
||||||
temperature: float,
|
temperature: float,
|
||||||
example_prompts: list[str],
|
example_prompts: list[str],
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test V1 Engine logprobs & prompt logprobs
|
"""Test V1 Engine logprobs & prompt logprobs
|
||||||
|
|
||||||
@@ -308,12 +307,8 @@ def test_get_logprobs_and_prompt_logprobs(
|
|||||||
temperature: "temperature" sampling parameter
|
temperature: "temperature" sampling parameter
|
||||||
example_prompts: example prompt fixture
|
example_prompts: example prompt fixture
|
||||||
"""
|
"""
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
|
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
|
||||||
if do_apc and (
|
if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
|
||||||
temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT
|
|
||||||
):
|
|
||||||
# Skip some test-cases to save time.
|
# Skip some test-cases to save time.
|
||||||
pytest.skip()
|
pytest.skip()
|
||||||
test_prompts = example_prompts
|
test_prompts = example_prompts
|
||||||
@@ -361,14 +356,11 @@ def test_get_logprobs_and_prompt_logprobs(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
|
def test_max_logprobs():
|
||||||
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
|
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
|
||||||
Should also fail for `prompt_logprobs > max_logprobs`
|
Should also fail for `prompt_logprobs > max_logprobs`
|
||||||
APC should not matter as this test checks basic request validation.
|
APC should not matter as this test checks basic request validation.
|
||||||
"""
|
"""
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
runner = VllmRunner(
|
runner = VllmRunner(
|
||||||
"facebook/opt-125m",
|
"facebook/opt-125m",
|
||||||
max_logprobs=1,
|
max_logprobs=1,
|
||||||
@@ -386,15 +378,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
|
|||||||
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
|
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
|
||||||
|
|
||||||
|
|
||||||
def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
|
def test_none_logprobs(vllm_model, example_prompts):
|
||||||
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
|
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vllm_model: vLLM model fixture
|
vllm_model: vLLM model fixture
|
||||||
example_prompts: list of example prompts (test fixture)
|
example_prompts: list of example prompts (test fixture)
|
||||||
"""
|
"""
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
|
|
||||||
sampling_params_logprobs_none = SamplingParams(
|
sampling_params_logprobs_none = SamplingParams(
|
||||||
@@ -416,15 +406,13 @@ def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa
|
|||||||
assert results_logprobs_none[i].prompt_logprobs is None
|
assert results_logprobs_none[i].prompt_logprobs is None
|
||||||
|
|
||||||
|
|
||||||
def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
|
def test_zero_logprobs(vllm_model, example_prompts):
|
||||||
"""Engine should return sampled token and prompt token logprobs
|
"""Engine should return sampled token and prompt token logprobs
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vllm_model: vLLM model fixture
|
vllm_model: vLLM model fixture
|
||||||
example_prompts: list of example prompts (test fixture)
|
example_prompts: list of example prompts (test fixture)
|
||||||
"""
|
"""
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
|
|
||||||
sampling_params_logprobs_zero = SamplingParams(
|
sampling_params_logprobs_zero = SamplingParams(
|
||||||
@@ -450,14 +438,12 @@ def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPa
|
|||||||
assert len(prompt_token_ids) == len(prompt_logprobs)
|
assert len(prompt_token_ids) == len(prompt_logprobs)
|
||||||
|
|
||||||
|
|
||||||
def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
|
def test_all_logprobs(example_prompts):
|
||||||
"""Engine should return all vocabulary logprobs and prompt logprobs
|
"""Engine should return all vocabulary logprobs and prompt logprobs
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
example_prompts: list of example prompts (test fixture)
|
example_prompts: list of example prompts (test fixture)
|
||||||
"""
|
"""
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
runner = VllmRunner(
|
runner = VllmRunner(
|
||||||
"facebook/opt-125m",
|
"facebook/opt-125m",
|
||||||
max_logprobs=-1,
|
max_logprobs=-1,
|
||||||
@@ -488,16 +474,13 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
|
@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
|
||||||
def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch):
|
def test_logprobs_mode(logprobs_mode: LogprobsMode):
|
||||||
"""Test with LLM engine with different logprobs_mode.
|
"""Test with LLM engine with different logprobs_mode.
|
||||||
For logprobs, we should have non-positive values.
|
For logprobs, we should have non-positive values.
|
||||||
For logits, we should expect at least one positive values.
|
For logits, we should expect at least one positive values.
|
||||||
"""
|
"""
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
"facebook/opt-125m",
|
"facebook/opt-125m",
|
||||||
max_logprobs=5,
|
max_logprobs=5,
|
||||||
|
|||||||
@@ -1,14 +1,10 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
if os.getenv("VLLM_USE_V1", "0") != "1":
|
|
||||||
pytest.skip("Test package requires V1", allow_module_level=True)
|
|
||||||
|
|
||||||
MODEL = "meta-llama/Llama-3.2-1B"
|
MODEL = "meta-llama/Llama-3.2-1B"
|
||||||
PROMPT = "Hello my name is Robert and I"
|
PROMPT = "Hello my name is Robert and I"
|
||||||
|
|
||||||
@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
|
|||||||
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
|
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
|
||||||
|
|
||||||
|
|
||||||
def test_priority(llm):
|
|
||||||
"""Check that we reject requests with priority."""
|
|
||||||
|
|
||||||
# Reject all allowed token ids
|
|
||||||
with pytest.raises(ValueError):
|
|
||||||
_ = llm.generate(PROMPT, priority=[1])
|
|
||||||
|
|
||||||
|
|
||||||
def test_seed(llm):
|
def test_seed(llm):
|
||||||
"""Check that seed impacts randomness."""
|
"""Check that seed impacts randomness."""
|
||||||
|
|
||||||
|
|||||||
@@ -38,7 +38,6 @@ def test_eagle_max_len(
|
|||||||
monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
|
monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
|
||||||
):
|
):
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||||
|
|
||||||
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
|
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
|
||||||
|
|||||||
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
|
|||||||
@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
|
@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
|
||||||
def test_basic(
|
def test_basic(
|
||||||
vllm_runner: type[VllmRunner],
|
vllm_runner: type[VllmRunner],
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
model: str,
|
model: str,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
@@ -55,9 +54,6 @@ def test_basic(
|
|||||||
)
|
)
|
||||||
example_prompts = [prompt]
|
example_prompts = [prompt]
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
# Note: max_num_batched_tokens == 1024 is needed here to
|
# Note: max_num_batched_tokens == 1024 is needed here to
|
||||||
@@ -82,7 +78,6 @@ def test_basic(
|
|||||||
@pytest.mark.parametrize("max_num_seqs", [16])
|
@pytest.mark.parametrize("max_num_seqs", [16])
|
||||||
def test_phi3(
|
def test_phi3(
|
||||||
vllm_runner: type[VllmRunner],
|
vllm_runner: type[VllmRunner],
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
max_num_seqs: int,
|
max_num_seqs: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -99,9 +94,6 @@ def test_phi3(
|
|||||||
# test head dim = 96
|
# test head dim = 96
|
||||||
model = "microsoft/Phi-3-mini-128k-instruct"
|
model = "microsoft/Phi-3-mini-128k-instruct"
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
|
model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
|
|||||||
)
|
)
|
||||||
def test_gemma3_27b_with_text_input_and_tp(
|
def test_gemma3_27b_with_text_input_and_tp(
|
||||||
vllm_runner: type[VllmRunner],
|
vllm_runner: type[VllmRunner],
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
model = "google/gemma-3-27b-it"
|
model = "google/gemma-3-27b-it"
|
||||||
max_tokens = 16
|
max_tokens = 16
|
||||||
@@ -140,9 +131,6 @@ def test_gemma3_27b_with_text_input_and_tp(
|
|||||||
" but in rising every time we fall.",
|
" but in rising every time we fall.",
|
||||||
]
|
]
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
max_num_batched_tokens=256,
|
max_num_batched_tokens=256,
|
||||||
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
|
|||||||
)
|
)
|
||||||
def test_w8a8_quantization(
|
def test_w8a8_quantization(
|
||||||
vllm_runner: type[VllmRunner],
|
vllm_runner: type[VllmRunner],
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
|
model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
|
||||||
max_tokens = 5
|
max_tokens = 5
|
||||||
@@ -176,9 +163,6 @@ def test_w8a8_quantization(
|
|||||||
)
|
)
|
||||||
example_prompts = [prompt]
|
example_prompts = [prompt]
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
max_num_batched_tokens=64,
|
max_num_batched_tokens=64,
|
||||||
|
|||||||
@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
|
|||||||
@pytest.mark.parametrize("params", TEST_PARAMS)
|
@pytest.mark.parametrize("params", TEST_PARAMS)
|
||||||
def test_perf(
|
def test_perf(
|
||||||
vllm_runner: type[VllmRunner],
|
vllm_runner: type[VllmRunner],
|
||||||
monkeypatch: pytest.MonkeyPatch,
|
|
||||||
params: TestParams,
|
params: TestParams,
|
||||||
) -> None:
|
) -> None:
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(
|
||||||
@@ -107,9 +106,6 @@ def test_perf(
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
max_tokens=params.decode_len, temperature=1.0, min_p=0.0
|
max_tokens=params.decode_len, temperature=1.0, min_p=0.0
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -82,7 +82,7 @@ def test_traces(
|
|||||||
):
|
):
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
|
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
temperature=0.01,
|
temperature=0.01,
|
||||||
top_p=0.1,
|
top_p=0.1,
|
||||||
|
|||||||
@@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
|
|||||||
logger.info("Warming up model for the compilation...")
|
logger.info("Warming up model for the compilation...")
|
||||||
# Only generate graph for the generic shape
|
# Only generate graph for the generic shape
|
||||||
with _set_global_compilation_settings(self.vllm_config):
|
with _set_global_compilation_settings(self.vllm_config):
|
||||||
self._dummy_run(max(16, self.max_num_reqs))
|
self._dummy_run(
|
||||||
|
min(
|
||||||
|
max(16, self.max_num_reqs),
|
||||||
|
self.scheduler_config.max_num_batched_tokens,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("Warming up done.")
|
logger.info("Warming up done.")
|
||||||
|
|
||||||
def _init_device_properties(self) -> None:
|
def _init_device_properties(self) -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user