[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -305,7 +305,6 @@ full_cg_backend_configs = {
|
||||
"CutlassMLA": BackendConfig(
|
||||
name="CutlassMLA",
|
||||
env_vars={
|
||||
"VLLM_USE_V1": "1",
|
||||
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
|
||||
"FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed
|
||||
},
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
|
||||
from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
|
||||
def new_kv_cache_spec():
|
||||
return FullAttentionSpec(16, 1, 1, torch.float32, False)
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
if os.getenv("VLLM_USE_V1", "0") != "1":
|
||||
pytest.skip("Test package requires V1", allow_module_level=True)
|
||||
|
||||
MODEL = "meta-llama/Llama-3.2-1B"
|
||||
PROMPT = "Hello my name is Robert and I"
|
||||
|
||||
|
||||
@@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
|
||||
):
|
||||
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
|
||||
|
||||
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
|
||||
env_vars = backend_configs[backend_name].env_vars
|
||||
|
||||
with temporary_environ(env_vars), ExitStack() as stack:
|
||||
if not supported:
|
||||
@@ -117,7 +117,7 @@ combo_cases_2 = [
|
||||
def test_cudagraph_compilation_combo(combo_case):
|
||||
backend_name, cudagraph_mode, compilation_level, supported = combo_case
|
||||
|
||||
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
|
||||
env_vars = backend_configs[backend_name].env_vars
|
||||
|
||||
with temporary_environ(env_vars), ExitStack() as stack:
|
||||
if not supported:
|
||||
|
||||
@@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
|
||||
)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||
|
||||
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
|
||||
|
||||
@@ -32,7 +32,7 @@ model_config = {
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
|
||||
def test_sliding_window_retrieval(
|
||||
monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager
|
||||
model, batch_size, seed, disable_hybrid_kv_cache_manager
|
||||
):
|
||||
"""
|
||||
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
||||
@@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
|
||||
If we tell it upfront which we are going to be looking for, then
|
||||
it answers correctly (mostly).
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
test_config = model_config[model]
|
||||
|
||||
test_config = model_config[model]
|
||||
llm = LLM(
|
||||
model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
|
||||
)
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
|
||||
|
||||
llm = LLM(
|
||||
model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
|
||||
)
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
|
||||
prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)
|
||||
|
||||
prompts, answer, indices = prep_prompts(
|
||||
batch_size, ln_range=test_config.ln_range
|
||||
)
|
||||
check_length(prompts, llm, test_config.sliding_window)
|
||||
|
||||
check_length(prompts, llm, test_config.sliding_window)
|
||||
# Fresh generation
|
||||
responses = llm.generate(prompts, sampling_params)
|
||||
check_answers(
|
||||
indices,
|
||||
answer,
|
||||
[response.outputs[0].text for response in responses],
|
||||
accept_rate=1.0,
|
||||
)
|
||||
|
||||
# Fresh generation
|
||||
responses = llm.generate(prompts, sampling_params)
|
||||
check_answers(
|
||||
indices,
|
||||
answer,
|
||||
[response.outputs[0].text for response in responses],
|
||||
accept_rate=1.0,
|
||||
)
|
||||
|
||||
# Re-generate with the same prompts to test prefix caching
|
||||
responses = llm.generate(prompts, sampling_params)
|
||||
check_answers(
|
||||
indices,
|
||||
answer,
|
||||
[response.outputs[0].text for response in responses],
|
||||
accept_rate=1.0,
|
||||
)
|
||||
# Re-generate with the same prompts to test prefix caching
|
||||
responses = llm.generate(prompts, sampling_params)
|
||||
check_answers(
|
||||
indices,
|
||||
answer,
|
||||
[response.outputs[0].text for response in responses],
|
||||
accept_rate=1.0,
|
||||
)
|
||||
|
||||
|
||||
def check_length(prompts: list[str], llm: LLM, sliding_window: int):
|
||||
|
||||
@@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
|
||||
)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
# Make scheduling deterministic for reproducibility
|
||||
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
|
||||
|
||||
@@ -13,7 +13,6 @@ Covers:
|
||||
5) Multiple stop conditions
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
import pytest
|
||||
@@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
|
||||
@pytest.fixture(scope="module")
|
||||
def llm_v1():
|
||||
"""Create V1 LLM instance for testing"""
|
||||
# Ensure V1 engine is used
|
||||
os.environ["VLLM_USE_V1"] = "1"
|
||||
|
||||
llm = LLM(
|
||||
model=TEST_MODEL,
|
||||
tensor_parallel_size=1,
|
||||
@@ -503,6 +499,6 @@ if __name__ == "__main__":
|
||||
|
||||
Usage:
|
||||
cd vllm/
|
||||
VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v
|
||||
python -m pytest tests/v1/e2e/test_min_tokens.py -v
|
||||
"""
|
||||
pytest.main([__file__, "-v"])
|
||||
|
||||
@@ -301,7 +301,6 @@ def test_mtp_correctness(
|
||||
model_setup: (method, model_name, tp_size)
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("VLLM_MLA_DISABLE", "1")
|
||||
|
||||
method, model_name, tp_size = model_setup
|
||||
|
||||
@@ -95,17 +95,11 @@ async def generate(
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_load(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
output_kind: RequestOutputKind,
|
||||
engine_args: AsyncEngineArgs,
|
||||
prompt: PromptType,
|
||||
):
|
||||
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
|
||||
# so that in the future when we switch, we don't have to change all the
|
||||
# tests.
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with ExitStack() as after:
|
||||
with set_default_torch_num_threads(1):
|
||||
engine = AsyncLLM.from_engine_args(engine_args)
|
||||
after.callback(engine.shutdown)
|
||||
@@ -149,14 +143,11 @@ async def test_load(
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_abort(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
output_kind: RequestOutputKind,
|
||||
engine_args: AsyncEngineArgs,
|
||||
prompt: PromptType,
|
||||
):
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with ExitStack() as after:
|
||||
with set_default_torch_num_threads(1):
|
||||
engine = AsyncLLM.from_engine_args(engine_args)
|
||||
after.callback(engine.shutdown)
|
||||
@@ -222,13 +213,8 @@ async def test_abort(
|
||||
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_abort(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
output_kind: RequestOutputKind,
|
||||
):
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
async def test_multi_abort(output_kind: RequestOutputKind):
|
||||
with ExitStack() as after:
|
||||
with set_default_torch_num_threads(1):
|
||||
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
|
||||
after.callback(engine.shutdown)
|
||||
@@ -304,14 +290,11 @@ async def test_multi_abort(
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_finished_flag(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
n: int,
|
||||
engine_args: AsyncEngineArgs,
|
||||
prompt: PromptType,
|
||||
):
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with ExitStack() as after:
|
||||
with set_default_torch_num_threads(1):
|
||||
engine = AsyncLLM.from_engine_args(engine_args)
|
||||
after.callback(engine.shutdown)
|
||||
@@ -341,12 +324,10 @@ async def test_finished_flag(
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_mid_stream_cancellation(
|
||||
monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType
|
||||
engine_args: AsyncEngineArgs, prompt: PromptType
|
||||
):
|
||||
"""Test that requests can be cancelled mid-stream."""
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with ExitStack() as after:
|
||||
with set_default_torch_num_threads(1):
|
||||
engine = AsyncLLM.from_engine_args(engine_args)
|
||||
after.callback(engine.shutdown)
|
||||
@@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
|
||||
be added to the default loggers.
|
||||
"""
|
||||
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with ExitStack() as after:
|
||||
with set_default_torch_num_threads(1):
|
||||
engine = AsyncLLM.from_engine_args(
|
||||
TEXT_ENGINE_ARGS,
|
||||
@@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
|
||||
|
||||
|
||||
@pytest.mark.asyncio(scope="module")
|
||||
async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
async def test_dp_rank_argument():
|
||||
with ExitStack() as after:
|
||||
with set_default_torch_num_threads(1):
|
||||
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
|
||||
after.callback(engine.shutdown)
|
||||
@@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_health(monkeypatch: pytest.MonkeyPatch):
|
||||
async def test_check_health():
|
||||
"""Test that check_health returns normally for healthy engine
|
||||
and raises EngineDeadError when the engine is dead.
|
||||
"""
|
||||
@@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
from vllm.v1.engine.exceptions import EngineDeadError
|
||||
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with ExitStack() as after:
|
||||
with set_default_torch_num_threads(1):
|
||||
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
|
||||
after.callback(engine.shutdown)
|
||||
@@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
|
||||
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_abort_final_output(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
output_kind: RequestOutputKind,
|
||||
):
|
||||
async def test_abort_final_output(output_kind: RequestOutputKind):
|
||||
"""Test that abort() returns a final output with correct information."""
|
||||
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with ExitStack() as after:
|
||||
with set_default_torch_num_threads(1):
|
||||
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
|
||||
after.callback(engine.shutdown)
|
||||
|
||||
@@ -5,18 +5,11 @@ from argparse import ArgumentError
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
pytest.skip(
|
||||
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
def test_prefix_caching_from_cli():
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
|
||||
@@ -46,188 +46,184 @@ def make_request() -> EngineCoreRequest:
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_engine_core(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
"""Setup the EngineCore."""
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
def test_engine_core():
|
||||
"""Setup the EngineCore."""
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
|
||||
with set_default_torch_num_threads(1):
|
||||
engine_core = EngineCore(
|
||||
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
|
||||
)
|
||||
"""Test basic request lifecycle."""
|
||||
with set_default_torch_num_threads(1):
|
||||
engine_core = EngineCore(
|
||||
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
|
||||
)
|
||||
"""Test basic request lifecycle."""
|
||||
|
||||
# First request.
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
# First request.
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 1
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 1
|
||||
|
||||
# Second request.
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 1
|
||||
# Second request.
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 1
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
|
||||
# Add two requests in a row.
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
|
||||
assert len(engine_core.scheduler.waiting) == 2
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
# Add two requests in a row.
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
|
||||
assert len(engine_core.scheduler.waiting) == 2
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 4
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 4
|
||||
|
||||
# Loop through until they are all done.
|
||||
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
|
||||
pass
|
||||
# Loop through until they are all done.
|
||||
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
|
||||
pass
|
||||
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
"""Test abort cycle."""
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
"""Test abort cycle."""
|
||||
|
||||
# Basic abort.
|
||||
req = make_request()
|
||||
request_id = req.request_id
|
||||
# Basic abort.
|
||||
req = make_request()
|
||||
request_id = req.request_id
|
||||
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req))
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
assert engine_core.scheduler.has_unfinished_requests()
|
||||
assert not engine_core.scheduler.has_finished_requests()
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req))
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
assert engine_core.scheduler.has_unfinished_requests()
|
||||
assert not engine_core.scheduler.has_finished_requests()
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 1
|
||||
assert engine_core.scheduler.has_unfinished_requests()
|
||||
assert not engine_core.scheduler.has_finished_requests()
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 1
|
||||
assert engine_core.scheduler.has_unfinished_requests()
|
||||
assert not engine_core.scheduler.has_finished_requests()
|
||||
|
||||
engine_core.abort_requests([request_id])
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
assert not engine_core.scheduler.has_unfinished_requests()
|
||||
assert engine_core.scheduler.has_finished_requests()
|
||||
engine_core.abort_requests([request_id])
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
assert not engine_core.scheduler.has_unfinished_requests()
|
||||
assert engine_core.scheduler.has_finished_requests()
|
||||
|
||||
_ = engine_core.step()
|
||||
assert not engine_core.scheduler.has_unfinished_requests()
|
||||
assert not engine_core.scheduler.has_finished_requests()
|
||||
_ = engine_core.step()
|
||||
assert not engine_core.scheduler.has_unfinished_requests()
|
||||
assert not engine_core.scheduler.has_finished_requests()
|
||||
|
||||
# Add, step, abort 1 of the 3.
|
||||
req0 = make_request()
|
||||
req1 = make_request()
|
||||
req2 = make_request()
|
||||
# Add, step, abort 1 of the 3.
|
||||
req0 = make_request()
|
||||
req1 = make_request()
|
||||
req2 = make_request()
|
||||
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req0))
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req1))
|
||||
assert len(engine_core.scheduler.waiting) == 2
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req0))
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req1))
|
||||
assert len(engine_core.scheduler.waiting) == 2
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req2))
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req2))
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 3
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 3
|
||||
|
||||
# Abort just one.
|
||||
engine_core.abort_requests([req1.request_id])
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
# Abort just one.
|
||||
engine_core.abort_requests([req1.request_id])
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
_ = engine_core.step()
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 2
|
||||
|
||||
# Abort the other requests at the same time.
|
||||
engine_core.abort_requests([req2.request_id, req0.request_id])
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
# Abort the other requests at the same time.
|
||||
engine_core.abort_requests([req2.request_id, req0.request_id])
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
# Sending duplicate requests with same request_id
|
||||
req0 = make_request()
|
||||
req1 = make_request()
|
||||
req0.request_id = req1.request_id = "test"
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req0))
|
||||
# Sending duplicate requests with same request_id
|
||||
req0 = make_request()
|
||||
req1 = make_request()
|
||||
req0.request_id = req1.request_id = "test"
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req0))
|
||||
|
||||
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
|
||||
pass
|
||||
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
|
||||
pass
|
||||
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req1))
|
||||
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
|
||||
pass
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req1))
|
||||
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
|
||||
pass
|
||||
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
|
||||
def test_engine_core_advanced_sampling():
|
||||
"""
|
||||
A basic end-to-end test to verify that the engine functions correctly
|
||||
when additional sampling parameters, such as top_p, min_tokens, and
|
||||
presence_penalty, are set.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
"""Setup the EngineCore."""
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
"""Setup the EngineCore."""
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
|
||||
with set_default_torch_num_threads(1):
|
||||
engine_core = EngineCore(
|
||||
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
|
||||
)
|
||||
"""Test basic request lifecycle."""
|
||||
# First request.
|
||||
request: EngineCoreRequest = make_request()
|
||||
request.sampling_params = SamplingParams(
|
||||
min_tokens=4,
|
||||
presence_penalty=1.0,
|
||||
frequency_penalty=1.0,
|
||||
repetition_penalty=0.1,
|
||||
stop_token_ids=[1001, 1002],
|
||||
with set_default_torch_num_threads(1):
|
||||
engine_core = EngineCore(
|
||||
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
|
||||
)
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(request))
|
||||
"""Test basic request lifecycle."""
|
||||
# First request.
|
||||
request: EngineCoreRequest = make_request()
|
||||
request.sampling_params = SamplingParams(
|
||||
min_tokens=4,
|
||||
presence_penalty=1.0,
|
||||
frequency_penalty=1.0,
|
||||
repetition_penalty=0.1,
|
||||
stop_token_ids=[1001, 1002],
|
||||
)
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(request))
|
||||
|
||||
def _check_engine_state():
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
# Loop through until they are all done.
|
||||
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
|
||||
pass
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
def _check_engine_state():
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
# Loop through until they are all done.
|
||||
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
|
||||
pass
|
||||
assert len(engine_core.scheduler.waiting) == 0
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
_check_engine_state()
|
||||
_check_engine_state()
|
||||
|
||||
# Second request.
|
||||
request2 = make_request()
|
||||
request2.sampling_params = SamplingParams(
|
||||
top_p=0.99,
|
||||
top_k=50,
|
||||
)
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(request2))
|
||||
_check_engine_state()
|
||||
# Second request.
|
||||
request2 = make_request()
|
||||
request2.sampling_params = SamplingParams(
|
||||
top_p=0.99,
|
||||
top_k=50,
|
||||
)
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(request2))
|
||||
_check_engine_state()
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
def test_engine_core_concurrent_batches():
|
||||
"""
|
||||
Test that the engine can handle multiple concurrent batches.
|
||||
"""
|
||||
@@ -272,173 +268,163 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
if hasattr(self, "thread_pool"):
|
||||
self.thread_pool.shutdown(wait=False)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
# To test concurrent batches.
|
||||
max_num_seqs=2,
|
||||
# Avoid all requests being scheduled once.
|
||||
enable_prefix_caching=False,
|
||||
max_num_batched_tokens=10,
|
||||
# Reduce startup time.
|
||||
enforce_eager=True,
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
# To test concurrent batches.
|
||||
max_num_seqs=2,
|
||||
# Avoid all requests being scheduled once.
|
||||
enable_prefix_caching=False,
|
||||
max_num_batched_tokens=10,
|
||||
# Reduce startup time.
|
||||
enforce_eager=True,
|
||||
)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
with set_default_torch_num_threads(1):
|
||||
engine_core = EngineCore(
|
||||
vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
|
||||
)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
with set_default_torch_num_threads(1):
|
||||
engine_core = EngineCore(
|
||||
vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
|
||||
assert engine_core.batch_queue is not None
|
||||
|
||||
# Add two requests in a row. Each request have 12 prompt tokens.
|
||||
req0 = make_request_with_max_tokens("0", 5)
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req0))
|
||||
req1 = make_request_with_max_tokens("1", 5)
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req1))
|
||||
|
||||
# Schedule Batch 1: (10, req0)
|
||||
assert engine_core.step_with_batch_queue()[0] is None
|
||||
assert len(engine_core.batch_queue) == 1
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["0"] == 10
|
||||
# num_computed_tokens should have been updated immediately.
|
||||
assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
|
||||
|
||||
# Schedule Batch 2: (2, req0), (8, req1)
|
||||
assert engine_core.step_with_batch_queue()[0] == {}
|
||||
assert len(engine_core.batch_queue) == 1
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["0"] == 2
|
||||
assert scheduler_output.num_scheduled_tokens["1"] == 8
|
||||
# num_computed_tokens should have been updated immediately.
|
||||
assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
|
||||
assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
|
||||
|
||||
assert engine_core.scheduler.get_num_unfinished_requests() == 2
|
||||
|
||||
# Finish Batch 1 and schedule Batch 3: (4, req1).
|
||||
# Note that req0 cannot be scheduled
|
||||
# because it is in the decoding stage now.
|
||||
engine_core.step_with_batch_queue()
|
||||
assert len(engine_core.batch_queue) == 1
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["1"] == 4
|
||||
|
||||
# Finish Batch 2. Get first token of req0.
|
||||
# Schedule Batch 4: (1, req0).
|
||||
output = engine_core.step_with_batch_queue()[0].get(0)
|
||||
assert output is not None
|
||||
assert len(output.outputs) == 1
|
||||
assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["0"] == 1
|
||||
|
||||
# Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
|
||||
output = engine_core.step_with_batch_queue()[0].get(0)
|
||||
assert output is not None
|
||||
assert len(output.outputs) == 1
|
||||
assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["1"] == 1
|
||||
|
||||
# Loop until req0 is finished.
|
||||
req_id = 0
|
||||
expected_num_tokens = [
|
||||
engine_core.scheduler.requests["0"].num_tokens + 1,
|
||||
engine_core.scheduler.requests["1"].num_tokens + 1,
|
||||
]
|
||||
while engine_core.scheduler.get_num_unfinished_requests() == 2:
|
||||
output = engine_core.step_with_batch_queue()[0]
|
||||
# Every step consumes an output.
|
||||
assert output is not None
|
||||
assert len(output[0].outputs) == 1
|
||||
if req_id in engine_core.scheduler.requests:
|
||||
assert (
|
||||
engine_core.scheduler.requests[req_id].num_tokens
|
||||
== expected_num_tokens[req_id]
|
||||
)
|
||||
assert engine_core.batch_queue is not None
|
||||
|
||||
# Add two requests in a row. Each request have 12 prompt tokens.
|
||||
req0 = make_request_with_max_tokens("0", 5)
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req0))
|
||||
req1 = make_request_with_max_tokens("1", 5)
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(req1))
|
||||
|
||||
# Schedule Batch 1: (10, req0)
|
||||
assert engine_core.step_with_batch_queue()[0] is None
|
||||
assert len(engine_core.batch_queue) == 1
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["0"] == 10
|
||||
# num_computed_tokens should have been updated immediately.
|
||||
assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
|
||||
|
||||
# Schedule Batch 2: (2, req0), (8, req1)
|
||||
assert engine_core.step_with_batch_queue()[0] == {}
|
||||
assert len(engine_core.batch_queue) == 1
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["0"] == 2
|
||||
assert scheduler_output.num_scheduled_tokens["1"] == 8
|
||||
# num_computed_tokens should have been updated immediately.
|
||||
assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
|
||||
assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
|
||||
|
||||
assert engine_core.scheduler.get_num_unfinished_requests() == 2
|
||||
|
||||
# Finish Batch 1 and schedule Batch 3: (4, req1).
|
||||
# Note that req0 cannot be scheduled
|
||||
# because it is in the decoding stage now.
|
||||
engine_core.step_with_batch_queue()
|
||||
assert len(engine_core.batch_queue) == 1
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["1"] == 4
|
||||
|
||||
# Finish Batch 2. Get first token of req0.
|
||||
# Schedule Batch 4: (1, req0).
|
||||
output = engine_core.step_with_batch_queue()[0].get(0)
|
||||
assert output is not None
|
||||
assert len(output.outputs) == 1
|
||||
assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["0"] == 1
|
||||
|
||||
# Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
|
||||
output = engine_core.step_with_batch_queue()[0].get(0)
|
||||
assert output is not None
|
||||
assert len(output.outputs) == 1
|
||||
assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
|
||||
scheduler_output = engine_core.batch_queue[-1][1]
|
||||
assert scheduler_output.num_scheduled_tokens["1"] == 1
|
||||
|
||||
# Loop until req0 is finished.
|
||||
req_id = 0
|
||||
expected_num_tokens = [
|
||||
engine_core.scheduler.requests["0"].num_tokens + 1,
|
||||
engine_core.scheduler.requests["1"].num_tokens + 1,
|
||||
]
|
||||
while engine_core.scheduler.get_num_unfinished_requests() == 2:
|
||||
output = engine_core.step_with_batch_queue()[0]
|
||||
# Every step consumes an output.
|
||||
assert output is not None
|
||||
assert len(output[0].outputs) == 1
|
||||
if req_id in engine_core.scheduler.requests:
|
||||
assert (
|
||||
engine_core.scheduler.requests[req_id].num_tokens
|
||||
== expected_num_tokens[req_id]
|
||||
)
|
||||
expected_num_tokens[req_id] += 1
|
||||
req_id = (req_id + 1) % 2
|
||||
expected_num_tokens[req_id] += 1
|
||||
req_id = (req_id + 1) % 2
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
|
||||
def test_engine_core_tp():
|
||||
"""
|
||||
Test engine can initialize worker in tp properly
|
||||
"""
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
"""Setup the EngineCore."""
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
tensor_parallel_size=2,
|
||||
# Reduce startup time.
|
||||
enforce_eager=True,
|
||||
)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
"""Setup the EngineCore."""
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
tensor_parallel_size=2,
|
||||
# Reduce startup time.
|
||||
enforce_eager=True,
|
||||
)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
|
||||
with set_default_torch_num_threads(1):
|
||||
engine_core = EngineCore(
|
||||
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
|
||||
)
|
||||
|
||||
def get_worker_cache_config_field(worker, key: str):
|
||||
return getattr(worker.cache_config, key)
|
||||
|
||||
num_gpu_blocks = engine_core.collective_rpc(
|
||||
get_worker_cache_config_field, args=("num_gpu_blocks",)
|
||||
with set_default_torch_num_threads(1):
|
||||
engine_core = EngineCore(
|
||||
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
|
||||
)
|
||||
num_cpu_blocks = engine_core.collective_rpc(
|
||||
get_worker_cache_config_field, args=("num_cpu_blocks",)
|
||||
)
|
||||
assert all(x is not None for x in num_gpu_blocks)
|
||||
assert all(x is not None for x in num_cpu_blocks)
|
||||
|
||||
def get_worker_cache_config_field(worker, key: str):
|
||||
return getattr(worker.cache_config, key)
|
||||
|
||||
num_gpu_blocks = engine_core.collective_rpc(
|
||||
get_worker_cache_config_field, args=("num_gpu_blocks",)
|
||||
)
|
||||
num_cpu_blocks = engine_core.collective_rpc(
|
||||
get_worker_cache_config_field, args=("num_cpu_blocks",)
|
||||
)
|
||||
assert all(x is not None for x in num_gpu_blocks)
|
||||
assert all(x is not None for x in num_cpu_blocks)
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch):
|
||||
def test_engine_core_invalid_request_id_type():
|
||||
"""Test that engine raises TypeError for non-string request_id."""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
with set_default_torch_num_threads(1):
|
||||
engine_core = EngineCore(
|
||||
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
|
||||
)
|
||||
|
||||
with set_default_torch_num_threads(1):
|
||||
engine_core = EngineCore(
|
||||
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
|
||||
)
|
||||
# Test with UUID object (common mistake)
|
||||
uuid_request = make_request()
|
||||
uuid_request.request_id = uuid.uuid4() # UUID object instead of string
|
||||
|
||||
# Test with UUID object (common mistake)
|
||||
uuid_request = make_request()
|
||||
uuid_request.request_id = uuid.uuid4() # UUID object instead of string
|
||||
with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))
|
||||
|
||||
with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))
|
||||
# Test with integer
|
||||
int_request = make_request()
|
||||
int_request.request_id = 12345
|
||||
|
||||
# Test with integer
|
||||
int_request = make_request()
|
||||
int_request.request_id = 12345
|
||||
with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(int_request))
|
||||
|
||||
with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(int_request))
|
||||
# Test with None
|
||||
none_request = make_request()
|
||||
none_request.request_id = None
|
||||
|
||||
# Test with None
|
||||
none_request = make_request()
|
||||
none_request.request_id = None
|
||||
with pytest.raises(TypeError, match="request_id must be a string, got.*NoneType"):
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(none_request))
|
||||
|
||||
with pytest.raises(
|
||||
TypeError, match="request_id must be a string, got.*NoneType"
|
||||
):
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(none_request))
|
||||
|
||||
# Verify engine is still functional after errors
|
||||
valid_request = make_request()
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
# Verify engine is still functional after errors
|
||||
valid_request = make_request()
|
||||
engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
|
||||
assert len(engine_core.scheduler.waiting) == 1
|
||||
assert len(engine_core.scheduler.running) == 0
|
||||
|
||||
@@ -130,8 +130,6 @@ def test_engine_core_client(
|
||||
monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
# Monkey-patch core engine utility function to test.
|
||||
m.setattr(EngineCore, "echo", echo, raising=False)
|
||||
|
||||
@@ -218,8 +216,6 @@ def test_engine_core_client(
|
||||
@pytest.mark.asyncio(loop_scope="function")
|
||||
async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
# Monkey-patch core engine utility function to test.
|
||||
m.setattr(EngineCore, "echo", echo, raising=False)
|
||||
|
||||
@@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
# Must set insecure serialization to allow returning custom types.
|
||||
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
@@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
# Must set insecure serialization to allow returning custom types.
|
||||
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
@@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
# Must set insecure serialization to allow returning custom types.
|
||||
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
@@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures(
|
||||
indirect=["publisher_config"],
|
||||
)
|
||||
def test_kv_cache_events(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
multiprocessing_mode: bool,
|
||||
publisher_config,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
block_size = 16
|
||||
num_blocks = 2
|
||||
block_size = 16
|
||||
num_blocks = 2
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
block_size=block_size,
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
block_size=block_size,
|
||||
)
|
||||
engine_args.kv_events_config = publisher_config
|
||||
|
||||
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
|
||||
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
with set_default_torch_num_threads(1):
|
||||
client = EngineCoreClient.make_client(
|
||||
multiprocess_mode=multiprocessing_mode,
|
||||
asyncio_mode=False,
|
||||
vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
log_stats=False,
|
||||
)
|
||||
engine_args.kv_events_config = publisher_config
|
||||
endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
|
||||
subscriber = MockSubscriber(
|
||||
endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
|
||||
)
|
||||
|
||||
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
|
||||
try:
|
||||
custom_tokens = list(range(num_blocks * block_size))
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
request = make_request(sampling_params, custom_tokens)
|
||||
client.add_request(request)
|
||||
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
with set_default_torch_num_threads(1):
|
||||
client = EngineCoreClient.make_client(
|
||||
multiprocess_mode=multiprocessing_mode,
|
||||
asyncio_mode=False,
|
||||
vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
log_stats=False,
|
||||
)
|
||||
endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
|
||||
subscriber = MockSubscriber(
|
||||
endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
|
||||
outputs: dict[str, list] = {request.request_id: []}
|
||||
loop_until_done(client, outputs)
|
||||
|
||||
result = subscriber.receive_one(timeout=1000)
|
||||
assert result is not None, "No message received"
|
||||
|
||||
seq, received = result
|
||||
|
||||
assert seq == 0, "Sequence number mismatch"
|
||||
assert len(received.events) == 1, "We should have exactly one BlockStored event"
|
||||
event = received.events[0]
|
||||
assert isinstance(event, BlockStored), "We should have a BlockStored event"
|
||||
assert len(event.block_hashes) == num_blocks, (
|
||||
"We should have a BlockStored event with 2 block_hashes"
|
||||
)
|
||||
|
||||
try:
|
||||
custom_tokens = list(range(num_blocks * block_size))
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
request = make_request(sampling_params, custom_tokens)
|
||||
client.add_request(request)
|
||||
|
||||
outputs: dict[str, list] = {request.request_id: []}
|
||||
loop_until_done(client, outputs)
|
||||
|
||||
result = subscriber.receive_one(timeout=1000)
|
||||
assert result is not None, "No message received"
|
||||
|
||||
seq, received = result
|
||||
|
||||
assert seq == 0, "Sequence number mismatch"
|
||||
assert len(received.events) == 1, (
|
||||
"We should have exactly one BlockStored event"
|
||||
)
|
||||
event = received.events[0]
|
||||
assert isinstance(event, BlockStored), "We should have a BlockStored event"
|
||||
assert len(event.block_hashes) == num_blocks, (
|
||||
"We should have a BlockStored event with 2 block_hashes"
|
||||
)
|
||||
assert event.block_size == block_size, (
|
||||
"Block size should be the same as the block size"
|
||||
)
|
||||
assert event.parent_block_hash is None, "Parent block hash should be None"
|
||||
assert event.lora_id is None, "Lora id should be None"
|
||||
assert len(event.token_ids) == num_blocks * block_size, (
|
||||
"Token ids should be the same as the custom tokens"
|
||||
)
|
||||
assert event.token_ids == custom_tokens, (
|
||||
"Token ids should be the same as the custom tokens"
|
||||
)
|
||||
finally:
|
||||
client.shutdown()
|
||||
subscriber.close()
|
||||
assert event.block_size == block_size, (
|
||||
"Block size should be the same as the block size"
|
||||
)
|
||||
assert event.parent_block_hash is None, "Parent block hash should be None"
|
||||
assert event.lora_id is None, "Lora id should be None"
|
||||
assert len(event.token_ids) == num_blocks * block_size, (
|
||||
"Token ids should be the same as the custom tokens"
|
||||
)
|
||||
assert event.token_ids == custom_tokens, (
|
||||
"Token ids should be the same as the custom tokens"
|
||||
)
|
||||
finally:
|
||||
client.shutdown()
|
||||
subscriber.close()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -672,101 +657,96 @@ def test_kv_cache_events(
|
||||
)
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
async def test_kv_cache_events_dp(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
multiprocessing_mode: bool,
|
||||
publisher_config,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
block_size = 16
|
||||
num_blocks = 2
|
||||
dp_size = 2
|
||||
tp_size = 2
|
||||
block_size = 16
|
||||
num_blocks = 2
|
||||
dp_size = 2
|
||||
tp_size = 2
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
data_parallel_size=dp_size,
|
||||
tensor_parallel_size=tp_size,
|
||||
block_size=block_size,
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL_NAME,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
data_parallel_size=dp_size,
|
||||
tensor_parallel_size=tp_size,
|
||||
block_size=block_size,
|
||||
)
|
||||
engine_args.kv_events_config = publisher_config
|
||||
|
||||
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
|
||||
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
with set_default_torch_num_threads(1):
|
||||
client = EngineCoreClient.make_client(
|
||||
multiprocess_mode=multiprocessing_mode,
|
||||
asyncio_mode=True,
|
||||
vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
log_stats=False,
|
||||
)
|
||||
engine_args.kv_events_config = publisher_config
|
||||
await asyncio.sleep(1)
|
||||
|
||||
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
|
||||
# Build endpoints for all DP ranks
|
||||
base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
|
||||
endpoints = []
|
||||
for i in range(dp_size):
|
||||
offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
|
||||
endpoints.append(offset_endpoint)
|
||||
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
with set_default_torch_num_threads(1):
|
||||
client = EngineCoreClient.make_client(
|
||||
multiprocess_mode=multiprocessing_mode,
|
||||
asyncio_mode=True,
|
||||
vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
log_stats=False,
|
||||
)
|
||||
await asyncio.sleep(1)
|
||||
subscriber = MockSubscriber(
|
||||
endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
|
||||
)
|
||||
|
||||
# Build endpoints for all DP ranks
|
||||
base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
|
||||
endpoints = []
|
||||
for i in range(dp_size):
|
||||
offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
|
||||
endpoints.append(offset_endpoint)
|
||||
try:
|
||||
custom_tokens = list(range(num_blocks * block_size))
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
all_request_ids = []
|
||||
|
||||
subscriber = MockSubscriber(
|
||||
endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
|
||||
# Create and add 25 requests
|
||||
# NOTE: attempts to force routing to both dp groups but can be flaky
|
||||
for i in range(25):
|
||||
await asyncio.sleep(0.01)
|
||||
request = make_request(sampling_params, custom_tokens)
|
||||
await client.add_request_async(request)
|
||||
all_request_ids.append(request.request_id)
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Initialize outputs dict for all requests
|
||||
outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
|
||||
|
||||
print("processing requests...")
|
||||
await asyncio.wait_for(
|
||||
loop_until_fully_done_async(client, outputs), timeout=20.0
|
||||
)
|
||||
|
||||
try:
|
||||
custom_tokens = list(range(num_blocks * block_size))
|
||||
sampling_params = SamplingParams(max_tokens=1)
|
||||
all_request_ids = []
|
||||
# Receive from subscriber until no more messages
|
||||
print("collecting results...")
|
||||
results = []
|
||||
while True:
|
||||
result = subscriber.receive_one(timeout=1)
|
||||
print(result)
|
||||
if result is None:
|
||||
break
|
||||
results.append(result)
|
||||
|
||||
# Create and add 25 requests
|
||||
# NOTE: attempts to force routing to both dp groups but can be flaky
|
||||
for i in range(25):
|
||||
await asyncio.sleep(0.01)
|
||||
request = make_request(sampling_params, custom_tokens)
|
||||
await client.add_request_async(request)
|
||||
all_request_ids.append(request.request_id)
|
||||
# Collect all events and data_parallel_ranks from all results
|
||||
all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
|
||||
unique_dps = set(all_dp_ranks)
|
||||
assert len(unique_dps) == 2, (
|
||||
f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
|
||||
)
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Initialize outputs dict for all requests
|
||||
outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
|
||||
|
||||
print("processing requests...")
|
||||
await asyncio.wait_for(
|
||||
loop_until_fully_done_async(client, outputs), timeout=20.0
|
||||
)
|
||||
|
||||
# Receive from subscriber until no more messages
|
||||
print("collecting results...")
|
||||
results = []
|
||||
while True:
|
||||
result = subscriber.receive_one(timeout=1)
|
||||
print(result)
|
||||
if result is None:
|
||||
break
|
||||
results.append(result)
|
||||
|
||||
# Collect all events and data_parallel_ranks from all results
|
||||
all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
|
||||
unique_dps = set(all_dp_ranks)
|
||||
assert len(unique_dps) == 2, (
|
||||
f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
|
||||
)
|
||||
|
||||
finally:
|
||||
client.shutdown()
|
||||
subscriber.close()
|
||||
finally:
|
||||
client.shutdown()
|
||||
subscriber.close()
|
||||
|
||||
|
||||
@pytest.mark.timeout(20)
|
||||
def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
# Monkey-patch to extract core process pid while it's starting.
|
||||
core_proc_pid = [None]
|
||||
cepm_ctor = CoreEngineProcManager.__init__
|
||||
@@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
|
||||
mock_executor_class.side_effect = create_mock_executor
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices
|
||||
|
||||
from vllm.v1.engine.utils import EngineZmqAddresses
|
||||
|
||||
@@ -21,12 +21,10 @@ DTYPE = "half"
|
||||
def _vllm_model(
|
||||
apc: bool,
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
*,
|
||||
skip_tokenizer_init: bool = False,
|
||||
):
|
||||
"""Set up VllmRunner instance."""
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
return vllm_runner(
|
||||
MODEL,
|
||||
dtype=DTYPE,
|
||||
@@ -45,16 +43,16 @@ def _vllm_model(
|
||||
# Prefix caching
|
||||
params=[False, True],
|
||||
)
|
||||
def vllm_model(vllm_runner, request, monkeypatch):
|
||||
def vllm_model(vllm_runner, request):
|
||||
"""VllmRunner test fixture parameterized by APC True/False."""
|
||||
with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model:
|
||||
with _vllm_model(request.param, vllm_runner) as vllm_model:
|
||||
yield vllm_model
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def vllm_model_apc(vllm_runner, monkeypatch):
|
||||
def vllm_model_apc(vllm_runner):
|
||||
"""VllmRunner test fixture with APC."""
|
||||
with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model:
|
||||
with _vllm_model(True, vllm_runner) as vllm_model:
|
||||
yield vllm_model
|
||||
|
||||
|
||||
@@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
|
||||
# Prefix caching
|
||||
params=[False, True],
|
||||
)
|
||||
def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
|
||||
def vllm_model_skip_tokenizer_init(vllm_runner, request):
|
||||
"""VllmRunner test fixture with APC."""
|
||||
with _vllm_model(
|
||||
request.param,
|
||||
vllm_runner,
|
||||
monkeypatch,
|
||||
skip_tokenizer_init=True,
|
||||
) as vllm_model:
|
||||
yield vllm_model
|
||||
@@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
|
||||
)
|
||||
|
||||
|
||||
def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
|
||||
def test_engine_metrics(vllm_runner, example_prompts):
|
||||
max_tokens = 100
|
||||
# Use spec decoding to test num_accepted_tokens_per_pos
|
||||
speculative_config = {
|
||||
@@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
|
||||
"prompt_lookup_min": 3,
|
||||
"num_speculative_tokens": 5,
|
||||
}
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with vllm_runner(
|
||||
MODEL,
|
||||
speculative_config=speculative_config,
|
||||
@@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
|
||||
def test_skip_tokenizer_initialization(model: str, monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
def test_skip_tokenizer_initialization(model: str):
|
||||
# This test checks if the flag skip_tokenizer_init skips the initialization
|
||||
# of tokenizer and detokenizer. The generated output is expected to contain
|
||||
# token ids.
|
||||
|
||||
@@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
|
||||
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
|
||||
)
|
||||
def test_structured_output(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
sample_json_schema: dict[str, Any],
|
||||
unsupported_json_schema: dict[str, Any],
|
||||
sample_sql_ebnf: str,
|
||||
@@ -115,8 +114,6 @@ def test_structured_output(
|
||||
model_name: str,
|
||||
speculative_config: dict[str, Any],
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
if current_platform.is_tpu() and speculative_config:
|
||||
pytest.skip("TPU does not support speculative decoding")
|
||||
|
||||
@@ -620,15 +617,12 @@ Make the response as short as possible.
|
||||
],
|
||||
)
|
||||
def test_structured_output_with_reasoning_matrices(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
backend: str,
|
||||
tokenizer_mode: TokenizerMode,
|
||||
reasoning_parser: str,
|
||||
model_name: str,
|
||||
speculative_config: dict[str, Any] | None,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
if current_platform.is_tpu() and speculative_config:
|
||||
pytest.skip("TPU does not support speculative decoding")
|
||||
|
||||
@@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
|
||||
def test_structured_output_auto_mode(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
unsupported_json_schema: dict[str, Any],
|
||||
model_name: str,
|
||||
tokenizer_mode: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=1024,
|
||||
@@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
def test_guidance_no_additional_properties():
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
max_model_len=1024,
|
||||
@@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
|
||||
def test_structured_output_batched_with_non_structured_outputs_requests(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
sample_json_schema: dict[str, Any],
|
||||
backend: str,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
# Don't use eager execution on TPUs because we want to test for no
|
||||
# recompilation at runtime
|
||||
enforce_eager = bool(not current_platform.is_tpu())
|
||||
|
||||
@@ -53,7 +53,6 @@ cleanup() {
|
||||
launch_baseline() {
|
||||
BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
|
||||
VLLM_LOGGING_LEVEL=DEBUG \
|
||||
VLLM_USE_V1=1 \
|
||||
PJRT_DEVICE=TPU \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
|
||||
@@ -73,7 +72,6 @@ launch_pd() {
|
||||
UCX_TLS=tcp \
|
||||
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
||||
VLLM_LOGGING_LEVEL=DEBUG \
|
||||
VLLM_USE_V1=1 \
|
||||
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
|
||||
PJRT_DEVICE=TPU \
|
||||
@@ -93,7 +91,6 @@ launch_pd() {
|
||||
UCX_TLS=tcp \
|
||||
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
||||
VLLM_LOGGING_LEVEL=DEBUG \
|
||||
VLLM_USE_V1=1 \
|
||||
PJRT_DEVICE=TPU \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
|
||||
|
||||
@@ -55,7 +55,6 @@ launch_pd() {
|
||||
UCX_TLS=tcp \
|
||||
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
||||
VLLM_LOGGING_LEVEL=DEBUG \
|
||||
VLLM_USE_V1=1 \
|
||||
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
|
||||
PJRT_DEVICE=TPU \
|
||||
@@ -75,7 +74,6 @@ launch_pd() {
|
||||
UCX_TLS=tcp \
|
||||
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
|
||||
VLLM_LOGGING_LEVEL=DEBUG \
|
||||
VLLM_USE_V1=1 \
|
||||
PJRT_DEVICE=TPU \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
@@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
|
||||
from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v1_only(monkeypatch):
|
||||
"""
|
||||
The change relies on V1 APIs, so set VLLM_USE_V1=1.
|
||||
"""
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
|
||||
MODELS = [
|
||||
"distilbert/distilgpt2",
|
||||
]
|
||||
@@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
|
||||
@ray.remote(num_gpus=1)
|
||||
class EngineTestActor:
|
||||
async def run(self):
|
||||
# Set environment variable inside the Ray actor since environment
|
||||
# variables from pytest fixtures don't propagate to Ray actors
|
||||
os.environ["VLLM_USE_V1"] = "1"
|
||||
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
|
||||
)
|
||||
|
||||
@@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
|
||||
batch_logprobs_composition: BatchLogprobsComposition,
|
||||
temperature: float,
|
||||
example_prompts: list[str],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""Test V1 Engine logprobs & prompt logprobs
|
||||
|
||||
@@ -308,220 +307,204 @@ def test_get_logprobs_and_prompt_logprobs(
|
||||
temperature: "temperature" sampling parameter
|
||||
example_prompts: example prompt fixture
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
|
||||
if do_apc and (
|
||||
temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT
|
||||
):
|
||||
# Skip some test-cases to save time.
|
||||
pytest.skip()
|
||||
test_prompts = example_prompts
|
||||
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
|
||||
if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
|
||||
# Skip some test-cases to save time.
|
||||
pytest.skip()
|
||||
test_prompts = example_prompts
|
||||
|
||||
max_tokens = 5
|
||||
hf_outputs = hf_model.generate_greedy(
|
||||
test_prompts,
|
||||
max_tokens = 5
|
||||
hf_outputs = hf_model.generate_greedy(
|
||||
test_prompts,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
hf_logprobs = hf_model.generate_greedy_logprobs(
|
||||
test_prompts,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
# Batch has mixed sample params
|
||||
# (different logprobs/prompt logprobs combos)
|
||||
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
|
||||
|
||||
# Ensure that each test prompt has a logprob config for testing
|
||||
logprob_prompt_logprob_list = _repeat_logprob_config(
|
||||
test_prompts, logprob_prompt_logprob_list
|
||||
)
|
||||
# Generate SamplingParams
|
||||
vllm_sampling_params = [
|
||||
SamplingParams(
|
||||
max_tokens=max_tokens,
|
||||
logprobs=num_lp,
|
||||
prompt_logprobs=num_plp,
|
||||
temperature=temperature,
|
||||
seed=1984,
|
||||
)
|
||||
hf_logprobs = hf_model.generate_greedy_logprobs(
|
||||
test_prompts,
|
||||
for num_lp, num_plp in logprob_prompt_logprob_list
|
||||
]
|
||||
for _ in range(2 if do_apc else 1):
|
||||
_run_and_validate(
|
||||
vllm_model=vllm_model,
|
||||
test_prompts=test_prompts,
|
||||
vllm_sampling_params=vllm_sampling_params,
|
||||
hf_logprobs=hf_logprobs,
|
||||
hf_outputs=hf_outputs,
|
||||
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
do_apc=do_apc,
|
||||
)
|
||||
|
||||
# Batch has mixed sample params
|
||||
# (different logprobs/prompt logprobs combos)
|
||||
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
|
||||
|
||||
# Ensure that each test prompt has a logprob config for testing
|
||||
logprob_prompt_logprob_list = _repeat_logprob_config(
|
||||
test_prompts, logprob_prompt_logprob_list
|
||||
)
|
||||
# Generate SamplingParams
|
||||
vllm_sampling_params = [
|
||||
SamplingParams(
|
||||
max_tokens=max_tokens,
|
||||
logprobs=num_lp,
|
||||
prompt_logprobs=num_plp,
|
||||
temperature=temperature,
|
||||
seed=1984,
|
||||
)
|
||||
for num_lp, num_plp in logprob_prompt_logprob_list
|
||||
]
|
||||
for _ in range(2 if do_apc else 1):
|
||||
_run_and_validate(
|
||||
vllm_model=vllm_model,
|
||||
test_prompts=test_prompts,
|
||||
vllm_sampling_params=vllm_sampling_params,
|
||||
hf_logprobs=hf_logprobs,
|
||||
hf_outputs=hf_outputs,
|
||||
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
do_apc=do_apc,
|
||||
)
|
||||
|
||||
|
||||
def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
|
||||
def test_max_logprobs():
|
||||
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
|
||||
Should also fail for `prompt_logprobs > max_logprobs`
|
||||
APC should not matter as this test checks basic request validation.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
runner = VllmRunner(
|
||||
"facebook/opt-125m",
|
||||
max_logprobs=1,
|
||||
enable_prefix_caching=False,
|
||||
# 2 other llms alive during whole session
|
||||
gpu_memory_utilization=0.15,
|
||||
max_model_len=256,
|
||||
)
|
||||
vllm_sampling_params = SamplingParams(logprobs=1)
|
||||
# should pass
|
||||
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
|
||||
|
||||
runner = VllmRunner(
|
||||
"facebook/opt-125m",
|
||||
max_logprobs=1,
|
||||
enable_prefix_caching=False,
|
||||
# 2 other llms alive during whole session
|
||||
gpu_memory_utilization=0.15,
|
||||
max_model_len=256,
|
||||
)
|
||||
vllm_sampling_params = SamplingParams(logprobs=1)
|
||||
# should pass
|
||||
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
|
||||
|
||||
bad_sampling_params = SamplingParams(logprobs=2)
|
||||
with pytest.raises(ValueError):
|
||||
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
|
||||
bad_sampling_params = SamplingParams(logprobs=2)
|
||||
with pytest.raises(ValueError):
|
||||
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
|
||||
|
||||
|
||||
def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
|
||||
def test_none_logprobs(vllm_model, example_prompts):
|
||||
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
|
||||
|
||||
Args:
|
||||
vllm_model: vLLM model fixture
|
||||
example_prompts: list of example prompts (test fixture)
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
max_tokens = 5
|
||||
max_tokens = 5
|
||||
|
||||
sampling_params_logprobs_none = SamplingParams(
|
||||
max_tokens=max_tokens,
|
||||
logprobs=None,
|
||||
prompt_logprobs=None,
|
||||
temperature=0.0,
|
||||
)
|
||||
results_logprobs_none = vllm_model.llm.generate(
|
||||
example_prompts,
|
||||
sampling_params=sampling_params_logprobs_none,
|
||||
)
|
||||
sampling_params_logprobs_none = SamplingParams(
|
||||
max_tokens=max_tokens,
|
||||
logprobs=None,
|
||||
prompt_logprobs=None,
|
||||
temperature=0.0,
|
||||
)
|
||||
results_logprobs_none = vllm_model.llm.generate(
|
||||
example_prompts,
|
||||
sampling_params=sampling_params_logprobs_none,
|
||||
)
|
||||
|
||||
for i in range(len(results_logprobs_none)):
|
||||
# Check sample logprobs are None
|
||||
assert results_logprobs_none[i].outputs[0].logprobs is None
|
||||
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
|
||||
# Check prompt logprobs are None
|
||||
assert results_logprobs_none[i].prompt_logprobs is None
|
||||
for i in range(len(results_logprobs_none)):
|
||||
# Check sample logprobs are None
|
||||
assert results_logprobs_none[i].outputs[0].logprobs is None
|
||||
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
|
||||
# Check prompt logprobs are None
|
||||
assert results_logprobs_none[i].prompt_logprobs is None
|
||||
|
||||
|
||||
def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
|
||||
def test_zero_logprobs(vllm_model, example_prompts):
|
||||
"""Engine should return sampled token and prompt token logprobs
|
||||
|
||||
Args:
|
||||
vllm_model: vLLM model fixture
|
||||
example_prompts: list of example prompts (test fixture)
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
max_tokens = 5
|
||||
max_tokens = 5
|
||||
|
||||
sampling_params_logprobs_zero = SamplingParams(
|
||||
max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
|
||||
)
|
||||
results_logprobs_zero = vllm_model.llm.generate(
|
||||
example_prompts, sampling_params=sampling_params_logprobs_zero
|
||||
)
|
||||
sampling_params_logprobs_zero = SamplingParams(
|
||||
max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
|
||||
)
|
||||
results_logprobs_zero = vllm_model.llm.generate(
|
||||
example_prompts, sampling_params=sampling_params_logprobs_zero
|
||||
)
|
||||
|
||||
for i in range(len(results_logprobs_zero)):
|
||||
# Check that there is one sample logprob dict for each
|
||||
# sample token
|
||||
logprobs = results_logprobs_zero[i].outputs[0].logprobs
|
||||
prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
|
||||
sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
|
||||
prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
|
||||
assert logprobs is not None
|
||||
assert len(sampled_token_ids) == len(logprobs)
|
||||
assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
|
||||
# Check that there is one prompt logprob dict for each
|
||||
# prompt token
|
||||
assert prompt_logprobs is not None
|
||||
assert len(prompt_token_ids) == len(prompt_logprobs)
|
||||
for i in range(len(results_logprobs_zero)):
|
||||
# Check that there is one sample logprob dict for each
|
||||
# sample token
|
||||
logprobs = results_logprobs_zero[i].outputs[0].logprobs
|
||||
prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
|
||||
sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
|
||||
prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
|
||||
assert logprobs is not None
|
||||
assert len(sampled_token_ids) == len(logprobs)
|
||||
assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
|
||||
# Check that there is one prompt logprob dict for each
|
||||
# prompt token
|
||||
assert prompt_logprobs is not None
|
||||
assert len(prompt_token_ids) == len(prompt_logprobs)
|
||||
|
||||
|
||||
def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
|
||||
def test_all_logprobs(example_prompts):
|
||||
"""Engine should return all vocabulary logprobs and prompt logprobs
|
||||
|
||||
Args:
|
||||
example_prompts: list of example prompts (test fixture)
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
runner = VllmRunner(
|
||||
"facebook/opt-125m",
|
||||
max_logprobs=-1,
|
||||
enable_prefix_caching=False,
|
||||
# 2 other llms alive during whole session
|
||||
gpu_memory_utilization=0.15,
|
||||
max_model_len=256,
|
||||
)
|
||||
runner = VllmRunner(
|
||||
"facebook/opt-125m",
|
||||
max_logprobs=-1,
|
||||
enable_prefix_caching=False,
|
||||
# 2 other llms alive during whole session
|
||||
gpu_memory_utilization=0.15,
|
||||
max_model_len=256,
|
||||
)
|
||||
|
||||
sampling_params_logprobs_all = SamplingParams(
|
||||
max_tokens=5, logprobs=-1, prompt_logprobs=-1
|
||||
)
|
||||
results_logprobs_all = runner.llm.generate(
|
||||
example_prompts, sampling_params=sampling_params_logprobs_all
|
||||
)
|
||||
vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
|
||||
sampling_params_logprobs_all = SamplingParams(
|
||||
max_tokens=5, logprobs=-1, prompt_logprobs=-1
|
||||
)
|
||||
results_logprobs_all = runner.llm.generate(
|
||||
example_prompts, sampling_params=sampling_params_logprobs_all
|
||||
)
|
||||
vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
|
||||
|
||||
for i in range(len(results_logprobs_all)):
|
||||
logprobs = results_logprobs_all[i].outputs[0].logprobs
|
||||
prompt_logprobs = results_logprobs_all[i].prompt_logprobs
|
||||
assert logprobs is not None
|
||||
for logprob in logprobs:
|
||||
assert len(logprob) == vocab_size
|
||||
assert prompt_logprobs is not None
|
||||
assert prompt_logprobs[0] is None
|
||||
for prompt_logprob in prompt_logprobs[1:]:
|
||||
assert len(prompt_logprob) == vocab_size
|
||||
for i in range(len(results_logprobs_all)):
|
||||
logprobs = results_logprobs_all[i].outputs[0].logprobs
|
||||
prompt_logprobs = results_logprobs_all[i].prompt_logprobs
|
||||
assert logprobs is not None
|
||||
for logprob in logprobs:
|
||||
assert len(logprob) == vocab_size
|
||||
assert prompt_logprobs is not None
|
||||
assert prompt_logprobs[0] is None
|
||||
for prompt_logprob in prompt_logprobs[1:]:
|
||||
assert len(prompt_logprob) == vocab_size
|
||||
|
||||
|
||||
@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
|
||||
def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch):
|
||||
def test_logprobs_mode(logprobs_mode: LogprobsMode):
|
||||
"""Test with LLM engine with different logprobs_mode.
|
||||
For logprobs, we should have non-positive values.
|
||||
For logits, we should expect at least one positive values.
|
||||
"""
|
||||
from vllm import LLM
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(
|
||||
"facebook/opt-125m",
|
||||
max_logprobs=5,
|
||||
enable_prefix_caching=False,
|
||||
# 2 other llms alive during whole session
|
||||
gpu_memory_utilization=0.05,
|
||||
max_model_len=16,
|
||||
logprobs_mode=logprobs_mode,
|
||||
)
|
||||
vllm_sampling_params = SamplingParams(logprobs=1)
|
||||
results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
|
||||
|
||||
llm = LLM(
|
||||
"facebook/opt-125m",
|
||||
max_logprobs=5,
|
||||
enable_prefix_caching=False,
|
||||
# 2 other llms alive during whole session
|
||||
gpu_memory_utilization=0.05,
|
||||
max_model_len=16,
|
||||
logprobs_mode=logprobs_mode,
|
||||
)
|
||||
vllm_sampling_params = SamplingParams(logprobs=1)
|
||||
results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
|
||||
|
||||
total_token_with_logprobs = 0
|
||||
positive_values = 0
|
||||
for output in results[0].outputs:
|
||||
for logprobs in output.logprobs:
|
||||
for token_id in logprobs:
|
||||
logprob = logprobs[token_id]
|
||||
if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
|
||||
assert logprob.logprob <= 0
|
||||
if logprob.logprob > 0:
|
||||
positive_values = positive_values + 1
|
||||
total_token_with_logprobs = total_token_with_logprobs + 1
|
||||
assert total_token_with_logprobs >= len(results[0].outputs)
|
||||
if logprobs_mode in ("raw_logits", "processed_logits"):
|
||||
assert positive_values > 0
|
||||
del llm
|
||||
total_token_with_logprobs = 0
|
||||
positive_values = 0
|
||||
for output in results[0].outputs:
|
||||
for logprobs in output.logprobs:
|
||||
for token_id in logprobs:
|
||||
logprob = logprobs[token_id]
|
||||
if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
|
||||
assert logprob.logprob <= 0
|
||||
if logprob.logprob > 0:
|
||||
positive_values = positive_values + 1
|
||||
total_token_with_logprobs = total_token_with_logprobs + 1
|
||||
assert total_token_with_logprobs >= len(results[0].outputs)
|
||||
if logprobs_mode in ("raw_logits", "processed_logits"):
|
||||
assert positive_values > 0
|
||||
del llm
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
if os.getenv("VLLM_USE_V1", "0") != "1":
|
||||
pytest.skip("Test package requires V1", allow_module_level=True)
|
||||
|
||||
MODEL = "meta-llama/Llama-3.2-1B"
|
||||
PROMPT = "Hello my name is Robert and I"
|
||||
|
||||
@@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
|
||||
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
|
||||
|
||||
|
||||
def test_priority(llm):
|
||||
"""Check that we reject requests with priority."""
|
||||
|
||||
# Reject all allowed token ids
|
||||
with pytest.raises(ValueError):
|
||||
_ = llm.generate(PROMPT, priority=[1])
|
||||
|
||||
|
||||
def test_seed(llm):
|
||||
"""Check that seed impacts randomness."""
|
||||
|
||||
|
||||
@@ -38,7 +38,6 @@ def test_eagle_max_len(
|
||||
monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||
|
||||
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
|
||||
|
||||
@@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
|
||||
@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
|
||||
def test_basic(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
tensor_parallel_size: int,
|
||||
@@ -55,23 +54,20 @@ def test_basic(
|
||||
)
|
||||
example_prompts = [prompt]
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
with vllm_runner(
|
||||
model,
|
||||
# Note: max_num_batched_tokens == 1024 is needed here to
|
||||
# actually test chunked prompt
|
||||
max_num_batched_tokens=1024,
|
||||
max_model_len=8192,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
output = vllm_outputs[0][1]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
# Note: max_num_batched_tokens == 1024 is needed here to
|
||||
# actually test chunked prompt
|
||||
max_num_batched_tokens=1024,
|
||||
max_model_len=8192,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
output = vllm_outputs[0][1]
|
||||
|
||||
assert "1024" in output or "0, 1" in output
|
||||
assert "1024" in output or "0, 1" in output
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Temporarily disabled due to timeout")
|
||||
@@ -82,7 +78,6 @@ def test_basic(
|
||||
@pytest.mark.parametrize("max_num_seqs", [16])
|
||||
def test_phi3(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
max_tokens: int,
|
||||
max_num_seqs: int,
|
||||
) -> None:
|
||||
@@ -99,18 +94,15 @@ def test_phi3(
|
||||
# test head dim = 96
|
||||
model = "microsoft/Phi-3-mini-128k-instruct"
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with vllm_runner(
|
||||
model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
|
||||
# vllm_outputs is a list of tuples whose first element is the token id
|
||||
# and the second element is the output (including the prompt).
|
||||
for output, answer in zip(vllm_outputs, answers):
|
||||
generated_text = output[1]
|
||||
assert answer in generated_text
|
||||
with vllm_runner(
|
||||
model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
|
||||
# vllm_outputs is a list of tuples whose first element is the token id
|
||||
# and the second element is the output (including the prompt).
|
||||
for output, answer in zip(vllm_outputs, answers):
|
||||
generated_text = output[1]
|
||||
assert answer in generated_text
|
||||
|
||||
|
||||
TP_SIZE_8 = 8
|
||||
@@ -123,7 +115,6 @@ TP_SIZE_8 = 8
|
||||
)
|
||||
def test_gemma3_27b_with_text_input_and_tp(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
model = "google/gemma-3-27b-it"
|
||||
max_tokens = 16
|
||||
@@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
|
||||
" but in rising every time we fall.",
|
||||
]
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_num_batched_tokens=256,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
|
||||
# vllm_outputs is a list of tuples whose first element is the token id
|
||||
# and the second element is the output (including the prompt).
|
||||
for output, answer in zip(vllm_outputs, answers):
|
||||
generated_text = output[1]
|
||||
assert answer in generated_text
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_num_batched_tokens=256,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
|
||||
# vllm_outputs is a list of tuples whose first element is the token id
|
||||
# and the second element is the output (including the prompt).
|
||||
for output, answer in zip(vllm_outputs, answers):
|
||||
generated_text = output[1]
|
||||
assert answer in generated_text
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
|
||||
)
|
||||
def test_w8a8_quantization(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
|
||||
max_tokens = 5
|
||||
@@ -176,18 +163,15 @@ def test_w8a8_quantization(
|
||||
)
|
||||
example_prompts = [prompt]
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_num_batched_tokens=64,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
output = vllm_outputs[0][1]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_num_batched_tokens=64,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_num_seqs=max_num_seqs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
output = vllm_outputs[0][1]
|
||||
|
||||
assert "1024" in output or "0, 1" in output
|
||||
assert "1024" in output or "0, 1" in output
|
||||
|
||||
@@ -86,7 +86,6 @@ GPU_UTIL = 0.9
|
||||
@pytest.mark.parametrize("params", TEST_PARAMS)
|
||||
def test_perf(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
params: TestParams,
|
||||
) -> None:
|
||||
tokenizer = get_tokenizer(
|
||||
@@ -107,48 +106,45 @@ def test_perf(
|
||||
)
|
||||
)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=params.decode_len, temperature=1.0, min_p=0.0
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=params.decode_len, temperature=1.0, min_p=0.0
|
||||
)
|
||||
with vllm_runner(
|
||||
params.model,
|
||||
max_num_batched_tokens=MAX_MODEL_LEN,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
gpu_memory_utilization=GPU_UTIL,
|
||||
enforce_eager=False,
|
||||
tensor_parallel_size=1,
|
||||
) as vllm_model:
|
||||
print(" -- Warmup / Compile")
|
||||
for i in range(NUM_WARMUPS):
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
|
||||
with vllm_runner(
|
||||
params.model,
|
||||
max_num_batched_tokens=MAX_MODEL_LEN,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
gpu_memory_utilization=GPU_UTIL,
|
||||
enforce_eager=False,
|
||||
tensor_parallel_size=1,
|
||||
) as vllm_model:
|
||||
print(" -- Warmup / Compile")
|
||||
for i in range(NUM_WARMUPS):
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
print(" -- Benchmarking... ")
|
||||
times = []
|
||||
for i in range(NUM_RUNS):
|
||||
start_time = time.time()
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
times.append(time.time() - start_time)
|
||||
|
||||
print(" -- Benchmarking... ")
|
||||
times = []
|
||||
for i in range(NUM_RUNS):
|
||||
start_time = time.time()
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
times.append(time.time() - start_time)
|
||||
avg_time = sum(times) / len(times)
|
||||
|
||||
avg_time = sum(times) / len(times)
|
||||
|
||||
print(" -- avg_time = {}".format(avg_time))
|
||||
print(
|
||||
" -- expected_avg_time = {} with err_tol = {}".format(
|
||||
params.expected_avg_time, params.err_tol
|
||||
)
|
||||
print(" -- avg_time = {}".format(avg_time))
|
||||
print(
|
||||
" -- expected_avg_time = {} with err_tol = {}".format(
|
||||
params.expected_avg_time, params.err_tol
|
||||
)
|
||||
)
|
||||
diff = avg_time - params.expected_avg_time
|
||||
ok = diff < params.err_tol
|
||||
if diff < -params.err_tol:
|
||||
print(
|
||||
" !! WARNING !! Performance has improved by {}, "
|
||||
"it may be necessary to fine-tune the "
|
||||
"expected_avg_time = {}".format(-diff, params.expected_avg_time)
|
||||
)
|
||||
diff = avg_time - params.expected_avg_time
|
||||
ok = diff < params.err_tol
|
||||
if diff < -params.err_tol:
|
||||
print(
|
||||
" !! WARNING !! Performance has improved by {}, "
|
||||
"it may be necessary to fine-tune the "
|
||||
"expected_avg_time = {}".format(-diff, params.expected_avg_time)
|
||||
)
|
||||
|
||||
assert ok, " !! ERROR !! Regression detected"
|
||||
assert ok, " !! ERROR !! Regression detected"
|
||||
|
||||
@@ -82,7 +82,7 @@ def test_traces(
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.01,
|
||||
top_p=0.1,
|
||||
|
||||
Reference in New Issue
Block a user