[v1] EngineArgs for better config handling for v1 (#10382)
Signed-off-by: rickyx <rickyx@anyscale.com>
This commit is contained in:
@@ -32,6 +32,9 @@ async def generate(engine: AsyncLLM, request_id: str,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_load(monkeypatch):
|
||||
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
|
||||
# so that in the future when we switch, we don't have to change all the
|
||||
# tests.
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
|
||||
42
tests/v1/engine/test_engine_args.py
Normal file
42
tests/v1/engine/test_engine_args.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import pytest
|
||||
|
||||
from vllm import envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
pytest.skip(
|
||||
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
def test_defaults():
|
||||
engine_args = EngineArgs(model="facebook/opt-125m")
|
||||
|
||||
# Assert V1 defaults
|
||||
assert (engine_args.enable_prefix_caching
|
||||
), "V1 turns on prefix caching by default"
|
||||
|
||||
|
||||
def test_defaults_with_usage_context():
|
||||
engine_args = EngineArgs(model="facebook/opt-125m")
|
||||
vllm_config: VllmConfig = engine_args.create_engine_config(
|
||||
UsageContext.LLM_CLASS)
|
||||
|
||||
assert vllm_config.scheduler_config.max_num_seqs == 1024
|
||||
assert vllm_config.scheduler_config.max_num_batched_tokens == 8192
|
||||
|
||||
engine_args = EngineArgs(model="facebook/opt-125m")
|
||||
vllm_config = engine_args.create_engine_config(
|
||||
UsageContext.OPENAI_API_SERVER)
|
||||
assert vllm_config.scheduler_config.max_num_seqs == 1024
|
||||
assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
|
||||
|
||||
|
||||
def test_prefix_cache_disabled_with_multimodel():
|
||||
engine_args = EngineArgs(model="llava-hf/llava-1.5-7b-hf")
|
||||
|
||||
vllm_config = engine_args.create_engine_config(UsageContext.LLM_CLASS)
|
||||
assert not vllm_config.cache_config.enable_prefix_caching
|
||||
@@ -43,7 +43,8 @@ def test_engine_core(monkeypatch):
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
"""Setup the EngineCore."""
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
vllm_config = engine_args.create_engine_config(
|
||||
usage_context=UsageContext.UNKNOWN_CONTEXT)
|
||||
executor_class = AsyncLLM._get_executor_cls(vllm_config)
|
||||
|
||||
engine_core = EngineCore(vllm_config=vllm_config,
|
||||
|
||||
@@ -82,7 +82,8 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
vllm_config = engine_args.create_engine_config(
|
||||
UsageContext.UNKNOWN_CONTEXT)
|
||||
executor_class = AsyncLLM._get_executor_cls(vllm_config)
|
||||
client = EngineCoreClient.make_client(
|
||||
vllm_config,
|
||||
@@ -153,7 +154,8 @@ async def test_engine_core_client_asyncio(monkeypatch):
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
engine_args = EngineArgs(model=MODEL_NAME)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
vllm_config = engine_args.create_engine_config(
|
||||
usage_context=UsageContext.UNKNOWN_CONTEXT)
|
||||
executor_class = AsyncLLM._get_executor_cls(vllm_config)
|
||||
client = EngineCoreClient.make_client(
|
||||
vllm_config,
|
||||
|
||||
Reference in New Issue
Block a user