[V1] V1 Enablement Oracle (#13726)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
@@ -18,19 +18,19 @@ if not envs.VLLM_USE_V1:
|
||||
def test_prefix_caching_from_cli():
|
||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||
args = parser.parse_args([])
|
||||
engine_args = EngineArgs.from_cli_args(args=args)
|
||||
assert (engine_args.enable_prefix_caching
|
||||
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
|
||||
assert (vllm_config.cache_config.enable_prefix_caching
|
||||
), "V1 turns on prefix caching by default."
|
||||
|
||||
# Turn it off possible with flag.
|
||||
args = parser.parse_args(["--no-enable-prefix-caching"])
|
||||
engine_args = EngineArgs.from_cli_args(args=args)
|
||||
assert not engine_args.enable_prefix_caching
|
||||
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
|
||||
assert not vllm_config.cache_config.enable_prefix_caching
|
||||
|
||||
# Turn it on with flag.
|
||||
args = parser.parse_args(["--enable-prefix-caching"])
|
||||
engine_args = EngineArgs.from_cli_args(args=args)
|
||||
assert engine_args.enable_prefix_caching
|
||||
vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
|
||||
assert vllm_config.cache_config.enable_prefix_caching
|
||||
|
||||
|
||||
def test_defaults_with_usage_context():
|
||||
@@ -38,11 +38,21 @@ def test_defaults_with_usage_context():
|
||||
vllm_config: VllmConfig = engine_args.create_engine_config(
|
||||
UsageContext.LLM_CLASS)
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
device_name = current_platform.get_device_name().lower()
|
||||
if "h100" in device_name or "h200" in device_name:
|
||||
# For H100 and H200, we use larger default values.
|
||||
default_llm_tokens = 16384
|
||||
default_server_tokens = 8192
|
||||
else:
|
||||
default_llm_tokens = 8192
|
||||
default_server_tokens = 2048
|
||||
|
||||
assert vllm_config.scheduler_config.max_num_seqs == 1024
|
||||
assert vllm_config.scheduler_config.max_num_batched_tokens == 8192
|
||||
assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens # noqa: E501
|
||||
|
||||
engine_args = EngineArgs(model="facebook/opt-125m")
|
||||
vllm_config = engine_args.create_engine_config(
|
||||
UsageContext.OPENAI_API_SERVER)
|
||||
assert vllm_config.scheduler_config.max_num_seqs == 1024
|
||||
assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
|
||||
assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens # noqa: E501
|
||||
|
||||
Reference in New Issue
Block a user