Add vllm bench [latency, throughput] CLI commands (#16508)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
44
tests/benchmarks/test_serve_cli.py
Normal file
44
tests/benchmarks/test_serve_cli.py
Normal file
@@ -0,0 +1,44 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
from ..utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.benchmark
|
||||
def test_bench_serve(server):
|
||||
command = [
|
||||
"vllm",
|
||||
"bench",
|
||||
"serve",
|
||||
"--model",
|
||||
MODEL_NAME,
|
||||
"--host",
|
||||
server.host,
|
||||
"--port",
|
||||
str(server.port),
|
||||
"--random-input-len",
|
||||
"32",
|
||||
"--random-output-len",
|
||||
"4",
|
||||
"--num-prompts",
|
||||
"5",
|
||||
]
|
||||
result = subprocess.run(command, capture_output=True, text=True)
|
||||
print(result.stdout)
|
||||
print(result.stderr)
|
||||
|
||||
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
|
||||
Reference in New Issue
Block a user