[Frontend] Use init_app_state and FrontendArgs in run_batch (#32967)
Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -447,7 +447,7 @@ def test_metrics_exist_run_batch():
|
||||
"--model",
|
||||
"intfloat/multilingual-e5-small",
|
||||
"--enable-metrics",
|
||||
"--url",
|
||||
"--host",
|
||||
base_url,
|
||||
"--port",
|
||||
port,
|
||||
|
||||
@@ -10,59 +10,361 @@ import pytest
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.entrypoints.openai.run_batch import BatchRequestOutput
|
||||
|
||||
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
|
||||
CHAT_MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
|
||||
EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
RERANKER_MODEL_NAME = "BAAI/bge-reranker-v2-m3"
|
||||
REASONING_MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
SPEECH_LARGE_MODEL_NAME = "openai/whisper-large-v3"
|
||||
SPEECH_SMALL_MODEL_NAME = "openai/whisper-small"
|
||||
|
||||
# ruff: noqa: E501
|
||||
INPUT_BATCH = (
|
||||
'{{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
|
||||
'{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
|
||||
'{{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "NonExistModel", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
|
||||
'{{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
|
||||
'{{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {{"stream": "True", "model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
|
||||
).format(MODEL_NAME)
|
||||
|
||||
INVALID_INPUT_BATCH = (
|
||||
'{{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
|
||||
'{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
|
||||
).format(MODEL_NAME)
|
||||
|
||||
INPUT_EMBEDDING_BATCH = (
|
||||
'{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}\n'
|
||||
'{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}\n'
|
||||
'{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}\n'
|
||||
'{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}'
|
||||
INPUT_BATCH = "\n".join(
|
||||
json.dumps(req)
|
||||
for req in [
|
||||
{
|
||||
"custom_id": "request-1",
|
||||
"method": "POST",
|
||||
"url": "/v1/chat/completions",
|
||||
"body": {
|
||||
"model": CHAT_MODEL_NAME,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant.",
|
||||
},
|
||||
{"role": "user", "content": "Hello world!"},
|
||||
],
|
||||
"max_tokens": 1000,
|
||||
},
|
||||
},
|
||||
{
|
||||
"custom_id": "request-2",
|
||||
"method": "POST",
|
||||
"url": "/v1/chat/completions",
|
||||
"body": {
|
||||
"model": CHAT_MODEL_NAME,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an unhelpful assistant.",
|
||||
},
|
||||
{"role": "user", "content": "Hello world!"},
|
||||
],
|
||||
"max_tokens": 1000,
|
||||
},
|
||||
},
|
||||
{
|
||||
"custom_id": "request-3",
|
||||
"method": "POST",
|
||||
"url": "/v1/chat/completions",
|
||||
"body": {
|
||||
"model": "NonExistModel",
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an unhelpful assistant.",
|
||||
},
|
||||
{"role": "user", "content": "Hello world!"},
|
||||
],
|
||||
"max_tokens": 1000,
|
||||
},
|
||||
},
|
||||
{
|
||||
"custom_id": "request-4",
|
||||
"method": "POST",
|
||||
"url": "/bad_url",
|
||||
"body": {
|
||||
"model": CHAT_MODEL_NAME,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an unhelpful assistant.",
|
||||
},
|
||||
{"role": "user", "content": "Hello world!"},
|
||||
],
|
||||
"max_tokens": 1000,
|
||||
},
|
||||
},
|
||||
{
|
||||
"custom_id": "request-5",
|
||||
"method": "POST",
|
||||
"url": "/v1/chat/completions",
|
||||
"body": {
|
||||
"stream": "True",
|
||||
"model": CHAT_MODEL_NAME,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an unhelpful assistant.",
|
||||
},
|
||||
{"role": "user", "content": "Hello world!"},
|
||||
],
|
||||
"max_tokens": 1000,
|
||||
},
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
|
||||
INVALID_INPUT_BATCH = "\n".join(
|
||||
json.dumps(req)
|
||||
for req in [
|
||||
{
|
||||
"invalid_field": "request-1",
|
||||
"method": "POST",
|
||||
"url": "/v1/chat/completions",
|
||||
"body": {
|
||||
"model": CHAT_MODEL_NAME,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Hello world!"},
|
||||
],
|
||||
"max_tokens": 1000,
|
||||
},
|
||||
},
|
||||
{
|
||||
"custom_id": "request-2",
|
||||
"method": "POST",
|
||||
"url": "/v1/chat/completions",
|
||||
"body": {
|
||||
"model": CHAT_MODEL_NAME,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are an unhelpful assistant."},
|
||||
{"role": "user", "content": "Hello world!"},
|
||||
],
|
||||
"max_tokens": 1000,
|
||||
},
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
|
||||
INPUT_EMBEDDING_BATCH = "\n".join(
|
||||
json.dumps(req)
|
||||
for req in [
|
||||
{
|
||||
"custom_id": "request-1",
|
||||
"method": "POST",
|
||||
"url": "/v1/embeddings",
|
||||
"body": {
|
||||
"model": EMBEDDING_MODEL_NAME,
|
||||
"input": "You are a helpful assistant.",
|
||||
},
|
||||
},
|
||||
{
|
||||
"custom_id": "request-2",
|
||||
"method": "POST",
|
||||
"url": "/v1/embeddings",
|
||||
"body": {
|
||||
"model": EMBEDDING_MODEL_NAME,
|
||||
"input": "You are an unhelpful assistant.",
|
||||
},
|
||||
},
|
||||
{
|
||||
"custom_id": "request-3",
|
||||
"method": "POST",
|
||||
"url": "/v1/embeddings",
|
||||
"body": {
|
||||
"model": EMBEDDING_MODEL_NAME,
|
||||
"input": "Hello world!",
|
||||
},
|
||||
},
|
||||
{
|
||||
"custom_id": "request-4",
|
||||
"method": "POST",
|
||||
"url": "/v1/embeddings",
|
||||
"body": {
|
||||
"model": "NonExistModel",
|
||||
"input": "Hello world!",
|
||||
},
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
INPUT_REASONING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Solve this math problem: 2+2=?"}]}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "What is the capital of France?"}]}}"""
|
||||
_SCORE_RERANK_DOCUMENTS = [
|
||||
"The capital of Brazil is Brasilia.",
|
||||
"The capital of France is Paris.",
|
||||
]
|
||||
|
||||
INPUT_SCORE_BATCH = "\n".join(
|
||||
json.dumps(req)
|
||||
for req in [
|
||||
{
|
||||
"custom_id": "request-1",
|
||||
"method": "POST",
|
||||
"url": "/score",
|
||||
"body": {
|
||||
"model": RERANKER_MODEL_NAME,
|
||||
"queries": "What is the capital of France?",
|
||||
"documents": _SCORE_RERANK_DOCUMENTS,
|
||||
},
|
||||
},
|
||||
{
|
||||
"custom_id": "request-2",
|
||||
"method": "POST",
|
||||
"url": "/v1/score",
|
||||
"body": {
|
||||
"model": RERANKER_MODEL_NAME,
|
||||
"queries": "What is the capital of France?",
|
||||
"documents": _SCORE_RERANK_DOCUMENTS,
|
||||
},
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
INPUT_RERANK_BATCH = "\n".join(
|
||||
json.dumps(req)
|
||||
for req in [
|
||||
{
|
||||
"custom_id": "request-1",
|
||||
"method": "POST",
|
||||
"url": "/rerank",
|
||||
"body": {
|
||||
"model": RERANKER_MODEL_NAME,
|
||||
"query": "What is the capital of France?",
|
||||
"documents": _SCORE_RERANK_DOCUMENTS,
|
||||
},
|
||||
},
|
||||
{
|
||||
"custom_id": "request-2",
|
||||
"method": "POST",
|
||||
"url": "/v1/rerank",
|
||||
"body": {
|
||||
"model": RERANKER_MODEL_NAME,
|
||||
"query": "What is the capital of France?",
|
||||
"documents": _SCORE_RERANK_DOCUMENTS,
|
||||
},
|
||||
},
|
||||
{
|
||||
"custom_id": "request-2",
|
||||
"method": "POST",
|
||||
"url": "/v2/rerank",
|
||||
"body": {
|
||||
"model": RERANKER_MODEL_NAME,
|
||||
"query": "What is the capital of France?",
|
||||
"documents": _SCORE_RERANK_DOCUMENTS,
|
||||
},
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
INPUT_REASONING_BATCH = "\n".join(
|
||||
json.dumps(req)
|
||||
for req in [
|
||||
{
|
||||
"custom_id": "request-1",
|
||||
"method": "POST",
|
||||
"url": "/v1/chat/completions",
|
||||
"body": {
|
||||
"model": REASONING_MODEL_NAME,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Solve this math problem: 2+2=?"},
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
"custom_id": "request-2",
|
||||
"method": "POST",
|
||||
"url": "/v1/chat/completions",
|
||||
"body": {
|
||||
"model": REASONING_MODEL_NAME,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "What is the capital of France?"},
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
# This is a valid but minimal audio file for testing
|
||||
MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA="
|
||||
INPUT_TRANSCRIPTION_BATCH = (
|
||||
'{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/transcriptions", '
|
||||
'"body": {{"model": "openai/whisper-large-v3", "file_url": "data:audio/wav;base64,{}", '
|
||||
'"response_format": "json"}}}}\n'
|
||||
).format(MINIMAL_WAV_BASE64)
|
||||
json.dumps(
|
||||
{
|
||||
"custom_id": "request-1",
|
||||
"method": "POST",
|
||||
"url": "/v1/audio/transcriptions",
|
||||
"body": {
|
||||
"model": SPEECH_LARGE_MODEL_NAME,
|
||||
"file_url": f"data:audio/wav;base64,{MINIMAL_WAV_BASE64}",
|
||||
"response_format": "json",
|
||||
},
|
||||
}
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
INPUT_TRANSCRIPTION_HTTP_BATCH = (
|
||||
'{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/transcriptions", '
|
||||
'"body": {{"model": "openai/whisper-large-v3", "file_url": "{}", '
|
||||
'"response_format": "json"}}}}\n'
|
||||
).format(AudioAsset("mary_had_lamb").url)
|
||||
json.dumps(
|
||||
{
|
||||
"custom_id": "request-1",
|
||||
"method": "POST",
|
||||
"url": "/v1/audio/transcriptions",
|
||||
"body": {
|
||||
"model": SPEECH_LARGE_MODEL_NAME,
|
||||
"file_url": AudioAsset("mary_had_lamb").url,
|
||||
"response_format": "json",
|
||||
},
|
||||
}
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
INPUT_TRANSLATION_BATCH = (
|
||||
'{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/translations", '
|
||||
'"body": {{"model": "openai/whisper-small", "file_url": "{}", '
|
||||
'"response_format": "text", "language": "it", "to_language": "en", '
|
||||
'"temperature": 0.0}}}}\n'
|
||||
).format(AudioAsset("mary_had_lamb").url)
|
||||
json.dumps(
|
||||
{
|
||||
"custom_id": "request-1",
|
||||
"method": "POST",
|
||||
"url": "/v1/audio/translations",
|
||||
"body": {
|
||||
"model": SPEECH_SMALL_MODEL_NAME,
|
||||
"file_url": AudioAsset("mary_had_lamb").url,
|
||||
"response_format": "text",
|
||||
"language": "it",
|
||||
"to_language": "en",
|
||||
"temperature": 0.0,
|
||||
},
|
||||
}
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
WEATHER_TOOL = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"location": {
|
||||
"type": "string",
|
||||
"description": "The city and state, e.g. San Francisco, CA",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["location"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
INPUT_TOOL_CALLING_BATCH = json.dumps(
|
||||
{
|
||||
"custom_id": "request-1",
|
||||
"method": "POST",
|
||||
"url": "/v1/chat/completions",
|
||||
"body": {
|
||||
"model": REASONING_MODEL_NAME,
|
||||
"messages": [
|
||||
{"role": "user", "content": "What is the weather in San Francisco?"},
|
||||
],
|
||||
"tools": [WEATHER_TOOL],
|
||||
"tool_choice": "required",
|
||||
"max_tokens": 1000,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def test_empty_file():
|
||||
@@ -81,7 +383,7 @@ def test_empty_file():
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"intfloat/multilingual-e5-small",
|
||||
EMBEDDING_MODEL_NAME,
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
@@ -108,7 +410,7 @@ def test_completions():
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
MODEL_NAME,
|
||||
CHAT_MODEL_NAME,
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
@@ -141,7 +443,7 @@ def test_completions_invalid_input():
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
MODEL_NAME,
|
||||
CHAT_MODEL_NAME,
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
@@ -165,7 +467,7 @@ def test_embeddings():
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"intfloat/multilingual-e5-small",
|
||||
EMBEDDING_MODEL_NAME,
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
@@ -196,7 +498,7 @@ def test_score(input_batch):
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"BAAI/bge-reranker-v2-m3",
|
||||
RERANKER_MODEL_NAME,
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
@@ -234,7 +536,7 @@ def test_reasoning_parser():
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
REASONING_MODEL_NAME,
|
||||
"--reasoning-parser",
|
||||
"qwen3",
|
||||
],
|
||||
@@ -278,7 +580,7 @@ def test_transcription():
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"openai/whisper-large-v3",
|
||||
SPEECH_LARGE_MODEL_NAME,
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
@@ -316,7 +618,7 @@ def test_transcription_http_url():
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"openai/whisper-large-v3",
|
||||
SPEECH_LARGE_MODEL_NAME,
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
@@ -356,7 +658,7 @@ def test_translation():
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
"openai/whisper-small",
|
||||
SPEECH_SMALL_MODEL_NAME,
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
@@ -378,3 +680,69 @@ def test_translation():
|
||||
translation_text = response_body["text"]
|
||||
translation_text_lower = str(translation_text).strip().lower()
|
||||
assert "mary" in translation_text_lower or "lamb" in translation_text_lower
|
||||
|
||||
|
||||
def test_tool_calling():
|
||||
"""
|
||||
Test that tool calling works correctly in run_batch.
|
||||
Verifies that requests with tools return tool_calls in the response.
|
||||
"""
|
||||
with (
|
||||
tempfile.NamedTemporaryFile("w") as input_file,
|
||||
tempfile.NamedTemporaryFile("r") as output_file,
|
||||
):
|
||||
input_file.write(INPUT_TOOL_CALLING_BATCH)
|
||||
input_file.flush()
|
||||
proc = subprocess.Popen(
|
||||
[
|
||||
"vllm",
|
||||
"run-batch",
|
||||
"-i",
|
||||
input_file.name,
|
||||
"-o",
|
||||
output_file.name,
|
||||
"--model",
|
||||
REASONING_MODEL_NAME,
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"hermes",
|
||||
],
|
||||
)
|
||||
proc.communicate()
|
||||
proc.wait()
|
||||
assert proc.returncode == 0, f"{proc=}"
|
||||
|
||||
contents = output_file.read()
|
||||
for line in contents.strip().split("\n"):
|
||||
if not line.strip(): # Skip empty lines
|
||||
continue
|
||||
# Ensure that the output format conforms to the openai api.
|
||||
# Validation should throw if the schema is wrong.
|
||||
BatchRequestOutput.model_validate_json(line)
|
||||
|
||||
# Ensure that there is no error in the response.
|
||||
line_dict = json.loads(line)
|
||||
assert isinstance(line_dict, dict)
|
||||
assert line_dict["error"] is None
|
||||
|
||||
# Check that tool_calls are present in the response
|
||||
# With tool_choice="required", the model must call a tool
|
||||
response_body = line_dict["response"]["body"]
|
||||
assert response_body is not None
|
||||
message = response_body["choices"][0]["message"]
|
||||
assert "tool_calls" in message
|
||||
tool_calls = message.get("tool_calls")
|
||||
# With tool_choice="required", tool_calls must be present and non-empty
|
||||
assert tool_calls is not None
|
||||
assert isinstance(tool_calls, list)
|
||||
assert len(tool_calls) > 0
|
||||
# Verify tool_calls have the expected structure
|
||||
for tool_call in tool_calls:
|
||||
assert "id" in tool_call
|
||||
assert "type" in tool_call
|
||||
assert tool_call["type"] == "function"
|
||||
assert "function" in tool_call
|
||||
assert "name" in tool_call["function"]
|
||||
assert "arguments" in tool_call["function"]
|
||||
# Verify the tool name matches our tool definition
|
||||
assert tool_call["function"]["name"] == "get_current_weather"
|
||||
|
||||
@@ -67,7 +67,149 @@ class LoRAParserAction(argparse.Action):
|
||||
|
||||
|
||||
@config
|
||||
class FrontendArgs:
|
||||
class BaseFrontendArgs:
|
||||
"""Base arguments for the OpenAI-compatible frontend server.
|
||||
|
||||
This base class does not include host, port, and server-specific arguments
|
||||
like SSL, CORS, and HTTP server settings. Those arguments are added by
|
||||
the subclasses.
|
||||
"""
|
||||
|
||||
lora_modules: list[LoRAModulePath] | None = None
|
||||
"""LoRA modules configurations in either 'name=path' format or JSON format
|
||||
or JSON list format. Example (old format): `'name=path'` Example (new
|
||||
format): `{\"name\": \"name\", \"path\": \"lora_path\",
|
||||
\"base_model_name\": \"id\"}`"""
|
||||
chat_template: str | None = None
|
||||
"""The file path to the chat template, or the template in single-line form
|
||||
for the specified model."""
|
||||
chat_template_content_format: ChatTemplateContentFormatOption = "auto"
|
||||
"""The format to render message content within a chat template.
|
||||
|
||||
* "string" will render the content as a string. Example: `"Hello World"`
|
||||
* "openai" will render the content as a list of dictionaries, similar to
|
||||
OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
|
||||
trust_request_chat_template: bool = False
|
||||
"""Whether to trust the chat template provided in the request. If False,
|
||||
the server will always use the chat template specified by `--chat-template`
|
||||
or the ones from tokenizer."""
|
||||
default_chat_template_kwargs: dict[str, Any] | None = None
|
||||
"""Default keyword arguments to pass to the chat template renderer.
|
||||
These will be merged with request-level chat_template_kwargs,
|
||||
with request values taking precedence. Useful for setting default
|
||||
behavior for reasoning models. Example: '{"enable_thinking": false}'
|
||||
to disable thinking mode by default for Qwen3/DeepSeek models."""
|
||||
response_role: str = "assistant"
|
||||
"""The role name to return if `request.add_generation_prompt=true`."""
|
||||
return_tokens_as_token_ids: bool = False
|
||||
"""When `--max-logprobs` is specified, represents single tokens as
|
||||
strings of the form 'token_id:{token_id}' so that tokens that are not
|
||||
JSON-encodable can be identified."""
|
||||
disable_frontend_multiprocessing: bool = False
|
||||
"""If specified, will run the OpenAI frontend server in the same process as
|
||||
the model serving engine."""
|
||||
enable_auto_tool_choice: bool = False
|
||||
"""Enable auto tool choice for supported models. Use `--tool-call-parser`
|
||||
to specify which parser to use."""
|
||||
exclude_tools_when_tool_choice_none: bool = False
|
||||
"""If specified, exclude tool definitions in prompts when
|
||||
tool_choice='none'."""
|
||||
tool_call_parser: str | None = None
|
||||
"""Select the tool call parser depending on the model that you're using.
|
||||
This is used to parse the model-generated tool call into OpenAI API format.
|
||||
Required for `--enable-auto-tool-choice`. You can choose any option from
|
||||
the built-in parsers or register a plugin via `--tool-parser-plugin`."""
|
||||
tool_parser_plugin: str = ""
|
||||
"""Special the tool parser plugin write to parse the model-generated tool
|
||||
into OpenAI API format, the name register in this plugin can be used in
|
||||
`--tool-call-parser`."""
|
||||
tool_server: str | None = None
|
||||
"""Comma-separated list of host:port pairs (IPv4, IPv6, or hostname).
|
||||
Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo
|
||||
purpose."""
|
||||
log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH
|
||||
"""Path to logging config JSON file for both vllm and uvicorn"""
|
||||
max_log_len: int | None = None
|
||||
"""Max number of prompt characters or prompt ID numbers being printed in
|
||||
log. The default of None means unlimited."""
|
||||
enable_prompt_tokens_details: bool = False
|
||||
"""If set to True, enable prompt_tokens_details in usage."""
|
||||
enable_server_load_tracking: bool = False
|
||||
"""If set to True, enable tracking server_load_metrics in the app state."""
|
||||
enable_force_include_usage: bool = False
|
||||
"""If set to True, including usage on every request."""
|
||||
enable_tokenizer_info_endpoint: bool = False
|
||||
"""Enable the `/tokenizer_info` endpoint. May expose chat
|
||||
templates and other tokenizer configuration."""
|
||||
enable_log_outputs: bool = False
|
||||
"""If set to True, log model outputs (generations).
|
||||
Requires --enable-log-requests."""
|
||||
enable_log_deltas: bool = True
|
||||
"""If set to False, output deltas will not be logged. Relevant only if
|
||||
--enable-log-outputs is set.
|
||||
"""
|
||||
log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
|
||||
"""If set to True, log the stack trace of error responses"""
|
||||
tokens_only: bool = False
|
||||
"""
|
||||
If set to True, only enable the Tokens In<>Out endpoint.
|
||||
This is intended for use in a Disaggregated Everything setup.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def _customize_cli_kwargs(
|
||||
cls,
|
||||
frontend_kwargs: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
"""Customize argparse kwargs before arguments are registered.
|
||||
|
||||
Subclasses should override this and call
|
||||
``super()._customize_cli_kwargs(frontend_kwargs)`` first.
|
||||
"""
|
||||
# Special case: default_chat_template_kwargs needs json.loads type
|
||||
frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads
|
||||
|
||||
# Special case: LoRA modules need custom parser action and
|
||||
# optional_type(str)
|
||||
frontend_kwargs["lora_modules"]["type"] = optional_type(str)
|
||||
frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
|
||||
|
||||
# Special case: Tool call parser shows built-in options.
|
||||
valid_tool_parsers = list(ToolParserManager.list_registered())
|
||||
parsers_str = ",".join(valid_tool_parsers)
|
||||
frontend_kwargs["tool_call_parser"]["metavar"] = (
|
||||
f"{{{parsers_str}}} or name registered in --tool-parser-plugin"
|
||||
)
|
||||
return frontend_kwargs
|
||||
|
||||
@classmethod
|
||||
def add_cli_args(cls, parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
"""Register CLI arguments for this frontend class.
|
||||
|
||||
Subclasses should override ``_customize_cli_kwargs`` instead of
|
||||
this method so that base-class postprocessing is always applied.
|
||||
"""
|
||||
from vllm.engine.arg_utils import get_kwargs
|
||||
|
||||
frontend_kwargs = get_kwargs(cls)
|
||||
frontend_kwargs = cls._customize_cli_kwargs(frontend_kwargs)
|
||||
|
||||
group_name = cls.__name__.replace("Args", "")
|
||||
frontend_group = parser.add_argument_group(
|
||||
title=group_name,
|
||||
description=cls.__doc__,
|
||||
)
|
||||
for key, value in frontend_kwargs.items():
|
||||
extra_flags = value.pop("flags", [])
|
||||
frontend_group.add_argument(
|
||||
*extra_flags, f"--{key.replace('_', '-')}", **value
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
@config
|
||||
class FrontendArgs(BaseFrontendArgs):
|
||||
"""Arguments for the OpenAI-compatible frontend server."""
|
||||
|
||||
host: str | None = None
|
||||
@@ -99,32 +241,6 @@ class FrontendArgs:
|
||||
api_key: list[str] | None = None
|
||||
"""If provided, the server will require one of these keys to be presented in
|
||||
the header."""
|
||||
lora_modules: list[LoRAModulePath] | None = None
|
||||
"""LoRA modules configurations in either 'name=path' format or JSON format
|
||||
or JSON list format. Example (old format): `'name=path'` Example (new
|
||||
format): `{\"name\": \"name\", \"path\": \"lora_path\",
|
||||
\"base_model_name\": \"id\"}`"""
|
||||
chat_template: str | None = None
|
||||
"""The file path to the chat template, or the template in single-line form
|
||||
for the specified model."""
|
||||
chat_template_content_format: ChatTemplateContentFormatOption = "auto"
|
||||
"""The format to render message content within a chat template.
|
||||
|
||||
* "string" will render the content as a string. Example: `"Hello World"`
|
||||
* "openai" will render the content as a list of dictionaries, similar to
|
||||
OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
|
||||
trust_request_chat_template: bool = False
|
||||
"""Whether to trust the chat template provided in the request. If False,
|
||||
the server will always use the chat template specified by `--chat-template`
|
||||
or the ones from tokenizer."""
|
||||
default_chat_template_kwargs: dict[str, Any] | None = None
|
||||
"""Default keyword arguments to pass to the chat template renderer.
|
||||
These will be merged with request-level chat_template_kwargs,
|
||||
with request values taking precedence. Useful for setting default
|
||||
behavior for reasoning models. Example: '{"enable_thinking": false}'
|
||||
to disable thinking mode by default for Qwen3/DeepSeek models."""
|
||||
response_role: str = "assistant"
|
||||
"""The role name to return if `request.add_generation_prompt=true`."""
|
||||
ssl_keyfile: str | None = None
|
||||
"""The file path to the SSL key file."""
|
||||
ssl_certfile: str | None = None
|
||||
@@ -146,81 +262,28 @@ class FrontendArgs:
|
||||
is provided, vLLM will add it to the server using
|
||||
`@app.middleware('http')`. If a class is provided, vLLM will
|
||||
add it to the server using `app.add_middleware()`."""
|
||||
return_tokens_as_token_ids: bool = False
|
||||
"""When `--max-logprobs` is specified, represents single tokens as
|
||||
strings of the form 'token_id:{token_id}' so that tokens that are not
|
||||
JSON-encodable can be identified."""
|
||||
disable_frontend_multiprocessing: bool = False
|
||||
"""If specified, will run the OpenAI frontend server in the same process as
|
||||
the model serving engine."""
|
||||
enable_request_id_headers: bool = False
|
||||
"""If specified, API server will add X-Request-Id header to responses."""
|
||||
enable_auto_tool_choice: bool = False
|
||||
"""Enable auto tool choice for supported models. Use `--tool-call-parser`
|
||||
to specify which parser to use."""
|
||||
exclude_tools_when_tool_choice_none: bool = False
|
||||
"""If specified, exclude tool definitions in prompts when
|
||||
tool_choice='none'."""
|
||||
tool_call_parser: str | None = None
|
||||
"""Select the tool call parser depending on the model that you're using.
|
||||
This is used to parse the model-generated tool call into OpenAI API format.
|
||||
Required for `--enable-auto-tool-choice`. You can choose any option from
|
||||
the built-in parsers or register a plugin via `--tool-parser-plugin`."""
|
||||
tool_parser_plugin: str = ""
|
||||
"""Special the tool parser plugin write to parse the model-generated tool
|
||||
into OpenAI API format, the name register in this plugin can be used in
|
||||
`--tool-call-parser`."""
|
||||
tool_server: str | None = None
|
||||
"""Comma-separated list of host:port pairs (IPv4, IPv6, or hostname).
|
||||
Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo
|
||||
purpose."""
|
||||
log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH
|
||||
"""Path to logging config JSON file for both vllm and uvicorn"""
|
||||
max_log_len: int | None = None
|
||||
"""Max number of prompt characters or prompt ID numbers being printed in
|
||||
log. The default of None means unlimited."""
|
||||
disable_fastapi_docs: bool = False
|
||||
"""Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."""
|
||||
enable_prompt_tokens_details: bool = False
|
||||
"""If set to True, enable prompt_tokens_details in usage."""
|
||||
enable_server_load_tracking: bool = False
|
||||
"""If set to True, enable tracking server_load_metrics in the app state."""
|
||||
enable_force_include_usage: bool = False
|
||||
"""If set to True, including usage on every request."""
|
||||
enable_tokenizer_info_endpoint: bool = False
|
||||
"""Enable the `/tokenizer_info` endpoint. May expose chat
|
||||
templates and other tokenizer configuration."""
|
||||
enable_log_outputs: bool = False
|
||||
"""If set to True, log model outputs (generations).
|
||||
Requires --enable-log-requests."""
|
||||
enable_log_deltas: bool = True
|
||||
"""If set to False, output deltas will not be logged. Relevant only if
|
||||
--enable-log-outputs is set.
|
||||
"""
|
||||
h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
|
||||
"""Maximum size (bytes) of an incomplete HTTP event (header or body) for
|
||||
h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB)."""
|
||||
h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT
|
||||
"""Maximum number of HTTP headers allowed in a request for h11 parser.
|
||||
Helps mitigate header abuse. Default: 256."""
|
||||
log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
|
||||
"""If set to True, log the stack trace of error responses"""
|
||||
tokens_only: bool = False
|
||||
"""
|
||||
If set to True, only enable the Tokens In<>Out endpoint.
|
||||
This is intended for use in a Disaggregated Everything setup.
|
||||
"""
|
||||
enable_offline_docs: bool = False
|
||||
"""
|
||||
Enable offline FastAPI documentation for air-gapped environments.
|
||||
Uses vendored static assets bundled with vLLM.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
from vllm.engine.arg_utils import get_kwargs
|
||||
|
||||
frontend_kwargs = get_kwargs(FrontendArgs)
|
||||
@classmethod
|
||||
def _customize_cli_kwargs(
|
||||
cls,
|
||||
frontend_kwargs: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
frontend_kwargs = super()._customize_cli_kwargs(frontend_kwargs)
|
||||
|
||||
# Special case: allowed_origins, allowed_methods, allowed_headers all
|
||||
# need json.loads type
|
||||
@@ -232,14 +295,6 @@ class FrontendArgs:
|
||||
del frontend_kwargs["allowed_methods"]["nargs"]
|
||||
del frontend_kwargs["allowed_headers"]["nargs"]
|
||||
|
||||
# Special case: default_chat_template_kwargs needs json.loads type
|
||||
frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads
|
||||
|
||||
# Special case: LoRA modules need custom parser action and
|
||||
# optional_type(str)
|
||||
frontend_kwargs["lora_modules"]["type"] = optional_type(str)
|
||||
frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
|
||||
|
||||
# Special case: Middleware needs to append action
|
||||
frontend_kwargs["middleware"]["action"] = "append"
|
||||
frontend_kwargs["middleware"]["type"] = str
|
||||
@@ -252,22 +307,7 @@ class FrontendArgs:
|
||||
if "nargs" in frontend_kwargs["disable_access_log_for_endpoints"]:
|
||||
del frontend_kwargs["disable_access_log_for_endpoints"]["nargs"]
|
||||
|
||||
# Special case: Tool call parser shows built-in options.
|
||||
valid_tool_parsers = list(ToolParserManager.list_registered())
|
||||
parsers_str = ",".join(valid_tool_parsers)
|
||||
frontend_kwargs["tool_call_parser"]["metavar"] = (
|
||||
f"{{{parsers_str}}} or name registered in --tool-parser-plugin"
|
||||
)
|
||||
|
||||
frontend_group = parser.add_argument_group(
|
||||
title="Frontend",
|
||||
description=FrontendArgs.__doc__,
|
||||
)
|
||||
|
||||
for key, value in frontend_kwargs.items():
|
||||
frontend_group.add_argument(f"--{key.replace('_', '-')}", **value)
|
||||
|
||||
return parser
|
||||
return frontend_kwargs
|
||||
|
||||
|
||||
def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import sys
|
||||
import tempfile
|
||||
from argparse import Namespace
|
||||
from collections.abc import Awaitable, Callable
|
||||
@@ -17,23 +18,23 @@ from fastapi import UploadFile
|
||||
from prometheus_client import start_http_server
|
||||
from pydantic import Field, TypeAdapter, field_validator, model_validator
|
||||
from pydantic_core.core_schema import ValidationInfo
|
||||
from starlette.datastructures import State
|
||||
from tqdm import tqdm
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
|
||||
from vllm.config import config
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.api_server import init_app_state
|
||||
from vllm.entrypoints.openai.chat_completion.protocol import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponse,
|
||||
)
|
||||
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
|
||||
from vllm.entrypoints.openai.cli_args import BaseFrontendArgs
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorInfo,
|
||||
ErrorResponse,
|
||||
OpenAIBaseModel,
|
||||
)
|
||||
from vllm.entrypoints.openai.models.protocol import BaseModelPath
|
||||
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
|
||||
from vllm.entrypoints.openai.speech_to_text.protocol import (
|
||||
TranscriptionRequest,
|
||||
TranscriptionResponse,
|
||||
@@ -42,25 +43,18 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
|
||||
TranslationResponse,
|
||||
TranslationResponseVerbose,
|
||||
)
|
||||
from vllm.entrypoints.openai.speech_to_text.serving import (
|
||||
OpenAIServingTranscription,
|
||||
OpenAIServingTranslation,
|
||||
)
|
||||
from vllm.entrypoints.pooling.embed.protocol import (
|
||||
EmbeddingRequest,
|
||||
EmbeddingResponse,
|
||||
)
|
||||
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
|
||||
from vllm.entrypoints.pooling.score.protocol import (
|
||||
RerankRequest,
|
||||
RerankResponse,
|
||||
ScoreRequest,
|
||||
ScoreResponse,
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.serving import ServingScores
|
||||
from vllm.logger import init_logger
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
@@ -219,87 +213,73 @@ class BatchRequestOutput(OpenAIBaseModel):
|
||||
error: Any | None
|
||||
|
||||
|
||||
@config
|
||||
class BatchFrontendArgs(BaseFrontendArgs):
|
||||
"""Arguments for the batch runner frontend."""
|
||||
|
||||
input_file: str | None = None
|
||||
"""The path or url to a single input file. Currently supports local file
|
||||
paths, or the http protocol (http or https). If a URL is specified,
|
||||
the file should be available via HTTP GET."""
|
||||
output_file: str | None = None
|
||||
"""The path or url to a single output file. Currently supports
|
||||
local file paths, or web (http or https) urls. If a URL is specified,
|
||||
the file should be available via HTTP PUT."""
|
||||
output_tmp_dir: str | None = None
|
||||
"""The directory to store the output file before uploading it
|
||||
to the output URL."""
|
||||
enable_metrics: bool = False
|
||||
"""Enable Prometheus metrics"""
|
||||
host: str | None = None
|
||||
"""Host name for the Prometheus metrics server
|
||||
(only needed if enable-metrics is set)."""
|
||||
port: int = 8000
|
||||
"""Port number for the Prometheus metrics server
|
||||
(only needed if enable-metrics is set)."""
|
||||
url: str = "0.0.0.0"
|
||||
"""[DEPRECATED] Host name for the Prometheus metrics server
|
||||
(only needed if enable-metrics is set). Use --host instead."""
|
||||
|
||||
@classmethod
|
||||
def _customize_cli_kwargs(
|
||||
cls,
|
||||
frontend_kwargs: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
frontend_kwargs = super()._customize_cli_kwargs(frontend_kwargs)
|
||||
|
||||
frontend_kwargs["input_file"]["flags"] = ["-i"]
|
||||
frontend_kwargs["input_file"]["required"] = True
|
||||
frontend_kwargs["output_file"]["flags"] = ["-o"]
|
||||
frontend_kwargs["output_file"]["required"] = True
|
||||
|
||||
frontend_kwargs["enable_metrics"]["action"] = "store_true"
|
||||
|
||||
frontend_kwargs["url"]["deprecated"] = True
|
||||
return frontend_kwargs
|
||||
|
||||
|
||||
def make_arg_parser(parser: FlexibleArgumentParser):
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--input-file",
|
||||
required=True,
|
||||
type=str,
|
||||
help="The path or url to a single input file. Currently supports local file "
|
||||
"paths, or the http protocol (http or https). If a URL is specified, "
|
||||
"the file should be available via HTTP GET.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output-file",
|
||||
required=True,
|
||||
type=str,
|
||||
help="The path or url to a single output file. Currently supports "
|
||||
"local file paths, or web (http or https) urls. If a URL is specified,"
|
||||
" the file should be available via HTTP PUT.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-tmp-dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The directory to store the output file before uploading it "
|
||||
"to the output URL.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--response-role",
|
||||
type=optional_type(str),
|
||||
default="assistant",
|
||||
help="The role name to return if `request.add_generation_prompt=True`.",
|
||||
)
|
||||
|
||||
parser = BatchFrontendArgs.add_cli_args(parser)
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
"--max-log-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Max number of prompt characters or prompt "
|
||||
"ID numbers being printed in log."
|
||||
"\n\nDefault: Unlimited",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--enable-metrics", action="store_true", help="Enable Prometheus metrics"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
type=str,
|
||||
default="0.0.0.0",
|
||||
help="URL to the Prometheus metrics server "
|
||||
"(only needed if enable-metrics is set).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=8000,
|
||||
help="Port number for the Prometheus metrics server "
|
||||
"(only needed if enable-metrics is set).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-prompt-tokens-details",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="If set to True, enable prompt_tokens_details in usage.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-force-include-usage",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="If set to True, include usage on every request "
|
||||
"(even when stream_options is not specified)",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(description="vLLM OpenAI-Compatible batch runner.")
|
||||
return make_arg_parser(parser).parse_args()
|
||||
args = make_arg_parser(parser).parse_args()
|
||||
|
||||
# Backward compatibility: If --url is set, use it for host
|
||||
url_explicit = any(arg == "--url" or arg.startswith("--url=") for arg in sys.argv)
|
||||
host_explicit = any(
|
||||
arg == "--host" or arg.startswith("--host=") for arg in sys.argv
|
||||
)
|
||||
if url_explicit and hasattr(args, "url") and not host_explicit:
|
||||
args.host = args.url
|
||||
logger.warning_once(
|
||||
"Using --url for metrics is deprecated. Please use --host instead."
|
||||
)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
# explicitly use pure text format, with a newline at the end
|
||||
@@ -671,12 +651,9 @@ def make_transcription_wrapper(is_translation: bool) -> WrapperFn:
|
||||
return wrapper
|
||||
|
||||
|
||||
def build_endpoint_registry(
|
||||
async def build_endpoint_registry(
|
||||
engine_client: EngineClient,
|
||||
args: Namespace,
|
||||
base_model_paths: list[BaseModelPath],
|
||||
request_logger: RequestLogger | None,
|
||||
supported_tasks: tuple[SupportedTask, ...],
|
||||
) -> dict[str, dict[str, Any]]:
|
||||
"""
|
||||
Build the endpoint registry with all serving objects and handler configurations.
|
||||
@@ -684,90 +661,27 @@ def build_endpoint_registry(
|
||||
Args:
|
||||
engine_client: The engine client
|
||||
args: Command line arguments
|
||||
base_model_paths: List of base model paths
|
||||
request_logger: Optional request logger
|
||||
supported_tasks: Tuple of supported tasks
|
||||
|
||||
Returns:
|
||||
Dictionary mapping endpoint keys to their configurations
|
||||
"""
|
||||
model_config = engine_client.model_config
|
||||
supported_tasks = await engine_client.get_supported_tasks()
|
||||
logger.info("Supported tasks: %s", supported_tasks)
|
||||
|
||||
# Create the openai serving objects.
|
||||
openai_serving_models = OpenAIServingModels(
|
||||
engine_client=engine_client,
|
||||
base_model_paths=base_model_paths,
|
||||
lora_modules=None,
|
||||
)
|
||||
# Create a state object to hold serving objects
|
||||
state = State()
|
||||
|
||||
openai_serving_chat = (
|
||||
OpenAIServingChat(
|
||||
engine_client,
|
||||
openai_serving_models,
|
||||
args.response_role,
|
||||
request_logger=request_logger,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
reasoning_parser=args.structured_outputs_config.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
default_chat_template_kwargs=getattr(
|
||||
args, "default_chat_template_kwargs", None
|
||||
),
|
||||
)
|
||||
if "generate" in supported_tasks
|
||||
else None
|
||||
)
|
||||
# Initialize all serving objects using init_app_state
|
||||
# This provides full functionality including chat template processing,
|
||||
# LoRA support, tool servers, etc.
|
||||
await init_app_state(engine_client, state, args, supported_tasks)
|
||||
|
||||
openai_serving_embedding = (
|
||||
OpenAIServingEmbedding(
|
||||
engine_client,
|
||||
openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
chat_template=None,
|
||||
chat_template_content_format="auto",
|
||||
)
|
||||
if "embed" in supported_tasks
|
||||
else None
|
||||
)
|
||||
|
||||
enable_serving_reranking = (
|
||||
"classify" in supported_tasks
|
||||
and getattr(model_config.hf_config, "num_labels", 0) == 1
|
||||
)
|
||||
|
||||
openai_serving_scores = (
|
||||
ServingScores(
|
||||
engine_client,
|
||||
openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
score_template=None,
|
||||
)
|
||||
if ("embed" in supported_tasks or enable_serving_reranking)
|
||||
else None
|
||||
)
|
||||
|
||||
openai_serving_transcription = (
|
||||
OpenAIServingTranscription(
|
||||
engine_client,
|
||||
openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
)
|
||||
if "transcription" in supported_tasks
|
||||
else None
|
||||
)
|
||||
|
||||
openai_serving_translation = (
|
||||
OpenAIServingTranslation(
|
||||
engine_client,
|
||||
openai_serving_models,
|
||||
request_logger=request_logger,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
)
|
||||
if "transcription" in supported_tasks
|
||||
else None
|
||||
)
|
||||
# Get serving objects from state (defaulting to None if not set)
|
||||
openai_serving_chat = getattr(state, "openai_serving_chat", None)
|
||||
openai_serving_embedding = getattr(state, "openai_serving_embedding", None)
|
||||
openai_serving_scores = getattr(state, "openai_serving_scores", None)
|
||||
openai_serving_transcription = getattr(state, "openai_serving_transcription", None)
|
||||
openai_serving_translation = getattr(state, "openai_serving_translation", None)
|
||||
|
||||
# Registry of endpoint configurations
|
||||
endpoint_registry: dict[str, dict[str, Any]] = {
|
||||
@@ -845,29 +759,9 @@ async def run_batch(
|
||||
engine_client: EngineClient,
|
||||
args: Namespace,
|
||||
) -> None:
|
||||
if args.served_model_name is not None:
|
||||
served_model_names = args.served_model_name
|
||||
else:
|
||||
served_model_names = [args.model]
|
||||
|
||||
if args.enable_log_requests:
|
||||
request_logger = RequestLogger(max_log_len=args.max_log_len)
|
||||
else:
|
||||
request_logger = None
|
||||
|
||||
base_model_paths = [
|
||||
BaseModelPath(name=name, model_path=args.model) for name in served_model_names
|
||||
]
|
||||
|
||||
supported_tasks = await engine_client.get_supported_tasks()
|
||||
logger.info("Supported tasks: %s", supported_tasks)
|
||||
|
||||
endpoint_registry = build_endpoint_registry(
|
||||
endpoint_registry = await build_endpoint_registry(
|
||||
engine_client=engine_client,
|
||||
args=args,
|
||||
base_model_paths=base_model_paths,
|
||||
request_logger=request_logger,
|
||||
supported_tasks=supported_tasks,
|
||||
)
|
||||
|
||||
tracker = BatchProgressTracker()
|
||||
@@ -942,7 +836,7 @@ if __name__ == "__main__":
|
||||
# to publish metrics at the /metrics endpoint.
|
||||
if args.enable_metrics:
|
||||
logger.info("Prometheus metrics enabled")
|
||||
start_http_server(port=args.port, addr=args.url)
|
||||
start_http_server(port=args.port, addr=args.host)
|
||||
else:
|
||||
logger.info("Prometheus metrics disabled")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user