[Frontend] Use init_app_state and FrontendArgs in run_batch (#32967)

Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
Pooya Davoodi
2026-02-24 19:40:39 -08:00
committed by GitHub
parent dbf0da817a
commit e3b2324ec4
4 changed files with 652 additions and 350 deletions

View File

@@ -447,7 +447,7 @@ def test_metrics_exist_run_batch():
"--model",
"intfloat/multilingual-e5-small",
"--enable-metrics",
"--url",
"--host",
base_url,
"--port",
port,

View File

@@ -10,59 +10,361 @@ import pytest
from vllm.assets.audio import AudioAsset
from vllm.entrypoints.openai.run_batch import BatchRequestOutput
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
CHAT_MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-small"
RERANKER_MODEL_NAME = "BAAI/bge-reranker-v2-m3"
REASONING_MODEL_NAME = "Qwen/Qwen3-0.6B"
SPEECH_LARGE_MODEL_NAME = "openai/whisper-large-v3"
SPEECH_SMALL_MODEL_NAME = "openai/whisper-small"
# ruff: noqa: E501
INPUT_BATCH = (
'{{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
'{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
'{{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "NonExistModel", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
'{{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
'{{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {{"stream": "True", "model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
).format(MODEL_NAME)
INVALID_INPUT_BATCH = (
'{{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
'{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
).format(MODEL_NAME)
INPUT_EMBEDDING_BATCH = (
'{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}\n'
'{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}\n'
'{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}\n'
'{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}'
INPUT_BATCH = "\n".join(
json.dumps(req)
for req in [
{
"custom_id": "request-1",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": CHAT_MODEL_NAME,
"messages": [
{
"role": "system",
"content": "You are a helpful assistant.",
},
{"role": "user", "content": "Hello world!"},
],
"max_tokens": 1000,
},
},
{
"custom_id": "request-2",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": CHAT_MODEL_NAME,
"messages": [
{
"role": "system",
"content": "You are an unhelpful assistant.",
},
{"role": "user", "content": "Hello world!"},
],
"max_tokens": 1000,
},
},
{
"custom_id": "request-3",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": "NonExistModel",
"messages": [
{
"role": "system",
"content": "You are an unhelpful assistant.",
},
{"role": "user", "content": "Hello world!"},
],
"max_tokens": 1000,
},
},
{
"custom_id": "request-4",
"method": "POST",
"url": "/bad_url",
"body": {
"model": CHAT_MODEL_NAME,
"messages": [
{
"role": "system",
"content": "You are an unhelpful assistant.",
},
{"role": "user", "content": "Hello world!"},
],
"max_tokens": 1000,
},
},
{
"custom_id": "request-5",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"stream": "True",
"model": CHAT_MODEL_NAME,
"messages": [
{
"role": "system",
"content": "You are an unhelpful assistant.",
},
{"role": "user", "content": "Hello world!"},
],
"max_tokens": 1000,
},
},
]
)
INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
INVALID_INPUT_BATCH = "\n".join(
json.dumps(req)
for req in [
{
"invalid_field": "request-1",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": CHAT_MODEL_NAME,
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello world!"},
],
"max_tokens": 1000,
},
},
{
"custom_id": "request-2",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": CHAT_MODEL_NAME,
"messages": [
{"role": "system", "content": "You are an unhelpful assistant."},
{"role": "user", "content": "Hello world!"},
],
"max_tokens": 1000,
},
},
]
)
INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
INPUT_EMBEDDING_BATCH = "\n".join(
json.dumps(req)
for req in [
{
"custom_id": "request-1",
"method": "POST",
"url": "/v1/embeddings",
"body": {
"model": EMBEDDING_MODEL_NAME,
"input": "You are a helpful assistant.",
},
},
{
"custom_id": "request-2",
"method": "POST",
"url": "/v1/embeddings",
"body": {
"model": EMBEDDING_MODEL_NAME,
"input": "You are an unhelpful assistant.",
},
},
{
"custom_id": "request-3",
"method": "POST",
"url": "/v1/embeddings",
"body": {
"model": EMBEDDING_MODEL_NAME,
"input": "Hello world!",
},
},
{
"custom_id": "request-4",
"method": "POST",
"url": "/v1/embeddings",
"body": {
"model": "NonExistModel",
"input": "Hello world!",
},
},
]
)
INPUT_REASONING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Solve this math problem: 2+2=?"}]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "What is the capital of France?"}]}}"""
_SCORE_RERANK_DOCUMENTS = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.",
]
INPUT_SCORE_BATCH = "\n".join(
json.dumps(req)
for req in [
{
"custom_id": "request-1",
"method": "POST",
"url": "/score",
"body": {
"model": RERANKER_MODEL_NAME,
"queries": "What is the capital of France?",
"documents": _SCORE_RERANK_DOCUMENTS,
},
},
{
"custom_id": "request-2",
"method": "POST",
"url": "/v1/score",
"body": {
"model": RERANKER_MODEL_NAME,
"queries": "What is the capital of France?",
"documents": _SCORE_RERANK_DOCUMENTS,
},
},
]
)
INPUT_RERANK_BATCH = "\n".join(
json.dumps(req)
for req in [
{
"custom_id": "request-1",
"method": "POST",
"url": "/rerank",
"body": {
"model": RERANKER_MODEL_NAME,
"query": "What is the capital of France?",
"documents": _SCORE_RERANK_DOCUMENTS,
},
},
{
"custom_id": "request-2",
"method": "POST",
"url": "/v1/rerank",
"body": {
"model": RERANKER_MODEL_NAME,
"query": "What is the capital of France?",
"documents": _SCORE_RERANK_DOCUMENTS,
},
},
{
"custom_id": "request-2",
"method": "POST",
"url": "/v2/rerank",
"body": {
"model": RERANKER_MODEL_NAME,
"query": "What is the capital of France?",
"documents": _SCORE_RERANK_DOCUMENTS,
},
},
]
)
INPUT_REASONING_BATCH = "\n".join(
json.dumps(req)
for req in [
{
"custom_id": "request-1",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": REASONING_MODEL_NAME,
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Solve this math problem: 2+2=?"},
],
},
},
{
"custom_id": "request-2",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": REASONING_MODEL_NAME,
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the capital of France?"},
],
},
},
]
)
# This is a valid but minimal audio file for testing
MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA="
INPUT_TRANSCRIPTION_BATCH = (
'{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/transcriptions", '
'"body": {{"model": "openai/whisper-large-v3", "file_url": "data:audio/wav;base64,{}", '
'"response_format": "json"}}}}\n'
).format(MINIMAL_WAV_BASE64)
json.dumps(
{
"custom_id": "request-1",
"method": "POST",
"url": "/v1/audio/transcriptions",
"body": {
"model": SPEECH_LARGE_MODEL_NAME,
"file_url": f"data:audio/wav;base64,{MINIMAL_WAV_BASE64}",
"response_format": "json",
},
}
)
+ "\n"
)
INPUT_TRANSCRIPTION_HTTP_BATCH = (
'{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/transcriptions", '
'"body": {{"model": "openai/whisper-large-v3", "file_url": "{}", '
'"response_format": "json"}}}}\n'
).format(AudioAsset("mary_had_lamb").url)
json.dumps(
{
"custom_id": "request-1",
"method": "POST",
"url": "/v1/audio/transcriptions",
"body": {
"model": SPEECH_LARGE_MODEL_NAME,
"file_url": AudioAsset("mary_had_lamb").url,
"response_format": "json",
},
}
)
+ "\n"
)
INPUT_TRANSLATION_BATCH = (
'{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/translations", '
'"body": {{"model": "openai/whisper-small", "file_url": "{}", '
'"response_format": "text", "language": "it", "to_language": "en", '
'"temperature": 0.0}}}}\n'
).format(AudioAsset("mary_had_lamb").url)
json.dumps(
{
"custom_id": "request-1",
"method": "POST",
"url": "/v1/audio/translations",
"body": {
"model": SPEECH_SMALL_MODEL_NAME,
"file_url": AudioAsset("mary_had_lamb").url,
"response_format": "text",
"language": "it",
"to_language": "en",
"temperature": 0.0,
},
}
)
+ "\n"
)
WEATHER_TOOL = {
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["location"],
},
},
}
INPUT_TOOL_CALLING_BATCH = json.dumps(
{
"custom_id": "request-1",
"method": "POST",
"url": "/v1/chat/completions",
"body": {
"model": REASONING_MODEL_NAME,
"messages": [
{"role": "user", "content": "What is the weather in San Francisco?"},
],
"tools": [WEATHER_TOOL],
"tool_choice": "required",
"max_tokens": 1000,
},
}
)
def test_empty_file():
@@ -81,7 +383,7 @@ def test_empty_file():
"-o",
output_file.name,
"--model",
"intfloat/multilingual-e5-small",
EMBEDDING_MODEL_NAME,
],
)
proc.communicate()
@@ -108,7 +410,7 @@ def test_completions():
"-o",
output_file.name,
"--model",
MODEL_NAME,
CHAT_MODEL_NAME,
],
)
proc.communicate()
@@ -141,7 +443,7 @@ def test_completions_invalid_input():
"-o",
output_file.name,
"--model",
MODEL_NAME,
CHAT_MODEL_NAME,
],
)
proc.communicate()
@@ -165,7 +467,7 @@ def test_embeddings():
"-o",
output_file.name,
"--model",
"intfloat/multilingual-e5-small",
EMBEDDING_MODEL_NAME,
],
)
proc.communicate()
@@ -196,7 +498,7 @@ def test_score(input_batch):
"-o",
output_file.name,
"--model",
"BAAI/bge-reranker-v2-m3",
RERANKER_MODEL_NAME,
],
)
proc.communicate()
@@ -234,7 +536,7 @@ def test_reasoning_parser():
"-o",
output_file.name,
"--model",
"Qwen/Qwen3-0.6B",
REASONING_MODEL_NAME,
"--reasoning-parser",
"qwen3",
],
@@ -278,7 +580,7 @@ def test_transcription():
"-o",
output_file.name,
"--model",
"openai/whisper-large-v3",
SPEECH_LARGE_MODEL_NAME,
],
)
proc.communicate()
@@ -316,7 +618,7 @@ def test_transcription_http_url():
"-o",
output_file.name,
"--model",
"openai/whisper-large-v3",
SPEECH_LARGE_MODEL_NAME,
],
)
proc.communicate()
@@ -356,7 +658,7 @@ def test_translation():
"-o",
output_file.name,
"--model",
"openai/whisper-small",
SPEECH_SMALL_MODEL_NAME,
],
)
proc.communicate()
@@ -378,3 +680,69 @@ def test_translation():
translation_text = response_body["text"]
translation_text_lower = str(translation_text).strip().lower()
assert "mary" in translation_text_lower or "lamb" in translation_text_lower
def test_tool_calling():
"""
Test that tool calling works correctly in run_batch.
Verifies that requests with tools return tool_calls in the response.
"""
with (
tempfile.NamedTemporaryFile("w") as input_file,
tempfile.NamedTemporaryFile("r") as output_file,
):
input_file.write(INPUT_TOOL_CALLING_BATCH)
input_file.flush()
proc = subprocess.Popen(
[
"vllm",
"run-batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
REASONING_MODEL_NAME,
"--enable-auto-tool-choice",
"--tool-call-parser",
"hermes",
],
)
proc.communicate()
proc.wait()
assert proc.returncode == 0, f"{proc=}"
contents = output_file.read()
for line in contents.strip().split("\n"):
if not line.strip(): # Skip empty lines
continue
# Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong.
BatchRequestOutput.model_validate_json(line)
# Ensure that there is no error in the response.
line_dict = json.loads(line)
assert isinstance(line_dict, dict)
assert line_dict["error"] is None
# Check that tool_calls are present in the response
# With tool_choice="required", the model must call a tool
response_body = line_dict["response"]["body"]
assert response_body is not None
message = response_body["choices"][0]["message"]
assert "tool_calls" in message
tool_calls = message.get("tool_calls")
# With tool_choice="required", tool_calls must be present and non-empty
assert tool_calls is not None
assert isinstance(tool_calls, list)
assert len(tool_calls) > 0
# Verify tool_calls have the expected structure
for tool_call in tool_calls:
assert "id" in tool_call
assert "type" in tool_call
assert tool_call["type"] == "function"
assert "function" in tool_call
assert "name" in tool_call["function"]
assert "arguments" in tool_call["function"]
# Verify the tool name matches our tool definition
assert tool_call["function"]["name"] == "get_current_weather"

View File

@@ -67,7 +67,149 @@ class LoRAParserAction(argparse.Action):
@config
class FrontendArgs:
class BaseFrontendArgs:
"""Base arguments for the OpenAI-compatible frontend server.
This base class does not include host, port, and server-specific arguments
like SSL, CORS, and HTTP server settings. Those arguments are added by
the subclasses.
"""
lora_modules: list[LoRAModulePath] | None = None
"""LoRA modules configurations in either 'name=path' format or JSON format
or JSON list format. Example (old format): `'name=path'` Example (new
format): `{\"name\": \"name\", \"path\": \"lora_path\",
\"base_model_name\": \"id\"}`"""
chat_template: str | None = None
"""The file path to the chat template, or the template in single-line form
for the specified model."""
chat_template_content_format: ChatTemplateContentFormatOption = "auto"
"""The format to render message content within a chat template.
* "string" will render the content as a string. Example: `"Hello World"`
* "openai" will render the content as a list of dictionaries, similar to
OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
trust_request_chat_template: bool = False
"""Whether to trust the chat template provided in the request. If False,
the server will always use the chat template specified by `--chat-template`
or the ones from tokenizer."""
default_chat_template_kwargs: dict[str, Any] | None = None
"""Default keyword arguments to pass to the chat template renderer.
These will be merged with request-level chat_template_kwargs,
with request values taking precedence. Useful for setting default
behavior for reasoning models. Example: '{"enable_thinking": false}'
to disable thinking mode by default for Qwen3/DeepSeek models."""
response_role: str = "assistant"
"""The role name to return if `request.add_generation_prompt=true`."""
return_tokens_as_token_ids: bool = False
"""When `--max-logprobs` is specified, represents single tokens as
strings of the form 'token_id:{token_id}' so that tokens that are not
JSON-encodable can be identified."""
disable_frontend_multiprocessing: bool = False
"""If specified, will run the OpenAI frontend server in the same process as
the model serving engine."""
enable_auto_tool_choice: bool = False
"""Enable auto tool choice for supported models. Use `--tool-call-parser`
to specify which parser to use."""
exclude_tools_when_tool_choice_none: bool = False
"""If specified, exclude tool definitions in prompts when
tool_choice='none'."""
tool_call_parser: str | None = None
"""Select the tool call parser depending on the model that you're using.
This is used to parse the model-generated tool call into OpenAI API format.
Required for `--enable-auto-tool-choice`. You can choose any option from
the built-in parsers or register a plugin via `--tool-parser-plugin`."""
tool_parser_plugin: str = ""
"""Special the tool parser plugin write to parse the model-generated tool
into OpenAI API format, the name register in this plugin can be used in
`--tool-call-parser`."""
tool_server: str | None = None
"""Comma-separated list of host:port pairs (IPv4, IPv6, or hostname).
Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo
purpose."""
log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH
"""Path to logging config JSON file for both vllm and uvicorn"""
max_log_len: int | None = None
"""Max number of prompt characters or prompt ID numbers being printed in
log. The default of None means unlimited."""
enable_prompt_tokens_details: bool = False
"""If set to True, enable prompt_tokens_details in usage."""
enable_server_load_tracking: bool = False
"""If set to True, enable tracking server_load_metrics in the app state."""
enable_force_include_usage: bool = False
"""If set to True, including usage on every request."""
enable_tokenizer_info_endpoint: bool = False
"""Enable the `/tokenizer_info` endpoint. May expose chat
templates and other tokenizer configuration."""
enable_log_outputs: bool = False
"""If set to True, log model outputs (generations).
Requires --enable-log-requests."""
enable_log_deltas: bool = True
"""If set to False, output deltas will not be logged. Relevant only if
--enable-log-outputs is set.
"""
log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
"""If set to True, log the stack trace of error responses"""
tokens_only: bool = False
"""
If set to True, only enable the Tokens In<>Out endpoint.
This is intended for use in a Disaggregated Everything setup.
"""
@classmethod
def _customize_cli_kwargs(
cls,
frontend_kwargs: dict[str, Any],
) -> dict[str, Any]:
"""Customize argparse kwargs before arguments are registered.
Subclasses should override this and call
``super()._customize_cli_kwargs(frontend_kwargs)`` first.
"""
# Special case: default_chat_template_kwargs needs json.loads type
frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads
# Special case: LoRA modules need custom parser action and
# optional_type(str)
frontend_kwargs["lora_modules"]["type"] = optional_type(str)
frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
# Special case: Tool call parser shows built-in options.
valid_tool_parsers = list(ToolParserManager.list_registered())
parsers_str = ",".join(valid_tool_parsers)
frontend_kwargs["tool_call_parser"]["metavar"] = (
f"{{{parsers_str}}} or name registered in --tool-parser-plugin"
)
return frontend_kwargs
@classmethod
def add_cli_args(cls, parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"""Register CLI arguments for this frontend class.
Subclasses should override ``_customize_cli_kwargs`` instead of
this method so that base-class postprocessing is always applied.
"""
from vllm.engine.arg_utils import get_kwargs
frontend_kwargs = get_kwargs(cls)
frontend_kwargs = cls._customize_cli_kwargs(frontend_kwargs)
group_name = cls.__name__.replace("Args", "")
frontend_group = parser.add_argument_group(
title=group_name,
description=cls.__doc__,
)
for key, value in frontend_kwargs.items():
extra_flags = value.pop("flags", [])
frontend_group.add_argument(
*extra_flags, f"--{key.replace('_', '-')}", **value
)
return parser
@config
class FrontendArgs(BaseFrontendArgs):
"""Arguments for the OpenAI-compatible frontend server."""
host: str | None = None
@@ -99,32 +241,6 @@ class FrontendArgs:
api_key: list[str] | None = None
"""If provided, the server will require one of these keys to be presented in
the header."""
lora_modules: list[LoRAModulePath] | None = None
"""LoRA modules configurations in either 'name=path' format or JSON format
or JSON list format. Example (old format): `'name=path'` Example (new
format): `{\"name\": \"name\", \"path\": \"lora_path\",
\"base_model_name\": \"id\"}`"""
chat_template: str | None = None
"""The file path to the chat template, or the template in single-line form
for the specified model."""
chat_template_content_format: ChatTemplateContentFormatOption = "auto"
"""The format to render message content within a chat template.
* "string" will render the content as a string. Example: `"Hello World"`
* "openai" will render the content as a list of dictionaries, similar to
OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
trust_request_chat_template: bool = False
"""Whether to trust the chat template provided in the request. If False,
the server will always use the chat template specified by `--chat-template`
or the ones from tokenizer."""
default_chat_template_kwargs: dict[str, Any] | None = None
"""Default keyword arguments to pass to the chat template renderer.
These will be merged with request-level chat_template_kwargs,
with request values taking precedence. Useful for setting default
behavior for reasoning models. Example: '{"enable_thinking": false}'
to disable thinking mode by default for Qwen3/DeepSeek models."""
response_role: str = "assistant"
"""The role name to return if `request.add_generation_prompt=true`."""
ssl_keyfile: str | None = None
"""The file path to the SSL key file."""
ssl_certfile: str | None = None
@@ -146,81 +262,28 @@ class FrontendArgs:
is provided, vLLM will add it to the server using
`@app.middleware('http')`. If a class is provided, vLLM will
add it to the server using `app.add_middleware()`."""
return_tokens_as_token_ids: bool = False
"""When `--max-logprobs` is specified, represents single tokens as
strings of the form 'token_id:{token_id}' so that tokens that are not
JSON-encodable can be identified."""
disable_frontend_multiprocessing: bool = False
"""If specified, will run the OpenAI frontend server in the same process as
the model serving engine."""
enable_request_id_headers: bool = False
"""If specified, API server will add X-Request-Id header to responses."""
enable_auto_tool_choice: bool = False
"""Enable auto tool choice for supported models. Use `--tool-call-parser`
to specify which parser to use."""
exclude_tools_when_tool_choice_none: bool = False
"""If specified, exclude tool definitions in prompts when
tool_choice='none'."""
tool_call_parser: str | None = None
"""Select the tool call parser depending on the model that you're using.
This is used to parse the model-generated tool call into OpenAI API format.
Required for `--enable-auto-tool-choice`. You can choose any option from
the built-in parsers or register a plugin via `--tool-parser-plugin`."""
tool_parser_plugin: str = ""
"""Special the tool parser plugin write to parse the model-generated tool
into OpenAI API format, the name register in this plugin can be used in
`--tool-call-parser`."""
tool_server: str | None = None
"""Comma-separated list of host:port pairs (IPv4, IPv6, or hostname).
Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo
purpose."""
log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH
"""Path to logging config JSON file for both vllm and uvicorn"""
max_log_len: int | None = None
"""Max number of prompt characters or prompt ID numbers being printed in
log. The default of None means unlimited."""
disable_fastapi_docs: bool = False
"""Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."""
enable_prompt_tokens_details: bool = False
"""If set to True, enable prompt_tokens_details in usage."""
enable_server_load_tracking: bool = False
"""If set to True, enable tracking server_load_metrics in the app state."""
enable_force_include_usage: bool = False
"""If set to True, including usage on every request."""
enable_tokenizer_info_endpoint: bool = False
"""Enable the `/tokenizer_info` endpoint. May expose chat
templates and other tokenizer configuration."""
enable_log_outputs: bool = False
"""If set to True, log model outputs (generations).
Requires --enable-log-requests."""
enable_log_deltas: bool = True
"""If set to False, output deltas will not be logged. Relevant only if
--enable-log-outputs is set.
"""
h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
"""Maximum size (bytes) of an incomplete HTTP event (header or body) for
h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB)."""
h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT
"""Maximum number of HTTP headers allowed in a request for h11 parser.
Helps mitigate header abuse. Default: 256."""
log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
"""If set to True, log the stack trace of error responses"""
tokens_only: bool = False
"""
If set to True, only enable the Tokens In<>Out endpoint.
This is intended for use in a Disaggregated Everything setup.
"""
enable_offline_docs: bool = False
"""
Enable offline FastAPI documentation for air-gapped environments.
Uses vendored static assets bundled with vLLM.
"""
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
from vllm.engine.arg_utils import get_kwargs
frontend_kwargs = get_kwargs(FrontendArgs)
@classmethod
def _customize_cli_kwargs(
cls,
frontend_kwargs: dict[str, Any],
) -> dict[str, Any]:
frontend_kwargs = super()._customize_cli_kwargs(frontend_kwargs)
# Special case: allowed_origins, allowed_methods, allowed_headers all
# need json.loads type
@@ -232,14 +295,6 @@ class FrontendArgs:
del frontend_kwargs["allowed_methods"]["nargs"]
del frontend_kwargs["allowed_headers"]["nargs"]
# Special case: default_chat_template_kwargs needs json.loads type
frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads
# Special case: LoRA modules need custom parser action and
# optional_type(str)
frontend_kwargs["lora_modules"]["type"] = optional_type(str)
frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
# Special case: Middleware needs to append action
frontend_kwargs["middleware"]["action"] = "append"
frontend_kwargs["middleware"]["type"] = str
@@ -252,22 +307,7 @@ class FrontendArgs:
if "nargs" in frontend_kwargs["disable_access_log_for_endpoints"]:
del frontend_kwargs["disable_access_log_for_endpoints"]["nargs"]
# Special case: Tool call parser shows built-in options.
valid_tool_parsers = list(ToolParserManager.list_registered())
parsers_str = ",".join(valid_tool_parsers)
frontend_kwargs["tool_call_parser"]["metavar"] = (
f"{{{parsers_str}}} or name registered in --tool-parser-plugin"
)
frontend_group = parser.add_argument_group(
title="Frontend",
description=FrontendArgs.__doc__,
)
for key, value in frontend_kwargs.items():
frontend_group.add_argument(f"--{key.replace('_', '-')}", **value)
return parser
return frontend_kwargs
def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:

View File

@@ -3,6 +3,7 @@
import asyncio
import base64
import sys
import tempfile
from argparse import Namespace
from collections.abc import Awaitable, Callable
@@ -17,23 +18,23 @@ from fastapi import UploadFile
from prometheus_client import start_http_server
from pydantic import Field, TypeAdapter, field_validator, model_validator
from pydantic_core.core_schema import ValidationInfo
from starlette.datastructures import State
from tqdm import tqdm
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
from vllm.config import config
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.api_server import init_app_state
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
)
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.cli_args import BaseFrontendArgs
from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo,
ErrorResponse,
OpenAIBaseModel,
)
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.speech_to_text.protocol import (
TranscriptionRequest,
TranscriptionResponse,
@@ -42,25 +43,18 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
TranslationResponse,
TranslationResponseVerbose,
)
from vllm.entrypoints.openai.speech_to_text.serving import (
OpenAIServingTranscription,
OpenAIServingTranslation,
)
from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingRequest,
EmbeddingResponse,
)
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
from vllm.entrypoints.pooling.score.protocol import (
RerankRequest,
RerankResponse,
ScoreRequest,
ScoreResponse,
)
from vllm.entrypoints.pooling.score.serving import ServingScores
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.tasks import SupportedTask
from vllm.utils import random_uuid
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.version import __version__ as VLLM_VERSION
@@ -219,87 +213,73 @@ class BatchRequestOutput(OpenAIBaseModel):
error: Any | None
@config
class BatchFrontendArgs(BaseFrontendArgs):
"""Arguments for the batch runner frontend."""
input_file: str | None = None
"""The path or url to a single input file. Currently supports local file
paths, or the http protocol (http or https). If a URL is specified,
the file should be available via HTTP GET."""
output_file: str | None = None
"""The path or url to a single output file. Currently supports
local file paths, or web (http or https) urls. If a URL is specified,
the file should be available via HTTP PUT."""
output_tmp_dir: str | None = None
"""The directory to store the output file before uploading it
to the output URL."""
enable_metrics: bool = False
"""Enable Prometheus metrics"""
host: str | None = None
"""Host name for the Prometheus metrics server
(only needed if enable-metrics is set)."""
port: int = 8000
"""Port number for the Prometheus metrics server
(only needed if enable-metrics is set)."""
url: str = "0.0.0.0"
"""[DEPRECATED] Host name for the Prometheus metrics server
(only needed if enable-metrics is set). Use --host instead."""
@classmethod
def _customize_cli_kwargs(
cls,
frontend_kwargs: dict[str, Any],
) -> dict[str, Any]:
frontend_kwargs = super()._customize_cli_kwargs(frontend_kwargs)
frontend_kwargs["input_file"]["flags"] = ["-i"]
frontend_kwargs["input_file"]["required"] = True
frontend_kwargs["output_file"]["flags"] = ["-o"]
frontend_kwargs["output_file"]["required"] = True
frontend_kwargs["enable_metrics"]["action"] = "store_true"
frontend_kwargs["url"]["deprecated"] = True
return frontend_kwargs
def make_arg_parser(parser: FlexibleArgumentParser):
parser.add_argument(
"-i",
"--input-file",
required=True,
type=str,
help="The path or url to a single input file. Currently supports local file "
"paths, or the http protocol (http or https). If a URL is specified, "
"the file should be available via HTTP GET.",
)
parser.add_argument(
"-o",
"--output-file",
required=True,
type=str,
help="The path or url to a single output file. Currently supports "
"local file paths, or web (http or https) urls. If a URL is specified,"
" the file should be available via HTTP PUT.",
)
parser.add_argument(
"--output-tmp-dir",
type=str,
default=None,
help="The directory to store the output file before uploading it "
"to the output URL.",
)
parser.add_argument(
"--response-role",
type=optional_type(str),
default="assistant",
help="The role name to return if `request.add_generation_prompt=True`.",
)
parser = BatchFrontendArgs.add_cli_args(parser)
parser = AsyncEngineArgs.add_cli_args(parser)
parser.add_argument(
"--max-log-len",
type=int,
default=None,
help="Max number of prompt characters or prompt "
"ID numbers being printed in log."
"\n\nDefault: Unlimited",
)
parser.add_argument(
"--enable-metrics", action="store_true", help="Enable Prometheus metrics"
)
parser.add_argument(
"--url",
type=str,
default="0.0.0.0",
help="URL to the Prometheus metrics server "
"(only needed if enable-metrics is set).",
)
parser.add_argument(
"--port",
type=int,
default=8000,
help="Port number for the Prometheus metrics server "
"(only needed if enable-metrics is set).",
)
parser.add_argument(
"--enable-prompt-tokens-details",
action="store_true",
default=False,
help="If set to True, enable prompt_tokens_details in usage.",
)
parser.add_argument(
"--enable-force-include-usage",
action="store_true",
default=False,
help="If set to True, include usage on every request "
"(even when stream_options is not specified)",
)
return parser
def parse_args():
parser = FlexibleArgumentParser(description="vLLM OpenAI-Compatible batch runner.")
return make_arg_parser(parser).parse_args()
args = make_arg_parser(parser).parse_args()
# Backward compatibility: If --url is set, use it for host
url_explicit = any(arg == "--url" or arg.startswith("--url=") for arg in sys.argv)
host_explicit = any(
arg == "--host" or arg.startswith("--host=") for arg in sys.argv
)
if url_explicit and hasattr(args, "url") and not host_explicit:
args.host = args.url
logger.warning_once(
"Using --url for metrics is deprecated. Please use --host instead."
)
return args
# explicitly use pure text format, with a newline at the end
@@ -671,12 +651,9 @@ def make_transcription_wrapper(is_translation: bool) -> WrapperFn:
return wrapper
def build_endpoint_registry(
async def build_endpoint_registry(
engine_client: EngineClient,
args: Namespace,
base_model_paths: list[BaseModelPath],
request_logger: RequestLogger | None,
supported_tasks: tuple[SupportedTask, ...],
) -> dict[str, dict[str, Any]]:
"""
Build the endpoint registry with all serving objects and handler configurations.
@@ -684,90 +661,27 @@ def build_endpoint_registry(
Args:
engine_client: The engine client
args: Command line arguments
base_model_paths: List of base model paths
request_logger: Optional request logger
supported_tasks: Tuple of supported tasks
Returns:
Dictionary mapping endpoint keys to their configurations
"""
model_config = engine_client.model_config
supported_tasks = await engine_client.get_supported_tasks()
logger.info("Supported tasks: %s", supported_tasks)
# Create the openai serving objects.
openai_serving_models = OpenAIServingModels(
engine_client=engine_client,
base_model_paths=base_model_paths,
lora_modules=None,
)
# Create a state object to hold serving objects
state = State()
openai_serving_chat = (
OpenAIServingChat(
engine_client,
openai_serving_models,
args.response_role,
request_logger=request_logger,
chat_template=None,
chat_template_content_format="auto",
reasoning_parser=args.structured_outputs_config.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
default_chat_template_kwargs=getattr(
args, "default_chat_template_kwargs", None
),
)
if "generate" in supported_tasks
else None
)
# Initialize all serving objects using init_app_state
# This provides full functionality including chat template processing,
# LoRA support, tool servers, etc.
await init_app_state(engine_client, state, args, supported_tasks)
openai_serving_embedding = (
OpenAIServingEmbedding(
engine_client,
openai_serving_models,
request_logger=request_logger,
chat_template=None,
chat_template_content_format="auto",
)
if "embed" in supported_tasks
else None
)
enable_serving_reranking = (
"classify" in supported_tasks
and getattr(model_config.hf_config, "num_labels", 0) == 1
)
openai_serving_scores = (
ServingScores(
engine_client,
openai_serving_models,
request_logger=request_logger,
score_template=None,
)
if ("embed" in supported_tasks or enable_serving_reranking)
else None
)
openai_serving_transcription = (
OpenAIServingTranscription(
engine_client,
openai_serving_models,
request_logger=request_logger,
enable_force_include_usage=args.enable_force_include_usage,
)
if "transcription" in supported_tasks
else None
)
openai_serving_translation = (
OpenAIServingTranslation(
engine_client,
openai_serving_models,
request_logger=request_logger,
enable_force_include_usage=args.enable_force_include_usage,
)
if "transcription" in supported_tasks
else None
)
# Get serving objects from state (defaulting to None if not set)
openai_serving_chat = getattr(state, "openai_serving_chat", None)
openai_serving_embedding = getattr(state, "openai_serving_embedding", None)
openai_serving_scores = getattr(state, "openai_serving_scores", None)
openai_serving_transcription = getattr(state, "openai_serving_transcription", None)
openai_serving_translation = getattr(state, "openai_serving_translation", None)
# Registry of endpoint configurations
endpoint_registry: dict[str, dict[str, Any]] = {
@@ -845,29 +759,9 @@ async def run_batch(
engine_client: EngineClient,
args: Namespace,
) -> None:
if args.served_model_name is not None:
served_model_names = args.served_model_name
else:
served_model_names = [args.model]
if args.enable_log_requests:
request_logger = RequestLogger(max_log_len=args.max_log_len)
else:
request_logger = None
base_model_paths = [
BaseModelPath(name=name, model_path=args.model) for name in served_model_names
]
supported_tasks = await engine_client.get_supported_tasks()
logger.info("Supported tasks: %s", supported_tasks)
endpoint_registry = build_endpoint_registry(
endpoint_registry = await build_endpoint_registry(
engine_client=engine_client,
args=args,
base_model_paths=base_model_paths,
request_logger=request_logger,
supported_tasks=supported_tasks,
)
tracker = BatchProgressTracker()
@@ -942,7 +836,7 @@ if __name__ == "__main__":
# to publish metrics at the /metrics endpoint.
if args.enable_metrics:
logger.info("Prometheus metrics enabled")
start_http_server(port=args.port, addr=args.url)
start_http_server(port=args.port, addr=args.host)
else:
logger.info("Prometheus metrics disabled")