[Frontend] Use init_app_state and FrontendArgs in run_batch (#32967)

Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-02-24 19:40:39 -08:00
parent dbf0da817a
commit e3b2324ec4
4 changed files with 652 additions and 350 deletions
--- a/tests/entrypoints/instrumentator/test_metrics.py
+++ b/tests/entrypoints/instrumentator/test_metrics.py
@@ -447,7 +447,7 @@ def test_metrics_exist_run_batch():
                "--model",
                "intfloat/multilingual-e5-small",
                "--enable-metrics",
-                "--url",
+                "--host",
                base_url,
                "--port",
                port,
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -10,59 +10,361 @@ import pytest
 from vllm.assets.audio import AudioAsset
 from vllm.entrypoints.openai.run_batch import BatchRequestOutput

-MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
+CHAT_MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
+EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-small"
+RERANKER_MODEL_NAME = "BAAI/bge-reranker-v2-m3"
+REASONING_MODEL_NAME = "Qwen/Qwen3-0.6B"
+SPEECH_LARGE_MODEL_NAME = "openai/whisper-large-v3"
+SPEECH_SMALL_MODEL_NAME = "openai/whisper-small"

-# ruff: noqa: E501
-INPUT_BATCH = (
-    '{{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
-    '{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
-    '{{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "NonExistModel", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
-    '{{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
-    '{{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {{"stream": "True", "model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
-).format(MODEL_NAME)
-
-INVALID_INPUT_BATCH = (
-    '{{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n'
-    '{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}'
-).format(MODEL_NAME)
-
-INPUT_EMBEDDING_BATCH = (
-    '{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}\n'
-    '{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}\n'
-    '{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}\n'
-    '{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}'
+INPUT_BATCH = "\n".join(
+    json.dumps(req)
+    for req in [
+        {
+            "custom_id": "request-1",
+            "method": "POST",
+            "url": "/v1/chat/completions",
+            "body": {
+                "model": CHAT_MODEL_NAME,
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant.",
+                    },
+                    {"role": "user", "content": "Hello world!"},
+                ],
+                "max_tokens": 1000,
+            },
+        },
+        {
+            "custom_id": "request-2",
+            "method": "POST",
+            "url": "/v1/chat/completions",
+            "body": {
+                "model": CHAT_MODEL_NAME,
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are an unhelpful assistant.",
+                    },
+                    {"role": "user", "content": "Hello world!"},
+                ],
+                "max_tokens": 1000,
+            },
+        },
+        {
+            "custom_id": "request-3",
+            "method": "POST",
+            "url": "/v1/chat/completions",
+            "body": {
+                "model": "NonExistModel",
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are an unhelpful assistant.",
+                    },
+                    {"role": "user", "content": "Hello world!"},
+                ],
+                "max_tokens": 1000,
+            },
+        },
+        {
+            "custom_id": "request-4",
+            "method": "POST",
+            "url": "/bad_url",
+            "body": {
+                "model": CHAT_MODEL_NAME,
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are an unhelpful assistant.",
+                    },
+                    {"role": "user", "content": "Hello world!"},
+                ],
+                "max_tokens": 1000,
+            },
+        },
+        {
+            "custom_id": "request-5",
+            "method": "POST",
+            "url": "/v1/chat/completions",
+            "body": {
+                "stream": "True",
+                "model": CHAT_MODEL_NAME,
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are an unhelpful assistant.",
+                    },
+                    {"role": "user", "content": "Hello world!"},
+                ],
+                "max_tokens": 1000,
+            },
+        },
+    ]
 )

-INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
+INVALID_INPUT_BATCH = "\n".join(
+    json.dumps(req)
+    for req in [
+        {
+            "invalid_field": "request-1",
+            "method": "POST",
+            "url": "/v1/chat/completions",
+            "body": {
+                "model": CHAT_MODEL_NAME,
+                "messages": [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": "Hello world!"},
+                ],
+                "max_tokens": 1000,
+            },
+        },
+        {
+            "custom_id": "request-2",
+            "method": "POST",
+            "url": "/v1/chat/completions",
+            "body": {
+                "model": CHAT_MODEL_NAME,
+                "messages": [
+                    {"role": "system", "content": "You are an unhelpful assistant."},
+                    {"role": "user", "content": "Hello world!"},
+                ],
+                "max_tokens": 1000,
+            },
+        },
+    ]
+)

-INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
-{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
+INPUT_EMBEDDING_BATCH = "\n".join(
+    json.dumps(req)
+    for req in [
+        {
+            "custom_id": "request-1",
+            "method": "POST",
+            "url": "/v1/embeddings",
+            "body": {
+                "model": EMBEDDING_MODEL_NAME,
+                "input": "You are a helpful assistant.",
+            },
+        },
+        {
+            "custom_id": "request-2",
+            "method": "POST",
+            "url": "/v1/embeddings",
+            "body": {
+                "model": EMBEDDING_MODEL_NAME,
+                "input": "You are an unhelpful assistant.",
+            },
+        },
+        {
+            "custom_id": "request-3",
+            "method": "POST",
+            "url": "/v1/embeddings",
+            "body": {
+                "model": EMBEDDING_MODEL_NAME,
+                "input": "Hello world!",
+            },
+        },
+        {
+            "custom_id": "request-4",
+            "method": "POST",
+            "url": "/v1/embeddings",
+            "body": {
+                "model": "NonExistModel",
+                "input": "Hello world!",
+            },
+        },
+    ]
+)

-INPUT_REASONING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Solve this math problem: 2+2=?"}]}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "What is the capital of France?"}]}}"""
+_SCORE_RERANK_DOCUMENTS = [
+    "The capital of Brazil is Brasilia.",
+    "The capital of France is Paris.",
+]
+
+INPUT_SCORE_BATCH = "\n".join(
+    json.dumps(req)
+    for req in [
+        {
+            "custom_id": "request-1",
+            "method": "POST",
+            "url": "/score",
+            "body": {
+                "model": RERANKER_MODEL_NAME,
+                "queries": "What is the capital of France?",
+                "documents": _SCORE_RERANK_DOCUMENTS,
+            },
+        },
+        {
+            "custom_id": "request-2",
+            "method": "POST",
+            "url": "/v1/score",
+            "body": {
+                "model": RERANKER_MODEL_NAME,
+                "queries": "What is the capital of France?",
+                "documents": _SCORE_RERANK_DOCUMENTS,
+            },
+        },
+    ]
+)
+
+INPUT_RERANK_BATCH = "\n".join(
+    json.dumps(req)
+    for req in [
+        {
+            "custom_id": "request-1",
+            "method": "POST",
+            "url": "/rerank",
+            "body": {
+                "model": RERANKER_MODEL_NAME,
+                "query": "What is the capital of France?",
+                "documents": _SCORE_RERANK_DOCUMENTS,
+            },
+        },
+        {
+            "custom_id": "request-2",
+            "method": "POST",
+            "url": "/v1/rerank",
+            "body": {
+                "model": RERANKER_MODEL_NAME,
+                "query": "What is the capital of France?",
+                "documents": _SCORE_RERANK_DOCUMENTS,
+            },
+        },
+        {
+            "custom_id": "request-2",
+            "method": "POST",
+            "url": "/v2/rerank",
+            "body": {
+                "model": RERANKER_MODEL_NAME,
+                "query": "What is the capital of France?",
+                "documents": _SCORE_RERANK_DOCUMENTS,
+            },
+        },
+    ]
+)
+
+INPUT_REASONING_BATCH = "\n".join(
+    json.dumps(req)
+    for req in [
+        {
+            "custom_id": "request-1",
+            "method": "POST",
+            "url": "/v1/chat/completions",
+            "body": {
+                "model": REASONING_MODEL_NAME,
+                "messages": [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": "Solve this math problem: 2+2=?"},
+                ],
+            },
+        },
+        {
+            "custom_id": "request-2",
+            "method": "POST",
+            "url": "/v1/chat/completions",
+            "body": {
+                "model": REASONING_MODEL_NAME,
+                "messages": [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": "What is the capital of France?"},
+                ],
+            },
+        },
+    ]
+)

-# This is a valid but minimal audio file for testing
 MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA="
 INPUT_TRANSCRIPTION_BATCH = (
-    '{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/transcriptions", '
-    '"body": {{"model": "openai/whisper-large-v3", "file_url": "data:audio/wav;base64,{}", '
-    '"response_format": "json"}}}}\n'
-).format(MINIMAL_WAV_BASE64)
+    json.dumps(
+        {
+            "custom_id": "request-1",
+            "method": "POST",
+            "url": "/v1/audio/transcriptions",
+            "body": {
+                "model": SPEECH_LARGE_MODEL_NAME,
+                "file_url": f"data:audio/wav;base64,{MINIMAL_WAV_BASE64}",
+                "response_format": "json",
+            },
+        }
+    )
+    + "\n"
+)

 INPUT_TRANSCRIPTION_HTTP_BATCH = (
-    '{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/transcriptions", '
-    '"body": {{"model": "openai/whisper-large-v3", "file_url": "{}", '
-    '"response_format": "json"}}}}\n'
-).format(AudioAsset("mary_had_lamb").url)
+    json.dumps(
+        {
+            "custom_id": "request-1",
+            "method": "POST",
+            "url": "/v1/audio/transcriptions",
+            "body": {
+                "model": SPEECH_LARGE_MODEL_NAME,
+                "file_url": AudioAsset("mary_had_lamb").url,
+                "response_format": "json",
+            },
+        }
+    )
+    + "\n"
+)

 INPUT_TRANSLATION_BATCH = (
-    '{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/translations", '
-    '"body": {{"model": "openai/whisper-small", "file_url": "{}", '
-    '"response_format": "text", "language": "it", "to_language": "en", '
-    '"temperature": 0.0}}}}\n'
-).format(AudioAsset("mary_had_lamb").url)
+    json.dumps(
+        {
+            "custom_id": "request-1",
+            "method": "POST",
+            "url": "/v1/audio/translations",
+            "body": {
+                "model": SPEECH_SMALL_MODEL_NAME,
+                "file_url": AudioAsset("mary_had_lamb").url,
+                "response_format": "text",
+                "language": "it",
+                "to_language": "en",
+                "temperature": 0.0,
+            },
+        }
+    )
+    + "\n"
+)
+
+WEATHER_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The city and state, e.g. San Francisco, CA",
+                },
+                "unit": {
+                    "type": "string",
+                    "enum": ["celsius", "fahrenheit"],
+                },
+            },
+            "required": ["location"],
+        },
+    },
+}
+
+INPUT_TOOL_CALLING_BATCH = json.dumps(
+    {
+        "custom_id": "request-1",
+        "method": "POST",
+        "url": "/v1/chat/completions",
+        "body": {
+            "model": REASONING_MODEL_NAME,
+            "messages": [
+                {"role": "user", "content": "What is the weather in San Francisco?"},
+            ],
+            "tools": [WEATHER_TOOL],
+            "tool_choice": "required",
+            "max_tokens": 1000,
+        },
+    }
+)


 def test_empty_file():
@@ -81,7 +383,7 @@ def test_empty_file():
                "-o",
                output_file.name,
                "--model",
-                "intfloat/multilingual-e5-small",
+                EMBEDDING_MODEL_NAME,
            ],
        )
        proc.communicate()
@@ -108,7 +410,7 @@ def test_completions():
                "-o",
                output_file.name,
                "--model",
-                MODEL_NAME,
+                CHAT_MODEL_NAME,
            ],
        )
        proc.communicate()
@@ -141,7 +443,7 @@ def test_completions_invalid_input():
                "-o",
                output_file.name,
                "--model",
-                MODEL_NAME,
+                CHAT_MODEL_NAME,
            ],
        )
        proc.communicate()
@@ -165,7 +467,7 @@ def test_embeddings():
                "-o",
                output_file.name,
                "--model",
-                "intfloat/multilingual-e5-small",
+                EMBEDDING_MODEL_NAME,
            ],
        )
        proc.communicate()
@@ -196,7 +498,7 @@ def test_score(input_batch):
                "-o",
                output_file.name,
                "--model",
-                "BAAI/bge-reranker-v2-m3",
+                RERANKER_MODEL_NAME,
            ],
        )
        proc.communicate()
@@ -234,7 +536,7 @@ def test_reasoning_parser():
                "-o",
                output_file.name,
                "--model",
-                "Qwen/Qwen3-0.6B",
+                REASONING_MODEL_NAME,
                "--reasoning-parser",
                "qwen3",
            ],
@@ -278,7 +580,7 @@ def test_transcription():
                "-o",
                output_file.name,
                "--model",
-                "openai/whisper-large-v3",
+                SPEECH_LARGE_MODEL_NAME,
            ],
        )
        proc.communicate()
@@ -316,7 +618,7 @@ def test_transcription_http_url():
                "-o",
                output_file.name,
                "--model",
-                "openai/whisper-large-v3",
+                SPEECH_LARGE_MODEL_NAME,
            ],
        )
        proc.communicate()
@@ -356,7 +658,7 @@ def test_translation():
                "-o",
                output_file.name,
                "--model",
-                "openai/whisper-small",
+                SPEECH_SMALL_MODEL_NAME,
            ],
        )
        proc.communicate()
@@ -378,3 +680,69 @@ def test_translation():
            translation_text = response_body["text"]
            translation_text_lower = str(translation_text).strip().lower()
            assert "mary" in translation_text_lower or "lamb" in translation_text_lower
+
+
+def test_tool_calling():
+    """
+    Test that tool calling works correctly in run_batch.
+    Verifies that requests with tools return tool_calls in the response.
+    """
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
+        input_file.write(INPUT_TOOL_CALLING_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                REASONING_MODEL_NAME,
+                "--enable-auto-tool-choice",
+                "--tool-call-parser",
+                "hermes",
+            ],
+        )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            if not line.strip():  # Skip empty lines
+                continue
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
+
+            # Ensure that there is no error in the response.
+            line_dict = json.loads(line)
+            assert isinstance(line_dict, dict)
+            assert line_dict["error"] is None
+
+            # Check that tool_calls are present in the response
+            # With tool_choice="required", the model must call a tool
+            response_body = line_dict["response"]["body"]
+            assert response_body is not None
+            message = response_body["choices"][0]["message"]
+            assert "tool_calls" in message
+            tool_calls = message.get("tool_calls")
+            # With tool_choice="required", tool_calls must be present and non-empty
+            assert tool_calls is not None
+            assert isinstance(tool_calls, list)
+            assert len(tool_calls) > 0
+            # Verify tool_calls have the expected structure
+            for tool_call in tool_calls:
+                assert "id" in tool_call
+                assert "type" in tool_call
+                assert tool_call["type"] == "function"
+                assert "function" in tool_call
+                assert "name" in tool_call["function"]
+                assert "arguments" in tool_call["function"]
+                # Verify the tool name matches our tool definition
+                assert tool_call["function"]["name"] == "get_current_weather"
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -67,7 +67,149 @@ class LoRAParserAction(argparse.Action):


@config
-class FrontendArgs:
+class BaseFrontendArgs:
+    """Base arguments for the OpenAI-compatible frontend server.
+
+    This base class does not include host, port, and server-specific arguments
+    like SSL, CORS, and HTTP server settings. Those arguments are added by
+    the subclasses.
+    """
+
+    lora_modules: list[LoRAModulePath] | None = None
+    """LoRA modules configurations in either 'name=path' format or JSON format
+    or JSON list format. Example (old format): `'name=path'` Example (new
+    format): `{\"name\": \"name\", \"path\": \"lora_path\",
+    \"base_model_name\": \"id\"}`"""
+    chat_template: str | None = None
+    """The file path to the chat template, or the template in single-line form
+    for the specified model."""
+    chat_template_content_format: ChatTemplateContentFormatOption = "auto"
+    """The format to render message content within a chat template.
+
+    * "string" will render the content as a string. Example: `"Hello World"`
+    * "openai" will render the content as a list of dictionaries, similar to
+      OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
+    trust_request_chat_template: bool = False
+    """Whether to trust the chat template provided in the request. If False,
+    the server will always use the chat template specified by `--chat-template`
+    or the ones from tokenizer."""
+    default_chat_template_kwargs: dict[str, Any] | None = None
+    """Default keyword arguments to pass to the chat template renderer.
+    These will be merged with request-level chat_template_kwargs,
+    with request values taking precedence. Useful for setting default
+    behavior for reasoning models. Example: '{"enable_thinking": false}'
+    to disable thinking mode by default for Qwen3/DeepSeek models."""
+    response_role: str = "assistant"
+    """The role name to return if `request.add_generation_prompt=true`."""
+    return_tokens_as_token_ids: bool = False
+    """When `--max-logprobs` is specified, represents single tokens as
+    strings of the form 'token_id:{token_id}' so that tokens that are not
+    JSON-encodable can be identified."""
+    disable_frontend_multiprocessing: bool = False
+    """If specified, will run the OpenAI frontend server in the same process as
+    the model serving engine."""
+    enable_auto_tool_choice: bool = False
+    """Enable auto tool choice for supported models. Use `--tool-call-parser`
+    to specify which parser to use."""
+    exclude_tools_when_tool_choice_none: bool = False
+    """If specified, exclude tool definitions in prompts when
+    tool_choice='none'."""
+    tool_call_parser: str | None = None
+    """Select the tool call parser depending on the model that you're using.
+    This is used to parse the model-generated tool call into OpenAI API format.
+    Required for `--enable-auto-tool-choice`. You can choose any option from
+    the built-in parsers or register a plugin via `--tool-parser-plugin`."""
+    tool_parser_plugin: str = ""
+    """Special the tool parser plugin write to parse the model-generated tool
+    into OpenAI API format, the name register in this plugin can be used in
+    `--tool-call-parser`."""
+    tool_server: str | None = None
+    """Comma-separated list of host:port pairs (IPv4, IPv6, or hostname).
+    Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo
+    purpose."""
+    log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH
+    """Path to logging config JSON file for both vllm and uvicorn"""
+    max_log_len: int | None = None
+    """Max number of prompt characters or prompt ID numbers being printed in
+    log. The default of None means unlimited."""
+    enable_prompt_tokens_details: bool = False
+    """If set to True, enable prompt_tokens_details in usage."""
+    enable_server_load_tracking: bool = False
+    """If set to True, enable tracking server_load_metrics in the app state."""
+    enable_force_include_usage: bool = False
+    """If set to True, including usage on every request."""
+    enable_tokenizer_info_endpoint: bool = False
+    """Enable the `/tokenizer_info` endpoint. May expose chat
+    templates and other tokenizer configuration."""
+    enable_log_outputs: bool = False
+    """If set to True, log model outputs (generations).
+    Requires --enable-log-requests."""
+    enable_log_deltas: bool = True
+    """If set to False, output deltas will not be logged. Relevant only if 
+    --enable-log-outputs is set.
+    """
+    log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
+    """If set to True, log the stack trace of error responses"""
+    tokens_only: bool = False
+    """
+    If set to True, only enable the Tokens In<>Out endpoint. 
+    This is intended for use in a Disaggregated Everything setup.
+    """
+
+    @classmethod
+    def _customize_cli_kwargs(
+        cls,
+        frontend_kwargs: dict[str, Any],
+    ) -> dict[str, Any]:
+        """Customize argparse kwargs before arguments are registered.
+
+        Subclasses should override this and call
+        ``super()._customize_cli_kwargs(frontend_kwargs)`` first.
+        """
+        # Special case: default_chat_template_kwargs needs json.loads type
+        frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads
+
+        # Special case: LoRA modules need custom parser action and
+        # optional_type(str)
+        frontend_kwargs["lora_modules"]["type"] = optional_type(str)
+        frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
+
+        # Special case: Tool call parser shows built-in options.
+        valid_tool_parsers = list(ToolParserManager.list_registered())
+        parsers_str = ",".join(valid_tool_parsers)
+        frontend_kwargs["tool_call_parser"]["metavar"] = (
+            f"{{{parsers_str}}} or name registered in --tool-parser-plugin"
+        )
+        return frontend_kwargs
+
+    @classmethod
+    def add_cli_args(cls, parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+        """Register CLI arguments for this frontend class.
+
+        Subclasses should override ``_customize_cli_kwargs`` instead of
+        this method so that base-class postprocessing is always applied.
+        """
+        from vllm.engine.arg_utils import get_kwargs
+
+        frontend_kwargs = get_kwargs(cls)
+        frontend_kwargs = cls._customize_cli_kwargs(frontend_kwargs)
+
+        group_name = cls.__name__.replace("Args", "")
+        frontend_group = parser.add_argument_group(
+            title=group_name,
+            description=cls.__doc__,
+        )
+        for key, value in frontend_kwargs.items():
+            extra_flags = value.pop("flags", [])
+            frontend_group.add_argument(
+                *extra_flags, f"--{key.replace('_', '-')}", **value
+            )
+
+        return parser
+
+
+@config
+class FrontendArgs(BaseFrontendArgs):
    """Arguments for the OpenAI-compatible frontend server."""

    host: str | None = None
@@ -99,32 +241,6 @@ class FrontendArgs:
    api_key: list[str] | None = None
    """If provided, the server will require one of these keys to be presented in
    the header."""
-    lora_modules: list[LoRAModulePath] | None = None
-    """LoRA modules configurations in either 'name=path' format or JSON format
-    or JSON list format. Example (old format): `'name=path'` Example (new
-    format): `{\"name\": \"name\", \"path\": \"lora_path\",
-    \"base_model_name\": \"id\"}`"""
-    chat_template: str | None = None
-    """The file path to the chat template, or the template in single-line form
-    for the specified model."""
-    chat_template_content_format: ChatTemplateContentFormatOption = "auto"
-    """The format to render message content within a chat template.
-
-    * "string" will render the content as a string. Example: `"Hello World"`
-    * "openai" will render the content as a list of dictionaries, similar to
-      OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
-    trust_request_chat_template: bool = False
-    """Whether to trust the chat template provided in the request. If False,
-    the server will always use the chat template specified by `--chat-template`
-    or the ones from tokenizer."""
-    default_chat_template_kwargs: dict[str, Any] | None = None
-    """Default keyword arguments to pass to the chat template renderer.
-    These will be merged with request-level chat_template_kwargs,
-    with request values taking precedence. Useful for setting default
-    behavior for reasoning models. Example: '{"enable_thinking": false}'
-    to disable thinking mode by default for Qwen3/DeepSeek models."""
-    response_role: str = "assistant"
-    """The role name to return if `request.add_generation_prompt=true`."""
    ssl_keyfile: str | None = None
    """The file path to the SSL key file."""
    ssl_certfile: str | None = None
@@ -146,81 +262,28 @@ class FrontendArgs:
    is provided, vLLM will add it to the server using
    `@app.middleware('http')`. If a class is provided, vLLM will
    add it to the server using `app.add_middleware()`."""
-    return_tokens_as_token_ids: bool = False
-    """When `--max-logprobs` is specified, represents single tokens as
-    strings of the form 'token_id:{token_id}' so that tokens that are not
-    JSON-encodable can be identified."""
-    disable_frontend_multiprocessing: bool = False
-    """If specified, will run the OpenAI frontend server in the same process as
-    the model serving engine."""
    enable_request_id_headers: bool = False
    """If specified, API server will add X-Request-Id header to responses."""
-    enable_auto_tool_choice: bool = False
-    """Enable auto tool choice for supported models. Use `--tool-call-parser`
-    to specify which parser to use."""
-    exclude_tools_when_tool_choice_none: bool = False
-    """If specified, exclude tool definitions in prompts when
-    tool_choice='none'."""
-    tool_call_parser: str | None = None
-    """Select the tool call parser depending on the model that you're using.
-    This is used to parse the model-generated tool call into OpenAI API format.
-    Required for `--enable-auto-tool-choice`. You can choose any option from
-    the built-in parsers or register a plugin via `--tool-parser-plugin`."""
-    tool_parser_plugin: str = ""
-    """Special the tool parser plugin write to parse the model-generated tool
-    into OpenAI API format, the name register in this plugin can be used in
-    `--tool-call-parser`."""
-    tool_server: str | None = None
-    """Comma-separated list of host:port pairs (IPv4, IPv6, or hostname).
-    Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo
-    purpose."""
-    log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH
-    """Path to logging config JSON file for both vllm and uvicorn"""
-    max_log_len: int | None = None
-    """Max number of prompt characters or prompt ID numbers being printed in
-    log. The default of None means unlimited."""
    disable_fastapi_docs: bool = False
    """Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."""
-    enable_prompt_tokens_details: bool = False
-    """If set to True, enable prompt_tokens_details in usage."""
-    enable_server_load_tracking: bool = False
-    """If set to True, enable tracking server_load_metrics in the app state."""
-    enable_force_include_usage: bool = False
-    """If set to True, including usage on every request."""
-    enable_tokenizer_info_endpoint: bool = False
-    """Enable the `/tokenizer_info` endpoint. May expose chat
-    templates and other tokenizer configuration."""
-    enable_log_outputs: bool = False
-    """If set to True, log model outputs (generations).
-    Requires --enable-log-requests."""
-    enable_log_deltas: bool = True
-    """If set to False, output deltas will not be logged. Relevant only if 
-    --enable-log-outputs is set.
-    """
    h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
    """Maximum size (bytes) of an incomplete HTTP event (header or body) for
    h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB)."""
    h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT
    """Maximum number of HTTP headers allowed in a request for h11 parser.
    Helps mitigate header abuse. Default: 256."""
-    log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
-    """If set to True, log the stack trace of error responses"""
-    tokens_only: bool = False
-    """
-    If set to True, only enable the Tokens In<>Out endpoint. 
-    This is intended for use in a Disaggregated Everything setup.
-    """
    enable_offline_docs: bool = False
    """
    Enable offline FastAPI documentation for air-gapped environments.
    Uses vendored static assets bundled with vLLM.
    """

-    @staticmethod
-    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
-        from vllm.engine.arg_utils import get_kwargs
-
-        frontend_kwargs = get_kwargs(FrontendArgs)
+    @classmethod
+    def _customize_cli_kwargs(
+        cls,
+        frontend_kwargs: dict[str, Any],
+    ) -> dict[str, Any]:
+        frontend_kwargs = super()._customize_cli_kwargs(frontend_kwargs)

        # Special case: allowed_origins, allowed_methods, allowed_headers all
        # need json.loads type
@@ -232,14 +295,6 @@ class FrontendArgs:
        del frontend_kwargs["allowed_methods"]["nargs"]
        del frontend_kwargs["allowed_headers"]["nargs"]

-        # Special case: default_chat_template_kwargs needs json.loads type
-        frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads
-
-        # Special case: LoRA modules need custom parser action and
-        # optional_type(str)
-        frontend_kwargs["lora_modules"]["type"] = optional_type(str)
-        frontend_kwargs["lora_modules"]["action"] = LoRAParserAction
-
        # Special case: Middleware needs to append action
        frontend_kwargs["middleware"]["action"] = "append"
        frontend_kwargs["middleware"]["type"] = str
@@ -252,22 +307,7 @@ class FrontendArgs:
        if "nargs" in frontend_kwargs["disable_access_log_for_endpoints"]:
            del frontend_kwargs["disable_access_log_for_endpoints"]["nargs"]

-        # Special case: Tool call parser shows built-in options.
-        valid_tool_parsers = list(ToolParserManager.list_registered())
-        parsers_str = ",".join(valid_tool_parsers)
-        frontend_kwargs["tool_call_parser"]["metavar"] = (
-            f"{{{parsers_str}}} or name registered in --tool-parser-plugin"
-        )
-
-        frontend_group = parser.add_argument_group(
-            title="Frontend",
-            description=FrontendArgs.__doc__,
-        )
-
-        for key, value in frontend_kwargs.items():
-            frontend_group.add_argument(f"--{key.replace('_', '-')}", **value)
-
-        return parser
+        return frontend_kwargs


 def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -3,6 +3,7 @@

 import asyncio
 import base64
+import sys
 import tempfile
 from argparse import Namespace
 from collections.abc import Awaitable, Callable
@@ -17,23 +18,23 @@ from fastapi import UploadFile
 from prometheus_client import start_http_server
 from pydantic import Field, TypeAdapter, field_validator, model_validator
 from pydantic_core.core_schema import ValidationInfo
+from starlette.datastructures import State
 from tqdm import tqdm

-from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
+from vllm.config import config
+from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.api_server import init_app_state
 from vllm.entrypoints.openai.chat_completion.protocol import (
    ChatCompletionRequest,
    ChatCompletionResponse,
 )
-from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+from vllm.entrypoints.openai.cli_args import BaseFrontendArgs
 from vllm.entrypoints.openai.engine.protocol import (
    ErrorInfo,
    ErrorResponse,
    OpenAIBaseModel,
 )
-from vllm.entrypoints.openai.models.protocol import BaseModelPath
-from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.openai.speech_to_text.protocol import (
    TranscriptionRequest,
    TranscriptionResponse,
@@ -42,25 +43,18 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
    TranslationResponse,
    TranslationResponseVerbose,
 )
-from vllm.entrypoints.openai.speech_to_text.serving import (
-    OpenAIServingTranscription,
-    OpenAIServingTranslation,
-)
 from vllm.entrypoints.pooling.embed.protocol import (
    EmbeddingRequest,
    EmbeddingResponse,
 )
-from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
 from vllm.entrypoints.pooling.score.protocol import (
    RerankRequest,
    RerankResponse,
    ScoreRequest,
    ScoreResponse,
 )
-from vllm.entrypoints.pooling.score.serving import ServingScores
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
-from vllm.tasks import SupportedTask
 from vllm.utils import random_uuid
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.version import __version__ as VLLM_VERSION
@@ -219,87 +213,73 @@ class BatchRequestOutput(OpenAIBaseModel):
    error: Any | None


+@config
+class BatchFrontendArgs(BaseFrontendArgs):
+    """Arguments for the batch runner frontend."""
+
+    input_file: str | None = None
+    """The path or url to a single input file. Currently supports local file
+    paths, or the http protocol (http or https). If a URL is specified,
+    the file should be available via HTTP GET."""
+    output_file: str | None = None
+    """The path or url to a single output file. Currently supports
+    local file paths, or web (http or https) urls. If a URL is specified,
+    the file should be available via HTTP PUT."""
+    output_tmp_dir: str | None = None
+    """The directory to store the output file before uploading it
+    to the output URL."""
+    enable_metrics: bool = False
+    """Enable Prometheus metrics"""
+    host: str | None = None
+    """Host name for the Prometheus metrics server
+    (only needed if enable-metrics is set)."""
+    port: int = 8000
+    """Port number for the Prometheus metrics server
+    (only needed if enable-metrics is set)."""
+    url: str = "0.0.0.0"
+    """[DEPRECATED] Host name for the Prometheus metrics server
+    (only needed if enable-metrics is set). Use --host instead."""
+
+    @classmethod
+    def _customize_cli_kwargs(
+        cls,
+        frontend_kwargs: dict[str, Any],
+    ) -> dict[str, Any]:
+        frontend_kwargs = super()._customize_cli_kwargs(frontend_kwargs)
+
+        frontend_kwargs["input_file"]["flags"] = ["-i"]
+        frontend_kwargs["input_file"]["required"] = True
+        frontend_kwargs["output_file"]["flags"] = ["-o"]
+        frontend_kwargs["output_file"]["required"] = True
+
+        frontend_kwargs["enable_metrics"]["action"] = "store_true"
+
+        frontend_kwargs["url"]["deprecated"] = True
+        return frontend_kwargs
+
+
 def make_arg_parser(parser: FlexibleArgumentParser):
-    parser.add_argument(
-        "-i",
-        "--input-file",
-        required=True,
-        type=str,
-        help="The path or url to a single input file. Currently supports local file "
-        "paths, or the http protocol (http or https). If a URL is specified, "
-        "the file should be available via HTTP GET.",
-    )
-    parser.add_argument(
-        "-o",
-        "--output-file",
-        required=True,
-        type=str,
-        help="The path or url to a single output file. Currently supports "
-        "local file paths, or web (http or https) urls. If a URL is specified,"
-        " the file should be available via HTTP PUT.",
-    )
-    parser.add_argument(
-        "--output-tmp-dir",
-        type=str,
-        default=None,
-        help="The directory to store the output file before uploading it "
-        "to the output URL.",
-    )
-    parser.add_argument(
-        "--response-role",
-        type=optional_type(str),
-        default="assistant",
-        help="The role name to return if `request.add_generation_prompt=True`.",
-    )
-
+    parser = BatchFrontendArgs.add_cli_args(parser)
    parser = AsyncEngineArgs.add_cli_args(parser)
-
-    parser.add_argument(
-        "--max-log-len",
-        type=int,
-        default=None,
-        help="Max number of prompt characters or prompt "
-        "ID numbers being printed in log."
-        "\n\nDefault: Unlimited",
-    )
-
-    parser.add_argument(
-        "--enable-metrics", action="store_true", help="Enable Prometheus metrics"
-    )
-    parser.add_argument(
-        "--url",
-        type=str,
-        default="0.0.0.0",
-        help="URL to the Prometheus metrics server "
-        "(only needed if enable-metrics is set).",
-    )
-    parser.add_argument(
-        "--port",
-        type=int,
-        default=8000,
-        help="Port number for the Prometheus metrics server "
-        "(only needed if enable-metrics is set).",
-    )
-    parser.add_argument(
-        "--enable-prompt-tokens-details",
-        action="store_true",
-        default=False,
-        help="If set to True, enable prompt_tokens_details in usage.",
-    )
-    parser.add_argument(
-        "--enable-force-include-usage",
-        action="store_true",
-        default=False,
-        help="If set to True, include usage on every request "
-        "(even when stream_options is not specified)",
-    )
-
    return parser


 def parse_args():
    parser = FlexibleArgumentParser(description="vLLM OpenAI-Compatible batch runner.")
-    return make_arg_parser(parser).parse_args()
+    args = make_arg_parser(parser).parse_args()
+
+    # Backward compatibility: If --url is set, use it for host
+    url_explicit = any(arg == "--url" or arg.startswith("--url=") for arg in sys.argv)
+    host_explicit = any(
+        arg == "--host" or arg.startswith("--host=") for arg in sys.argv
+    )
+    if url_explicit and hasattr(args, "url") and not host_explicit:
+        args.host = args.url
+        logger.warning_once(
+            "Using --url for metrics is deprecated. Please use --host instead."
+        )
+
+    return args


 # explicitly use pure text format, with a newline at the end
@@ -671,12 +651,9 @@ def make_transcription_wrapper(is_translation: bool) -> WrapperFn:
    return wrapper


-def build_endpoint_registry(
+async def build_endpoint_registry(
    engine_client: EngineClient,
    args: Namespace,
-    base_model_paths: list[BaseModelPath],
-    request_logger: RequestLogger | None,
-    supported_tasks: tuple[SupportedTask, ...],
 ) -> dict[str, dict[str, Any]]:
    """
    Build the endpoint registry with all serving objects and handler configurations.
@@ -684,90 +661,27 @@ def build_endpoint_registry(
    Args:
        engine_client: The engine client
        args: Command line arguments
-        base_model_paths: List of base model paths
-        request_logger: Optional request logger
-        supported_tasks: Tuple of supported tasks

    Returns:
        Dictionary mapping endpoint keys to their configurations
    """
-    model_config = engine_client.model_config
+    supported_tasks = await engine_client.get_supported_tasks()
+    logger.info("Supported tasks: %s", supported_tasks)

-    # Create the openai serving objects.
-    openai_serving_models = OpenAIServingModels(
-        engine_client=engine_client,
-        base_model_paths=base_model_paths,
-        lora_modules=None,
-    )
+    # Create a state object to hold serving objects
+    state = State()

-    openai_serving_chat = (
-        OpenAIServingChat(
-            engine_client,
-            openai_serving_models,
-            args.response_role,
-            request_logger=request_logger,
-            chat_template=None,
-            chat_template_content_format="auto",
-            reasoning_parser=args.structured_outputs_config.reasoning_parser,
-            enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-            enable_force_include_usage=args.enable_force_include_usage,
-            default_chat_template_kwargs=getattr(
-                args, "default_chat_template_kwargs", None
-            ),
-        )
-        if "generate" in supported_tasks
-        else None
-    )
+    # Initialize all serving objects using init_app_state
+    # This provides full functionality including chat template processing,
+    # LoRA support, tool servers, etc.
+    await init_app_state(engine_client, state, args, supported_tasks)

-    openai_serving_embedding = (
-        OpenAIServingEmbedding(
-            engine_client,
-            openai_serving_models,
-            request_logger=request_logger,
-            chat_template=None,
-            chat_template_content_format="auto",
-        )
-        if "embed" in supported_tasks
-        else None
-    )
-
-    enable_serving_reranking = (
-        "classify" in supported_tasks
-        and getattr(model_config.hf_config, "num_labels", 0) == 1
-    )
-
-    openai_serving_scores = (
-        ServingScores(
-            engine_client,
-            openai_serving_models,
-            request_logger=request_logger,
-            score_template=None,
-        )
-        if ("embed" in supported_tasks or enable_serving_reranking)
-        else None
-    )
-
-    openai_serving_transcription = (
-        OpenAIServingTranscription(
-            engine_client,
-            openai_serving_models,
-            request_logger=request_logger,
-            enable_force_include_usage=args.enable_force_include_usage,
-        )
-        if "transcription" in supported_tasks
-        else None
-    )
-
-    openai_serving_translation = (
-        OpenAIServingTranslation(
-            engine_client,
-            openai_serving_models,
-            request_logger=request_logger,
-            enable_force_include_usage=args.enable_force_include_usage,
-        )
-        if "transcription" in supported_tasks
-        else None
-    )
+    # Get serving objects from state (defaulting to None if not set)
+    openai_serving_chat = getattr(state, "openai_serving_chat", None)
+    openai_serving_embedding = getattr(state, "openai_serving_embedding", None)
+    openai_serving_scores = getattr(state, "openai_serving_scores", None)
+    openai_serving_transcription = getattr(state, "openai_serving_transcription", None)
+    openai_serving_translation = getattr(state, "openai_serving_translation", None)

    # Registry of endpoint configurations
    endpoint_registry: dict[str, dict[str, Any]] = {
@@ -845,29 +759,9 @@ async def run_batch(
    engine_client: EngineClient,
    args: Namespace,
 ) -> None:
-    if args.served_model_name is not None:
-        served_model_names = args.served_model_name
-    else:
-        served_model_names = [args.model]
-
-    if args.enable_log_requests:
-        request_logger = RequestLogger(max_log_len=args.max_log_len)
-    else:
-        request_logger = None
-
-    base_model_paths = [
-        BaseModelPath(name=name, model_path=args.model) for name in served_model_names
-    ]
-
-    supported_tasks = await engine_client.get_supported_tasks()
-    logger.info("Supported tasks: %s", supported_tasks)
-
-    endpoint_registry = build_endpoint_registry(
+    endpoint_registry = await build_endpoint_registry(
        engine_client=engine_client,
        args=args,
-        base_model_paths=base_model_paths,
-        request_logger=request_logger,
-        supported_tasks=supported_tasks,
    )

    tracker = BatchProgressTracker()
@@ -942,7 +836,7 @@ if __name__ == "__main__":
    # to publish metrics at the /metrics endpoint.
    if args.enable_metrics:
        logger.info("Prometheus metrics enabled")
-        start_http_server(port=args.port, addr=args.url)
+        start_http_server(port=args.port, addr=args.host)
    else:
        logger.info("Prometheus metrics disabled")