feat(frontend): add --default-chat-template-kwargs CLI argument (#31343)

Signed-off-by: effortprogrammer <yhjhoward7@gmail.com>
This commit is contained in:
Hojin Yang
2025-12-30 12:38:47 +09:00
committed by GitHub
parent e54ee3ea33
commit dc837bc23e
7 changed files with 91 additions and 2 deletions

View File

@@ -204,6 +204,42 @@ The reasoning content is also available when both tool calling and the reasoning
For more examples, please refer to [examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py](../../examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py).
## Server-Level Default Chat Template Kwargs
You can set default `chat_template_kwargs` at the server level using the `--default-chat-template-kwargs` CLI argument. This is useful for configuring reasoning behavior across all requests without requiring clients to specify it in each request.
### Disabling Thinking Mode by Default
For models like Qwen3 where thinking is enabled by default, you can disable it server-wide:
```bash
vllm serve Qwen/Qwen3-8B \
--reasoning-parser qwen3 \
--default-chat-template-kwargs '{"enable_thinking": false}'
```
### Enabling Thinking Mode by Default
For models like IBM Granite 3.2 or DeepSeek-V3.1 where thinking is disabled by default, you can enable it server-wide:
```bash
vllm serve ibm-granite/granite-3.2-2b-instruct \
--reasoning-parser granite \
--default-chat-template-kwargs '{"thinking": true}'
```
### Request-Level Override
Request-level `chat_template_kwargs` always take priority over server defaults. For example, if the server is started with `enable_thinking=false`, a client can still enable it for a specific request:
```python
response = client.chat.completions.create(
model=model,
messages=messages,
extra_body={"chat_template_kwargs": {"enable_thinking": True}} # Overrides server default
)
```
## Limitations
- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).

View File

@@ -208,3 +208,36 @@ def test_middleware(serve_parser, cli_args, expected_middleware):
"""Ensure multiple middleware args are parsed properly"""
args = serve_parser.parse_args(args=cli_args)
assert args.middleware == expected_middleware
def test_default_chat_template_kwargs_parsing(serve_parser):
"""Ensure default_chat_template_kwargs JSON is parsed correctly"""
args = serve_parser.parse_args(
args=["--default-chat-template-kwargs", '{"enable_thinking": false}']
)
assert args.default_chat_template_kwargs == {"enable_thinking": False}
def test_default_chat_template_kwargs_complex(serve_parser):
"""Ensure complex default_chat_template_kwargs JSON is parsed correctly"""
kwargs_json = '{"enable_thinking": false, "custom_param": "value", "num": 42}'
args = serve_parser.parse_args(args=["--default-chat-template-kwargs", kwargs_json])
assert args.default_chat_template_kwargs == {
"enable_thinking": False,
"custom_param": "value",
"num": 42,
}
def test_default_chat_template_kwargs_default_none(serve_parser):
"""Ensure default_chat_template_kwargs defaults to None"""
args = serve_parser.parse_args(args=[])
assert args.default_chat_template_kwargs is None
def test_default_chat_template_kwargs_invalid_json(serve_parser):
"""Ensure invalid JSON raises an error"""
with pytest.raises(SystemExit):
serve_parser.parse_args(
args=["--default-chat-template-kwargs", "not valid json"]
)

View File

@@ -1081,6 +1081,7 @@ async def init_app_state(
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
default_chat_template_kwargs=args.default_chat_template_kwargs,
trust_request_chat_template=args.trust_request_chat_template,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,

View File

@@ -11,7 +11,7 @@ import json
import ssl
from collections.abc import Sequence
from dataclasses import field
from typing import Literal
from typing import Any, Literal
from pydantic.dataclasses import dataclass
@@ -114,6 +114,12 @@ class FrontendArgs:
"""Whether to trust the chat template provided in the request. If False,
the server will always use the chat template specified by `--chat-template`
or the ones from tokenizer."""
default_chat_template_kwargs: dict[str, Any] | None = None
"""Default keyword arguments to pass to the chat template renderer.
These will be merged with request-level chat_template_kwargs,
with request values taking precedence. Useful for setting default
behavior for reasoning models. Example: '{"enable_thinking": false}'
to disable thinking mode by default for Qwen3/DeepSeek models."""
response_role: str = "assistant"
"""The role name to return if `request.add_generation_prompt=true`."""
ssl_keyfile: str | None = None
@@ -216,6 +222,9 @@ class FrontendArgs:
del frontend_kwargs["allowed_methods"]["nargs"]
del frontend_kwargs["allowed_headers"]["nargs"]
# Special case: default_chat_template_kwargs needs json.loads type
frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads
# Special case: LoRA modules need custom parser action and
# optional_type(str)
frontend_kwargs["lora_modules"]["type"] = optional_type(str)

View File

@@ -468,6 +468,9 @@ async def run_batch(
reasoning_parser=args.structured_outputs_config.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
default_chat_template_kwargs=getattr(
args, "default_chat_template_kwargs", None
),
)
if "generate" in supported_tasks
else None

View File

@@ -6,7 +6,7 @@ import json
import time
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
from typing import Final
from typing import Any, Final
import jinja2
import partial_json_parser
@@ -102,6 +102,7 @@ class OpenAIServingChat(OpenAIServing):
enable_force_include_usage: bool = False,
enable_log_outputs: bool = False,
log_error_stack: bool = False,
default_chat_template_kwargs: dict[str, Any] | None = None,
) -> None:
super().__init__(
engine_client=engine_client,
@@ -115,6 +116,7 @@ class OpenAIServingChat(OpenAIServing):
self.chat_template = chat_template
self.chat_template_content_format: Final = chat_template_content_format
self.trust_request_chat_template = trust_request_chat_template
self.default_chat_template_kwargs = default_chat_template_kwargs or {}
self.enable_log_outputs = enable_log_outputs
# set up logits processors
@@ -203,6 +205,7 @@ class OpenAIServingChat(OpenAIServing):
tool_dicts=None,
documents=None,
chat_template_kwargs=None,
default_chat_template_kwargs=self.default_chat_template_kwargs,
tool_parser=None,
add_special_tokens=False,
)
@@ -310,6 +313,7 @@ class OpenAIServingChat(OpenAIServing):
tool_dicts=tool_dicts,
documents=request.documents,
chat_template_kwargs=request.chat_template_kwargs,
default_chat_template_kwargs=self.default_chat_template_kwargs,
tool_parser=tool_parser,
add_special_tokens=request.add_special_tokens,
)

View File

@@ -1158,6 +1158,7 @@ class OpenAIServing:
tool_dicts: list[dict[str, Any]] | None = None,
documents: list[dict[str, str]] | None = None,
chat_template_kwargs: dict[str, Any] | None = None,
default_chat_template_kwargs: dict[str, Any] | None = None,
tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
add_special_tokens: bool = False,
) -> tuple[list[ConversationMessage], list[TokensPrompt]]:
@@ -1183,6 +1184,8 @@ class OpenAIServing:
tools=tool_dicts,
documents=documents,
)
if default_chat_template_kwargs:
_chat_template_kwargs.update(default_chat_template_kwargs)
_chat_template_kwargs.update(chat_template_kwargs or {})
request_prompt: str | list[int]