diff --git a/tests/entrypoints/instrumentator/test_metrics.py b/tests/entrypoints/instrumentator/test_metrics.py index 68eefcf12..19d1234c3 100644 --- a/tests/entrypoints/instrumentator/test_metrics.py +++ b/tests/entrypoints/instrumentator/test_metrics.py @@ -447,7 +447,7 @@ def test_metrics_exist_run_batch(): "--model", "intfloat/multilingual-e5-small", "--enable-metrics", - "--url", + "--host", base_url, "--port", port, diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 26b34a924..cf7e2a7b0 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -10,59 +10,361 @@ import pytest from vllm.assets.audio import AudioAsset from vllm.entrypoints.openai.run_batch import BatchRequestOutput -MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" +CHAT_MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" +EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-small" +RERANKER_MODEL_NAME = "BAAI/bge-reranker-v2-m3" +REASONING_MODEL_NAME = "Qwen/Qwen3-0.6B" +SPEECH_LARGE_MODEL_NAME = "openai/whisper-large-v3" +SPEECH_SMALL_MODEL_NAME = "openai/whisper-small" -# ruff: noqa: E501 -INPUT_BATCH = ( - '{{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n' - '{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n' - '{{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "NonExistModel", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n' - '{{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n' - '{{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {{"stream": "True", "model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}' -).format(MODEL_NAME) - -INVALID_INPUT_BATCH = ( - '{{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are a helpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}\n' - '{{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {{"model": "{0}", "messages": [{{"role": "system", "content": "You are an unhelpful assistant."}},{{"role": "user", "content": "Hello world!"}}],"max_tokens": 1000}}}}' -).format(MODEL_NAME) - -INPUT_EMBEDDING_BATCH = ( - '{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}\n' - '{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}\n' - '{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}\n' - '{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}' +INPUT_BATCH = "\n".join( + json.dumps(req) + for req in [ + { + "custom_id": "request-1", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": CHAT_MODEL_NAME, + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant.", + }, + {"role": "user", "content": "Hello world!"}, + ], + "max_tokens": 1000, + }, + }, + { + "custom_id": "request-2", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": CHAT_MODEL_NAME, + "messages": [ + { + "role": "system", + "content": "You are an unhelpful assistant.", + }, + {"role": "user", "content": "Hello world!"}, + ], + "max_tokens": 1000, + }, + }, + { + "custom_id": "request-3", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": "NonExistModel", + "messages": [ + { + "role": "system", + "content": "You are an unhelpful assistant.", + }, + {"role": "user", "content": "Hello world!"}, + ], + "max_tokens": 1000, + }, + }, + { + "custom_id": "request-4", + "method": "POST", + "url": "/bad_url", + "body": { + "model": CHAT_MODEL_NAME, + "messages": [ + { + "role": "system", + "content": "You are an unhelpful assistant.", + }, + {"role": "user", "content": "Hello world!"}, + ], + "max_tokens": 1000, + }, + }, + { + "custom_id": "request-5", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "stream": "True", + "model": CHAT_MODEL_NAME, + "messages": [ + { + "role": "system", + "content": "You are an unhelpful assistant.", + }, + {"role": "user", "content": "Hello world!"}, + ], + "max_tokens": 1000, + }, + }, + ] ) -INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "queries": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" +INVALID_INPUT_BATCH = "\n".join( + json.dumps(req) + for req in [ + { + "invalid_field": "request-1", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": CHAT_MODEL_NAME, + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello world!"}, + ], + "max_tokens": 1000, + }, + }, + { + "custom_id": "request-2", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": CHAT_MODEL_NAME, + "messages": [ + {"role": "system", "content": "You are an unhelpful assistant."}, + {"role": "user", "content": "Hello world!"}, + ], + "max_tokens": 1000, + }, + }, + ] +) -INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} -{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" +INPUT_EMBEDDING_BATCH = "\n".join( + json.dumps(req) + for req in [ + { + "custom_id": "request-1", + "method": "POST", + "url": "/v1/embeddings", + "body": { + "model": EMBEDDING_MODEL_NAME, + "input": "You are a helpful assistant.", + }, + }, + { + "custom_id": "request-2", + "method": "POST", + "url": "/v1/embeddings", + "body": { + "model": EMBEDDING_MODEL_NAME, + "input": "You are an unhelpful assistant.", + }, + }, + { + "custom_id": "request-3", + "method": "POST", + "url": "/v1/embeddings", + "body": { + "model": EMBEDDING_MODEL_NAME, + "input": "Hello world!", + }, + }, + { + "custom_id": "request-4", + "method": "POST", + "url": "/v1/embeddings", + "body": { + "model": "NonExistModel", + "input": "Hello world!", + }, + }, + ] +) -INPUT_REASONING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Solve this math problem: 2+2=?"}]}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "What is the capital of France?"}]}}""" +_SCORE_RERANK_DOCUMENTS = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", +] + +INPUT_SCORE_BATCH = "\n".join( + json.dumps(req) + for req in [ + { + "custom_id": "request-1", + "method": "POST", + "url": "/score", + "body": { + "model": RERANKER_MODEL_NAME, + "queries": "What is the capital of France?", + "documents": _SCORE_RERANK_DOCUMENTS, + }, + }, + { + "custom_id": "request-2", + "method": "POST", + "url": "/v1/score", + "body": { + "model": RERANKER_MODEL_NAME, + "queries": "What is the capital of France?", + "documents": _SCORE_RERANK_DOCUMENTS, + }, + }, + ] +) + +INPUT_RERANK_BATCH = "\n".join( + json.dumps(req) + for req in [ + { + "custom_id": "request-1", + "method": "POST", + "url": "/rerank", + "body": { + "model": RERANKER_MODEL_NAME, + "query": "What is the capital of France?", + "documents": _SCORE_RERANK_DOCUMENTS, + }, + }, + { + "custom_id": "request-2", + "method": "POST", + "url": "/v1/rerank", + "body": { + "model": RERANKER_MODEL_NAME, + "query": "What is the capital of France?", + "documents": _SCORE_RERANK_DOCUMENTS, + }, + }, + { + "custom_id": "request-2", + "method": "POST", + "url": "/v2/rerank", + "body": { + "model": RERANKER_MODEL_NAME, + "query": "What is the capital of France?", + "documents": _SCORE_RERANK_DOCUMENTS, + }, + }, + ] +) + +INPUT_REASONING_BATCH = "\n".join( + json.dumps(req) + for req in [ + { + "custom_id": "request-1", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": REASONING_MODEL_NAME, + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Solve this math problem: 2+2=?"}, + ], + }, + }, + { + "custom_id": "request-2", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": REASONING_MODEL_NAME, + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + ], + }, + }, + ] +) -# This is a valid but minimal audio file for testing MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=" INPUT_TRANSCRIPTION_BATCH = ( - '{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/transcriptions", ' - '"body": {{"model": "openai/whisper-large-v3", "file_url": "data:audio/wav;base64,{}", ' - '"response_format": "json"}}}}\n' -).format(MINIMAL_WAV_BASE64) + json.dumps( + { + "custom_id": "request-1", + "method": "POST", + "url": "/v1/audio/transcriptions", + "body": { + "model": SPEECH_LARGE_MODEL_NAME, + "file_url": f"data:audio/wav;base64,{MINIMAL_WAV_BASE64}", + "response_format": "json", + }, + } + ) + + "\n" +) INPUT_TRANSCRIPTION_HTTP_BATCH = ( - '{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/transcriptions", ' - '"body": {{"model": "openai/whisper-large-v3", "file_url": "{}", ' - '"response_format": "json"}}}}\n' -).format(AudioAsset("mary_had_lamb").url) + json.dumps( + { + "custom_id": "request-1", + "method": "POST", + "url": "/v1/audio/transcriptions", + "body": { + "model": SPEECH_LARGE_MODEL_NAME, + "file_url": AudioAsset("mary_had_lamb").url, + "response_format": "json", + }, + } + ) + + "\n" +) INPUT_TRANSLATION_BATCH = ( - '{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/translations", ' - '"body": {{"model": "openai/whisper-small", "file_url": "{}", ' - '"response_format": "text", "language": "it", "to_language": "en", ' - '"temperature": 0.0}}}}\n' -).format(AudioAsset("mary_had_lamb").url) + json.dumps( + { + "custom_id": "request-1", + "method": "POST", + "url": "/v1/audio/translations", + "body": { + "model": SPEECH_SMALL_MODEL_NAME, + "file_url": AudioAsset("mary_had_lamb").url, + "response_format": "text", + "language": "it", + "to_language": "en", + "temperature": 0.0, + }, + } + ) + + "\n" +) + +WEATHER_TOOL = { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["location"], + }, + }, +} + +INPUT_TOOL_CALLING_BATCH = json.dumps( + { + "custom_id": "request-1", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": REASONING_MODEL_NAME, + "messages": [ + {"role": "user", "content": "What is the weather in San Francisco?"}, + ], + "tools": [WEATHER_TOOL], + "tool_choice": "required", + "max_tokens": 1000, + }, + } +) def test_empty_file(): @@ -81,7 +383,7 @@ def test_empty_file(): "-o", output_file.name, "--model", - "intfloat/multilingual-e5-small", + EMBEDDING_MODEL_NAME, ], ) proc.communicate() @@ -108,7 +410,7 @@ def test_completions(): "-o", output_file.name, "--model", - MODEL_NAME, + CHAT_MODEL_NAME, ], ) proc.communicate() @@ -141,7 +443,7 @@ def test_completions_invalid_input(): "-o", output_file.name, "--model", - MODEL_NAME, + CHAT_MODEL_NAME, ], ) proc.communicate() @@ -165,7 +467,7 @@ def test_embeddings(): "-o", output_file.name, "--model", - "intfloat/multilingual-e5-small", + EMBEDDING_MODEL_NAME, ], ) proc.communicate() @@ -196,7 +498,7 @@ def test_score(input_batch): "-o", output_file.name, "--model", - "BAAI/bge-reranker-v2-m3", + RERANKER_MODEL_NAME, ], ) proc.communicate() @@ -234,7 +536,7 @@ def test_reasoning_parser(): "-o", output_file.name, "--model", - "Qwen/Qwen3-0.6B", + REASONING_MODEL_NAME, "--reasoning-parser", "qwen3", ], @@ -278,7 +580,7 @@ def test_transcription(): "-o", output_file.name, "--model", - "openai/whisper-large-v3", + SPEECH_LARGE_MODEL_NAME, ], ) proc.communicate() @@ -316,7 +618,7 @@ def test_transcription_http_url(): "-o", output_file.name, "--model", - "openai/whisper-large-v3", + SPEECH_LARGE_MODEL_NAME, ], ) proc.communicate() @@ -356,7 +658,7 @@ def test_translation(): "-o", output_file.name, "--model", - "openai/whisper-small", + SPEECH_SMALL_MODEL_NAME, ], ) proc.communicate() @@ -378,3 +680,69 @@ def test_translation(): translation_text = response_body["text"] translation_text_lower = str(translation_text).strip().lower() assert "mary" in translation_text_lower or "lamb" in translation_text_lower + + +def test_tool_calling(): + """ + Test that tool calling works correctly in run_batch. + Verifies that requests with tools return tool_calls in the response. + """ + with ( + tempfile.NamedTemporaryFile("w") as input_file, + tempfile.NamedTemporaryFile("r") as output_file, + ): + input_file.write(INPUT_TOOL_CALLING_BATCH) + input_file.flush() + proc = subprocess.Popen( + [ + "vllm", + "run-batch", + "-i", + input_file.name, + "-o", + output_file.name, + "--model", + REASONING_MODEL_NAME, + "--enable-auto-tool-choice", + "--tool-call-parser", + "hermes", + ], + ) + proc.communicate() + proc.wait() + assert proc.returncode == 0, f"{proc=}" + + contents = output_file.read() + for line in contents.strip().split("\n"): + if not line.strip(): # Skip empty lines + continue + # Ensure that the output format conforms to the openai api. + # Validation should throw if the schema is wrong. + BatchRequestOutput.model_validate_json(line) + + # Ensure that there is no error in the response. + line_dict = json.loads(line) + assert isinstance(line_dict, dict) + assert line_dict["error"] is None + + # Check that tool_calls are present in the response + # With tool_choice="required", the model must call a tool + response_body = line_dict["response"]["body"] + assert response_body is not None + message = response_body["choices"][0]["message"] + assert "tool_calls" in message + tool_calls = message.get("tool_calls") + # With tool_choice="required", tool_calls must be present and non-empty + assert tool_calls is not None + assert isinstance(tool_calls, list) + assert len(tool_calls) > 0 + # Verify tool_calls have the expected structure + for tool_call in tool_calls: + assert "id" in tool_call + assert "type" in tool_call + assert tool_call["type"] == "function" + assert "function" in tool_call + assert "name" in tool_call["function"] + assert "arguments" in tool_call["function"] + # Verify the tool name matches our tool definition + assert tool_call["function"]["name"] == "get_current_weather" diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 983040a89..eac581e5d 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -67,7 +67,149 @@ class LoRAParserAction(argparse.Action): @config -class FrontendArgs: +class BaseFrontendArgs: + """Base arguments for the OpenAI-compatible frontend server. + + This base class does not include host, port, and server-specific arguments + like SSL, CORS, and HTTP server settings. Those arguments are added by + the subclasses. + """ + + lora_modules: list[LoRAModulePath] | None = None + """LoRA modules configurations in either 'name=path' format or JSON format + or JSON list format. Example (old format): `'name=path'` Example (new + format): `{\"name\": \"name\", \"path\": \"lora_path\", + \"base_model_name\": \"id\"}`""" + chat_template: str | None = None + """The file path to the chat template, or the template in single-line form + for the specified model.""" + chat_template_content_format: ChatTemplateContentFormatOption = "auto" + """The format to render message content within a chat template. + + * "string" will render the content as a string. Example: `"Hello World"` + * "openai" will render the content as a list of dictionaries, similar to + OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" + trust_request_chat_template: bool = False + """Whether to trust the chat template provided in the request. If False, + the server will always use the chat template specified by `--chat-template` + or the ones from tokenizer.""" + default_chat_template_kwargs: dict[str, Any] | None = None + """Default keyword arguments to pass to the chat template renderer. + These will be merged with request-level chat_template_kwargs, + with request values taking precedence. Useful for setting default + behavior for reasoning models. Example: '{"enable_thinking": false}' + to disable thinking mode by default for Qwen3/DeepSeek models.""" + response_role: str = "assistant" + """The role name to return if `request.add_generation_prompt=true`.""" + return_tokens_as_token_ids: bool = False + """When `--max-logprobs` is specified, represents single tokens as + strings of the form 'token_id:{token_id}' so that tokens that are not + JSON-encodable can be identified.""" + disable_frontend_multiprocessing: bool = False + """If specified, will run the OpenAI frontend server in the same process as + the model serving engine.""" + enable_auto_tool_choice: bool = False + """Enable auto tool choice for supported models. Use `--tool-call-parser` + to specify which parser to use.""" + exclude_tools_when_tool_choice_none: bool = False + """If specified, exclude tool definitions in prompts when + tool_choice='none'.""" + tool_call_parser: str | None = None + """Select the tool call parser depending on the model that you're using. + This is used to parse the model-generated tool call into OpenAI API format. + Required for `--enable-auto-tool-choice`. You can choose any option from + the built-in parsers or register a plugin via `--tool-parser-plugin`.""" + tool_parser_plugin: str = "" + """Special the tool parser plugin write to parse the model-generated tool + into OpenAI API format, the name register in this plugin can be used in + `--tool-call-parser`.""" + tool_server: str | None = None + """Comma-separated list of host:port pairs (IPv4, IPv6, or hostname). + Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo + purpose.""" + log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH + """Path to logging config JSON file for both vllm and uvicorn""" + max_log_len: int | None = None + """Max number of prompt characters or prompt ID numbers being printed in + log. The default of None means unlimited.""" + enable_prompt_tokens_details: bool = False + """If set to True, enable prompt_tokens_details in usage.""" + enable_server_load_tracking: bool = False + """If set to True, enable tracking server_load_metrics in the app state.""" + enable_force_include_usage: bool = False + """If set to True, including usage on every request.""" + enable_tokenizer_info_endpoint: bool = False + """Enable the `/tokenizer_info` endpoint. May expose chat + templates and other tokenizer configuration.""" + enable_log_outputs: bool = False + """If set to True, log model outputs (generations). + Requires --enable-log-requests.""" + enable_log_deltas: bool = True + """If set to False, output deltas will not be logged. Relevant only if + --enable-log-outputs is set. + """ + log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE + """If set to True, log the stack trace of error responses""" + tokens_only: bool = False + """ + If set to True, only enable the Tokens In<>Out endpoint. + This is intended for use in a Disaggregated Everything setup. + """ + + @classmethod + def _customize_cli_kwargs( + cls, + frontend_kwargs: dict[str, Any], + ) -> dict[str, Any]: + """Customize argparse kwargs before arguments are registered. + + Subclasses should override this and call + ``super()._customize_cli_kwargs(frontend_kwargs)`` first. + """ + # Special case: default_chat_template_kwargs needs json.loads type + frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads + + # Special case: LoRA modules need custom parser action and + # optional_type(str) + frontend_kwargs["lora_modules"]["type"] = optional_type(str) + frontend_kwargs["lora_modules"]["action"] = LoRAParserAction + + # Special case: Tool call parser shows built-in options. + valid_tool_parsers = list(ToolParserManager.list_registered()) + parsers_str = ",".join(valid_tool_parsers) + frontend_kwargs["tool_call_parser"]["metavar"] = ( + f"{{{parsers_str}}} or name registered in --tool-parser-plugin" + ) + return frontend_kwargs + + @classmethod + def add_cli_args(cls, parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + """Register CLI arguments for this frontend class. + + Subclasses should override ``_customize_cli_kwargs`` instead of + this method so that base-class postprocessing is always applied. + """ + from vllm.engine.arg_utils import get_kwargs + + frontend_kwargs = get_kwargs(cls) + frontend_kwargs = cls._customize_cli_kwargs(frontend_kwargs) + + group_name = cls.__name__.replace("Args", "") + frontend_group = parser.add_argument_group( + title=group_name, + description=cls.__doc__, + ) + for key, value in frontend_kwargs.items(): + extra_flags = value.pop("flags", []) + frontend_group.add_argument( + *extra_flags, f"--{key.replace('_', '-')}", **value + ) + + return parser + + +@config +class FrontendArgs(BaseFrontendArgs): """Arguments for the OpenAI-compatible frontend server.""" host: str | None = None @@ -99,32 +241,6 @@ class FrontendArgs: api_key: list[str] | None = None """If provided, the server will require one of these keys to be presented in the header.""" - lora_modules: list[LoRAModulePath] | None = None - """LoRA modules configurations in either 'name=path' format or JSON format - or JSON list format. Example (old format): `'name=path'` Example (new - format): `{\"name\": \"name\", \"path\": \"lora_path\", - \"base_model_name\": \"id\"}`""" - chat_template: str | None = None - """The file path to the chat template, or the template in single-line form - for the specified model.""" - chat_template_content_format: ChatTemplateContentFormatOption = "auto" - """The format to render message content within a chat template. - - * "string" will render the content as a string. Example: `"Hello World"` - * "openai" will render the content as a list of dictionaries, similar to - OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" - trust_request_chat_template: bool = False - """Whether to trust the chat template provided in the request. If False, - the server will always use the chat template specified by `--chat-template` - or the ones from tokenizer.""" - default_chat_template_kwargs: dict[str, Any] | None = None - """Default keyword arguments to pass to the chat template renderer. - These will be merged with request-level chat_template_kwargs, - with request values taking precedence. Useful for setting default - behavior for reasoning models. Example: '{"enable_thinking": false}' - to disable thinking mode by default for Qwen3/DeepSeek models.""" - response_role: str = "assistant" - """The role name to return if `request.add_generation_prompt=true`.""" ssl_keyfile: str | None = None """The file path to the SSL key file.""" ssl_certfile: str | None = None @@ -146,81 +262,28 @@ class FrontendArgs: is provided, vLLM will add it to the server using `@app.middleware('http')`. If a class is provided, vLLM will add it to the server using `app.add_middleware()`.""" - return_tokens_as_token_ids: bool = False - """When `--max-logprobs` is specified, represents single tokens as - strings of the form 'token_id:{token_id}' so that tokens that are not - JSON-encodable can be identified.""" - disable_frontend_multiprocessing: bool = False - """If specified, will run the OpenAI frontend server in the same process as - the model serving engine.""" enable_request_id_headers: bool = False """If specified, API server will add X-Request-Id header to responses.""" - enable_auto_tool_choice: bool = False - """Enable auto tool choice for supported models. Use `--tool-call-parser` - to specify which parser to use.""" - exclude_tools_when_tool_choice_none: bool = False - """If specified, exclude tool definitions in prompts when - tool_choice='none'.""" - tool_call_parser: str | None = None - """Select the tool call parser depending on the model that you're using. - This is used to parse the model-generated tool call into OpenAI API format. - Required for `--enable-auto-tool-choice`. You can choose any option from - the built-in parsers or register a plugin via `--tool-parser-plugin`.""" - tool_parser_plugin: str = "" - """Special the tool parser plugin write to parse the model-generated tool - into OpenAI API format, the name register in this plugin can be used in - `--tool-call-parser`.""" - tool_server: str | None = None - """Comma-separated list of host:port pairs (IPv4, IPv6, or hostname). - Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo - purpose.""" - log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH - """Path to logging config JSON file for both vllm and uvicorn""" - max_log_len: int | None = None - """Max number of prompt characters or prompt ID numbers being printed in - log. The default of None means unlimited.""" disable_fastapi_docs: bool = False """Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint.""" - enable_prompt_tokens_details: bool = False - """If set to True, enable prompt_tokens_details in usage.""" - enable_server_load_tracking: bool = False - """If set to True, enable tracking server_load_metrics in the app state.""" - enable_force_include_usage: bool = False - """If set to True, including usage on every request.""" - enable_tokenizer_info_endpoint: bool = False - """Enable the `/tokenizer_info` endpoint. May expose chat - templates and other tokenizer configuration.""" - enable_log_outputs: bool = False - """If set to True, log model outputs (generations). - Requires --enable-log-requests.""" - enable_log_deltas: bool = True - """If set to False, output deltas will not be logged. Relevant only if - --enable-log-outputs is set. - """ h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT """Maximum size (bytes) of an incomplete HTTP event (header or body) for h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB).""" h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT """Maximum number of HTTP headers allowed in a request for h11 parser. Helps mitigate header abuse. Default: 256.""" - log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE - """If set to True, log the stack trace of error responses""" - tokens_only: bool = False - """ - If set to True, only enable the Tokens In<>Out endpoint. - This is intended for use in a Disaggregated Everything setup. - """ enable_offline_docs: bool = False """ Enable offline FastAPI documentation for air-gapped environments. Uses vendored static assets bundled with vLLM. """ - @staticmethod - def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: - from vllm.engine.arg_utils import get_kwargs - - frontend_kwargs = get_kwargs(FrontendArgs) + @classmethod + def _customize_cli_kwargs( + cls, + frontend_kwargs: dict[str, Any], + ) -> dict[str, Any]: + frontend_kwargs = super()._customize_cli_kwargs(frontend_kwargs) # Special case: allowed_origins, allowed_methods, allowed_headers all # need json.loads type @@ -232,14 +295,6 @@ class FrontendArgs: del frontend_kwargs["allowed_methods"]["nargs"] del frontend_kwargs["allowed_headers"]["nargs"] - # Special case: default_chat_template_kwargs needs json.loads type - frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads - - # Special case: LoRA modules need custom parser action and - # optional_type(str) - frontend_kwargs["lora_modules"]["type"] = optional_type(str) - frontend_kwargs["lora_modules"]["action"] = LoRAParserAction - # Special case: Middleware needs to append action frontend_kwargs["middleware"]["action"] = "append" frontend_kwargs["middleware"]["type"] = str @@ -252,22 +307,7 @@ class FrontendArgs: if "nargs" in frontend_kwargs["disable_access_log_for_endpoints"]: del frontend_kwargs["disable_access_log_for_endpoints"]["nargs"] - # Special case: Tool call parser shows built-in options. - valid_tool_parsers = list(ToolParserManager.list_registered()) - parsers_str = ",".join(valid_tool_parsers) - frontend_kwargs["tool_call_parser"]["metavar"] = ( - f"{{{parsers_str}}} or name registered in --tool-parser-plugin" - ) - - frontend_group = parser.add_argument_group( - title="Frontend", - description=FrontendArgs.__doc__, - ) - - for key, value in frontend_kwargs.items(): - frontend_group.add_argument(f"--{key.replace('_', '-')}", **value) - - return parser + return frontend_kwargs def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 747025750..69c326ce1 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -3,6 +3,7 @@ import asyncio import base64 +import sys import tempfile from argparse import Namespace from collections.abc import Awaitable, Callable @@ -17,23 +18,23 @@ from fastapi import UploadFile from prometheus_client import start_http_server from pydantic import Field, TypeAdapter, field_validator, model_validator from pydantic_core.core_schema import ValidationInfo +from starlette.datastructures import State from tqdm import tqdm -from vllm.engine.arg_utils import AsyncEngineArgs, optional_type +from vllm.config import config +from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient -from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.api_server import init_app_state from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ChatCompletionResponse, ) -from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat +from vllm.entrypoints.openai.cli_args import BaseFrontendArgs from vllm.entrypoints.openai.engine.protocol import ( ErrorInfo, ErrorResponse, OpenAIBaseModel, ) -from vllm.entrypoints.openai.models.protocol import BaseModelPath -from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.speech_to_text.protocol import ( TranscriptionRequest, TranscriptionResponse, @@ -42,25 +43,18 @@ from vllm.entrypoints.openai.speech_to_text.protocol import ( TranslationResponse, TranslationResponseVerbose, ) -from vllm.entrypoints.openai.speech_to_text.serving import ( - OpenAIServingTranscription, - OpenAIServingTranslation, -) from vllm.entrypoints.pooling.embed.protocol import ( EmbeddingRequest, EmbeddingResponse, ) -from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding from vllm.entrypoints.pooling.score.protocol import ( RerankRequest, RerankResponse, ScoreRequest, ScoreResponse, ) -from vllm.entrypoints.pooling.score.serving import ServingScores from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager -from vllm.tasks import SupportedTask from vllm.utils import random_uuid from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.version import __version__ as VLLM_VERSION @@ -219,87 +213,73 @@ class BatchRequestOutput(OpenAIBaseModel): error: Any | None +@config +class BatchFrontendArgs(BaseFrontendArgs): + """Arguments for the batch runner frontend.""" + + input_file: str | None = None + """The path or url to a single input file. Currently supports local file + paths, or the http protocol (http or https). If a URL is specified, + the file should be available via HTTP GET.""" + output_file: str | None = None + """The path or url to a single output file. Currently supports + local file paths, or web (http or https) urls. If a URL is specified, + the file should be available via HTTP PUT.""" + output_tmp_dir: str | None = None + """The directory to store the output file before uploading it + to the output URL.""" + enable_metrics: bool = False + """Enable Prometheus metrics""" + host: str | None = None + """Host name for the Prometheus metrics server + (only needed if enable-metrics is set).""" + port: int = 8000 + """Port number for the Prometheus metrics server + (only needed if enable-metrics is set).""" + url: str = "0.0.0.0" + """[DEPRECATED] Host name for the Prometheus metrics server + (only needed if enable-metrics is set). Use --host instead.""" + + @classmethod + def _customize_cli_kwargs( + cls, + frontend_kwargs: dict[str, Any], + ) -> dict[str, Any]: + frontend_kwargs = super()._customize_cli_kwargs(frontend_kwargs) + + frontend_kwargs["input_file"]["flags"] = ["-i"] + frontend_kwargs["input_file"]["required"] = True + frontend_kwargs["output_file"]["flags"] = ["-o"] + frontend_kwargs["output_file"]["required"] = True + + frontend_kwargs["enable_metrics"]["action"] = "store_true" + + frontend_kwargs["url"]["deprecated"] = True + return frontend_kwargs + + def make_arg_parser(parser: FlexibleArgumentParser): - parser.add_argument( - "-i", - "--input-file", - required=True, - type=str, - help="The path or url to a single input file. Currently supports local file " - "paths, or the http protocol (http or https). If a URL is specified, " - "the file should be available via HTTP GET.", - ) - parser.add_argument( - "-o", - "--output-file", - required=True, - type=str, - help="The path or url to a single output file. Currently supports " - "local file paths, or web (http or https) urls. If a URL is specified," - " the file should be available via HTTP PUT.", - ) - parser.add_argument( - "--output-tmp-dir", - type=str, - default=None, - help="The directory to store the output file before uploading it " - "to the output URL.", - ) - parser.add_argument( - "--response-role", - type=optional_type(str), - default="assistant", - help="The role name to return if `request.add_generation_prompt=True`.", - ) - + parser = BatchFrontendArgs.add_cli_args(parser) parser = AsyncEngineArgs.add_cli_args(parser) - - parser.add_argument( - "--max-log-len", - type=int, - default=None, - help="Max number of prompt characters or prompt " - "ID numbers being printed in log." - "\n\nDefault: Unlimited", - ) - - parser.add_argument( - "--enable-metrics", action="store_true", help="Enable Prometheus metrics" - ) - parser.add_argument( - "--url", - type=str, - default="0.0.0.0", - help="URL to the Prometheus metrics server " - "(only needed if enable-metrics is set).", - ) - parser.add_argument( - "--port", - type=int, - default=8000, - help="Port number for the Prometheus metrics server " - "(only needed if enable-metrics is set).", - ) - parser.add_argument( - "--enable-prompt-tokens-details", - action="store_true", - default=False, - help="If set to True, enable prompt_tokens_details in usage.", - ) - parser.add_argument( - "--enable-force-include-usage", - action="store_true", - default=False, - help="If set to True, include usage on every request " - "(even when stream_options is not specified)", - ) - return parser def parse_args(): parser = FlexibleArgumentParser(description="vLLM OpenAI-Compatible batch runner.") - return make_arg_parser(parser).parse_args() + args = make_arg_parser(parser).parse_args() + + # Backward compatibility: If --url is set, use it for host + url_explicit = any(arg == "--url" or arg.startswith("--url=") for arg in sys.argv) + host_explicit = any( + arg == "--host" or arg.startswith("--host=") for arg in sys.argv + ) + if url_explicit and hasattr(args, "url") and not host_explicit: + args.host = args.url + logger.warning_once( + "Using --url for metrics is deprecated. Please use --host instead." + ) + + return args # explicitly use pure text format, with a newline at the end @@ -671,12 +651,9 @@ def make_transcription_wrapper(is_translation: bool) -> WrapperFn: return wrapper -def build_endpoint_registry( +async def build_endpoint_registry( engine_client: EngineClient, args: Namespace, - base_model_paths: list[BaseModelPath], - request_logger: RequestLogger | None, - supported_tasks: tuple[SupportedTask, ...], ) -> dict[str, dict[str, Any]]: """ Build the endpoint registry with all serving objects and handler configurations. @@ -684,90 +661,27 @@ def build_endpoint_registry( Args: engine_client: The engine client args: Command line arguments - base_model_paths: List of base model paths - request_logger: Optional request logger - supported_tasks: Tuple of supported tasks Returns: Dictionary mapping endpoint keys to their configurations """ - model_config = engine_client.model_config + supported_tasks = await engine_client.get_supported_tasks() + logger.info("Supported tasks: %s", supported_tasks) - # Create the openai serving objects. - openai_serving_models = OpenAIServingModels( - engine_client=engine_client, - base_model_paths=base_model_paths, - lora_modules=None, - ) + # Create a state object to hold serving objects + state = State() - openai_serving_chat = ( - OpenAIServingChat( - engine_client, - openai_serving_models, - args.response_role, - request_logger=request_logger, - chat_template=None, - chat_template_content_format="auto", - reasoning_parser=args.structured_outputs_config.reasoning_parser, - enable_prompt_tokens_details=args.enable_prompt_tokens_details, - enable_force_include_usage=args.enable_force_include_usage, - default_chat_template_kwargs=getattr( - args, "default_chat_template_kwargs", None - ), - ) - if "generate" in supported_tasks - else None - ) + # Initialize all serving objects using init_app_state + # This provides full functionality including chat template processing, + # LoRA support, tool servers, etc. + await init_app_state(engine_client, state, args, supported_tasks) - openai_serving_embedding = ( - OpenAIServingEmbedding( - engine_client, - openai_serving_models, - request_logger=request_logger, - chat_template=None, - chat_template_content_format="auto", - ) - if "embed" in supported_tasks - else None - ) - - enable_serving_reranking = ( - "classify" in supported_tasks - and getattr(model_config.hf_config, "num_labels", 0) == 1 - ) - - openai_serving_scores = ( - ServingScores( - engine_client, - openai_serving_models, - request_logger=request_logger, - score_template=None, - ) - if ("embed" in supported_tasks or enable_serving_reranking) - else None - ) - - openai_serving_transcription = ( - OpenAIServingTranscription( - engine_client, - openai_serving_models, - request_logger=request_logger, - enable_force_include_usage=args.enable_force_include_usage, - ) - if "transcription" in supported_tasks - else None - ) - - openai_serving_translation = ( - OpenAIServingTranslation( - engine_client, - openai_serving_models, - request_logger=request_logger, - enable_force_include_usage=args.enable_force_include_usage, - ) - if "transcription" in supported_tasks - else None - ) + # Get serving objects from state (defaulting to None if not set) + openai_serving_chat = getattr(state, "openai_serving_chat", None) + openai_serving_embedding = getattr(state, "openai_serving_embedding", None) + openai_serving_scores = getattr(state, "openai_serving_scores", None) + openai_serving_transcription = getattr(state, "openai_serving_transcription", None) + openai_serving_translation = getattr(state, "openai_serving_translation", None) # Registry of endpoint configurations endpoint_registry: dict[str, dict[str, Any]] = { @@ -845,29 +759,9 @@ async def run_batch( engine_client: EngineClient, args: Namespace, ) -> None: - if args.served_model_name is not None: - served_model_names = args.served_model_name - else: - served_model_names = [args.model] - - if args.enable_log_requests: - request_logger = RequestLogger(max_log_len=args.max_log_len) - else: - request_logger = None - - base_model_paths = [ - BaseModelPath(name=name, model_path=args.model) for name in served_model_names - ] - - supported_tasks = await engine_client.get_supported_tasks() - logger.info("Supported tasks: %s", supported_tasks) - - endpoint_registry = build_endpoint_registry( + endpoint_registry = await build_endpoint_registry( engine_client=engine_client, args=args, - base_model_paths=base_model_paths, - request_logger=request_logger, - supported_tasks=supported_tasks, ) tracker = BatchProgressTracker() @@ -942,7 +836,7 @@ if __name__ == "__main__": # to publish metrics at the /metrics endpoint. if args.enable_metrics: logger.info("Prometheus metrics enabled") - start_http_server(port=args.port, addr=args.url) + start_http_server(port=args.port, addr=args.host) else: logger.info("Prometheus metrics disabled")