diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 56004a193..b4c46bb66 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1091,6 +1091,7 @@ async def init_app_state( enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, + exclude_log_deltas=args.exclude_log_deltas, log_error_stack=args.log_error_stack, ) if "generate" in supported_tasks diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 26e52ff9a..413e71ec2 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -187,6 +187,9 @@ class FrontendArgs: enable_log_outputs: bool = False """If True, log model outputs (generations). Requires --enable-log-requests.""" + exclude_log_deltas: bool = False + """If True, model outputs will be logged once streaming is complete. Deltas + will not be logged. Requires --enable-log-outputs.""" h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT """Maximum size (bytes) of an incomplete HTTP event (header or body) for h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB).""" @@ -305,6 +308,8 @@ def validate_parsed_serve_args(args: argparse.Namespace): # Enable auto tool needs a tool call parser to be valid if args.enable_auto_tool_choice and not args.tool_call_parser: raise TypeError("Error: --enable-auto-tool-choice requires --tool-call-parser") + if args.exclude_log_deltas and not args.enable_log_outputs: + raise TypeError("Error: --exclude-log-deltas requires --enable-log-outputs") if args.enable_log_outputs and not args.enable_log_requests: raise TypeError("Error: --enable-log-outputs requires --enable-log-requests") diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 5a916f39b..f0d78dace 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -101,6 +101,7 @@ class OpenAIServingChat(OpenAIServing): enable_prompt_tokens_details: bool = False, enable_force_include_usage: bool = False, enable_log_outputs: bool = False, + exclude_log_deltas: bool = False, log_error_stack: bool = False, default_chat_template_kwargs: dict[str, Any] | None = None, ) -> None: @@ -118,6 +119,7 @@ class OpenAIServingChat(OpenAIServing): self.trust_request_chat_template = trust_request_chat_template self.default_chat_template_kwargs = default_chat_template_kwargs or {} self.enable_log_outputs = enable_log_outputs + self.exclude_log_deltas = exclude_log_deltas # set up logits processors self.logits_processors = self.model_config.logits_processors @@ -1135,7 +1137,7 @@ class OpenAIServingChat(OpenAIServing): if tc.function and tc.function.arguments ) - if delta_content: + if delta_content and not self.exclude_log_deltas: self.request_logger.log_outputs( request_id=request_id, outputs=delta_content,