[Bugfix][FE]: Always include usage with --enable-force-include-usage (#20983)

Signed-off-by: Max Wittig <max.wittig@siemens.com>
Signed-off-by: Antoine Auger <antoineauger@users.noreply.github.com>
Co-authored-by: Antoine Auger <antoineauger@users.noreply.github.com>
This commit is contained in:
Max Wittig
2025-10-14 09:17:39 +02:00
committed by GitHub
parent d32c611f45
commit fd85c9f426
11 changed files with 172 additions and 30 deletions

View File

@@ -58,7 +58,7 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_l
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.tool_parsers import ToolParser
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
from vllm.entrypoints.utils import get_max_tokens
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
from vllm.logger import init_logger
from vllm.logprobs import Logprob
@@ -101,7 +101,6 @@ class OpenAIServingChat(OpenAIServing):
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids,
enable_force_include_usage=enable_force_include_usage,
log_error_stack=log_error_stack,
)
@@ -352,7 +351,6 @@ class OpenAIServingChat(OpenAIServing):
conversation,
tokenizer,
request_metadata,
enable_force_include_usage=self.enable_force_include_usage,
)
try:
@@ -518,7 +516,6 @@ class OpenAIServingChat(OpenAIServing):
conversation: list[ConversationMessage],
tokenizer: AnyTokenizer,
request_metadata: RequestResponseMetadata,
enable_force_include_usage: bool,
) -> AsyncGenerator[str, None]:
created_time = int(time.time())
chunk_object_type: Final = "chat.completion.chunk"
@@ -596,13 +593,9 @@ class OpenAIServingChat(OpenAIServing):
return
stream_options = request.stream_options
if stream_options:
include_usage = stream_options.include_usage or enable_force_include_usage
include_continuous_usage = (
include_usage and stream_options.continuous_usage_stats
)
else:
include_usage, include_continuous_usage = False, False
include_usage, include_continuous_usage = should_include_usage(
stream_options, self.enable_force_include_usage
)
try:
async for res in result_generator: