[openai api] log exception in exception handler (1/N) (#31164)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
This commit is contained in:
@@ -39,6 +39,7 @@ def chat(request: Request) -> OpenAIServingChat | None:
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
@@ -54,10 +55,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
|
||||
message="The model does not support Chat Completions API"
|
||||
)
|
||||
|
||||
try:
|
||||
generator = await handler.create_chat_completion(request, raw_request)
|
||||
except Exception as e:
|
||||
generator = handler.create_error_response(e)
|
||||
generator = await handler.create_chat_completion(request, raw_request)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(
|
||||
@@ -81,6 +79,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request):
|
||||
@@ -93,10 +92,7 @@ async def render_chat_completion(request: ChatCompletionRequest, raw_request: Re
|
||||
message="The model does not support Chat Completions API"
|
||||
)
|
||||
|
||||
try:
|
||||
result = await handler.render_chat_request(request)
|
||||
except Exception as e:
|
||||
result = handler.create_error_response(e)
|
||||
result = await handler.render_chat_request(request)
|
||||
|
||||
if isinstance(result, ErrorResponse):
|
||||
return JSONResponse(content=result.model_dump(), status_code=result.error.code)
|
||||
|
||||
@@ -8,7 +8,6 @@ from collections.abc import AsyncGenerator, AsyncIterator
|
||||
from collections.abc import Sequence as GenericSequence
|
||||
from typing import Any, Final
|
||||
|
||||
import jinja2
|
||||
import partial_json_parser
|
||||
import regex as re
|
||||
from fastapi import Request
|
||||
@@ -105,7 +104,6 @@ class OpenAIServingChat(OpenAIServing):
|
||||
enable_force_include_usage: bool = False,
|
||||
enable_log_outputs: bool = False,
|
||||
enable_log_deltas: bool = True,
|
||||
log_error_stack: bool = False,
|
||||
default_chat_template_kwargs: dict[str, Any] | None = None,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
@@ -113,7 +111,6 @@ class OpenAIServingChat(OpenAIServing):
|
||||
models=models,
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=return_tokens_as_token_ids,
|
||||
log_error_stack=log_error_stack,
|
||||
)
|
||||
|
||||
self.response_role = response_role
|
||||
@@ -235,81 +232,76 @@ class OpenAIServingChat(OpenAIServing):
|
||||
if self.engine_client.errored:
|
||||
raise self.engine_client.dead_error
|
||||
|
||||
try:
|
||||
tokenizer = self.renderer.tokenizer
|
||||
tokenizer = self.renderer.tokenizer
|
||||
|
||||
tool_parser = self.tool_parser
|
||||
tool_parser = self.tool_parser
|
||||
|
||||
if is_mistral_tokenizer(tokenizer):
|
||||
# because of issues with pydantic we need to potentially
|
||||
# re-serialize the tool_calls field of the request
|
||||
# for more info: see comment in `maybe_serialize_tool_calls`
|
||||
_mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type]
|
||||
_mt.truncate_tool_call_ids(request) # type: ignore[arg-type]
|
||||
_mt.validate_request_params(request)
|
||||
if is_mistral_tokenizer(tokenizer):
|
||||
# because of issues with pydantic we need to potentially
|
||||
# re-serialize the tool_calls field of the request
|
||||
# for more info: see comment in `maybe_serialize_tool_calls`
|
||||
_mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type]
|
||||
_mt.truncate_tool_call_ids(request) # type: ignore[arg-type]
|
||||
_mt.validate_request_params(request)
|
||||
|
||||
# Check if tool parsing is unavailable (common condition)
|
||||
tool_parsing_unavailable = (
|
||||
tool_parser is None
|
||||
and not is_mistral_tokenizer(tokenizer)
|
||||
and not self.use_harmony
|
||||
# Check if tool parsing is unavailable (common condition)
|
||||
tool_parsing_unavailable = (
|
||||
tool_parser is None
|
||||
and not is_mistral_tokenizer(tokenizer)
|
||||
and not self.use_harmony
|
||||
)
|
||||
|
||||
# Validate tool_choice when tool parsing is required but unavailable
|
||||
if tool_parsing_unavailable and request.tool_choice not in (
|
||||
None,
|
||||
"none",
|
||||
):
|
||||
if request.tool_choice == "auto" and not self.enable_auto_tools:
|
||||
# for hf tokenizers, "auto" tools requires
|
||||
# --enable-auto-tool-choice and --tool-call-parser
|
||||
return self.create_error_response(
|
||||
'"auto" tool choice requires '
|
||||
"--enable-auto-tool-choice and --tool-call-parser to be set"
|
||||
)
|
||||
elif request.tool_choice != "auto":
|
||||
# "required" or named tool requires tool parser
|
||||
return self.create_error_response(
|
||||
f'tool_choice="{request.tool_choice}" requires '
|
||||
"--tool-call-parser to be set"
|
||||
)
|
||||
|
||||
if request.tools is None or (
|
||||
request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
|
||||
):
|
||||
tool_dicts = None
|
||||
else:
|
||||
tool_dicts = [tool.model_dump() for tool in request.tools]
|
||||
|
||||
if not self.use_harmony:
|
||||
# Common case.
|
||||
error_check_ret = self._validate_chat_template(
|
||||
request_chat_template=request.chat_template,
|
||||
chat_template_kwargs=request.chat_template_kwargs,
|
||||
trust_request_chat_template=self.trust_request_chat_template,
|
||||
)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
# Validate tool_choice when tool parsing is required but unavailable
|
||||
if tool_parsing_unavailable and request.tool_choice not in (
|
||||
None,
|
||||
"none",
|
||||
):
|
||||
if request.tool_choice == "auto" and not self.enable_auto_tools:
|
||||
# for hf tokenizers, "auto" tools requires
|
||||
# --enable-auto-tool-choice and --tool-call-parser
|
||||
return self.create_error_response(
|
||||
'"auto" tool choice requires '
|
||||
"--enable-auto-tool-choice and --tool-call-parser to be set"
|
||||
)
|
||||
elif request.tool_choice != "auto":
|
||||
# "required" or named tool requires tool parser
|
||||
return self.create_error_response(
|
||||
f'tool_choice="{request.tool_choice}" requires '
|
||||
"--tool-call-parser to be set"
|
||||
)
|
||||
|
||||
if request.tools is None or (
|
||||
request.tool_choice == "none"
|
||||
and self.exclude_tools_when_tool_choice_none
|
||||
):
|
||||
tool_dicts = None
|
||||
else:
|
||||
tool_dicts = [tool.model_dump() for tool in request.tools]
|
||||
|
||||
if not self.use_harmony:
|
||||
# Common case.
|
||||
error_check_ret = self._validate_chat_template(
|
||||
request_chat_template=request.chat_template,
|
||||
chat_template_kwargs=request.chat_template_kwargs,
|
||||
trust_request_chat_template=self.trust_request_chat_template,
|
||||
)
|
||||
if error_check_ret is not None:
|
||||
return error_check_ret
|
||||
|
||||
conversation, engine_prompts = await self._preprocess_chat(
|
||||
request,
|
||||
request.messages,
|
||||
default_template=self.chat_template,
|
||||
default_template_content_format=self.chat_template_content_format,
|
||||
default_template_kwargs=self.default_chat_template_kwargs,
|
||||
tool_dicts=tool_dicts,
|
||||
tool_parser=tool_parser,
|
||||
)
|
||||
else:
|
||||
# For GPT-OSS.
|
||||
should_include_tools = tool_dicts is not None
|
||||
conversation, engine_prompts = self._make_request_with_harmony(
|
||||
request, should_include_tools
|
||||
)
|
||||
except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(e)
|
||||
conversation, engine_prompts = await self._preprocess_chat(
|
||||
request,
|
||||
request.messages,
|
||||
default_template=self.chat_template,
|
||||
default_template_content_format=self.chat_template_content_format,
|
||||
default_template_kwargs=self.default_chat_template_kwargs,
|
||||
tool_dicts=tool_dicts,
|
||||
tool_parser=tool_parser,
|
||||
)
|
||||
else:
|
||||
# For GPT-OSS.
|
||||
should_include_tools = tool_dicts is not None
|
||||
conversation, engine_prompts = self._make_request_with_harmony(
|
||||
request, should_include_tools
|
||||
)
|
||||
|
||||
return conversation, engine_prompts
|
||||
|
||||
@@ -329,20 +321,16 @@ class OpenAIServingChat(OpenAIServing):
|
||||
tokenizer = self.renderer.tokenizer
|
||||
assert tokenizer is not None
|
||||
reasoning_parser: ReasoningParser | None = None
|
||||
try:
|
||||
if self.reasoning_parser_cls:
|
||||
# Pass the same chat template kwargs as used in tokenization
|
||||
chat_template_kwargs = self._prepare_extra_chat_template_kwargs(
|
||||
request.chat_template_kwargs,
|
||||
self.default_chat_template_kwargs,
|
||||
)
|
||||
reasoning_parser = self.reasoning_parser_cls(
|
||||
tokenizer,
|
||||
chat_template_kwargs=chat_template_kwargs, # type: ignore[call-arg]
|
||||
)
|
||||
except RuntimeError as e:
|
||||
logger.exception("Error in reasoning parser creation.")
|
||||
return self.create_error_response(str(e))
|
||||
if self.reasoning_parser_cls:
|
||||
# Pass the same chat template kwargs as used in tokenization
|
||||
chat_template_kwargs = self._prepare_extra_chat_template_kwargs(
|
||||
request.chat_template_kwargs,
|
||||
self.default_chat_template_kwargs,
|
||||
)
|
||||
reasoning_parser = self.reasoning_parser_cls(
|
||||
tokenizer,
|
||||
chat_template_kwargs=chat_template_kwargs, # type: ignore[call-arg]
|
||||
)
|
||||
result = await self.render_chat_request(request)
|
||||
if isinstance(result, ErrorResponse):
|
||||
return result
|
||||
@@ -357,15 +345,9 @@ class OpenAIServingChat(OpenAIServing):
|
||||
if raw_request:
|
||||
raw_request.state.request_metadata = request_metadata
|
||||
|
||||
try:
|
||||
lora_request = self._maybe_get_adapters(
|
||||
request, supports_default_mm_loras=True
|
||||
)
|
||||
lora_request = self._maybe_get_adapters(request, supports_default_mm_loras=True)
|
||||
|
||||
model_name = self.models.model_name(lora_request)
|
||||
except (ValueError, TypeError, RuntimeError) as e:
|
||||
logger.exception("Error preparing request components")
|
||||
return self.create_error_response(e)
|
||||
model_name = self.models.model_name(lora_request)
|
||||
|
||||
# Extract data_parallel_rank from header (router can inject it)
|
||||
data_parallel_rank = self._get_data_parallel_rank(raw_request)
|
||||
@@ -373,81 +355,76 @@ class OpenAIServingChat(OpenAIServing):
|
||||
# Schedule the request and get the result generator.
|
||||
max_model_len = self.model_config.max_model_len
|
||||
generators: list[AsyncGenerator[RequestOutput, None]] = []
|
||||
try:
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
prompt_token_ids = self._extract_prompt_components(
|
||||
engine_prompt
|
||||
).token_ids
|
||||
for i, engine_prompt in enumerate(engine_prompts):
|
||||
prompt_token_ids = self._extract_prompt_components(engine_prompt).token_ids
|
||||
|
||||
# If we are creating sub requests for multiple prompts, ensure that they
|
||||
# have unique request ids.
|
||||
sub_request_id = (
|
||||
request_id if len(engine_prompts) == 1 else f"{request_id}_{i}"
|
||||
# If we are creating sub requests for multiple prompts, ensure that they
|
||||
# have unique request ids.
|
||||
sub_request_id = (
|
||||
request_id if len(engine_prompts) == 1 else f"{request_id}_{i}"
|
||||
)
|
||||
|
||||
max_tokens = get_max_tokens(
|
||||
max_model_len,
|
||||
request.max_completion_tokens
|
||||
if request.max_completion_tokens is not None
|
||||
else request.max_tokens,
|
||||
self._extract_prompt_len(engine_prompt),
|
||||
self.default_sampling_params,
|
||||
self.override_max_tokens,
|
||||
)
|
||||
|
||||
sampling_params: SamplingParams | BeamSearchParams
|
||||
if request.use_beam_search:
|
||||
sampling_params = request.to_beam_search_params(
|
||||
max_tokens, self.default_sampling_params
|
||||
)
|
||||
|
||||
max_tokens = get_max_tokens(
|
||||
max_model_len,
|
||||
request.max_completion_tokens
|
||||
if request.max_completion_tokens is not None
|
||||
else request.max_tokens,
|
||||
self._extract_prompt_len(engine_prompt),
|
||||
else:
|
||||
sampling_params = request.to_sampling_params(
|
||||
max_tokens,
|
||||
self.default_sampling_params,
|
||||
self.override_max_tokens,
|
||||
)
|
||||
|
||||
sampling_params: SamplingParams | BeamSearchParams
|
||||
if request.use_beam_search:
|
||||
sampling_params = request.to_beam_search_params(
|
||||
max_tokens, self.default_sampling_params
|
||||
)
|
||||
else:
|
||||
sampling_params = request.to_sampling_params(
|
||||
max_tokens,
|
||||
self.default_sampling_params,
|
||||
)
|
||||
self._log_inputs(
|
||||
sub_request_id,
|
||||
engine_prompt,
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
self._log_inputs(
|
||||
sub_request_id,
|
||||
engine_prompt,
|
||||
trace_headers = (
|
||||
None
|
||||
if raw_request is None
|
||||
else await self._get_trace_headers(raw_request.headers)
|
||||
)
|
||||
|
||||
if isinstance(sampling_params, BeamSearchParams):
|
||||
generator = self.beam_search(
|
||||
prompt=engine_prompt,
|
||||
request_id=sub_request_id,
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
else:
|
||||
reasoning_ended = (
|
||||
reasoning_parser.is_reasoning_end(prompt_token_ids or [])
|
||||
if reasoning_parser
|
||||
else None
|
||||
)
|
||||
|
||||
trace_headers = (
|
||||
None
|
||||
if raw_request is None
|
||||
else await self._get_trace_headers(raw_request.headers)
|
||||
generator = self.engine_client.generate(
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
sub_request_id,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
data_parallel_rank=data_parallel_rank,
|
||||
reasoning_ended=reasoning_ended,
|
||||
)
|
||||
|
||||
if isinstance(sampling_params, BeamSearchParams):
|
||||
generator = self.beam_search(
|
||||
prompt=engine_prompt,
|
||||
request_id=sub_request_id,
|
||||
params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
else:
|
||||
reasoning_ended = (
|
||||
reasoning_parser.is_reasoning_end(prompt_token_ids or [])
|
||||
if reasoning_parser
|
||||
else None
|
||||
)
|
||||
|
||||
generator = self.engine_client.generate(
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
sub_request_id,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
priority=request.priority,
|
||||
data_parallel_rank=data_parallel_rank,
|
||||
reasoning_ended=reasoning_ended,
|
||||
)
|
||||
|
||||
generators.append(generator)
|
||||
except ValueError as e:
|
||||
return self.create_error_response(e)
|
||||
generators.append(generator)
|
||||
|
||||
assert len(generators) == 1
|
||||
(result_generator,) = generators
|
||||
@@ -464,21 +441,16 @@ class OpenAIServingChat(OpenAIServing):
|
||||
reasoning_parser,
|
||||
)
|
||||
|
||||
try:
|
||||
return await self.chat_completion_full_generator(
|
||||
request,
|
||||
result_generator,
|
||||
request_id,
|
||||
model_name,
|
||||
conversation,
|
||||
tokenizer,
|
||||
request_metadata,
|
||||
reasoning_parser,
|
||||
)
|
||||
except GenerationError as e:
|
||||
return self._convert_generation_error_to_response(e)
|
||||
except ValueError as e:
|
||||
return self.create_error_response(e)
|
||||
return await self.chat_completion_full_generator(
|
||||
request,
|
||||
result_generator,
|
||||
request_id,
|
||||
model_name,
|
||||
conversation,
|
||||
tokenizer,
|
||||
request_metadata,
|
||||
reasoning_parser,
|
||||
)
|
||||
|
||||
def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
|
||||
if request.add_generation_prompt:
|
||||
@@ -1414,8 +1386,6 @@ class OpenAIServingChat(OpenAIServing):
|
||||
final_res = res
|
||||
except asyncio.CancelledError:
|
||||
return self.create_error_response("Client disconnected")
|
||||
except ValueError as e:
|
||||
return self.create_error_response(e)
|
||||
|
||||
assert final_res is not None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user