[CI][Bugfix] Fix 500 errors from priority overflow and TemplateError subclasses in schema fuzz tests (#37127)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
@@ -7,7 +7,6 @@ import json
|
||||
import time
|
||||
from typing import Annotated, Any, ClassVar, Literal
|
||||
|
||||
import torch
|
||||
from openai.types.chat.chat_completion_audio import (
|
||||
ChatCompletionAudio as OpenAIChatCompletionAudio,
|
||||
)
|
||||
@@ -48,7 +47,8 @@ from vllm.utils import random_uuid
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
_LONG_INFO = torch.iinfo(torch.long)
|
||||
_INT64_MIN = -(2**63)
|
||||
_INT64_MAX = 2**63 - 1
|
||||
|
||||
|
||||
class ChatMessage(OpenAIBaseModel):
|
||||
@@ -165,7 +165,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
n: int | None = 1
|
||||
presence_penalty: float | None = 0.0
|
||||
response_format: AnyResponseFormat | None = None
|
||||
seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
|
||||
seed: int | None = Field(None, ge=_INT64_MIN, le=_INT64_MAX)
|
||||
stop: str | list[str] | None = []
|
||||
stream: bool | None = False
|
||||
stream_options: StreamOptions | None = None
|
||||
@@ -198,9 +198,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
min_tokens: int = 0
|
||||
skip_special_tokens: bool = True
|
||||
spaces_between_special_tokens: bool = True
|
||||
truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
|
||||
None
|
||||
)
|
||||
truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_INT64_MAX)] | None = None
|
||||
prompt_logprobs: int | None = None
|
||||
allowed_token_ids: list[int] | None = None
|
||||
bad_words: list[str] = Field(default_factory=list)
|
||||
@@ -285,6 +283,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
)
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
ge=_INT64_MIN,
|
||||
le=_INT64_MAX,
|
||||
description=(
|
||||
"The priority of the request (lower means earlier handling; "
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
|
||||
@@ -6,6 +6,7 @@ import json
|
||||
import time
|
||||
from collections.abc import AsyncGenerator, AsyncIterator
|
||||
from collections.abc import Sequence as GenericSequence
|
||||
from http import HTTPStatus
|
||||
from typing import TYPE_CHECKING, Any, Final
|
||||
|
||||
import partial_json_parser
|
||||
@@ -1289,7 +1290,12 @@ class OpenAIServingChat(OpenAIServing):
|
||||
except asyncio.CancelledError:
|
||||
return self.create_error_response("Client disconnected")
|
||||
|
||||
assert final_res is not None
|
||||
if final_res is None:
|
||||
return self.create_error_response(
|
||||
"No output received from the engine.",
|
||||
err_type="InternalServerError",
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
|
||||
)
|
||||
|
||||
choices: list[ChatCompletionResponseChoice] = []
|
||||
if self.tool_call_id_type == "kimi_k2":
|
||||
|
||||
@@ -7,7 +7,6 @@ import json
|
||||
import time
|
||||
from typing import Annotated, Any, Literal
|
||||
|
||||
import torch
|
||||
from pydantic import Field, model_validator
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
@@ -36,7 +35,8 @@ from vllm.utils import random_uuid
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
_LONG_INFO = torch.iinfo(torch.long)
|
||||
_INT64_MIN = -(2**63)
|
||||
_INT64_MAX = 2**63 - 1
|
||||
|
||||
|
||||
class CompletionRequest(OpenAIBaseModel):
|
||||
@@ -57,7 +57,7 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
max_tokens: int | None = 16
|
||||
n: int = 1
|
||||
presence_penalty: float | None = 0.0
|
||||
seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
|
||||
seed: int | None = Field(None, ge=_INT64_MIN, le=_INT64_MAX)
|
||||
stop: str | list[str] | None = []
|
||||
stream: bool | None = False
|
||||
stream_options: StreamOptions | None = None
|
||||
@@ -78,9 +78,7 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
min_tokens: int = 0
|
||||
skip_special_tokens: bool = True
|
||||
spaces_between_special_tokens: bool = True
|
||||
truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
|
||||
None
|
||||
)
|
||||
truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_INT64_MAX)] | None = None
|
||||
allowed_token_ids: list[int] | None = None
|
||||
prompt_logprobs: int | None = None
|
||||
# --8<-- [end:completion-sampling-params]
|
||||
@@ -108,6 +106,8 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
)
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
ge=_INT64_MIN,
|
||||
le=_INT64_MAX,
|
||||
description=(
|
||||
"The priority of the request (lower means earlier handling; "
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
import time
|
||||
from typing import Any, Literal, TypeAlias
|
||||
|
||||
import torch
|
||||
from openai.types.responses import (
|
||||
ResponseCodeInterpreterCallCodeDeltaEvent,
|
||||
ResponseCodeInterpreterCallCodeDoneEvent,
|
||||
@@ -78,7 +77,8 @@ from vllm.utils import random_uuid
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_LONG_INFO = torch.iinfo(torch.long)
|
||||
_INT64_MIN = -(2**63)
|
||||
_INT64_MAX = 2**63 - 1
|
||||
|
||||
|
||||
class InputTokensDetails(OpenAIBaseModel):
|
||||
@@ -210,6 +210,8 @@ class ResponsesRequest(OpenAIBaseModel):
|
||||
)
|
||||
priority: int = Field(
|
||||
default=0,
|
||||
ge=_INT64_MIN,
|
||||
le=_INT64_MAX,
|
||||
description=(
|
||||
"The priority of the request (lower means earlier handling; "
|
||||
"default: 0). Any priority other than 0 will raise an error "
|
||||
@@ -246,7 +248,7 @@ class ResponsesRequest(OpenAIBaseModel):
|
||||
)
|
||||
|
||||
repetition_penalty: float | None = None
|
||||
seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
|
||||
seed: int | None = Field(None, ge=_INT64_MIN, le=_INT64_MAX)
|
||||
stop: str | list[str] | None = []
|
||||
ignore_eos: bool = False
|
||||
vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
|
||||
|
||||
Reference in New Issue
Block a user