diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index e616a99c5..9014ab1ea 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -45,6 +45,8 @@ pystemmer==3.0.0 # via mteb # Multi-modal processing +av==16.1.0 + # required for audio_in_video tests blobfile==3.0.0 # Multi-Modal Models Test decord==0.6.0 diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py index a6fef7868..61763a3b6 100644 --- a/vllm/entrypoints/openai/chat_completion/protocol.py +++ b/vllm/entrypoints/openai/chat_completion/protocol.py @@ -7,7 +7,6 @@ import json import time from typing import Annotated, Any, ClassVar, Literal -import torch from openai.types.chat.chat_completion_audio import ( ChatCompletionAudio as OpenAIChatCompletionAudio, ) @@ -48,7 +47,8 @@ from vllm.utils import random_uuid logger = init_logger(__name__) -_LONG_INFO = torch.iinfo(torch.long) +_INT64_MIN = -(2**63) +_INT64_MAX = 2**63 - 1 class ChatMessage(OpenAIBaseModel): @@ -165,7 +165,7 @@ class ChatCompletionRequest(OpenAIBaseModel): n: int | None = 1 presence_penalty: float | None = 0.0 response_format: AnyResponseFormat | None = None - seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) + seed: int | None = Field(None, ge=_INT64_MIN, le=_INT64_MAX) stop: str | list[str] | None = [] stream: bool | None = False stream_options: StreamOptions | None = None @@ -198,9 +198,7 @@ class ChatCompletionRequest(OpenAIBaseModel): min_tokens: int = 0 skip_special_tokens: bool = True spaces_between_special_tokens: bool = True - truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = ( - None - ) + truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_INT64_MAX)] | None = None prompt_logprobs: int | None = None allowed_token_ids: list[int] | None = None bad_words: list[str] = Field(default_factory=list) @@ -285,6 +283,8 @@ class ChatCompletionRequest(OpenAIBaseModel): ) priority: int = Field( default=0, + ge=_INT64_MIN, + le=_INT64_MAX, description=( "The priority of the request (lower means earlier handling; " "default: 0). Any priority other than 0 will raise an error " diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index bf8beb9b9..2eb550c3e 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -6,6 +6,7 @@ import json import time from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence +from http import HTTPStatus from typing import TYPE_CHECKING, Any, Final import partial_json_parser @@ -1289,7 +1290,12 @@ class OpenAIServingChat(OpenAIServing): except asyncio.CancelledError: return self.create_error_response("Client disconnected") - assert final_res is not None + if final_res is None: + return self.create_error_response( + "No output received from the engine.", + err_type="InternalServerError", + status_code=HTTPStatus.INTERNAL_SERVER_ERROR, + ) choices: list[ChatCompletionResponseChoice] = [] if self.tool_call_id_type == "kimi_k2": diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py index 73232ec3a..c785d2540 100644 --- a/vllm/entrypoints/openai/completion/protocol.py +++ b/vllm/entrypoints/openai/completion/protocol.py @@ -7,7 +7,6 @@ import json import time from typing import Annotated, Any, Literal -import torch from pydantic import Field, model_validator from vllm.config import ModelConfig @@ -36,7 +35,8 @@ from vllm.utils import random_uuid logger = init_logger(__name__) -_LONG_INFO = torch.iinfo(torch.long) +_INT64_MIN = -(2**63) +_INT64_MAX = 2**63 - 1 class CompletionRequest(OpenAIBaseModel): @@ -57,7 +57,7 @@ class CompletionRequest(OpenAIBaseModel): max_tokens: int | None = 16 n: int = 1 presence_penalty: float | None = 0.0 - seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) + seed: int | None = Field(None, ge=_INT64_MIN, le=_INT64_MAX) stop: str | list[str] | None = [] stream: bool | None = False stream_options: StreamOptions | None = None @@ -78,9 +78,7 @@ class CompletionRequest(OpenAIBaseModel): min_tokens: int = 0 skip_special_tokens: bool = True spaces_between_special_tokens: bool = True - truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = ( - None - ) + truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_INT64_MAX)] | None = None allowed_token_ids: list[int] | None = None prompt_logprobs: int | None = None # --8<-- [end:completion-sampling-params] @@ -108,6 +106,8 @@ class CompletionRequest(OpenAIBaseModel): ) priority: int = Field( default=0, + ge=_INT64_MIN, + le=_INT64_MAX, description=( "The priority of the request (lower means earlier handling; " "default: 0). Any priority other than 0 will raise an error " diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py index e90d6b746..2adcd9eaa 100644 --- a/vllm/entrypoints/openai/responses/protocol.py +++ b/vllm/entrypoints/openai/responses/protocol.py @@ -6,7 +6,6 @@ import time from typing import Any, Literal, TypeAlias -import torch from openai.types.responses import ( ResponseCodeInterpreterCallCodeDeltaEvent, ResponseCodeInterpreterCallCodeDoneEvent, @@ -78,7 +77,8 @@ from vllm.utils import random_uuid logger = init_logger(__name__) -_LONG_INFO = torch.iinfo(torch.long) +_INT64_MIN = -(2**63) +_INT64_MAX = 2**63 - 1 class InputTokensDetails(OpenAIBaseModel): @@ -210,6 +210,8 @@ class ResponsesRequest(OpenAIBaseModel): ) priority: int = Field( default=0, + ge=_INT64_MIN, + le=_INT64_MAX, description=( "The priority of the request (lower means earlier handling; " "default: 0). Any priority other than 0 will raise an error " @@ -246,7 +248,7 @@ class ResponsesRequest(OpenAIBaseModel): ) repetition_penalty: float | None = None - seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) + seed: int | None = Field(None, ge=_INT64_MIN, le=_INT64_MAX) stop: str | list[str] | None = [] ignore_eos: bool = False vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field( diff --git a/vllm/entrypoints/pooling/base/protocol.py b/vllm/entrypoints/pooling/base/protocol.py index f4bbf8446..50be58374 100644 --- a/vllm/entrypoints/pooling/base/protocol.py +++ b/vllm/entrypoints/pooling/base/protocol.py @@ -34,6 +34,8 @@ class PoolingBasicRequestMixin(OpenAIBaseModel): ) priority: int = Field( default=0, + ge=-(2**63), + le=2**63 - 1, description=( "The priority of the request (lower means earlier handling; " "default: 0). Any priority other than 0 will raise an error " diff --git a/vllm/entrypoints/serve/disagg/protocol.py b/vllm/entrypoints/serve/disagg/protocol.py index c4d510297..028e8dee7 100644 --- a/vllm/entrypoints/serve/disagg/protocol.py +++ b/vllm/entrypoints/serve/disagg/protocol.py @@ -93,6 +93,8 @@ class GenerateRequest(BaseModel): ) priority: int = Field( default=0, + ge=-(2**63), + le=2**63 - 1, description=( "The priority of the request (lower means earlier handling; " "default: 0). Any priority other than 0 will raise an error " diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 9550a41bb..d5ecb7599 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -331,8 +331,8 @@ def create_error_response( err_type = "InternalServerError" status_code = exc.status_code param = None - elif exc.__class__.__name__ == "TemplateError": - # jinja2.TemplateError (avoid importing jinja2) + elif any(cls.__name__ == "TemplateError" for cls in type(exc).__mro__): + # jinja2.TemplateError and its subclasses (avoid importing jinja2) err_type = "BadRequestError" status_code = HTTPStatus.BAD_REQUEST param = None