[openapi server] log exception in exception handler(2/N) (#36201)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
This commit is contained in:
@@ -196,7 +196,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
|
||||
invalid_files.mkdir()
|
||||
(invalid_files / "adapter_config.json").write_text("this is not json")
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
with pytest.raises(openai.InternalServerError):
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
@@ -232,7 +232,7 @@ async def test_dynamic_lora_badrequests(
|
||||
json.dump(adapter_config, f)
|
||||
|
||||
# Test loading the adapter
|
||||
with pytest.raises(openai.BadRequestError, match=expected_error):
|
||||
with pytest.raises(openai.InternalServerError, match=expected_error):
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
@@ -312,7 +312,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
|
||||
body={"lora_name": "notfound", "lora_path": "/not/an/adapter"},
|
||||
)
|
||||
for _ in range(25):
|
||||
with suppress(openai.BadRequestError):
|
||||
with suppress(openai.InternalServerError):
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
|
||||
@@ -88,7 +88,7 @@ async def test_sagemaker_load_adapter_invalid_files(
|
||||
basic_server_with_lora.url_for("adapters"),
|
||||
json={"name": "invalid-adapter", "src": str(invalid_files)},
|
||||
)
|
||||
assert load_response.status_code == 400
|
||||
assert load_response.status_code == 500
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -62,7 +62,7 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
error = base_server.create_error_response(
|
||||
message="The model does not support Messages API"
|
||||
NotImplementedError("The model does not support Messages API")
|
||||
)
|
||||
return translate_error_response(error)
|
||||
|
||||
@@ -108,7 +108,7 @@ async def count_tokens(request: AnthropicCountTokensRequest, raw_request: Reques
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
error = base_server.create_error_response(
|
||||
message="The model does not support Messages API"
|
||||
NotImplementedError("The model does not support Messages API")
|
||||
)
|
||||
return translate_error_response(error)
|
||||
|
||||
|
||||
@@ -50,10 +50,7 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
|
||||
)
|
||||
handler = chat(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Chat Completions API"
|
||||
)
|
||||
raise NotImplementedError("The model does not support Chat Completions API")
|
||||
|
||||
generator = await handler.create_chat_completion(request, raw_request)
|
||||
|
||||
|
||||
@@ -49,10 +49,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
|
||||
)
|
||||
handler = completion(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Completions API"
|
||||
)
|
||||
raise NotImplementedError("The model does not support Completions API")
|
||||
|
||||
generator = await handler.create_completion(request, raw_request)
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ from http import HTTPStatus
|
||||
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.openai.engine.protocol import (
|
||||
ErrorInfo,
|
||||
ErrorResponse,
|
||||
ModelCard,
|
||||
ModelList,
|
||||
@@ -18,7 +17,8 @@ from vllm.entrypoints.serve.lora.protocol import (
|
||||
LoadLoRAAdapterRequest,
|
||||
UnloadLoRAAdapterRequest,
|
||||
)
|
||||
from vllm.entrypoints.utils import sanitize_message
|
||||
from vllm.entrypoints.utils import create_error_response
|
||||
from vllm.exceptions import LoRAAdapterNotFoundError
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
|
||||
@@ -152,15 +152,15 @@ class OpenAIServingModels:
|
||||
try:
|
||||
await self.engine_client.add_lora(lora_request)
|
||||
except Exception as e:
|
||||
error_type = "BadRequestError"
|
||||
status_code = HTTPStatus.BAD_REQUEST
|
||||
if "No adapter found" in str(e):
|
||||
error_type = "NotFoundError"
|
||||
status_code = HTTPStatus.NOT_FOUND
|
||||
|
||||
return create_error_response(
|
||||
message=str(e), err_type=error_type, status_code=status_code
|
||||
)
|
||||
if str(
|
||||
LoRAAdapterNotFoundError(
|
||||
lora_request.lora_name, lora_request.lora_path
|
||||
)
|
||||
) in str(e):
|
||||
raise LoRAAdapterNotFoundError(
|
||||
lora_request.lora_name, lora_request.lora_path
|
||||
) from e
|
||||
raise
|
||||
|
||||
self.lora_requests[lora_name] = lora_request
|
||||
logger.info(
|
||||
@@ -292,17 +292,3 @@ class OpenAIServingModels:
|
||||
err_type="NotFoundError",
|
||||
status_code=HTTPStatus.NOT_FOUND,
|
||||
)
|
||||
|
||||
|
||||
def create_error_response(
|
||||
message: str,
|
||||
err_type: str = "BadRequestError",
|
||||
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
|
||||
) -> ErrorResponse:
|
||||
return ErrorResponse(
|
||||
error=ErrorInfo(
|
||||
message=sanitize_message(message),
|
||||
type=err_type,
|
||||
code=status_code.value,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -59,10 +59,7 @@ async def _convert_stream_to_sse_events(
|
||||
async def create_responses(request: ResponsesRequest, raw_request: Request):
|
||||
handler = responses(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Responses API"
|
||||
)
|
||||
raise NotImplementedError("The model does not support Responses API")
|
||||
|
||||
generator = await handler.create_responses(request, raw_request)
|
||||
|
||||
@@ -88,10 +85,7 @@ async def retrieve_responses(
|
||||
):
|
||||
handler = responses(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Responses API"
|
||||
)
|
||||
raise NotImplementedError("The model does not support Responses API")
|
||||
|
||||
response = await handler.retrieve_responses(
|
||||
response_id,
|
||||
@@ -115,10 +109,7 @@ async def retrieve_responses(
|
||||
async def cancel_responses(response_id: str, raw_request: Request):
|
||||
handler = responses(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Responses API"
|
||||
)
|
||||
raise NotImplementedError("The model does not support Responses API")
|
||||
|
||||
response = await handler.cancel_responses(response_id)
|
||||
|
||||
|
||||
@@ -65,10 +65,7 @@ async def create_transcriptions(
|
||||
):
|
||||
handler = transcription(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Transcriptions API"
|
||||
)
|
||||
raise NotImplementedError("The model does not support Transcriptions API")
|
||||
|
||||
audio_data = await request.file.read()
|
||||
|
||||
@@ -101,10 +98,7 @@ async def create_translations(
|
||||
):
|
||||
handler = translation(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Translations API"
|
||||
)
|
||||
raise NotImplementedError("The model does not support Translations API")
|
||||
|
||||
audio_data = await request.file.read()
|
||||
|
||||
|
||||
@@ -2,13 +2,12 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
from fastapi.responses import Response
|
||||
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
|
||||
from vllm.entrypoints.pooling.classify.serving import ServingClassification
|
||||
from vllm.entrypoints.utils import (
|
||||
create_error_response,
|
||||
load_aware_call,
|
||||
with_cancellation,
|
||||
)
|
||||
@@ -28,12 +27,6 @@ async def create_classify(
|
||||
) -> Response:
|
||||
handler = classify(raw_request)
|
||||
if handler is None:
|
||||
error_response = create_error_response(
|
||||
message="The model does not support Classification API"
|
||||
)
|
||||
return JSONResponse(
|
||||
content=error_response.model_dump(),
|
||||
status_code=error_response.error.code,
|
||||
)
|
||||
raise NotImplementedError("The model does not support Classification API")
|
||||
|
||||
return await handler(request, raw_request)
|
||||
|
||||
@@ -4,14 +4,12 @@
|
||||
from http import HTTPStatus
|
||||
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest
|
||||
from vllm.entrypoints.pooling.embed.serving import ServingEmbedding
|
||||
from vllm.entrypoints.utils import (
|
||||
create_error_response,
|
||||
load_aware_call,
|
||||
with_cancellation,
|
||||
)
|
||||
@@ -39,11 +37,6 @@ async def create_embedding(
|
||||
):
|
||||
handler = embedding(raw_request)
|
||||
if handler is None:
|
||||
error_response = create_error_response(
|
||||
message="The model does not support Embeddings API"
|
||||
)
|
||||
return JSONResponse(
|
||||
content=error_response.model_dump(),
|
||||
status_code=error_response.error.code,
|
||||
)
|
||||
raise NotImplementedError("The model does not support Embeddings API")
|
||||
|
||||
return await handler(request, raw_request)
|
||||
|
||||
@@ -37,10 +37,7 @@ def pooling(request: Request) -> OpenAIServingPooling | None:
|
||||
async def create_pooling(request: PoolingRequest, raw_request: Request):
|
||||
handler = pooling(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Pooling API"
|
||||
)
|
||||
raise NotImplementedError("The model does not support Pooling API")
|
||||
|
||||
generator = await handler.create_pooling(request, raw_request)
|
||||
|
||||
|
||||
@@ -44,10 +44,7 @@ def rerank(request: Request) -> ServingScores | None:
|
||||
async def create_score(request: ScoreRequest, raw_request: Request):
|
||||
handler = score(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Score API"
|
||||
)
|
||||
raise NotImplementedError("The model does not support Score API")
|
||||
|
||||
generator = await handler.create_score(request, raw_request)
|
||||
|
||||
@@ -93,10 +90,7 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
|
||||
async def do_rerank(request: RerankRequest, raw_request: Request):
|
||||
handler = rerank(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Rerank (Score) API"
|
||||
)
|
||||
raise NotImplementedError("The model does not support Rerank (Score) API")
|
||||
|
||||
generator = await handler.do_rerank(request, raw_request)
|
||||
|
||||
|
||||
@@ -61,9 +61,7 @@ router = APIRouter()
|
||||
async def generate(request: GenerateRequest, raw_request: Request):
|
||||
handler = generate_tokens(raw_request)
|
||||
if handler is None:
|
||||
return tokenization(raw_request).create_error_response(
|
||||
message="The model does not support generate tokens API"
|
||||
)
|
||||
raise NotImplementedError("The model does not support generate tokens API")
|
||||
|
||||
generator = await handler.serve_tokens(request, raw_request)
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@ from vllm.entrypoints.openai.completion.protocol import CompletionRequest
|
||||
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
|
||||
from vllm.entrypoints.openai.utils import validate_json_request
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
from vllm.entrypoints.utils import create_error_response
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@@ -36,13 +35,8 @@ def render(request: Request) -> OpenAIServingRender | None:
|
||||
async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request):
|
||||
handler = render(raw_request)
|
||||
if handler is None:
|
||||
error = create_error_response(
|
||||
message="The model does not support Chat Completions Render API",
|
||||
err_type="NotFoundError",
|
||||
status_code=HTTPStatus.NOT_FOUND,
|
||||
)
|
||||
return JSONResponse(
|
||||
status_code=HTTPStatus.NOT_FOUND, content=error.model_dump()
|
||||
raise NotImplementedError(
|
||||
"The model does not support Chat Completions Render API"
|
||||
)
|
||||
|
||||
result = await handler.render_chat_request(request)
|
||||
@@ -66,14 +60,7 @@ async def render_chat_completion(request: ChatCompletionRequest, raw_request: Re
|
||||
async def render_completion(request: CompletionRequest, raw_request: Request):
|
||||
handler = render(raw_request)
|
||||
if handler is None:
|
||||
error = create_error_response(
|
||||
message="The model does not support Completions Render API",
|
||||
err_type="NotFoundError",
|
||||
status_code=HTTPStatus.NOT_FOUND,
|
||||
)
|
||||
return JSONResponse(
|
||||
status_code=HTTPStatus.NOT_FOUND, content=error.model_dump()
|
||||
)
|
||||
raise NotImplementedError("The model does not support Completions Render API")
|
||||
|
||||
result = await handler.render_completion_request(request)
|
||||
|
||||
|
||||
@@ -36,7 +36,31 @@ class VLLMValidationError(ValueError):
|
||||
return f"{base} ({', '.join(extras)})" if extras else base
|
||||
|
||||
|
||||
class VLLMNotFoundError(ValueError):
|
||||
class VLLMNotFoundError(Exception):
|
||||
"""vLLM-specific NotFoundError"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class LoRAAdapterNotFoundError(VLLMNotFoundError):
|
||||
"""Exception raised when a LoRA adapter is not found.
|
||||
|
||||
This exception is thrown when a requested LoRA adapter does not exist
|
||||
in the system.
|
||||
|
||||
Attributes:
|
||||
message: The error message string describing the exception
|
||||
"""
|
||||
|
||||
message: str
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
lora_name: str,
|
||||
lora_path: str,
|
||||
) -> None:
|
||||
message = f"Loading lora {lora_name} failed: No adapter found for {lora_path}"
|
||||
self.message = message
|
||||
|
||||
def __str__(self):
|
||||
return self.message
|
||||
|
||||
@@ -7,6 +7,7 @@ from typing import Any, Literal
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.exceptions import LoRAAdapterNotFoundError
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.lora_model import LoRAModel
|
||||
from vllm.lora.model_manager import (
|
||||
@@ -147,12 +148,10 @@ class WorkerLoRAManager:
|
||||
# offline mode)
|
||||
# - No local adapter files found at `lora_request.lora_path`
|
||||
# For NotFoundError
|
||||
raise ValueError(
|
||||
f"Loading lora {lora_request.lora_name} failed: No adapter "
|
||||
f"found for {lora_request.lora_path}"
|
||||
raise LoRAAdapterNotFoundError(
|
||||
lora_request.lora_name, lora_request.lora_path
|
||||
) from e
|
||||
except Exception as e:
|
||||
# For BadRequestError
|
||||
raise e
|
||||
|
||||
return lora
|
||||
|
||||
Reference in New Issue
Block a user