Adds anthropic /v1/messages endpoint to openai api_server (#27882)
Signed-off-by: bbartels <benjamin@bartels.dev> Signed-off-by: Benjamin Bartels <benjamin@bartels.dev>
This commit is contained in:
@@ -41,6 +41,13 @@ import vllm.envs as envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.protocol import Device, EngineClient
|
||||
from vllm.entrypoints.anthropic.protocol import (
|
||||
AnthropicError,
|
||||
AnthropicErrorResponse,
|
||||
AnthropicMessagesRequest,
|
||||
AnthropicMessagesResponse,
|
||||
)
|
||||
from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages
|
||||
from vllm.entrypoints.launcher import serve_http
|
||||
from vllm.entrypoints.logger import RequestLogger
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
|
||||
@@ -308,6 +315,10 @@ def responses(request: Request) -> OpenAIServingResponses | None:
|
||||
return request.app.state.openai_serving_responses
|
||||
|
||||
|
||||
def messages(request: Request) -> AnthropicServingMessages:
|
||||
return request.app.state.anthropic_serving_messages
|
||||
|
||||
|
||||
def chat(request: Request) -> OpenAIServingChat | None:
|
||||
return request.app.state.openai_serving_chat
|
||||
|
||||
@@ -591,6 +602,63 @@ async def cancel_responses(response_id: str, raw_request: Request):
|
||||
return JSONResponse(content=response.model_dump())
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v1/messages",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
responses={
|
||||
HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": AnthropicErrorResponse},
|
||||
HTTPStatus.NOT_FOUND.value: {"model": AnthropicErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": AnthropicErrorResponse},
|
||||
},
|
||||
)
|
||||
@with_cancellation
|
||||
@load_aware_call
|
||||
async def create_messages(request: AnthropicMessagesRequest, raw_request: Request):
|
||||
def translate_error_response(response: ErrorResponse) -> JSONResponse:
|
||||
anthropic_error = AnthropicErrorResponse(
|
||||
error=AnthropicError(
|
||||
type=response.error.type,
|
||||
message=response.error.message,
|
||||
)
|
||||
)
|
||||
return JSONResponse(
|
||||
status_code=response.error.code, content=anthropic_error.model_dump()
|
||||
)
|
||||
|
||||
handler = messages(raw_request)
|
||||
if handler is None:
|
||||
error = base(raw_request).create_error_response(
|
||||
message="The model does not support Messages API"
|
||||
)
|
||||
return translate_error_response(error)
|
||||
|
||||
try:
|
||||
generator = await handler.create_messages(request, raw_request)
|
||||
except Exception as e:
|
||||
logger.exception("Error in create_messages: %s", e)
|
||||
return JSONResponse(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
|
||||
content=AnthropicErrorResponse(
|
||||
error=AnthropicError(
|
||||
type="internal_error",
|
||||
message=str(e),
|
||||
)
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return translate_error_response(generator)
|
||||
|
||||
elif isinstance(generator, AnthropicMessagesResponse):
|
||||
logger.debug(
|
||||
"Anthropic Messages Response: %s", generator.model_dump(exclude_none=True)
|
||||
)
|
||||
return JSONResponse(content=generator.model_dump(exclude_none=True))
|
||||
|
||||
return StreamingResponse(content=generator, media_type="text/event-stream")
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v1/chat/completions",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
@@ -1817,6 +1885,24 @@ async def init_app_state(
|
||||
if "transcription" in supported_tasks
|
||||
else None
|
||||
)
|
||||
state.anthropic_serving_messages = (
|
||||
AnthropicServingMessages(
|
||||
engine_client,
|
||||
state.openai_serving_models,
|
||||
args.response_role,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
tool_parser=args.tool_call_parser,
|
||||
reasoning_parser=args.structured_outputs_config.reasoning_parser,
|
||||
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
|
||||
enable_force_include_usage=args.enable_force_include_usage,
|
||||
)
|
||||
if "generate" in supported_tasks
|
||||
else None
|
||||
)
|
||||
|
||||
state.enable_server_load_tracking = args.enable_server_load_tracking
|
||||
state.server_load_metrics = 0
|
||||
|
||||
Reference in New Issue
Block a user