[Frontend] Add GPU-less render serving path (vllm launch render) (#36166)
This commit is contained in:
@@ -71,34 +71,5 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
|
||||
return StreamingResponse(content=generator, media_type="text/event-stream")
|
||||
|
||||
|
||||
@router.post(
|
||||
"/v1/chat/completions/render",
|
||||
dependencies=[Depends(validate_json_request)],
|
||||
response_model=list,
|
||||
responses={
|
||||
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
|
||||
HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
|
||||
},
|
||||
)
|
||||
async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request):
|
||||
"""Render chat completion request and return conversation and engine
|
||||
prompts without generating."""
|
||||
handler = chat(raw_request)
|
||||
if handler is None:
|
||||
base_server = raw_request.app.state.openai_serving_tokenization
|
||||
return base_server.create_error_response(
|
||||
message="The model does not support Chat Completions API"
|
||||
)
|
||||
|
||||
result = await handler.render_chat_request(request)
|
||||
|
||||
if isinstance(result, ErrorResponse):
|
||||
return JSONResponse(content=result.model_dump(), status_code=result.error.code)
|
||||
|
||||
return JSONResponse(content=result)
|
||||
|
||||
|
||||
def attach_router(app: FastAPI):
|
||||
app.include_router(router)
|
||||
|
||||
Reference in New Issue
Block a user