[Frontend] Add GPU-less render serving path (vllm launch render) (#36166)

This commit is contained in:
Sage
2026-03-08 17:35:09 +02:00
committed by GitHub
parent b7332b058c
commit 4497431df6
10 changed files with 712 additions and 273 deletions

View File

@@ -71,34 +71,5 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
return StreamingResponse(content=generator, media_type="text/event-stream")
@router.post(
"/v1/chat/completions/render",
dependencies=[Depends(validate_json_request)],
response_model=list,
responses={
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
},
)
async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request):
"""Render chat completion request and return conversation and engine
prompts without generating."""
handler = chat(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Chat Completions API"
)
result = await handler.render_chat_request(request)
if isinstance(result, ErrorResponse):
return JSONResponse(content=result.model_dump(), status_code=result.error.code)
return JSONResponse(content=result)
def attach_router(app: FastAPI):
app.include_router(router)