[Frontend] Add GPU-less render serving path (vllm launch render) (#36166)

This commit is contained in:
Sage
2026-03-08 17:35:09 +02:00
committed by GitHub
parent b7332b058c
commit 4497431df6
10 changed files with 712 additions and 273 deletions

View File

@@ -22,6 +22,7 @@ from fastapi.middleware.cors import CORSMiddleware
from starlette.datastructures import State
import vllm.envs as envs
from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import load_chat_template
@@ -198,7 +199,7 @@ def build_app(
register_sagemaker_api_router(app, supported_tasks)
if any(task in supported_tasks for task in ("generate", "render")):
if "generate" in supported_tasks:
from vllm.entrypoints.openai.generate.api_router import (
register_generate_api_routers,
)
@@ -223,6 +224,13 @@ def build_app(
elastic_ep_attach_router(app)
if "generate" in supported_tasks or "render" in supported_tasks:
from vllm.entrypoints.serve.render.api_router import (
attach_router as attach_render_router,
)
attach_render_router(app)
if "transcription" in supported_tasks:
from vllm.entrypoints.openai.speech_to_text.api_router import (
attach_router as register_speech_to_text_api_router,
@@ -363,7 +371,7 @@ async def init_app_state(
trust_request_chat_template=args.trust_request_chat_template,
)
if any(task in supported_tasks for task in ("generate", "render")):
if "generate" in supported_tasks:
from vllm.entrypoints.openai.generate.api_router import init_generate_state
await init_generate_state(
@@ -393,6 +401,64 @@ async def init_app_state(
state.server_load_metrics = 0
async def init_render_app_state(
vllm_config: VllmConfig,
state: State,
args: Namespace,
) -> None:
"""Initialise FastAPI app state for a CPU-only render server.
Unlike :func:`init_app_state` this function does not require an
:class:`~vllm.engine.protocol.EngineClient`; it bootstraps the
preprocessing pipeline (renderer, io_processor, input_processor)
directly from the :class:`~vllm.config.VllmConfig`.
"""
from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
from vllm.plugins.io_processors import get_io_processor
from vllm.renderers import renderer_from_config
served_model_names = args.served_model_name or [args.model]
if args.enable_log_requests:
request_logger = RequestLogger(max_log_len=args.max_log_len)
else:
request_logger = None
renderer = renderer_from_config(vllm_config)
io_processor = get_io_processor(
vllm_config, renderer, vllm_config.model_config.io_processor_plugin
)
resolved_chat_template = load_chat_template(args.chat_template)
state.openai_serving_render = OpenAIServingRender(
model_config=vllm_config.model_config,
renderer=renderer,
io_processor=io_processor,
served_model_names=served_model_names,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
trust_request_chat_template=args.trust_request_chat_template,
enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack,
)
# Expose models endpoint via the render handler.
state.openai_serving_models = state.openai_serving_render
state.vllm_config = vllm_config
# Disable stats logging — there is no engine to poll.
state.log_stats = False
state.engine_client = None
state.args = args
state.enable_server_load_tracking = False
state.server_load_metrics = 0
def create_server_socket(addr: tuple[str, int]) -> socket.socket:
family = socket.AF_INET
if is_valid_ipv6_address(addr[0]):
@@ -494,7 +560,6 @@ async def build_and_serve(
supported_tasks = await engine_client.get_supported_tasks()
logger.info("Supported tasks: %s", supported_tasks)
app = build_app(args, supported_tasks)
await init_app_state(engine_client, app.state, args, supported_tasks)
@@ -522,6 +587,51 @@ async def build_and_serve(
)
async def build_and_serve_renderer(
vllm_config: VllmConfig,
listen_address: str,
sock: socket.socket,
args: Namespace,
**uvicorn_kwargs,
) -> asyncio.Task:
"""Build FastAPI app for a CPU-only render server, initialize state, and
start serving.
Returns the shutdown task for the caller to await.
"""
# Get uvicorn log config (from file or with endpoint filter)
log_config = get_uvicorn_log_config(args)
if log_config is not None:
uvicorn_kwargs["log_config"] = log_config
app = build_app(args, ("render",))
await init_render_app_state(vllm_config, app.state, args)
logger.info("Starting vLLM server on %s", listen_address)
return await serve_http(
app,
sock=sock,
enable_ssl_refresh=args.enable_ssl_refresh,
host=args.host,
port=args.port,
log_level=args.uvicorn_log_level,
# NOTE: When the 'disable_uvicorn_access_log' value is True,
# no access log will be output.
access_log=not args.disable_uvicorn_access_log,
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs,
ssl_ciphers=args.ssl_ciphers,
h11_max_incomplete_event_size=args.h11_max_incomplete_event_size,
h11_max_header_count=args.h11_max_header_count,
**uvicorn_kwargs,
)
async def run_server(args, **uvicorn_kwargs) -> None:
"""Run a single-worker API server."""