[Frontend] Add GPU-less render serving path (vllm launch render) (#36166)
This commit is contained in:
@@ -22,6 +22,7 @@ from fastapi.middleware.cors import CORSMiddleware
|
||||
from starlette.datastructures import State
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import load_chat_template
|
||||
@@ -198,7 +199,7 @@ def build_app(
|
||||
|
||||
register_sagemaker_api_router(app, supported_tasks)
|
||||
|
||||
if any(task in supported_tasks for task in ("generate", "render")):
|
||||
if "generate" in supported_tasks:
|
||||
from vllm.entrypoints.openai.generate.api_router import (
|
||||
register_generate_api_routers,
|
||||
)
|
||||
@@ -223,6 +224,13 @@ def build_app(
|
||||
|
||||
elastic_ep_attach_router(app)
|
||||
|
||||
if "generate" in supported_tasks or "render" in supported_tasks:
|
||||
from vllm.entrypoints.serve.render.api_router import (
|
||||
attach_router as attach_render_router,
|
||||
)
|
||||
|
||||
attach_render_router(app)
|
||||
|
||||
if "transcription" in supported_tasks:
|
||||
from vllm.entrypoints.openai.speech_to_text.api_router import (
|
||||
attach_router as register_speech_to_text_api_router,
|
||||
@@ -363,7 +371,7 @@ async def init_app_state(
|
||||
trust_request_chat_template=args.trust_request_chat_template,
|
||||
)
|
||||
|
||||
if any(task in supported_tasks for task in ("generate", "render")):
|
||||
if "generate" in supported_tasks:
|
||||
from vllm.entrypoints.openai.generate.api_router import init_generate_state
|
||||
|
||||
await init_generate_state(
|
||||
@@ -393,6 +401,64 @@ async def init_app_state(
|
||||
state.server_load_metrics = 0
|
||||
|
||||
|
||||
async def init_render_app_state(
|
||||
vllm_config: VllmConfig,
|
||||
state: State,
|
||||
args: Namespace,
|
||||
) -> None:
|
||||
"""Initialise FastAPI app state for a CPU-only render server.
|
||||
|
||||
Unlike :func:`init_app_state` this function does not require an
|
||||
:class:`~vllm.engine.protocol.EngineClient`; it bootstraps the
|
||||
preprocessing pipeline (renderer, io_processor, input_processor)
|
||||
directly from the :class:`~vllm.config.VllmConfig`.
|
||||
"""
|
||||
from vllm.entrypoints.chat_utils import load_chat_template
|
||||
from vllm.entrypoints.serve.render.serving import OpenAIServingRender
|
||||
from vllm.plugins.io_processors import get_io_processor
|
||||
from vllm.renderers import renderer_from_config
|
||||
|
||||
served_model_names = args.served_model_name or [args.model]
|
||||
|
||||
if args.enable_log_requests:
|
||||
request_logger = RequestLogger(max_log_len=args.max_log_len)
|
||||
else:
|
||||
request_logger = None
|
||||
|
||||
renderer = renderer_from_config(vllm_config)
|
||||
io_processor = get_io_processor(
|
||||
vllm_config, renderer, vllm_config.model_config.io_processor_plugin
|
||||
)
|
||||
resolved_chat_template = load_chat_template(args.chat_template)
|
||||
|
||||
state.openai_serving_render = OpenAIServingRender(
|
||||
model_config=vllm_config.model_config,
|
||||
renderer=renderer,
|
||||
io_processor=io_processor,
|
||||
served_model_names=served_model_names,
|
||||
request_logger=request_logger,
|
||||
chat_template=resolved_chat_template,
|
||||
chat_template_content_format=args.chat_template_content_format,
|
||||
trust_request_chat_template=args.trust_request_chat_template,
|
||||
enable_auto_tools=args.enable_auto_tool_choice,
|
||||
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
|
||||
tool_parser=args.tool_call_parser,
|
||||
default_chat_template_kwargs=args.default_chat_template_kwargs,
|
||||
log_error_stack=args.log_error_stack,
|
||||
)
|
||||
|
||||
# Expose models endpoint via the render handler.
|
||||
state.openai_serving_models = state.openai_serving_render
|
||||
|
||||
state.vllm_config = vllm_config
|
||||
# Disable stats logging — there is no engine to poll.
|
||||
state.log_stats = False
|
||||
state.engine_client = None
|
||||
state.args = args
|
||||
state.enable_server_load_tracking = False
|
||||
state.server_load_metrics = 0
|
||||
|
||||
|
||||
def create_server_socket(addr: tuple[str, int]) -> socket.socket:
|
||||
family = socket.AF_INET
|
||||
if is_valid_ipv6_address(addr[0]):
|
||||
@@ -494,7 +560,6 @@ async def build_and_serve(
|
||||
|
||||
supported_tasks = await engine_client.get_supported_tasks()
|
||||
logger.info("Supported tasks: %s", supported_tasks)
|
||||
|
||||
app = build_app(args, supported_tasks)
|
||||
await init_app_state(engine_client, app.state, args, supported_tasks)
|
||||
|
||||
@@ -522,6 +587,51 @@ async def build_and_serve(
|
||||
)
|
||||
|
||||
|
||||
async def build_and_serve_renderer(
|
||||
vllm_config: VllmConfig,
|
||||
listen_address: str,
|
||||
sock: socket.socket,
|
||||
args: Namespace,
|
||||
**uvicorn_kwargs,
|
||||
) -> asyncio.Task:
|
||||
"""Build FastAPI app for a CPU-only render server, initialize state, and
|
||||
start serving.
|
||||
|
||||
Returns the shutdown task for the caller to await.
|
||||
"""
|
||||
|
||||
# Get uvicorn log config (from file or with endpoint filter)
|
||||
log_config = get_uvicorn_log_config(args)
|
||||
if log_config is not None:
|
||||
uvicorn_kwargs["log_config"] = log_config
|
||||
|
||||
app = build_app(args, ("render",))
|
||||
await init_render_app_state(vllm_config, app.state, args)
|
||||
|
||||
logger.info("Starting vLLM server on %s", listen_address)
|
||||
|
||||
return await serve_http(
|
||||
app,
|
||||
sock=sock,
|
||||
enable_ssl_refresh=args.enable_ssl_refresh,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
log_level=args.uvicorn_log_level,
|
||||
# NOTE: When the 'disable_uvicorn_access_log' value is True,
|
||||
# no access log will be output.
|
||||
access_log=not args.disable_uvicorn_access_log,
|
||||
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
|
||||
ssl_keyfile=args.ssl_keyfile,
|
||||
ssl_certfile=args.ssl_certfile,
|
||||
ssl_ca_certs=args.ssl_ca_certs,
|
||||
ssl_cert_reqs=args.ssl_cert_reqs,
|
||||
ssl_ciphers=args.ssl_ciphers,
|
||||
h11_max_incomplete_event_size=args.h11_max_incomplete_event_size,
|
||||
h11_max_header_count=args.h11_max_header_count,
|
||||
**uvicorn_kwargs,
|
||||
)
|
||||
|
||||
|
||||
async def run_server(args, **uvicorn_kwargs) -> None:
|
||||
"""Run a single-worker API server."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user