[Frontend] Add vllm launch command for GPU-less preprocessing serving (#34551)

Signed-off-by: HyunKyun Moon <mhg5303@gmail.com>
This commit is contained in:
Hyunkyun Moon
2026-03-05 03:41:52 +09:00
committed by GitHub
parent 32224f568a
commit bc6be89d16
10 changed files with 776 additions and 65 deletions

View File

@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import importlib
import inspect
import multiprocessing
@@ -194,7 +195,7 @@ def build_app(
register_sagemaker_api_router(app, supported_tasks)
if "generate" in supported_tasks:
if any(task in supported_tasks for task in ("generate", "render")):
from vllm.entrypoints.openai.generate.api_router import (
register_generate_api_routers,
)
@@ -357,7 +358,7 @@ async def init_app_state(
log_error_stack=args.log_error_stack,
)
if "generate" in supported_tasks:
if any(task in supported_tasks for task in ("generate", "render")):
from vllm.entrypoints.openai.generate.api_router import init_generate_state
await init_generate_state(
@@ -469,6 +470,53 @@ def setup_server(args):
return listen_address, sock
async def build_and_serve(
engine_client: EngineClient,
listen_address: str,
sock: socket.socket,
args: Namespace,
**uvicorn_kwargs,
) -> asyncio.Task:
"""Build FastAPI app, initialize state, and start serving.
Returns the shutdown task for the caller to await.
"""
# Get uvicorn log config (from file or with endpoint filter)
log_config = get_uvicorn_log_config(args)
if log_config is not None:
uvicorn_kwargs["log_config"] = log_config
supported_tasks = await engine_client.get_supported_tasks()
logger.info("Supported tasks: %s", supported_tasks)
app = build_app(args, supported_tasks)
await init_app_state(engine_client, app.state, args, supported_tasks)
logger.info("Starting vLLM server on %s", listen_address)
return await serve_http(
app,
sock=sock,
enable_ssl_refresh=args.enable_ssl_refresh,
host=args.host,
port=args.port,
log_level=args.uvicorn_log_level,
# NOTE: When the 'disable_uvicorn_access_log' value is True,
# no access log will be output.
access_log=not args.disable_uvicorn_access_log,
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs,
ssl_ciphers=args.ssl_ciphers,
h11_max_incomplete_event_size=args.h11_max_incomplete_event_size,
h11_max_header_count=args.h11_max_header_count,
**uvicorn_kwargs,
)
async def run_server(args, **uvicorn_kwargs) -> None:
"""Run a single-worker API server."""
@@ -490,47 +538,13 @@ async def run_server_worker(
if args.reasoning_parser_plugin and len(args.reasoning_parser_plugin) > 3:
ReasoningParserManager.import_reasoning_parser(args.reasoning_parser_plugin)
# Get uvicorn log config (from file or with endpoint filter)
log_config = get_uvicorn_log_config(args)
if log_config is not None:
uvicorn_kwargs["log_config"] = log_config
async with build_async_engine_client(
args,
client_config=client_config,
) as engine_client:
supported_tasks = await engine_client.get_supported_tasks()
logger.info("Supported tasks: %s", supported_tasks)
app = build_app(args, supported_tasks)
await init_app_state(engine_client, app.state, args, supported_tasks)
logger.info(
"Starting vLLM API server %d on %s",
engine_client.vllm_config.parallel_config._api_process_rank,
listen_address,
shutdown_task = await build_and_serve(
engine_client, listen_address, sock, args, **uvicorn_kwargs
)
shutdown_task = await serve_http(
app,
sock=sock,
enable_ssl_refresh=args.enable_ssl_refresh,
host=args.host,
port=args.port,
log_level=args.uvicorn_log_level,
# NOTE: When the 'disable_uvicorn_access_log' value is True,
# no access log will be output.
access_log=not args.disable_uvicorn_access_log,
timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs,
ssl_ciphers=args.ssl_ciphers,
h11_max_incomplete_event_size=args.h11_max_incomplete_event_size,
h11_max_header_count=args.h11_max_header_count,
**uvicorn_kwargs,
)
# NB: Await server shutdown only after the backend context is exited
try:
await shutdown_task