diff --git a/vllm/entrypoints/cli/launch.py b/vllm/entrypoints/cli/launch.py index f04a77d48..6afa24353 100644 --- a/vllm/entrypoints/cli/launch.py +++ b/vllm/entrypoints/cli/launch.py @@ -8,7 +8,7 @@ import uvloop from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.cli.types import CLISubcommand from vllm.entrypoints.openai.api_server import ( - build_and_serve, + build_and_serve_renderer, setup_server, ) from vllm.entrypoints.openai.cli_args import ( @@ -109,19 +109,17 @@ def cmd_init() -> list[CLISubcommand]: async def run_launch_fastapi(args: argparse.Namespace) -> None: """Run the online serving layer with FastAPI (no GPU inference).""" from vllm.config import VllmConfig - from vllm.v1.engine.launch import LaunchEngineClient # 1. Socket binding listen_address, sock = setup_server(args) - # 2. Create LaunchEngineClient (no GPU) + # 2. Build and serve the API server engine_args = AsyncEngineArgs.from_cli_args(args) model_config = engine_args.create_model_config() vllm_config = VllmConfig(model_config=model_config) - engine_client = LaunchEngineClient.from_vllm_config(vllm_config) - - # 3. Build app, initialize state, and start serving - shutdown_task = await build_and_serve(engine_client, listen_address, sock, args) + shutdown_task = await build_and_serve_renderer( + vllm_config, listen_address, sock, args + ) try: await shutdown_task finally: diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ee0b7115d..7961daf16 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -22,6 +22,7 @@ from fastapi.middleware.cors import CORSMiddleware from starlette.datastructures import State import vllm.envs as envs +from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import load_chat_template @@ -198,7 +199,7 @@ def build_app( register_sagemaker_api_router(app, supported_tasks) - if any(task in supported_tasks for task in ("generate", "render")): + if "generate" in supported_tasks: from vllm.entrypoints.openai.generate.api_router import ( register_generate_api_routers, ) @@ -223,6 +224,13 @@ def build_app( elastic_ep_attach_router(app) + if "generate" in supported_tasks or "render" in supported_tasks: + from vllm.entrypoints.serve.render.api_router import ( + attach_router as attach_render_router, + ) + + attach_render_router(app) + if "transcription" in supported_tasks: from vllm.entrypoints.openai.speech_to_text.api_router import ( attach_router as register_speech_to_text_api_router, @@ -363,7 +371,7 @@ async def init_app_state( trust_request_chat_template=args.trust_request_chat_template, ) - if any(task in supported_tasks for task in ("generate", "render")): + if "generate" in supported_tasks: from vllm.entrypoints.openai.generate.api_router import init_generate_state await init_generate_state( @@ -393,6 +401,64 @@ async def init_app_state( state.server_load_metrics = 0 +async def init_render_app_state( + vllm_config: VllmConfig, + state: State, + args: Namespace, +) -> None: + """Initialise FastAPI app state for a CPU-only render server. + + Unlike :func:`init_app_state` this function does not require an + :class:`~vllm.engine.protocol.EngineClient`; it bootstraps the + preprocessing pipeline (renderer, io_processor, input_processor) + directly from the :class:`~vllm.config.VllmConfig`. + """ + from vllm.entrypoints.chat_utils import load_chat_template + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + from vllm.plugins.io_processors import get_io_processor + from vllm.renderers import renderer_from_config + + served_model_names = args.served_model_name or [args.model] + + if args.enable_log_requests: + request_logger = RequestLogger(max_log_len=args.max_log_len) + else: + request_logger = None + + renderer = renderer_from_config(vllm_config) + io_processor = get_io_processor( + vllm_config, renderer, vllm_config.model_config.io_processor_plugin + ) + resolved_chat_template = load_chat_template(args.chat_template) + + state.openai_serving_render = OpenAIServingRender( + model_config=vllm_config.model_config, + renderer=renderer, + io_processor=io_processor, + served_model_names=served_model_names, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + enable_auto_tools=args.enable_auto_tool_choice, + exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, + tool_parser=args.tool_call_parser, + default_chat_template_kwargs=args.default_chat_template_kwargs, + log_error_stack=args.log_error_stack, + ) + + # Expose models endpoint via the render handler. + state.openai_serving_models = state.openai_serving_render + + state.vllm_config = vllm_config + # Disable stats logging — there is no engine to poll. + state.log_stats = False + state.engine_client = None + state.args = args + state.enable_server_load_tracking = False + state.server_load_metrics = 0 + + def create_server_socket(addr: tuple[str, int]) -> socket.socket: family = socket.AF_INET if is_valid_ipv6_address(addr[0]): @@ -494,7 +560,6 @@ async def build_and_serve( supported_tasks = await engine_client.get_supported_tasks() logger.info("Supported tasks: %s", supported_tasks) - app = build_app(args, supported_tasks) await init_app_state(engine_client, app.state, args, supported_tasks) @@ -522,6 +587,51 @@ async def build_and_serve( ) +async def build_and_serve_renderer( + vllm_config: VllmConfig, + listen_address: str, + sock: socket.socket, + args: Namespace, + **uvicorn_kwargs, +) -> asyncio.Task: + """Build FastAPI app for a CPU-only render server, initialize state, and + start serving. + + Returns the shutdown task for the caller to await. + """ + + # Get uvicorn log config (from file or with endpoint filter) + log_config = get_uvicorn_log_config(args) + if log_config is not None: + uvicorn_kwargs["log_config"] = log_config + + app = build_app(args, ("render",)) + await init_render_app_state(vllm_config, app.state, args) + + logger.info("Starting vLLM server on %s", listen_address) + + return await serve_http( + app, + sock=sock, + enable_ssl_refresh=args.enable_ssl_refresh, + host=args.host, + port=args.port, + log_level=args.uvicorn_log_level, + # NOTE: When the 'disable_uvicorn_access_log' value is True, + # no access log will be output. + access_log=not args.disable_uvicorn_access_log, + timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, + ssl_keyfile=args.ssl_keyfile, + ssl_certfile=args.ssl_certfile, + ssl_ca_certs=args.ssl_ca_certs, + ssl_cert_reqs=args.ssl_cert_reqs, + ssl_ciphers=args.ssl_ciphers, + h11_max_incomplete_event_size=args.h11_max_incomplete_event_size, + h11_max_header_count=args.h11_max_header_count, + **uvicorn_kwargs, + ) + + async def run_server(args, **uvicorn_kwargs) -> None: """Run a single-worker API server.""" diff --git a/vllm/entrypoints/openai/chat_completion/api_router.py b/vllm/entrypoints/openai/chat_completion/api_router.py index 8f2c5c14f..f5569f5ab 100644 --- a/vllm/entrypoints/openai/chat_completion/api_router.py +++ b/vllm/entrypoints/openai/chat_completion/api_router.py @@ -71,34 +71,5 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post( - "/v1/chat/completions/render", - dependencies=[Depends(validate_json_request)], - response_model=list, - responses={ - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse}, - }, -) -async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request): - """Render chat completion request and return conversation and engine - prompts without generating.""" - handler = chat(raw_request) - if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Chat Completions API" - ) - - result = await handler.render_chat_request(request) - - if isinstance(result, ErrorResponse): - return JSONResponse(content=result.model_dump(), status_code=result.error.code) - - return JSONResponse(content=result) - - def attach_router(app: FastAPI): app.include_router(router) diff --git a/vllm/entrypoints/openai/completion/api_router.py b/vllm/entrypoints/openai/completion/api_router.py index 466c059aa..56e961bef 100644 --- a/vllm/entrypoints/openai/completion/api_router.py +++ b/vllm/entrypoints/openai/completion/api_router.py @@ -69,32 +69,5 @@ async def create_completion(request: CompletionRequest, raw_request: Request): return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post( - "/v1/completions/render", - dependencies=[Depends(validate_json_request)], - response_model=list, - responses={ - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - }, -) -async def render_completion(request: CompletionRequest, raw_request: Request): - """render completion request and return engine prompts without generating.""" - handler = completion(raw_request) - if handler is None: - base_server = raw_request.app.state.openai_serving_tokenization - return base_server.create_error_response( - message="The model does not support Completions API" - ) - - result = await handler.render_completion_request(request) - - if isinstance(result, ErrorResponse): - return JSONResponse(content=result.model_dump(), status_code=result.error.code) - - return JSONResponse(content=result) - - def attach_router(app: FastAPI): app.include_router(router) diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/openai/generate/api_router.py index 5e4f184a0..f07f42f0c 100644 --- a/vllm/entrypoints/openai/generate/api_router.py +++ b/vllm/entrypoints/openai/generate/api_router.py @@ -111,7 +111,7 @@ async def init_generate_state( enable_log_outputs=args.enable_log_outputs, enable_log_deltas=args.enable_log_deltas, ) - if any(task in supported_tasks for task in ("generate", "render")) + if "generate" in supported_tasks else None ) # Warm up chat template processing to avoid first-request latency @@ -126,7 +126,7 @@ async def init_generate_state( enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, ) - if any(task in supported_tasks for task in ("generate", "render")) + if "generate" in supported_tasks else None ) state.anthropic_serving_messages = ( @@ -160,3 +160,26 @@ async def init_generate_state( if "generate" in supported_tasks else None ) + + # Render endpoints are always backed by OpenAIServingRender so that + # /v1/chat/completions/render and /v1/completions/render work on both + # generate-mode and render-only servers. + from vllm.entrypoints.serve.render.serving import OpenAIServingRender + + state.openai_serving_render = OpenAIServingRender( + model_config=engine_client.model_config, + renderer=engine_client.renderer, + io_processor=engine_client.io_processor, + served_model_names=[ + mp.name for mp in state.openai_serving_models.base_model_paths + ], + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + enable_auto_tools=args.enable_auto_tool_choice, + exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, + tool_parser=args.tool_call_parser, + default_chat_template_kwargs=args.default_chat_template_kwargs, + log_error_stack=args.log_error_stack, + ) diff --git a/vllm/entrypoints/serve/instrumentator/health.py b/vllm/entrypoints/serve/instrumentator/health.py index 8b079ce31..5c0b2d185 100644 --- a/vllm/entrypoints/serve/instrumentator/health.py +++ b/vllm/entrypoints/serve/instrumentator/health.py @@ -22,8 +22,12 @@ def engine_client(request: Request) -> EngineClient: @router.get("/health", response_class=Response) async def health(raw_request: Request) -> Response: """Health check.""" + client = engine_client(raw_request) + if client is None: + # Render-only servers have no engine; they are always healthy. + return Response(status_code=200) try: - await engine_client(raw_request).check_health() + await client.check_health() return Response(status_code=200) except EngineDeadError: return Response(status_code=503) diff --git a/vllm/entrypoints/serve/render/__init__.py b/vllm/entrypoints/serve/render/__init__.py new file mode 100644 index 000000000..208f01a7c --- /dev/null +++ b/vllm/entrypoints/serve/render/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/entrypoints/serve/render/api_router.py b/vllm/entrypoints/serve/render/api_router.py new file mode 100644 index 000000000..a9f62e450 --- /dev/null +++ b/vllm/entrypoints/serve/render/api_router.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from http import HTTPStatus + +from fastapi import APIRouter, Depends, FastAPI, Request +from fastapi.responses import JSONResponse + +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.completion.protocol import CompletionRequest +from vllm.entrypoints.openai.engine.protocol import ErrorResponse +from vllm.entrypoints.openai.utils import validate_json_request +from vllm.entrypoints.serve.render.serving import OpenAIServingRender +from vllm.entrypoints.utils import create_error_response +from vllm.logger import init_logger + +logger = init_logger(__name__) + +router = APIRouter() + + +def render(request: Request) -> OpenAIServingRender | None: + return getattr(request.app.state, "openai_serving_render", None) + + +@router.post( + "/v1/chat/completions/render", + dependencies=[Depends(validate_json_request)], + response_model=list, + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +async def render_chat_completion(request: ChatCompletionRequest, raw_request: Request): + handler = render(raw_request) + if handler is None: + error = create_error_response( + message="The model does not support Chat Completions Render API", + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND, + ) + return JSONResponse( + status_code=HTTPStatus.NOT_FOUND, content=error.model_dump() + ) + + result = await handler.render_chat_request(request) + + if isinstance(result, ErrorResponse): + return JSONResponse(content=result.model_dump(), status_code=result.error.code) + + return JSONResponse(content=result) + + +@router.post( + "/v1/completions/render", + dependencies=[Depends(validate_json_request)], + response_model=list, + responses={ + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +async def render_completion(request: CompletionRequest, raw_request: Request): + handler = render(raw_request) + if handler is None: + error = create_error_response( + message="The model does not support Completions Render API", + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND, + ) + return JSONResponse( + status_code=HTTPStatus.NOT_FOUND, content=error.model_dump() + ) + + result = await handler.render_completion_request(request) + + if isinstance(result, ErrorResponse): + return JSONResponse(content=result.model_dump(), status_code=result.error.code) + + return JSONResponse(content=result) + + +def attach_router(app: FastAPI) -> None: + app.include_router(router) diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py new file mode 100644 index 000000000..c0e32be7e --- /dev/null +++ b/vllm/entrypoints/serve/render/serving.py @@ -0,0 +1,475 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import sys +import traceback +from collections.abc import Callable, Sequence +from http import HTTPStatus +from typing import Any + +import jinja2 +from openai_harmony import Message as OpenAIMessage + +from vllm.config import ModelConfig +from vllm.entrypoints.chat_utils import ( + ChatTemplateContentFormatOption, + ConversationMessage, +) +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.completion.protocol import CompletionRequest +from vllm.entrypoints.openai.engine.protocol import ( + ErrorInfo, + ErrorResponse, + ModelCard, + ModelList, + ModelPermission, +) +from vllm.entrypoints.openai.parser.harmony_utils import ( + get_developer_message, + get_system_message, + parse_chat_inputs_to_harmony_messages, + render_for_completion, +) +from vllm.entrypoints.utils import sanitize_message +from vllm.inputs.data import ProcessorInputs, PromptType, SingletonPrompt, TokensPrompt +from vllm.logger import init_logger +from vllm.parser import ParserManager +from vllm.renderers import BaseRenderer, merge_kwargs +from vllm.renderers.inputs.preprocess import parse_model_prompt, prompt_to_seq +from vllm.tokenizers import TokenizerLike +from vllm.tool_parsers import ToolParser +from vllm.utils.mistral import is_mistral_tokenizer +from vllm.utils.mistral import mt as _mt + +logger = init_logger(__name__) + + +class OpenAIServingRender: + def __init__( + self, + model_config: ModelConfig, + renderer: BaseRenderer, + io_processor: Any, + served_model_names: list[str], + *, + request_logger: RequestLogger | None, + chat_template: str | None, + chat_template_content_format: ChatTemplateContentFormatOption, + trust_request_chat_template: bool = False, + enable_auto_tools: bool = False, + exclude_tools_when_tool_choice_none: bool = False, + tool_parser: str | None = None, + default_chat_template_kwargs: dict[str, Any] | None = None, + log_error_stack: bool = False, + ) -> None: + self.model_config = model_config + self.renderer = renderer + self.io_processor = io_processor + self.served_model_names = served_model_names + self.request_logger = request_logger + self.chat_template = chat_template + self.chat_template_content_format: ChatTemplateContentFormatOption = ( + chat_template_content_format + ) + self.trust_request_chat_template = trust_request_chat_template + self.enable_auto_tools = enable_auto_tools + self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none + self.tool_parser: Callable[[TokenizerLike], ToolParser] | None = ( + ParserManager.get_tool_parser( + tool_parser_name=tool_parser, + enable_auto_tools=enable_auto_tools, + model_name=model_config.model, + ) + ) + self.default_chat_template_kwargs: dict[str, Any] = ( + default_chat_template_kwargs or {} + ) + self.log_error_stack = log_error_stack + self.use_harmony = model_config.hf_config.model_type == "gpt_oss" + self.supports_browsing = False + self.supports_code_interpreter = False + + async def render_chat_request( + self, + request: ChatCompletionRequest, + ) -> tuple[list[ConversationMessage], list[ProcessorInputs]] | ErrorResponse: + """Copied from OpenAIServingChat.render_chat_request. + + Differences: engine_client.errored check removed (no engine client). + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + logger.error("Error with model %s", error_check_ret) + return error_check_ret + + try: + tokenizer = self.renderer.tokenizer + + tool_parser = self.tool_parser + + if is_mistral_tokenizer(tokenizer): + # because of issues with pydantic we need to potentially + # re-serialize the tool_calls field of the request + # for more info: see comment in `maybe_serialize_tool_calls` + _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] + _mt.truncate_tool_call_ids(request) # type: ignore[arg-type] + _mt.validate_request_params(request) + + # Check if tool parsing is unavailable (common condition) + tool_parsing_unavailable = ( + tool_parser is None + and not is_mistral_tokenizer(tokenizer) + and not self.use_harmony + ) + + # Validate tool_choice when tool parsing is required but unavailable + if tool_parsing_unavailable and request.tool_choice not in ( + None, + "none", + ): + if request.tool_choice == "auto" and not self.enable_auto_tools: + # for hf tokenizers, "auto" tools requires + # --enable-auto-tool-choice and --tool-call-parser + return self.create_error_response( + '"auto" tool choice requires ' + "--enable-auto-tool-choice and --tool-call-parser to be set" + ) + elif request.tool_choice != "auto": + # "required" or named tool requires tool parser + return self.create_error_response( + f'tool_choice="{request.tool_choice}" requires ' + "--tool-call-parser to be set" + ) + + if request.tools is None or ( + request.tool_choice == "none" + and self.exclude_tools_when_tool_choice_none + ): + tool_dicts = None + else: + tool_dicts = [tool.model_dump() for tool in request.tools] + + if not self.use_harmony: + # Common case. + error_check_ret = self._validate_chat_template( + request_chat_template=request.chat_template, + chat_template_kwargs=request.chat_template_kwargs, + trust_request_chat_template=self.trust_request_chat_template, + ) + if error_check_ret is not None: + return error_check_ret + + conversation, engine_prompts = await self._preprocess_chat( + request, + request.messages, + default_template=self.chat_template, + default_template_content_format=self.chat_template_content_format, + default_template_kwargs=self.default_chat_template_kwargs, + tool_dicts=tool_dicts, + tool_parser=tool_parser, + ) + else: + # For GPT-OSS. + should_include_tools = tool_dicts is not None + conversation, engine_prompts = self._make_request_with_harmony( + request, should_include_tools + ) + except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(e) + + return conversation, engine_prompts + + async def render_completion_request( + self, + request: CompletionRequest, + ) -> list[ProcessorInputs] | ErrorResponse: + """Copied from OpenAIServingCompletion.render_completion_request. + + Differences: engine_client.errored check removed (no engine client). + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + # Return error for unsupported features. + if request.suffix is not None: + return self.create_error_response("suffix is not currently supported") + + if request.echo and request.prompt_embeds is not None: + return self.create_error_response("Echo is unsupported with prompt embeds.") + + if request.prompt_logprobs is not None and request.prompt_embeds is not None: + return self.create_error_response( + "prompt_logprobs is not compatible with prompt embeds." + ) + + try: + engine_prompts = await self._preprocess_completion( + request, + prompt_input=request.prompt, + prompt_embeds=request.prompt_embeds, + ) + except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(e) + + return engine_prompts + + def _make_request_with_harmony( + self, + request: ChatCompletionRequest, + should_include_tools: bool = True, + ): + """Copied from OpenAIServingChat._make_request_with_harmony.""" + messages: list[OpenAIMessage] = [] + + # because of issues with pydantic we need to potentially + # re-serialize the tool_calls field of the request + # for more info: see comment in `maybe_serialize_tool_calls` + _mt.maybe_serialize_tool_calls(request) # type: ignore[arg-type] + + # Add system message. + # NOTE: In Chat Completion API, browsing is enabled by default + # if the model supports it. TODO: Support browsing. + assert not self.supports_browsing + assert not self.supports_code_interpreter + sys_msg = get_system_message( + reasoning_effort=request.reasoning_effort, + browser_description=None, + python_description=None, + with_custom_tools=should_include_tools, + ) + messages.append(sys_msg) + + # Add developer message. + if request.tools: + dev_msg = get_developer_message( + tools=request.tools if should_include_tools else None # type: ignore[arg-type] + ) + messages.append(dev_msg) + + # Add user message. + messages.extend(parse_chat_inputs_to_harmony_messages(request.messages)) + + # Render prompt token ids. + prompt_token_ids = render_for_completion(messages) + engine_prompt = TokensPrompt(prompt_token_ids=prompt_token_ids) + + # Add cache_salt if provided in the request + if request.cache_salt is not None: + engine_prompt["cache_salt"] = request.cache_salt + + return messages, [engine_prompt] + + async def show_available_models(self) -> ModelList: + """Returns the models served by this render server.""" + max_model_len = self.model_config.max_model_len + return ModelList( + data=[ + ModelCard( + id=name, + max_model_len=max_model_len, + root=self.model_config.model, + permission=[ModelPermission()], + ) + for name in self.served_model_names + ] + ) + + def create_error_response( + self, + message: str | Exception, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + param: str | None = None, + ) -> ErrorResponse: + """Copied from OpenAIServing.create_error_response.""" + exc: Exception | None = None + + if isinstance(message, Exception): + exc = message + + from vllm.exceptions import VLLMValidationError + + if isinstance(exc, VLLMValidationError): + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = exc.parameter + elif isinstance(exc, (ValueError, TypeError, RuntimeError, OverflowError)): + # Common validation errors from user input + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = None + elif isinstance(exc, NotImplementedError): + err_type = "NotImplementedError" + status_code = HTTPStatus.NOT_IMPLEMENTED + param = None + elif exc.__class__.__name__ == "TemplateError": + # jinja2.TemplateError (avoid importing jinja2) + err_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + param = None + else: + err_type = "InternalServerError" + status_code = HTTPStatus.INTERNAL_SERVER_ERROR + param = None + + message = str(exc) + + if self.log_error_stack: + exc_type, _, _ = sys.exc_info() + if exc_type is not None: + traceback.print_exc() + else: + traceback.print_stack() + + return ErrorResponse( + error=ErrorInfo( + message=sanitize_message(message), + type=err_type, + code=status_code.value, + param=param, + ) + ) + + def _is_model_supported(self, model_name: str) -> bool: + """Simplified from OpenAIServing._is_model_supported (no LoRA support).""" + return model_name in self.served_model_names + + async def _check_model( + self, + request: Any, + ) -> ErrorResponse | None: + """Simplified from OpenAIServing._check_model (no LoRA support).""" + if self._is_model_supported(request.model): + return None + return self.create_error_response( + message=f"The model `{request.model}` does not exist.", + err_type="NotFoundError", + status_code=HTTPStatus.NOT_FOUND, + param="model", + ) + + def _validate_chat_template( + self, + request_chat_template: str | None, + chat_template_kwargs: dict[str, Any] | None, + trust_request_chat_template: bool, + ) -> ErrorResponse | None: + """Copied from OpenAIServing._validate_chat_template.""" + if not trust_request_chat_template and ( + request_chat_template is not None + or ( + chat_template_kwargs + and chat_template_kwargs.get("chat_template") is not None + ) + ): + return self.create_error_response( + "Chat template is passed with request, but " + "--trust-request-chat-template is not set. " + "Refused request with untrusted chat template." + ) + return None + + async def _preprocess_completion( + self, + request: Any, + prompt_input: str | list[str] | list[int] | list[list[int]] | None, + prompt_embeds: bytes | list[bytes] | None, + ) -> list[ProcessorInputs]: + """Copied from OpenAIServing._preprocess_completion.""" + prompts = list[SingletonPrompt | bytes]() + if prompt_embeds is not None: # embeds take higher priority + prompts.extend(prompt_to_seq(prompt_embeds)) + if prompt_input is not None: + prompts.extend(prompt_to_seq(prompt_input)) + return await self._preprocess_cmpl(request, prompts) + + async def _preprocess_cmpl( + self, + request: Any, + prompts: Sequence[PromptType | bytes], + ) -> list[ProcessorInputs]: + """Copied from OpenAIServing._preprocess_cmpl.""" + renderer = self.renderer + model_config = self.model_config + + parsed_prompts = [ + ( + prompt + if isinstance(prompt, bytes) + else parse_model_prompt(model_config, prompt) + ) + for prompt in prompts + ] + tok_params = request.build_tok_params(model_config) + + return await renderer.render_cmpl_async( + parsed_prompts, + tok_params, + prompt_extras={ + k: v + for k in ("mm_processor_kwargs", "cache_salt") + if (v := getattr(request, k, None)) is not None + }, + ) + + async def _preprocess_chat( + self, + request: Any, + messages: list[Any], + default_template: str | None, + default_template_content_format: ChatTemplateContentFormatOption, + default_template_kwargs: dict[str, Any] | None, + tool_dicts: list[dict[str, Any]] | None = None, + tool_parser: Callable[[TokenizerLike], ToolParser] | None = None, + ) -> tuple[list[ConversationMessage], list[ProcessorInputs]]: + """Copied from OpenAIServing._preprocess_chat. + + Differences: isinstance check is ChatCompletionRequest-only + (ResponsesRequest not supported here); TODO comment dropped accordingly. + """ + renderer = self.renderer + + default_template_kwargs = merge_kwargs( + default_template_kwargs, + dict( + tools=tool_dicts, + tokenize=is_mistral_tokenizer(renderer.tokenizer), + ), + ) + + tok_params = request.build_tok_params(self.model_config) + chat_params = request.build_chat_params( + default_template, default_template_content_format + ).with_defaults(default_template_kwargs) + + (conversation,), (engine_prompt,) = await renderer.render_chat_async( + [messages], + chat_params, + tok_params, + prompt_extras={ + k: v + for k in ("mm_processor_kwargs", "cache_salt") + if (v := getattr(request, k, None)) is not None + }, + ) + + # tool parsing is done only if a tool_parser has been set and if + # tool_choice is not "none" (if tool_choice is "none" but a tool_parser + # is set, we want to prevent parsing a tool_call hallucinated by the LLM + if tool_parser is not None: + tool_choice = getattr(request, "tool_choice", "none") + if tool_choice != "none": + if not isinstance(request, ChatCompletionRequest): + msg = ( + "Tool usage is only supported " + " for ChatCompletionRequest, but got " + f"{type(request).__name__}" + ) + raise NotImplementedError(msg) + tokenizer = renderer.get_tokenizer() + request = tool_parser(tokenizer).adjust_request(request=request) # type: ignore[arg-type] + + return conversation, [engine_prompt] diff --git a/vllm/v1/engine/launch.py b/vllm/v1/engine/launch.py deleted file mode 100644 index 2d92db4c9..000000000 --- a/vllm/v1/engine/launch.py +++ /dev/null @@ -1,204 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -LaunchEngineClient: A lightweight EngineClient for GPU-less online serving. - -This implements the EngineClient protocol without AsyncLLM or EngineCore, -enabling preprocessing (tokenization, rendering) and postprocessing -(detokenization) without GPU inference. -""" - -from collections.abc import AsyncGenerator, Iterable, Mapping -from typing import Any - -from vllm.config import VllmConfig -from vllm.engine.protocol import EngineClient, StreamingInput -from vllm.inputs import ProcessorInputs, PromptType -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.outputs import PoolingRequestOutput, RequestOutput -from vllm.plugins.io_processors import get_io_processor -from vllm.pooling_params import PoolingParams -from vllm.renderers import renderer_from_config -from vllm.sampling_params import SamplingParams -from vllm.tasks import SupportedTask -from vllm.v1.engine import EngineCoreRequest, PauseMode -from vllm.v1.engine.input_processor import InputProcessor - -logger = init_logger(__name__) - - -class LaunchEngineClient(EngineClient): - """GPU-less EngineClient that only supports preprocessing/postprocessing. - - This is a Null Object at the EngineClient level, bypassing AsyncLLM - entirely. It initializes renderer, io_processor, and input_processor - for tokenization and rendering, but raises NotImplementedError for - any inference-related operations. - """ - - def __init__( - self, - vllm_config: VllmConfig, - ) -> None: - self.vllm_config = vllm_config - self.model_config = vllm_config.model_config - - self.renderer = renderer = renderer_from_config(self.vllm_config) - self.io_processor = get_io_processor( - self.vllm_config, - self.renderer, - self.model_config.io_processor_plugin, - ) - - # Convert TokPrompt --> EngineCoreRequest. - self.input_processor = InputProcessor(self.vllm_config, renderer) - - @classmethod - def from_vllm_config( - cls, - vllm_config: VllmConfig, - ) -> "LaunchEngineClient": - """Create a LaunchEngineClient from a VllmConfig without GPU.""" - return cls( - vllm_config=vllm_config, - ) - - # -- Task support -- - - async def get_supported_tasks(self) -> tuple[SupportedTask, ...]: - return ("render",) - - # -- Inference (not supported) -- - - async def generate( - self, - prompt: EngineCoreRequest - | PromptType - | ProcessorInputs - | AsyncGenerator[StreamingInput, None], - sampling_params: SamplingParams, - request_id: str, - *, - prompt_text: str | None = None, - lora_request: LoRARequest | None = None, - tokenization_kwargs: dict[str, Any] | None = None, - trace_headers: Mapping[str, str] | None = None, - priority: int = 0, - data_parallel_rank: int | None = None, - reasoning_ended: bool | None = None, - ) -> AsyncGenerator[RequestOutput, None]: - raise NotImplementedError( - "LaunchEngineClient does not support inference. " - "Use vllm serve for generation requests." - ) - # yield is needed to make this an async generator - yield # type: ignore[misc] # pragma: no cover - - # -- Request management (no-op) -- - - async def abort( - self, request_id: str | Iterable[str], internal: bool = False - ) -> None: - pass - - # -- Generation control (no-op) -- - - async def pause_generation( - self, - *, - mode: PauseMode = "abort", - wait_for_inflight_requests: bool | None = None, - clear_cache: bool = True, - ) -> None: - pass - - async def resume_generation(self) -> None: - pass - - async def is_paused(self) -> bool: - return False - - def shutdown(self, timeout: float | None = None) -> None: - pass - - async def encode( - self, - prompt: PromptType | ProcessorInputs, - pooling_params: PoolingParams, - request_id: str, - lora_request: LoRARequest | None = None, - trace_headers: Mapping[str, str] | None = None, - priority: int = 0, - tokenization_kwargs: dict[str, Any] | None = None, - reasoning_ended: bool | None = None, - ) -> AsyncGenerator[PoolingRequestOutput, None]: - raise NotImplementedError( - "LaunchEngineClient does not support inference. " - "Use vllm serve for encoding requests." - ) - yield # type: ignore[misc] # pragma: no cover - - # -- Observability (no-op / defaults) -- - - async def is_tracing_enabled(self) -> bool: - return False - - async def do_log_stats(self) -> None: - pass - - async def check_health(self) -> None: - pass - - async def start_profile(self) -> None: - pass - - async def stop_profile(self) -> None: - pass - - # -- Cache management (no-op) -- - - async def reset_mm_cache(self) -> None: - pass - - async def reset_prefix_cache( - self, reset_running_requests: bool = False, reset_connector: bool = False - ) -> bool: - return True - - async def reset_encoder_cache(self) -> None: - pass - - # -- Power management (no-op) -- - - async def sleep(self, level: int = 1, mode: PauseMode = "abort") -> None: - pass - - async def wake_up(self, tags: list[str] | None = None) -> None: - pass - - async def is_sleeping(self) -> bool: - return False - - # -- LoRA (not supported) -- - - async def add_lora(self, lora_request: LoRARequest) -> bool: - return False - - # -- Status properties -- - - @property - def is_running(self) -> bool: - return True - - @property - def is_stopped(self) -> bool: - return False - - @property - def errored(self) -> bool: - return False - - @property - def dead_error(self) -> BaseException: - return RuntimeError("LaunchEngineClient does not support inference")