diff --git a/tests/entrypoints/openai/test_render.py b/tests/entrypoints/openai/cpu/test_render.py similarity index 99% rename from tests/entrypoints/openai/test_render.py rename to tests/entrypoints/openai/cpu/test_render.py index 2f506b950..11389a2e4 100644 --- a/tests/entrypoints/openai/test_render.py +++ b/tests/entrypoints/openai/cpu/test_render.py @@ -7,7 +7,7 @@ import httpx import pytest import pytest_asyncio -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" diff --git a/tests/entrypoints/openai/test_launch_render.py b/tests/entrypoints/openai/test_launch_render.py new file mode 100644 index 000000000..069e61f84 --- /dev/null +++ b/tests/entrypoints/openai/test_launch_render.py @@ -0,0 +1,199 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""E2E tests for render endpoints via `vllm launch` (GPU-less serving).""" + +import httpx +import pytest +import pytest_asyncio + +from ...utils import RemoteLaunchRenderServer + +MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" + + +@pytest.fixture(scope="module") +def server(): + args: list[str] = [] + with RemoteLaunchRenderServer(MODEL_NAME, args, max_wait_seconds=120) as srv: + yield srv + + +@pytest_asyncio.fixture +async def client(server): + async with httpx.AsyncClient( + base_url=server.url_for(""), timeout=30.0 + ) as http_client: + yield http_client + + +# -- Chat Completion Render -- + + +@pytest.mark.asyncio +async def test_chat_render_basic(client): + response = await client.post( + "/v1/chat/completions/render", + json={ + "model": MODEL_NAME, + "messages": [{"role": "user", "content": "Hello, how are you?"}], + }, + ) + + assert response.status_code == 200 + data = response.json() + + assert isinstance(data, list) + assert len(data) == 2 + + conversation, engine_prompts = data + + assert isinstance(conversation, list) + assert conversation[0]["role"] == "user" + + assert isinstance(engine_prompts, list) + assert len(engine_prompts) > 0 + first_prompt = engine_prompts[0] + assert "prompt_token_ids" in first_prompt + assert "prompt" in first_prompt + assert isinstance(first_prompt["prompt_token_ids"], list) + assert all(isinstance(t, int) for t in first_prompt["prompt_token_ids"]) + + +@pytest.mark.asyncio +async def test_chat_render_multi_turn(client): + response = await client.post( + "/v1/chat/completions/render", + json={ + "model": MODEL_NAME, + "messages": [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, + {"role": "user", "content": "How are you?"}, + ], + }, + ) + + assert response.status_code == 200 + conversation, engine_prompts = response.json() + + assert len(conversation) == 3 + assert conversation[0]["role"] == "user" + assert conversation[1]["role"] == "assistant" + assert conversation[2]["role"] == "user" + assert len(engine_prompts) > 0 + assert len(engine_prompts[0]["prompt_token_ids"]) > 0 + + +@pytest.mark.asyncio +async def test_chat_render_invalid_model(client): + response = await client.post( + "/v1/chat/completions/render", + json={ + "model": "nonexistent-model", + "messages": [{"role": "user", "content": "Hello"}], + }, + ) + + assert response.status_code == 404 + assert "error" in response.json() + + +# -- Completion Render -- + + +@pytest.mark.asyncio +async def test_completion_render_basic(client): + response = await client.post( + "/v1/completions/render", + json={ + "model": MODEL_NAME, + "prompt": "Once upon a time", + }, + ) + + assert response.status_code == 200 + data = response.json() + + assert isinstance(data, list) + assert len(data) > 0 + + first_prompt = data[0] + assert "prompt_token_ids" in first_prompt + assert "prompt" in first_prompt + assert isinstance(first_prompt["prompt_token_ids"], list) + assert len(first_prompt["prompt_token_ids"]) > 0 + assert "Once upon a time" in first_prompt["prompt"] + + +@pytest.mark.asyncio +async def test_completion_render_multiple_prompts(client): + response = await client.post( + "/v1/completions/render", + json={ + "model": MODEL_NAME, + "prompt": ["Hello world", "Goodbye world"], + }, + ) + + assert response.status_code == 200 + data = response.json() + + assert isinstance(data, list) + assert len(data) == 2 + + for prompt in data: + assert "prompt_token_ids" in prompt + assert "prompt" in prompt + assert len(prompt["prompt_token_ids"]) > 0 + + +@pytest.mark.asyncio +async def test_completion_render_invalid_model(client): + response = await client.post( + "/v1/completions/render", + json={ + "model": "nonexistent-model", + "prompt": "Hello", + }, + ) + + assert response.status_code == 404 + assert "error" in response.json() + + +@pytest.mark.asyncio +async def test_render_is_fast(client): + """Render should complete quickly since there is no inference.""" + import time + + start = time.perf_counter() + response = await client.post( + "/v1/completions/render", + json={ + "model": MODEL_NAME, + "prompt": "Tell me a very long story about " * 10, + }, + ) + elapsed = time.perf_counter() - start + + assert response.status_code == 200 + assert elapsed < 2.0 + + +# -- Health & Models -- + + +@pytest.mark.asyncio +async def test_health_endpoint(client): + response = await client.get("/health") + assert response.status_code == 200 + + +@pytest.mark.asyncio +async def test_models_endpoint(client): + response = await client.get("/v1/models") + assert response.status_code == 200 + data = response.json() + assert "data" in data + model_ids = [m["id"] for m in data["data"]] + assert MODEL_NAME in model_ids diff --git a/tests/entrypoints/test_launch_cli.py b/tests/entrypoints/test_launch_cli.py new file mode 100644 index 000000000..443dd82fd --- /dev/null +++ b/tests/entrypoints/test_launch_cli.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for the `vllm launch` CLI subcommand.""" + +import argparse +from unittest.mock import patch + +import pytest + +from vllm.entrypoints.cli.launch import ( + LaunchSubcommand, + RenderSubcommand, + cmd_init, +) +from vllm.utils.argparse_utils import FlexibleArgumentParser + + +@pytest.fixture +def launch_parser(): + parser = FlexibleArgumentParser(description="test") + subparsers = parser.add_subparsers(required=False, dest="subparser") + LaunchSubcommand().subparser_init(subparsers) + return parser + + +def test_subcommand_name(): + assert LaunchSubcommand().name == "launch" + + +def test_cmd_init_returns_subcommand(): + result = cmd_init() + assert len(result) == 1 + assert isinstance(result[0], LaunchSubcommand) + + +# -- Parsing: `vllm launch render` -- + + +def test_parse_launch_render(launch_parser): + args = launch_parser.parse_args(["launch", "render", "--model", "test-model"]) + assert args.launch_component == "render" + + +def test_parse_launch_requires_component(launch_parser): + with pytest.raises(SystemExit): + launch_parser.parse_args(["launch", "--model", "test-model"]) + + +def test_parse_launch_invalid_component(launch_parser): + with pytest.raises(SystemExit): + launch_parser.parse_args(["launch", "unknown", "--model", "test-model"]) + + +# -- Dispatch -- + + +def test_cmd_launch_render_calls_run(): + args = argparse.Namespace(model_tag=None, model="test-model") + with patch("vllm.entrypoints.cli.launch.uvloop.run") as mock_uvloop_run: + RenderSubcommand.cmd(args) + mock_uvloop_run.assert_called_once() + + +def test_cmd_launch_model_tag_overrides(): + args = argparse.Namespace( + model_tag="tag-model", + model="original-model", + launch_command=lambda a: None, + ) + LaunchSubcommand.cmd(args) + assert args.model == "tag-model" + + +def test_cmd_launch_model_tag_none(): + args = argparse.Namespace( + model_tag=None, + model="original-model", + launch_command=lambda a: None, + ) + LaunchSubcommand.cmd(args) + assert args.model == "original-model" + + +def test_cmd_dispatches(): + called = {} + + def fake_dispatch(args): + called["args"] = args + + args = argparse.Namespace(launch_command=fake_dispatch) + LaunchSubcommand.cmd(args) + assert "args" in called + + +# -- Module registration -- + + +def test_subparser_init_returns_parser(): + parser = FlexibleArgumentParser(description="test") + subparsers = parser.add_subparsers(required=False, dest="subparser") + result = LaunchSubcommand().subparser_init(subparsers) + assert isinstance(result, FlexibleArgumentParser) + + +def test_launch_registered_in_main(): + """Verify that launch module is importable as a CLI module.""" + import vllm.entrypoints.cli.launch as launch_module + + assert hasattr(launch_module, "cmd_init") + subcmds = launch_module.cmd_init() + assert any(s.name == "launch" for s in subcmds) diff --git a/tests/utils.py b/tests/utils.py index 03e5ccadb..94d694971 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -110,31 +110,25 @@ VLLM_PATH = Path(__file__).parent.parent """Path to root of the vLLM repository.""" -class RemoteOpenAIServer: +class RemoteVLLMServer: + """Base class for launching vLLM server subprocesses for testing. + + Subclasses must override ``_create_cli_subcommand`` and + ``_start_server``. + """ + DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key + proc: subprocess.Popen + + def _create_cli_subcommand(self): + """Return a CLISubcommand instance used to parse CLI args.""" + raise NotImplementedError def _start_server( self, model: str, vllm_serve_args: list[str], env_dict: dict[str, str] | None ) -> None: """Subclasses override this method to customize server process launch""" - env = os.environ.copy() - # the current process might initialize cuda, - # to be safe, we should use spawn method - env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - if env_dict is not None: - env.update(env_dict) - serve_cmd = ["vllm", "serve", model, *vllm_serve_args] - print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}") - print(f"Environment variables: {env}") - self.proc: subprocess.Popen = subprocess.Popen( - serve_cmd, - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - # Create a dedicated process group so we can kill - # the entire tree (parent + EngineCore + workers) at once. - start_new_session=True, - ) + raise NotImplementedError def __init__( self, @@ -171,9 +165,9 @@ class RemoteOpenAIServer: json.dumps(override_hf_configs), ] - parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.") + parser = FlexibleArgumentParser(description="vLLM's remote server.") subparsers = parser.add_subparsers(required=False, dest="subparser") - parser = ServeSubcommand().subparser_init(subparsers) + parser = self._create_cli_subcommand().subparser_init(subparsers) args = parser.parse_args(["--model", model, *vllm_serve_args]) self.uds = args.uds if args.uds: @@ -183,7 +177,9 @@ class RemoteOpenAIServer: self.host = str(args.host or "127.0.0.1") self.port = int(args.port) - self.show_hidden_metrics = args.show_hidden_metrics_for_version is not None + self.show_hidden_metrics = ( + getattr(args, "show_hidden_metrics_for_version", None) is not None + ) # download the model before starting the server to avoid timeout is_local = os.path.isdir(model) @@ -201,7 +197,8 @@ class RemoteOpenAIServer: if self._pre_server_gpu_memory is not None: pre_gb = self._pre_server_gpu_memory / 1e9 print( - f"[RemoteOpenAIServer] GPU memory before server start: {pre_gb:.2f} GB" + f"[{type(self).__name__}] GPU memory before server start: " + f"{pre_gb:.2f} GB" ) self._start_server(model, vllm_serve_args, env_dict) @@ -452,6 +449,62 @@ class RemoteOpenAIServer: ) +class RemoteOpenAIServer(RemoteVLLMServer): + """Launches ``vllm serve`` for testing OpenAI-compatible endpoints.""" + + def _create_cli_subcommand(self): + return ServeSubcommand() + + def _start_server( + self, model: str, vllm_serve_args: list[str], env_dict: dict[str, str] | None + ) -> None: + env = os.environ.copy() + # the current process might initialize cuda, + # to be safe, we should use spawn method + env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + if env_dict is not None: + env.update(env_dict) + serve_cmd = ["vllm", "serve", model, *vllm_serve_args] + print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}") + print(f"Environment variables: {env}") + self.proc: subprocess.Popen = subprocess.Popen( + serve_cmd, + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + # Create a dedicated process group so we can kill + # the entire tree (parent + EngineCore + workers) at once. + start_new_session=True, + ) + + +class RemoteLaunchRenderServer(RemoteVLLMServer): + """Launches ``vllm launch render`` for GPU-less serving tests.""" + + def _create_cli_subcommand(self): + return ServeSubcommand() + + def _start_server( + self, model: str, vllm_serve_args: list[str], env_dict: dict[str, str] | None + ) -> None: + env = os.environ.copy() + env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + if env_dict is not None: + env.update(env_dict) + serve_cmd = ["vllm", "launch", "render", model, *vllm_serve_args] + print(f"Launching RemoteLaunchRenderServer with: {' '.join(serve_cmd)}") + self.proc: subprocess.Popen = subprocess.Popen( + serve_cmd, + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + start_new_session=True, + ) + + def _wait_for_gpu_memory_release(self, timeout: float = 30.0): + pass # No GPU used + + class RemoteOpenAIServerCustom(RemoteOpenAIServer): """Launch test server with custom child process""" diff --git a/vllm/entrypoints/cli/launch.py b/vllm/entrypoints/cli/launch.py new file mode 100644 index 000000000..f04a77d48 --- /dev/null +++ b/vllm/entrypoints/cli/launch.py @@ -0,0 +1,128 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse + +import uvloop + +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.entrypoints.cli.types import CLISubcommand +from vllm.entrypoints.openai.api_server import ( + build_and_serve, + setup_server, +) +from vllm.entrypoints.openai.cli_args import ( + make_arg_parser, + validate_parsed_serve_args, +) +from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG +from vllm.logger import init_logger +from vllm.utils.argparse_utils import FlexibleArgumentParser + +logger = init_logger(__name__) + +DESCRIPTION = "Launch individual vLLM components." + + +class LaunchSubcommandBase(CLISubcommand): + """The base class of subcommands for `vllm launch`.""" + + help: str + + @classmethod + def add_cli_args(cls, parser: FlexibleArgumentParser) -> None: + """Add the CLI arguments to the parser. + + By default, adds the standard vLLM serving arguments. + Subclasses can override to add component-specific arguments. + """ + make_arg_parser(parser) + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + raise NotImplementedError + + +class RenderSubcommand(LaunchSubcommandBase): + """The `render` subcommand for `vllm launch`.""" + + name = "render" + help = "Launch a GPU-less rendering server (preprocessing and postprocessing only)." + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + uvloop.run(run_launch_fastapi(args)) + + +class LaunchSubcommand(CLISubcommand): + """The `launch` subcommand for the vLLM CLI. + + Uses nested sub-subcommands so each component can define its own + arguments independently (e.g. ``vllm launch render``). + """ + + name = "launch" + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + if hasattr(args, "model_tag") and args.model_tag is not None: + args.model = args.model_tag + + args.launch_command(args) + + def validate(self, args: argparse.Namespace) -> None: + validate_parsed_serve_args(args) + + def subparser_init( + self, subparsers: argparse._SubParsersAction + ) -> FlexibleArgumentParser: + launch_parser = subparsers.add_parser( + self.name, + help=DESCRIPTION, + description=DESCRIPTION, + usage=f"vllm {self.name} [options]", + ) + launch_subparsers = launch_parser.add_subparsers( + required=True, dest="launch_component" + ) + + for cmd_cls in LaunchSubcommandBase.__subclasses__(): + cmd_subparser = launch_subparsers.add_parser( + cmd_cls.name, + help=cmd_cls.help, + description=cmd_cls.help, + usage=f"vllm {self.name} {cmd_cls.name} [options]", + ) + cmd_subparser.set_defaults(launch_command=cmd_cls.cmd) + cmd_cls.add_cli_args(cmd_subparser) + cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format( + subcmd=f"{self.name} {cmd_cls.name}" + ) + + return launch_parser + + +def cmd_init() -> list[CLISubcommand]: + return [LaunchSubcommand()] + + +async def run_launch_fastapi(args: argparse.Namespace) -> None: + """Run the online serving layer with FastAPI (no GPU inference).""" + from vllm.config import VllmConfig + from vllm.v1.engine.launch import LaunchEngineClient + + # 1. Socket binding + listen_address, sock = setup_server(args) + + # 2. Create LaunchEngineClient (no GPU) + engine_args = AsyncEngineArgs.from_cli_args(args) + model_config = engine_args.create_model_config() + vllm_config = VllmConfig(model_config=model_config) + engine_client = LaunchEngineClient.from_vllm_config(vllm_config) + + # 3. Build app, initialize state, and start serving + shutdown_task = await build_and_serve(engine_client, listen_address, sock, args) + try: + await shutdown_task + finally: + sock.close() diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py index a3e73eb7a..2261ef233 100644 --- a/vllm/entrypoints/cli/main.py +++ b/vllm/entrypoints/cli/main.py @@ -16,6 +16,7 @@ logger = init_logger(__name__) def main(): import vllm.entrypoints.cli.benchmark.main import vllm.entrypoints.cli.collect_env + import vllm.entrypoints.cli.launch import vllm.entrypoints.cli.openai import vllm.entrypoints.cli.run_batch import vllm.entrypoints.cli.serve @@ -25,6 +26,7 @@ def main(): CMD_MODULES = [ vllm.entrypoints.cli.openai, vllm.entrypoints.cli.serve, + vllm.entrypoints.cli.launch, vllm.entrypoints.cli.benchmark.main, vllm.entrypoints.cli.collect_env, vllm.entrypoints.cli.run_batch, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index e9356b7d9..61095035f 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio import importlib import inspect import multiprocessing @@ -194,7 +195,7 @@ def build_app( register_sagemaker_api_router(app, supported_tasks) - if "generate" in supported_tasks: + if any(task in supported_tasks for task in ("generate", "render")): from vllm.entrypoints.openai.generate.api_router import ( register_generate_api_routers, ) @@ -357,7 +358,7 @@ async def init_app_state( log_error_stack=args.log_error_stack, ) - if "generate" in supported_tasks: + if any(task in supported_tasks for task in ("generate", "render")): from vllm.entrypoints.openai.generate.api_router import init_generate_state await init_generate_state( @@ -469,6 +470,53 @@ def setup_server(args): return listen_address, sock +async def build_and_serve( + engine_client: EngineClient, + listen_address: str, + sock: socket.socket, + args: Namespace, + **uvicorn_kwargs, +) -> asyncio.Task: + """Build FastAPI app, initialize state, and start serving. + + Returns the shutdown task for the caller to await. + """ + + # Get uvicorn log config (from file or with endpoint filter) + log_config = get_uvicorn_log_config(args) + if log_config is not None: + uvicorn_kwargs["log_config"] = log_config + + supported_tasks = await engine_client.get_supported_tasks() + logger.info("Supported tasks: %s", supported_tasks) + + app = build_app(args, supported_tasks) + await init_app_state(engine_client, app.state, args, supported_tasks) + + logger.info("Starting vLLM server on %s", listen_address) + + return await serve_http( + app, + sock=sock, + enable_ssl_refresh=args.enable_ssl_refresh, + host=args.host, + port=args.port, + log_level=args.uvicorn_log_level, + # NOTE: When the 'disable_uvicorn_access_log' value is True, + # no access log will be output. + access_log=not args.disable_uvicorn_access_log, + timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, + ssl_keyfile=args.ssl_keyfile, + ssl_certfile=args.ssl_certfile, + ssl_ca_certs=args.ssl_ca_certs, + ssl_cert_reqs=args.ssl_cert_reqs, + ssl_ciphers=args.ssl_ciphers, + h11_max_incomplete_event_size=args.h11_max_incomplete_event_size, + h11_max_header_count=args.h11_max_header_count, + **uvicorn_kwargs, + ) + + async def run_server(args, **uvicorn_kwargs) -> None: """Run a single-worker API server.""" @@ -490,47 +538,13 @@ async def run_server_worker( if args.reasoning_parser_plugin and len(args.reasoning_parser_plugin) > 3: ReasoningParserManager.import_reasoning_parser(args.reasoning_parser_plugin) - # Get uvicorn log config (from file or with endpoint filter) - log_config = get_uvicorn_log_config(args) - if log_config is not None: - uvicorn_kwargs["log_config"] = log_config - async with build_async_engine_client( args, client_config=client_config, ) as engine_client: - supported_tasks = await engine_client.get_supported_tasks() - logger.info("Supported tasks: %s", supported_tasks) - - app = build_app(args, supported_tasks) - await init_app_state(engine_client, app.state, args, supported_tasks) - - logger.info( - "Starting vLLM API server %d on %s", - engine_client.vllm_config.parallel_config._api_process_rank, - listen_address, + shutdown_task = await build_and_serve( + engine_client, listen_address, sock, args, **uvicorn_kwargs ) - shutdown_task = await serve_http( - app, - sock=sock, - enable_ssl_refresh=args.enable_ssl_refresh, - host=args.host, - port=args.port, - log_level=args.uvicorn_log_level, - # NOTE: When the 'disable_uvicorn_access_log' value is True, - # no access log will be output. - access_log=not args.disable_uvicorn_access_log, - timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, - ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile, - ssl_ca_certs=args.ssl_ca_certs, - ssl_cert_reqs=args.ssl_cert_reqs, - ssl_ciphers=args.ssl_ciphers, - h11_max_incomplete_event_size=args.h11_max_incomplete_event_size, - h11_max_header_count=args.h11_max_header_count, - **uvicorn_kwargs, - ) - # NB: Await server shutdown only after the backend context is exited try: await shutdown_task diff --git a/vllm/entrypoints/openai/generate/api_router.py b/vllm/entrypoints/openai/generate/api_router.py index ac74c7582..e4049331e 100644 --- a/vllm/entrypoints/openai/generate/api_router.py +++ b/vllm/entrypoints/openai/generate/api_router.py @@ -113,7 +113,7 @@ async def init_generate_state( enable_log_deltas=args.enable_log_deltas, log_error_stack=args.log_error_stack, ) - if "generate" in supported_tasks + if any(task in supported_tasks for task in ("generate", "render")) else None ) # Warm up chat template processing to avoid first-request latency @@ -129,7 +129,7 @@ async def init_generate_state( enable_force_include_usage=args.enable_force_include_usage, log_error_stack=args.log_error_stack, ) - if "generate" in supported_tasks + if any(task in supported_tasks for task in ("generate", "render")) else None ) state.anthropic_serving_messages = ( diff --git a/vllm/tasks.py b/vllm/tasks.py index b898bba69..3a64e462e 100644 --- a/vllm/tasks.py +++ b/vllm/tasks.py @@ -10,4 +10,7 @@ PoolingTask = Literal[ ] POOLING_TASKS: tuple[PoolingTask, ...] = get_args(PoolingTask) -SupportedTask = Literal[GenerationTask, PoolingTask] +FrontendTask = Literal["render"] +FRONTEND_TASKS: tuple[FrontendTask, ...] = get_args(FrontendTask) + +SupportedTask = Literal[GenerationTask, PoolingTask, FrontendTask] diff --git a/vllm/v1/engine/launch.py b/vllm/v1/engine/launch.py new file mode 100644 index 000000000..c3d9f32f3 --- /dev/null +++ b/vllm/v1/engine/launch.py @@ -0,0 +1,201 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +LaunchEngineClient: A lightweight EngineClient for GPU-less online serving. + +This implements the EngineClient protocol without AsyncLLM or EngineCore, +enabling preprocessing (tokenization, rendering) and postprocessing +(detokenization) without GPU inference. +""" + +from collections.abc import AsyncGenerator, Iterable, Mapping +from typing import Any + +from vllm.config import VllmConfig +from vllm.engine.protocol import EngineClient, StreamingInput +from vllm.inputs import ProcessorInputs, PromptType +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import PoolingRequestOutput, RequestOutput +from vllm.plugins.io_processors import get_io_processor +from vllm.pooling_params import PoolingParams +from vllm.renderers import renderer_from_config +from vllm.sampling_params import SamplingParams +from vllm.tasks import SupportedTask +from vllm.v1.engine import EngineCoreRequest, PauseMode +from vllm.v1.engine.input_processor import InputProcessor + +logger = init_logger(__name__) + + +class LaunchEngineClient(EngineClient): + """GPU-less EngineClient that only supports preprocessing/postprocessing. + + This is a Null Object at the EngineClient level, bypassing AsyncLLM + entirely. It initializes renderer, io_processor, and input_processor + for tokenization and rendering, but raises NotImplementedError for + any inference-related operations. + """ + + def __init__( + self, + vllm_config: VllmConfig, + ) -> None: + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + + self.renderer = renderer = renderer_from_config(self.vllm_config) + self.io_processor = get_io_processor( + self.vllm_config, + self.renderer, + self.model_config.io_processor_plugin, + ) + + # Convert TokPrompt --> EngineCoreRequest. + self.input_processor = InputProcessor(self.vllm_config, renderer) + + @classmethod + def from_vllm_config( + cls, + vllm_config: VllmConfig, + ) -> "LaunchEngineClient": + """Create a LaunchEngineClient from a VllmConfig without GPU.""" + return cls( + vllm_config=vllm_config, + ) + + # -- Task support -- + + async def get_supported_tasks(self) -> tuple[SupportedTask, ...]: + return ("render",) + + # -- Inference (not supported) -- + + async def generate( + self, + prompt: EngineCoreRequest + | PromptType + | ProcessorInputs + | AsyncGenerator[StreamingInput, None], + sampling_params: SamplingParams, + request_id: str, + *, + prompt_text: str | None = None, + lora_request: LoRARequest | None = None, + tokenization_kwargs: dict[str, Any] | None = None, + trace_headers: Mapping[str, str] | None = None, + priority: int = 0, + data_parallel_rank: int | None = None, + reasoning_ended: bool | None = None, + ) -> AsyncGenerator[RequestOutput, None]: + raise NotImplementedError( + "LaunchEngineClient does not support inference. " + "Use vllm serve for generation requests." + ) + # yield is needed to make this an async generator + yield # type: ignore[misc] # pragma: no cover + + # -- Request management (no-op) -- + + async def abort( + self, request_id: str | Iterable[str], internal: bool = False + ) -> None: + pass + + # -- Generation control (no-op) -- + + async def pause_generation( + self, + *, + mode: PauseMode = "abort", + wait_for_inflight_requests: bool | None = None, + clear_cache: bool = True, + ) -> None: + pass + + async def resume_generation(self) -> None: + pass + + async def is_paused(self) -> bool: + return False + + async def encode( + self, + prompt: PromptType | ProcessorInputs, + pooling_params: PoolingParams, + request_id: str, + lora_request: LoRARequest | None = None, + trace_headers: Mapping[str, str] | None = None, + priority: int = 0, + tokenization_kwargs: dict[str, Any] | None = None, + reasoning_ended: bool | None = None, + ) -> AsyncGenerator[PoolingRequestOutput, None]: + raise NotImplementedError( + "LaunchEngineClient does not support inference. " + "Use vllm serve for encoding requests." + ) + yield # type: ignore[misc] # pragma: no cover + + # -- Observability (no-op / defaults) -- + + async def is_tracing_enabled(self) -> bool: + return False + + async def do_log_stats(self) -> None: + pass + + async def check_health(self) -> None: + pass + + async def start_profile(self) -> None: + pass + + async def stop_profile(self) -> None: + pass + + # -- Cache management (no-op) -- + + async def reset_mm_cache(self) -> None: + pass + + async def reset_prefix_cache( + self, reset_running_requests: bool = False, reset_connector: bool = False + ) -> bool: + return True + + async def reset_encoder_cache(self) -> None: + pass + + # -- Power management (no-op) -- + + async def sleep(self, level: int = 1, mode: PauseMode = "abort") -> None: + pass + + async def wake_up(self, tags: list[str] | None = None) -> None: + pass + + async def is_sleeping(self) -> bool: + return False + + # -- LoRA (not supported) -- + + async def add_lora(self, lora_request: LoRARequest) -> bool: + return False + + # -- Status properties -- + + @property + def is_running(self) -> bool: + return True + + @property + def is_stopped(self) -> bool: + return False + + @property + def errored(self) -> bool: + return False + + @property + def dead_error(self) -> BaseException: + return RuntimeError("LaunchEngineClient does not support inference")