[Frontend] Frontend will only attach supported tasks corresponding entrypoints. (#33139)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
wang.yuqi
2026-01-27 20:15:43 +08:00
committed by GitHub
parent da8d0c441a
commit 76139d0801
10 changed files with 467 additions and 345 deletions

View File

@@ -33,14 +33,10 @@ from starlette.types import ASGIApp, Message, Receive, Scope, Send
import vllm.envs as envs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.anthropic.serving import AnthropicServingMessages
from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.launcher import serve_http
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.mcp.tool_server import DemoToolServer, MCPToolServer, ToolServer
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion
from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo,
ErrorResponse,
@@ -50,12 +46,6 @@ from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import (
OpenAIServingModels,
)
from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses
from vllm.entrypoints.openai.translations.serving import (
OpenAIServingTranscription,
OpenAIServingTranslation,
)
from vllm.entrypoints.serve.disagg.serving import ServingTokens
from vllm.entrypoints.serve.elastic_ep.middleware import (
ScalingMiddleware,
)
@@ -70,6 +60,7 @@ from vllm.entrypoints.utils import (
from vllm.exceptions import VLLMValidationError
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.tasks import POOLING_TASKS, SupportedTask
from vllm.tool_parsers import ToolParserManager
from vllm.usage.usage_lib import UsageContext
from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -513,7 +504,7 @@ def _log_non_streaming_response(response_body: list) -> None:
logger.info("response_body={<binary_data>}")
def build_app(args: Namespace) -> FastAPI:
def build_app(args: Namespace, supported_tasks: tuple["SupportedTask", ...]) -> FastAPI:
if args.disable_fastapi_docs:
app = FastAPI(
openapi_url=None, docs_url=None, redoc_url=None, lifespan=lifespan
@@ -523,52 +514,44 @@ def build_app(args: Namespace) -> FastAPI:
else:
app = FastAPI(lifespan=lifespan)
app.state.args = args
app.include_router(router)
from vllm.entrypoints.serve import register_vllm_serve_api_routers
register_vllm_serve_api_routers(app)
from vllm.entrypoints.openai.chat_completion.api_router import (
attach_router as register_chat_api_router,
)
register_chat_api_router(app)
from vllm.entrypoints.openai.responses.api_router import (
attach_router as register_responses_api_router,
)
register_responses_api_router(app)
from vllm.entrypoints.openai.translations.api_router import (
attach_router as register_translations_api_router,
)
register_translations_api_router(app)
from vllm.entrypoints.openai.completion.api_router import (
attach_router as register_completion_api_router,
)
register_completion_api_router(app)
from vllm.entrypoints.anthropic.api_router import (
attach_router as register_anthropic_api_router,
)
register_anthropic_api_router(app)
from vllm.entrypoints.openai.models.api_router import (
attach_router as register_models_api_router,
)
register_models_api_router(app)
from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
register_sagemaker_routes(router)
app.include_router(router)
from vllm.entrypoints.sagemaker.api_router import (
attach_router as register_sagemaker_api_router,
)
register_sagemaker_api_router(app, supported_tasks)
if "generate" in supported_tasks:
from vllm.entrypoints.openai.generate.api_router import (
register_generate_api_routers,
)
register_generate_api_routers(app)
if "transcription" in supported_tasks:
from vllm.entrypoints.openai.translations.api_router import (
attach_router as register_translations_api_router,
)
register_translations_api_router(app)
if any(task in POOLING_TASKS for task in supported_tasks):
from vllm.entrypoints.pooling import register_pooling_api_routers
register_pooling_api_routers(app, supported_tasks)
app.root_path = args.root_path
from vllm.entrypoints.pooling import register_pooling_api_routers
register_pooling_api_routers(app)
app.add_middleware(
CORSMiddleware,
allow_origins=args.allowed_origins,
@@ -673,6 +656,7 @@ async def init_app_state(
engine_client: EngineClient,
state: State,
args: Namespace,
supported_tasks: tuple["SupportedTask", ...],
) -> None:
vllm_config = engine_client.vllm_config
@@ -694,28 +678,9 @@ async def init_app_state(
state.log_stats = not args.disable_log_stats
state.vllm_config = vllm_config
state.args = args
supported_tasks = await engine_client.get_supported_tasks()
logger.info("Supported tasks: %s", supported_tasks)
resolved_chat_template = load_chat_template(args.chat_template)
if args.tool_server == "demo":
tool_server: ToolServer | None = DemoToolServer()
assert isinstance(tool_server, DemoToolServer)
await tool_server.init_and_validate()
elif args.tool_server:
tool_server = MCPToolServer()
await tool_server.add_tool_server(args.tool_server)
else:
tool_server = None
# Merge default_mm_loras into the static lora_modules
default_mm_loras = (
vllm_config.lora_config.default_mm_loras
if vllm_config.lora_config is not None
else {}
)
default_mm_loras = (
vllm_config.lora_config.default_mm_loras
if vllm_config.lora_config is not None
@@ -729,66 +694,6 @@ async def init_app_state(
lora_modules=lora_modules,
)
await state.openai_serving_models.init_static_loras()
state.openai_serving_responses = (
OpenAIServingResponses(
engine_client,
state.openai_serving_models,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
tool_parser=args.tool_call_parser,
tool_server=tool_server,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
enable_log_outputs=args.enable_log_outputs,
log_error_stack=args.log_error_stack,
)
if "generate" in supported_tasks
else None
)
state.openai_serving_chat = (
OpenAIServingChat(
engine_client,
state.openai_serving_models,
args.response_role,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
default_chat_template_kwargs=args.default_chat_template_kwargs,
trust_request_chat_template=args.trust_request_chat_template,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
enable_log_outputs=args.enable_log_outputs,
enable_log_deltas=args.enable_log_deltas,
log_error_stack=args.log_error_stack,
)
if "generate" in supported_tasks
else None
)
# Warm up chat template processing to avoid first-request latency
if state.openai_serving_chat is not None:
await state.openai_serving_chat.warmup()
state.openai_serving_completion = (
OpenAIServingCompletion(
engine_client,
state.openai_serving_models,
request_logger=request_logger,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
log_error_stack=args.log_error_stack,
)
if "generate" in supported_tasks
else None
)
state.openai_serving_tokenization = OpenAIServingTokenization(
engine_client,
state.openai_serving_models,
@@ -798,64 +703,27 @@ async def init_app_state(
trust_request_chat_template=args.trust_request_chat_template,
log_error_stack=args.log_error_stack,
)
state.openai_serving_transcription = (
OpenAIServingTranscription(
engine_client,
state.openai_serving_models,
request_logger=request_logger,
log_error_stack=args.log_error_stack,
enable_force_include_usage=args.enable_force_include_usage,
)
if "transcription" in supported_tasks
else None
)
state.openai_serving_translation = (
OpenAIServingTranslation(
engine_client,
state.openai_serving_models,
request_logger=request_logger,
log_error_stack=args.log_error_stack,
enable_force_include_usage=args.enable_force_include_usage,
)
if "transcription" in supported_tasks
else None
)
state.anthropic_serving_messages = (
AnthropicServingMessages(
engine_client,
state.openai_serving_models,
args.response_role,
request_logger=request_logger,
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
enable_auto_tools=args.enable_auto_tool_choice,
tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_force_include_usage=args.enable_force_include_usage,
)
if "generate" in supported_tasks
else None
)
state.serving_tokens = (
ServingTokens(
engine_client,
state.openai_serving_models,
request_logger=request_logger,
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
log_error_stack=args.log_error_stack,
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
enable_log_outputs=args.enable_log_outputs,
force_no_detokenize=args.tokens_only,
)
if "generate" in supported_tasks
else None
)
from vllm.entrypoints.pooling import init_pooling_state
if "generate" in supported_tasks:
from vllm.entrypoints.openai.generate.api_router import init_generate_state
await init_pooling_state(engine_client, state, args)
await init_generate_state(
engine_client, state, args, request_logger, supported_tasks
)
if "transcription" in supported_tasks:
from vllm.entrypoints.openai.translations.api_router import (
init_transcription_state,
)
init_transcription_state(
engine_client, state, args, request_logger, supported_tasks
)
if any(task in POOLING_TASKS for task in supported_tasks):
from vllm.entrypoints.pooling import init_pooling_state
init_pooling_state(engine_client, state, args, request_logger, supported_tasks)
state.enable_server_load_tracking = args.enable_server_load_tracking
state.server_load_metrics = 0
@@ -972,9 +840,11 @@ async def run_server_worker(
args,
client_config=client_config,
) as engine_client:
app = build_app(args)
supported_tasks = await engine_client.get_supported_tasks()
logger.info("Supported tasks: %s", supported_tasks)
await init_app_state(engine_client, app.state, args)
app = build_app(args, supported_tasks)
await init_app_state(engine_client, app.state, args, supported_tasks)
logger.info(
"Starting vLLM API server %d on %s",