vllm/entrypoints/utils.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import argparse
import asyncio
import functools
import os
import sys
from typing import Any, Optional, Union

from fastapi import Request
from fastapi.responses import JSONResponse, StreamingResponse
from starlette.background import BackgroundTask, BackgroundTasks

from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              CompletionRequest)
from vllm.logger import init_logger
from vllm.platforms import current_platform

logger = init_logger(__name__)

VLLM_SUBCMD_PARSER_EPILOG = (
    "Tip: Use `vllm [serve|run-batch|bench <bench_type>] "
    "--help=<keyword>` to explore arguments from help.\n"
    "   - To view a argument group:     --help=ModelConfig\n"
    "   - To view a single argument:    --help=max-num-seqs\n"
    "   - To search by keyword:         --help=max\n"
    "   - To list all groups:           --help=listgroup")


async def listen_for_disconnect(request: Request) -> None:
    """Returns if a disconnect message is received"""
    while True:
        message = await request.receive()
        if message["type"] == "http.disconnect":
            if request.app.state.enable_server_load_tracking:
                # on timeout/cancellation the BackgroundTask in load_aware_call
                # cannot decrement the server load metrics.
                # Must be decremented by with_cancellation instead.
                request.app.state.server_load_metrics -= 1
            break


def with_cancellation(handler_func):
    """Decorator that allows a route handler to be cancelled by client
    disconnections.

    This does _not_ use request.is_disconnected, which does not work with
    middleware. Instead this follows the pattern from
    starlette.StreamingResponse, which simultaneously awaits on two tasks- one
    to wait for an http disconnect message, and the other to do the work that we
    want done. When the first task finishes, the other is cancelled.

    A core assumption of this method is that the body of the request has already
    been read. This is a safe assumption to make for fastapi handlers that have
    already parsed the body of the request into a pydantic model for us.
    This decorator is unsafe to use elsewhere, as it will consume and throw away
    all incoming messages for the request while it looks for a disconnect
    message.

    In the case where a `StreamingResponse` is returned by the handler, this
    wrapper will stop listening for disconnects and instead the response object
    will start listening for disconnects.
    """

    # Functools.wraps is required for this wrapper to appear to fastapi as a
    # normal route handler, with the correct request type hinting.
    @functools.wraps(handler_func)
    async def wrapper(*args, **kwargs):

        # The request is either the second positional arg or `raw_request`
        request = args[1] if len(args) > 1 else kwargs["raw_request"]

        handler_task = asyncio.create_task(handler_func(*args, **kwargs))
        cancellation_task = asyncio.create_task(listen_for_disconnect(request))

        done, pending = await asyncio.wait([handler_task, cancellation_task],
                                           return_when=asyncio.FIRST_COMPLETED)
        for task in pending:
            task.cancel()

        if handler_task in done:
            return handler_task.result()
        return None

    return wrapper


def decrement_server_load(request: Request):
    request.app.state.server_load_metrics -= 1


def load_aware_call(func):

    @functools.wraps(func)
    async def wrapper(*args, **kwargs):
        raw_request = kwargs.get("raw_request",
                                 args[1] if len(args) > 1 else None)

        if raw_request is None:
            raise ValueError(
                "raw_request required when server load tracking is enabled")

        if not raw_request.app.state.enable_server_load_tracking:
            return await func(*args, **kwargs)

        raw_request.app.state.server_load_metrics += 1
        try:
            response = await func(*args, **kwargs)
        except Exception:
            raw_request.app.state.server_load_metrics -= 1
            raise

        if isinstance(response, (JSONResponse, StreamingResponse)):
            if response.background is None:
                response.background = BackgroundTask(decrement_server_load,
                                                     raw_request)
            elif isinstance(response.background, BackgroundTasks):
                response.background.add_task(decrement_server_load,
                                             raw_request)
            elif isinstance(response.background, BackgroundTask):
                # Convert the single BackgroundTask to BackgroundTasks
                # and chain the decrement_server_load task to it
                tasks = BackgroundTasks()
                tasks.add_task(response.background.func,
                               *response.background.args,
                               **response.background.kwargs)
                tasks.add_task(decrement_server_load, raw_request)
                response.background = tasks
        else:
            raw_request.app.state.server_load_metrics -= 1

        return response

    return wrapper


def cli_env_setup():
    # The safest multiprocessing method is `spawn`, as the default `fork` method
    # is not compatible with some accelerators. The default method will be
    # changing in future versions of Python, so we should use it explicitly when
    # possible.
    #
    # We only set it here in the CLI entrypoint, because changing to `spawn`
    # could break some existing code using vLLM as a library. `spawn` will cause
    # unexpected behavior if the code is not protected by
    # `if __name__ == "__main__":`.
    #
    # References:
    # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
    # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
    # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
    # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
    if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
        logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"


def _validate_truncation_size(
    max_model_len: int,
    truncate_prompt_tokens: Optional[int],
    tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> Optional[int]:

    if truncate_prompt_tokens is not None:
        if truncate_prompt_tokens <= -1:
            truncate_prompt_tokens = max_model_len

        if truncate_prompt_tokens > max_model_len:
            raise ValueError(
                f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
                f"is greater than max_model_len ({max_model_len})."
                f" Please, select a smaller truncation size.")

        if tokenization_kwargs is not None:
            tokenization_kwargs["truncation"] = True
            tokenization_kwargs["max_length"] = truncate_prompt_tokens

    else:
        if tokenization_kwargs is not None:
            tokenization_kwargs["truncation"] = False

    return truncate_prompt_tokens


def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser,
                                              subcommand_name: list[str]):

    # Only handle --help=<keyword> for the current subcommand.
    # Since subparser_init() runs for all subcommands during CLI setup,
    # we skip processing if the subcommand name is not in sys.argv.
    # sys.argv[0] is the program name. The subcommand follows.
    # e.g., for `vllm bench latency`,
    # sys.argv is `['vllm', 'bench', 'latency', ...]`
    # and subcommand_name is "bench latency".
    if len(sys.argv) <= len(subcommand_name) or sys.argv[
            1:1 + len(subcommand_name)] != subcommand_name:
        return

    for arg in sys.argv:
        if arg.startswith('--help='):
            search_keyword = arg.split('=', 1)[1]

            # List available groups
            if search_keyword == 'listgroup':
                print("\nAvailable argument groups:")
                for group in parser._action_groups:
                    if group.title and not group.title.startswith(
                            "positional arguments"):
                        print(f"  - {group.title}")
                        if group.description:
                            print("    " + group.description.strip())
                        print()
                sys.exit(0)

            # For group search
            formatter = parser._get_formatter()
            for group in parser._action_groups:
                if group.title and group.title.lower() == search_keyword.lower(
                ):
                    formatter.start_section(group.title)
                    formatter.add_text(group.description)
                    formatter.add_arguments(group._group_actions)
                    formatter.end_section()
                    print(formatter.format_help())
                    sys.exit(0)

            # For single arg
            matched_actions = []

            for group in parser._action_groups:
                for action in group._group_actions:
                    # search option name
                    if any(search_keyword.lower() in opt.lower()
                           for opt in action.option_strings):
                        matched_actions.append(action)

            if matched_actions:
                print(f"\nParameters matching '{search_keyword}':\n")
                formatter = parser._get_formatter()
                formatter.add_arguments(matched_actions)
                print(formatter.format_help())
                sys.exit(0)

            print(f"\nNo group or parameter matching '{search_keyword}'")
            print("Tip: use `--help=listgroup` to view all groups.")
            sys.exit(1)


def get_max_tokens(max_model_len: int, request: Union[ChatCompletionRequest,
                                                      CompletionRequest],
                   input_length: int, default_sampling_params: dict) -> int:

    max_tokens = getattr(request, "max_completion_tokens",
                         None) or request.max_tokens
    default_max_tokens = max_model_len - input_length
    max_output_tokens = current_platform.get_max_output_tokens(input_length)

    return min(val
               for val in (default_max_tokens, max_tokens, max_output_tokens,
                           default_sampling_params.get("max_tokens"))
               if val is not None)
[Misc] Add SPDX-License-Identifier headers to python source files (#12628) - Add SPDX license headers to python source files - Check for SPDX headers using pre-commit commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745 Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:18:24 2025 -0500 Add SPDX license headers to python source files This commit adds SPDX license headers to python source files as recommended to the project by the Linux Foundation. These headers provide a concise way that is both human and machine readable for communicating license information for each source file. It helps avoid any ambiguity about the license of the code and can also be easily used by tools to help manage license compliance. The Linux Foundation runs license scans against the codebase to help ensure we are in compliance with the licenses of the code we use, including dependencies. Having these headers in place helps that tool do its job. More information can be found on the SPDX site: - https://spdx.dev/learn/handling-license-info/ Signed-off-by: Russell Bryant <rbryant@redhat.com> commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:36:32 2025 -0500 Check for SPDX headers using pre-commit Signed-off-by: Russell Bryant <rbryant@redhat.com> --------- Signed-off-by: Russell Bryant <rbryant@redhat.com> 2025-02-02 14:58:18 -05:00			`# SPDX-License-Identifier: Apache-2.0`
[Misc] Add SPDX-FileCopyrightText (#19100) Signed-off-by: simon-mo <simon.mo@hey.com> 2025-06-03 11:20:17 -07:00			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
[Misc] Add SPDX-License-Identifier headers to python source files (#12628) - Add SPDX license headers to python source files - Check for SPDX headers using pre-commit commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745 Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:18:24 2025 -0500 Add SPDX license headers to python source files This commit adds SPDX license headers to python source files as recommended to the project by the Linux Foundation. These headers provide a concise way that is both human and machine readable for communicating license information for each source file. It helps avoid any ambiguity about the license of the code and can also be easily used by tools to help manage license compliance. The Linux Foundation runs license scans against the codebase to help ensure we are in compliance with the licenses of the code we use, including dependencies. Having these headers in place helps that tool do its job. More information can be found on the SPDX site: - https://spdx.dev/learn/handling-license-info/ Signed-off-by: Russell Bryant <rbryant@redhat.com> commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:36:32 2025 -0500 Check for SPDX headers using pre-commit Signed-off-by: Russell Bryant <rbryant@redhat.com> --------- Signed-off-by: Russell Bryant <rbryant@redhat.com> 2025-02-02 14:58:18 -05:00
[Frontend] improve vllm bench <bench_type> --help display (#20430) Signed-off-by: reidliu41 <reid201711@gmail.com> 2025-07-03 22:22:16 +08:00			`import argparse`
[Bugfix] Fix request cancellation without polling (#11190) 2024-12-17 13:26:32 -07:00			`import asyncio`
			`import functools`
[Bugfix] set VLLM_WORKER_MULTIPROC_METHOD=spawn for vllm.entrypoionts.openai.api_server (#15700) Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com> 2025-03-29 12:12:26 +08:00			`import os`
[Platform] Add custom default max tokens (#18557) Signed-off-by: Gabriel Marinho <gmarinho@ibm.com> 2025-07-03 23:50:17 -03:00			`import sys`
			`from typing import Any, Optional, Union`
[Bugfix] Fix request cancellation without polling (#11190) 2024-12-17 13:26:32 -07:00
			`from fastapi import Request`
[Frontend] track server_load (#13950) 2025-03-14 09:53:17 -07:00			`from fastapi.responses import JSONResponse, StreamingResponse`
			`from starlette.background import BackgroundTask, BackgroundTasks`
[Bugfix] Fix request cancellation without polling (#11190) 2024-12-17 13:26:32 -07:00
[Platform] Add custom default max tokens (#18557) Signed-off-by: Gabriel Marinho <gmarinho@ibm.com> 2025-07-03 23:50:17 -03:00			`from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,`
			`CompletionRequest)`
[Bugfix] set VLLM_WORKER_MULTIPROC_METHOD=spawn for vllm.entrypoionts.openai.api_server (#15700) Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com> 2025-03-29 12:12:26 +08:00			`from vllm.logger import init_logger`
[Platform] Add custom default max tokens (#18557) Signed-off-by: Gabriel Marinho <gmarinho@ibm.com> 2025-07-03 23:50:17 -03:00			`from vllm.platforms import current_platform`
[Bugfix] set VLLM_WORKER_MULTIPROC_METHOD=spawn for vllm.entrypoionts.openai.api_server (#15700) Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com> 2025-03-29 12:12:26 +08:00
			`logger = init_logger(__name__)`

[Frontend] improve vllm run-batch --help display (#19187) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> 2025-06-05 19:16:25 +08:00			`VLLM_SUBCMD_PARSER_EPILOG = (`
[Frontend] improve vllm bench <bench_type> --help display (#20430) Signed-off-by: reidliu41 <reid201711@gmail.com> 2025-07-03 22:22:16 +08:00			"Tip: Use `vllm [serve\|run-batch\|bench <bench_type>] "
			"--help=<keyword>` to explore arguments from help.\n"
[Frontend] improve vllm serve --help display (#18643) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> 2025-05-24 15:53:22 +08:00			`" - To view a argument group: --help=ModelConfig\n"`
			`" - To view a single argument: --help=max-num-seqs\n"`
			`" - To search by keyword: --help=max\n"`
			`" - To list all groups: --help=listgroup")`

[Bugfix] Fix request cancellation without polling (#11190) 2024-12-17 13:26:32 -07:00
			`async def listen_for_disconnect(request: Request) -> None:`
			`"""Returns if a disconnect message is received"""`
			`while True:`
			`message = await request.receive()`
			`if message["type"] == "http.disconnect":`
decrement server_load on listen for disconnect (#18784) Signed-off-by: Daniel Salib <danielsalib@meta.com> 2025-05-28 07:15:12 -07:00			`if request.app.state.enable_server_load_tracking:`
			`# on timeout/cancellation the BackgroundTask in load_aware_call`
			`# cannot decrement the server load metrics.`
			`# Must be decremented by with_cancellation instead.`
			`request.app.state.server_load_metrics -= 1`
[Bugfix] Fix request cancellation without polling (#11190) 2024-12-17 13:26:32 -07:00			`break`


			`def with_cancellation(handler_func):`
			`"""Decorator that allows a route handler to be cancelled by client`
			`disconnections.`
[Frontend] track server_load (#13950) 2025-03-14 09:53:17 -07:00
[Bugfix] Fix request cancellation without polling (#11190) 2024-12-17 13:26:32 -07:00			`This does _not_ use request.is_disconnected, which does not work with`
[Frontend] track server_load (#13950) 2025-03-14 09:53:17 -07:00			`middleware. Instead this follows the pattern from`
[Bugfix] Fix request cancellation without polling (#11190) 2024-12-17 13:26:32 -07:00			`starlette.StreamingResponse, which simultaneously awaits on two tasks- one`
			`to wait for an http disconnect message, and the other to do the work that we`
			`want done. When the first task finishes, the other is cancelled.`

			`A core assumption of this method is that the body of the request has already`
			`been read. This is a safe assumption to make for fastapi handlers that have`
			`already parsed the body of the request into a pydantic model for us.`
			`This decorator is unsafe to use elsewhere, as it will consume and throw away`
			`all incoming messages for the request while it looks for a disconnect`
			`message.`

			In the case where a `StreamingResponse` is returned by the handler, this
			`wrapper will stop listening for disconnects and instead the response object`
			`will start listening for disconnects.`
			`"""`

			`# Functools.wraps is required for this wrapper to appear to fastapi as a`
			`# normal route handler, with the correct request type hinting.`
			`@functools.wraps(handler_func)`
			`async def wrapper(args, *kwargs):`

			# The request is either the second positional arg or `raw_request`
			`request = args[1] if len(args) > 1 else kwargs["raw_request"]`

			`handler_task = asyncio.create_task(handler_func(args, *kwargs))`
			`cancellation_task = asyncio.create_task(listen_for_disconnect(request))`

			`done, pending = await asyncio.wait([handler_task, cancellation_task],`
			`return_when=asyncio.FIRST_COMPLETED)`
			`for task in pending:`
			`task.cancel()`

			`if handler_task in done:`
			`return handler_task.result()`
			`return None`

			`return wrapper`
[Frontend] track server_load (#13950) 2025-03-14 09:53:17 -07:00

			`def decrement_server_load(request: Request):`
			`request.app.state.server_load_metrics -= 1`


			`def load_aware_call(func):`

			`@functools.wraps(func)`
Fix raw_request extraction in load_aware_call decorator (#15382) Signed-off-by: Daniel Salib <danielsalib@meta.com> 2025-03-25 22:29:54 -07:00			`async def wrapper(args, *kwargs):`
			`raw_request = kwargs.get("raw_request",`
			`args[1] if len(args) > 1 else None)`

			`if raw_request is None:`
			`raise ValueError(`
			`"raw_request required when server load tracking is enabled")`

[Frontend] track server_load (#13950) 2025-03-14 09:53:17 -07:00			`if not raw_request.app.state.enable_server_load_tracking:`
Fix raw_request extraction in load_aware_call decorator (#15382) Signed-off-by: Daniel Salib <danielsalib@meta.com> 2025-03-25 22:29:54 -07:00			`return await func(args, *kwargs)`
[Frontend] track server_load (#13950) 2025-03-14 09:53:17 -07:00
			`raw_request.app.state.server_load_metrics += 1`
			`try:`
Fix raw_request extraction in load_aware_call decorator (#15382) Signed-off-by: Daniel Salib <danielsalib@meta.com> 2025-03-25 22:29:54 -07:00			`response = await func(args, *kwargs)`
[Frontend] track server_load (#13950) 2025-03-14 09:53:17 -07:00			`except Exception:`
			`raw_request.app.state.server_load_metrics -= 1`
			`raise`

			`if isinstance(response, (JSONResponse, StreamingResponse)):`
			`if response.background is None:`
			`response.background = BackgroundTask(decrement_server_load,`
			`raw_request)`
			`elif isinstance(response.background, BackgroundTasks):`
			`response.background.add_task(decrement_server_load,`
			`raw_request)`
			`elif isinstance(response.background, BackgroundTask):`
			`# Convert the single BackgroundTask to BackgroundTasks`
			`# and chain the decrement_server_load task to it`
			`tasks = BackgroundTasks()`
			`tasks.add_task(response.background.func,`
			`*response.background.args,`
			`**response.background.kwargs)`
			`tasks.add_task(decrement_server_load, raw_request)`
			`response.background = tasks`
			`else:`
			`raw_request.app.state.server_load_metrics -= 1`

			`return response`

			`return wrapper`
[Bugfix] set VLLM_WORKER_MULTIPROC_METHOD=spawn for vllm.entrypoionts.openai.api_server (#15700) Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com> 2025-03-29 12:12:26 +08:00

			`def cli_env_setup():`
			# The safest multiprocessing method is `spawn`, as the default `fork` method
			`# is not compatible with some accelerators. The default method will be`
			`# changing in future versions of Python, so we should use it explicitly when`
			`# possible.`
			`#`
			# We only set it here in the CLI entrypoint, because changing to `spawn`
			# could break some existing code using vLLM as a library. `spawn` will cause
			`# unexpected behavior if the code is not protected by`
			# `if __name__ == "__main__":`.
			`#`
			`# References:`
			`# - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods`
			`# - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing`
			`# - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors`
			`# - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders`
			`if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:`
			`logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")`
			`os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"`
Truncation control for embedding models (#14776) Signed-off-by: Gabriel Marinho <gmarinho@ibm.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Max de Bayser <mbayser@br.ibm.com> 2025-04-29 22:24:57 -03:00

			`def _validate_truncation_size(`
			`max_model_len: int,`
			`truncate_prompt_tokens: Optional[int],`
			`tokenization_kwargs: Optional[dict[str, Any]] = None,`
			`) -> Optional[int]:`

			`if truncate_prompt_tokens is not None:`
			`if truncate_prompt_tokens <= -1:`
			`truncate_prompt_tokens = max_model_len`

			`if truncate_prompt_tokens > max_model_len:`
			`raise ValueError(`
			`f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "`
			`f"is greater than max_model_len ({max_model_len})."`
			`f" Please, select a smaller truncation size.")`

			`if tokenization_kwargs is not None:`
			`tokenization_kwargs["truncation"] = True`
			`tokenization_kwargs["max_length"] = truncate_prompt_tokens`

[Core] [Bugfix] [Multimodal] Fix multimodal profiling and generation for SFT/PTQed models (#20058) Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> 2025-06-30 13:26:49 -04:00			`else:`
			`if tokenization_kwargs is not None:`
			`tokenization_kwargs["truncation"] = False`

Truncation control for embedding models (#14776) Signed-off-by: Gabriel Marinho <gmarinho@ibm.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com> Co-authored-by: Max de Bayser <mbayser@br.ibm.com> 2025-04-29 22:24:57 -03:00			`return truncate_prompt_tokens`
[Frontend] improve vllm serve --help display (#18643) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> 2025-05-24 15:53:22 +08:00

[Frontend] improve vllm bench <bench_type> --help display (#20430) Signed-off-by: reidliu41 <reid201711@gmail.com> 2025-07-03 22:22:16 +08:00			`def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser,`
			`subcommand_name: list[str]):`
[Frontend] improve vllm run-batch --help display (#19187) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> 2025-06-05 19:16:25 +08:00
			`# Only handle --help=<keyword> for the current subcommand.`
			`# Since subparser_init() runs for all subcommands during CLI setup,`
			`# we skip processing if the subcommand name is not in sys.argv.`
[Frontend] improve vllm bench <bench_type> --help display (#20430) Signed-off-by: reidliu41 <reid201711@gmail.com> 2025-07-03 22:22:16 +08:00			`# sys.argv[0] is the program name. The subcommand follows.`
			# e.g., for `vllm bench latency`,
			# sys.argv is `['vllm', 'bench', 'latency', ...]`
			`# and subcommand_name is "bench latency".`
			`if len(sys.argv) <= len(subcommand_name) or sys.argv[`
			`1:1 + len(subcommand_name)] != subcommand_name:`
[Frontend] improve vllm run-batch --help display (#19187) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> 2025-06-05 19:16:25 +08:00			`return`

[Frontend] improve vllm serve --help display (#18643) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com> 2025-05-24 15:53:22 +08:00			`for arg in sys.argv:`
			`if arg.startswith('--help='):`
			`search_keyword = arg.split('=', 1)[1]`

			`# List available groups`
			`if search_keyword == 'listgroup':`
			`print("\nAvailable argument groups:")`
			`for group in parser._action_groups:`
			`if group.title and not group.title.startswith(`
			`"positional arguments"):`
			`print(f" - {group.title}")`
			`if group.description:`
			`print(" " + group.description.strip())`
			`print()`
			`sys.exit(0)`

			`# For group search`
			`formatter = parser._get_formatter()`
			`for group in parser._action_groups:`
			`if group.title and group.title.lower() == search_keyword.lower(`
			`):`
			`formatter.start_section(group.title)`
			`formatter.add_text(group.description)`
			`formatter.add_arguments(group._group_actions)`
			`formatter.end_section()`
			`print(formatter.format_help())`
			`sys.exit(0)`

			`# For single arg`
			`matched_actions = []`

			`for group in parser._action_groups:`
			`for action in group._group_actions:`
			`# search option name`
			`if any(search_keyword.lower() in opt.lower()`
			`for opt in action.option_strings):`
			`matched_actions.append(action)`

			`if matched_actions:`
			`print(f"\nParameters matching '{search_keyword}':\n")`
			`formatter = parser._get_formatter()`
			`formatter.add_arguments(matched_actions)`
			`print(formatter.format_help())`
			`sys.exit(0)`

			`print(f"\nNo group or parameter matching '{search_keyword}'")`
			print("Tip: use `--help=listgroup` to view all groups.")
			`sys.exit(1)`
[Platform] Add custom default max tokens (#18557) Signed-off-by: Gabriel Marinho <gmarinho@ibm.com> 2025-07-03 23:50:17 -03:00

			`def get_max_tokens(max_model_len: int, request: Union[ChatCompletionRequest,`
			`CompletionRequest],`
			`input_length: int, default_sampling_params: dict) -> int:`

			`max_tokens = getattr(request, "max_completion_tokens",`
			`None) or request.max_tokens`
			`default_max_tokens = max_model_len - input_length`
			`max_output_tokens = current_platform.get_max_output_tokens(input_length)`

			`return min(val`
			`for val in (default_max_tokens, max_tokens, max_output_tokens,`
			`default_sampling_params.get("max_tokens"))`
			`if val is not None)`