[Doc] Add engine args back in to the docs (#20674)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-10 16:02:40 +01:00
parent 77f77a951e
commit 3482fd7e4e
14 changed files with 218 additions and 40 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -12,8 +12,9 @@ import threading
 import warnings
 from dataclasses import MISSING, dataclass, fields, is_dataclass
 from itertools import permutations
-from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional,
-                    Type, TypeVar, Union, cast, get_args, get_origin)
+from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
+                    Literal, Optional, Type, TypeVar, Union, cast, get_args,
+                    get_origin)

 import regex as re
 import torch
@@ -33,20 +34,26 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
                         TaskOption, TokenizerMode, TokenizerPoolConfig,
                         VllmConfig, get_attr_docs, get_field)
-from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
 from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
-from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                        GiB_bytes, get_ip, is_in_ray_actor)

 # yapf: enable

+if TYPE_CHECKING:
+    from vllm.executor.executor_base import ExecutorBase
+    from vllm.model_executor.layers.quantization import QuantizationMethods
+    from vllm.usage.usage_lib import UsageContext
+else:
+    ExecutorBase = Any
+    QuantizationMethods = Any
+    UsageContext = Any
+
 logger = init_logger(__name__)

 # object is used to allow for special typing forms
@@ -200,14 +207,17 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
        kwargs[name] = {"default": default, "help": help}

        # Set other kwargs based on the type hints
-        json_tip = """\n\nShould either be a valid JSON string or JSON keys
-        passed individually. For example, the following sets of arguments are
-        equivalent:\n\n
-        - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
-        - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n
-        Additionally, list elements can be passed individually using '+':
-        - `--json-arg '{"key4": ["value3", "value4", "value5"]}'`\n
-        - `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`\n\n"""
+        json_tip = """Should either be a valid JSON string or JSON keys
+passed individually. For example, the following sets of arguments are
+equivalent:
+
+- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
+- `--json-arg.key1 value1 --json-arg.key2.key3 value2`
+
+Additionally, list elements can be passed individually using `+`:
+
+- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`\n
+- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`"""
        if dataclass_cls is not None:

            def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
@@ -219,7 +229,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
                    raise argparse.ArgumentTypeError(repr(e)) from e

            kwargs[name]["type"] = parse_dataclass
-            kwargs[name]["help"] += json_tip
+            kwargs[name]["help"] += f"\n\n{json_tip}"
        elif contains_type(type_hints, bool):
            # Creates --no-<name> and --<name> flags
            kwargs[name]["action"] = argparse.BooleanOptionalAction
@@ -255,7 +265,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
            kwargs[name]["type"] = union_dict_and_str
        elif contains_type(type_hints, dict):
            kwargs[name]["type"] = parse_type(json.loads)
-            kwargs[name]["help"] += json_tip
+            kwargs[name]["help"] += f"\n\n{json_tip}"
        elif (contains_type(type_hints, str)
              or any(is_not_builtin(th) for th in type_hints)):
            kwargs[name]["type"] = str
@@ -1545,7 +1555,6 @@ class EngineArgs:
            # Enable chunked prefill by default for long context (> 32K)
            # models to avoid OOM errors in initial memory profiling phase.
            elif use_long_context:
-                from vllm.platforms import current_platform
                is_gpu = current_platform.is_cuda()
                use_sliding_window = (model_config.get_sliding_window()
                                      is not None)
@@ -1653,6 +1662,7 @@ class EngineArgs:
        # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
        # throughput, see PR #17885 for more details.
        # So here we do an extra device name check to prevent such regression.
+        from vllm.usage.usage_lib import UsageContext
        if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
            # For GPUs like H100 and MI300x, use larger default values.
            default_max_num_batched_tokens = {