Remove need for explicit \n in docstring lists for --help formatting (#38350)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-03-27 15:38:00 +00:00
parent 98e7f223b9
commit 21d2b53f88
10 changed files with 89 additions and 82 deletions
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -66,22 +66,21 @@ class CacheConfig:
    enable_prefix_caching: bool = True
    """Whether to enable prefix caching."""
    prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
-    """Set the hash algorithm for prefix caching:\n
-    - "sha256" uses Pickle for object serialization before hashing. This is the
-    current default, as SHA256 is the most secure choice to avoid potential
-    hash collisions.\n
+    """Set the hash algorithm for prefix caching:
+
+    - "sha256" uses Pickle for object serialization before hashing. This is the current
+      default, as SHA256 is the most secure choice to avoid potential hash collisions.
    - "sha256_cbor" provides a reproducible, cross-language compatible hash. It
-    serializes objects using canonical CBOR and hashes them with SHA-256.\n
+      serializes objects using canonical CBOR and hashes them with SHA-256.
    - "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
-    non-cryptographic hashing. Requires the optional ``xxhash`` package.
-    IMPORTANT: Use of a hashing algorithm that is not considered 
-    cryptographically secure theoretically increases the risk of hash collisions,
-    which can cause undefined behavior or even leak private information in
-    multi-tenant environments. Even if collisions are still very unlikely, it is
-    important to consider your security risk tolerance against the performance
-    benefits before turning this on.\n
+      non-cryptographic hashing. Requires the optional ``xxhash`` package.
+      IMPORTANT: Use of a hashing algorithm that is not considered  cryptographically
+      secure theoretically increases the risk of hash collisions, which can cause
+      undefined behavior or even leak private information in multi-tenant environments.
+      Even if collisions are still very unlikely, it is important to consider your
+      security risk tolerance against the performance benefits before turning this on.
    - "xxhash_cbor" combines canonical CBOR serialization with xxHash for
-    reproducible hashing. Requires the optional ``xxhash`` package."""
+      reproducible hashing. Requires the optional ``xxhash`` package."""
    calculate_kv_scales: bool = False
    """Deprecated: This option is deprecated and will be removed in v0.19.
    It enables dynamic calculation of `k_scale` and `v_scale` when
--- a/vllm/config/kernel.py
+++ b/vllm/config/kernel.py
@@ -32,14 +32,14 @@ class KernelConfig:
    moe_backend: MoEBackend = "auto"
    """Backend for MoE expert computation kernels. Available options:

-    - "auto": Automatically select the best backend based on model and hardware\n
-    - "triton": Use Triton-based fused MoE kernels\n
-    - "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)\n
-    - "cutlass": Use vLLM CUTLASS kernels\n
-    - "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels\n
-    - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels\n
-    - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)\n
-    - "marlin": Use Marlin kernels (weight-only quantization)\n
+    - "auto": Automatically select the best backend based on model and hardware
+    - "triton": Use Triton-based fused MoE kernels
+    - "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
+    - "cutlass": Use vLLM CUTLASS kernels
+    - "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
+    - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
+    - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
+    - "marlin": Use Marlin kernels (weight-only quantization)
    - "aiter": Use AMD AITer kernels (ROCm only)"""

    @field_validator("moe_backend", mode="before")
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -51,7 +51,7 @@ class LoadConfig:
    - "gguf" will load weights from GGUF format files (details specified in
      https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).
    - "mistral" will load weights from consolidated safetensors files used by
-      Mistral models.\n
+      Mistral models.
    - Other custom values can be supported via plugins.
    """
    download_dir: str | None = None
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -125,26 +125,28 @@ class ModelConfig:
    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
    name or path will be used."""
    tokenizer_mode: TokenizerMode | str = "auto"
-    """Tokenizer mode:\n
+    """Tokenizer mode:
+
    - "auto" will use the tokenizer from `mistral_common` for Mistral models
-    if available, otherwise it will use the "hf" tokenizer.\n
-    - "hf" will use the fast tokenizer if available.\n
-    - "slow" will always use the slow tokenizer.\n
-    - "mistral" will always use the tokenizer from `mistral_common`.\n
-    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
-    - "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
+      if available, otherwise it will use the "hf" tokenizer.
+    - "hf" will use the fast tokenizer if available.
+    - "slow" will always use the slow tokenizer.
+    - "mistral" will always use the tokenizer from `mistral_common`.
+    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
+    - "qwen_vl" will always use the tokenizer from `qwen_vl`.
    - Other custom values can be supported via plugins."""
    trust_remote_code: bool = False
    """Trust remote code (e.g., from HuggingFace) when downloading the model
    and tokenizer."""
    dtype: ModelDType | torch.dtype = "auto"
-    """Data type for model weights and activations:\n
+    """Data type for model weights and activations:
+
    - "auto" will use FP16 precision for FP32 and FP16 models, and BF16
-    precision for BF16 models.\n
-    - "half" for FP16. Recommended for AWQ quantization.\n
-    - "float16" is the same as "half".\n
-    - "bfloat16" for a balance between precision and range.\n
-    - "float" is shorthand for FP32 precision.\n
+      precision for BF16 models.
+    - "half" for FP16. Recommended for AWQ quantization.
+    - "float16" is the same as "half".
+    - "bfloat16" for a balance between precision and range.
+    - "float" is shorthand for FP32 precision.
    - "float32" for FP32 precision."""
    seed: int = 0
    """Random seed for reproducibility.
@@ -182,13 +184,14 @@ class ModelConfig:
    automatically derived from the model config.

    When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
-    format. Examples:\n
-    - 1k -> 1000\n
-    - 1K -> 1024\n
-    - 25.6k -> 25,600\n
+    format. Examples:
+
+    - 1k -> 1000
+    - 1K -> 1024
+    - 25.6k -> 25,600
    - -1 or 'auto' -> Automatically choose the maximum model length that fits in
-    GPU memory. This will use the model's maximum context length if it fits,
-    otherwise it will find the largest length that can be accommodated."""
+      GPU memory. This will use the model's maximum context length if it fits,
+      otherwise it will find the largest length that can be accommodated."""
    spec_target_max_model_len: int | None = None
    """Specify the maximum length for spec decoding draft models."""
    quantization: QuantizationMethods | str | None = None
@@ -248,10 +251,11 @@ class ModelConfig:
    prometheus metrics, if multiple names provided, metrics tag will take the
    first one."""
    config_format: str | ConfigFormat = "auto"
-    """The format of the model config to load:\n
+    """The format of the model config to load:
+
    - "auto" will try to load the config in hf format if available after trying
-    to load in mistral format.\n
-    - "hf" will load the config in hf format.\n
+      to load in mistral format.
+    - "hf" will load the config in hf format.
    - "mistral" will load the config in mistral format."""
    hf_token: bool | str | None = None
    """The token to use as HTTP bearer authorization for remote files . If
@@ -276,12 +280,12 @@ class ModelConfig:
    """Enable sleep mode for the engine (only cuda and
    hip platforms are supported)."""
    model_impl: str | ModelImpl = "auto"
-    """Which implementation of the model to use:\n
-    - "auto" will try to use the vLLM implementation, if it exists, and fall
-    back to the Transformers implementation if no vLLM implementation is
-    available.\n
-    - "vllm" will use the vLLM model implementation.\n
-    - "transformers" will use the Transformers model implementation.\n
+    """Which implementation of the model to use:
+
+    - "auto" will try to use the vLLM implementation, if it exists, and fall back to the
+      Transformers implementation if no vLLM implementation is available.
+    - "vllm" will use the vLLM model implementation.
+    - "transformers" will use the Transformers model implementation.
    - "terratorch" will use the TerraTorch model implementation.
    """
    override_attention_dtype: str | None = None
@@ -1512,10 +1516,11 @@ class ModelConfig:
    @property
    def score_type(self) -> ScoreType:
        """
-        Scoring API handles score/rerank for:\n
-        - "classify" task (score_type: cross-encoder models)\n
-        - "embed" task (score_type: bi-encoder models)\n
-        - "token_embed" task (score_type: late interaction models)\n
+        Scoring API handles score/rerank for:
+
+        - "classify" task (score_type: cross-encoder models)
+        - "embed" task (score_type: bi-encoder models)
+        - "token_embed" task (score_type: late interaction models)
        """
        # fixme: self._model_info.score_type is the score type before
        #  as_seq_cls_model, which is "bi-encoder", rather than the
@@ -1593,9 +1598,10 @@ class ModelConfig:
        such as the lm_head in a generation model,
        or the score or classifier in a classification model.

-        `head_dtype` currently only supports pooling models.\n
-        - The pooling model defaults to using fp32 head,
-        you can use --hf-overrides '{"head_dtype": "model"}' to disable it.
+        `head_dtype` currently only supports pooling models.
+
+        - The pooling model defaults to using fp32 head, you can use
+          --hf-overrides '{"head_dtype": "model"}' to disable it.
        """

        head_dtype = _get_head_dtype(
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -146,14 +146,14 @@ class MultiModalConfig:
    parallelism (TP).

    - `"weights"`: Within the same vLLM engine, split the weights of
-        each layer across TP ranks. (default TP behavior)\n
+      each layer across TP ranks. (default TP behavior)
    - `"data"`: Within the same vLLM engine, split the batched input data
-        across TP ranks to process the data in parallel, while hosting
-        the full weights on each TP rank.
-        This batch-level DP is not to be confused with API request-level
-        DP (which is controlled by `--data-parallel-size`).
-        This is only supported on a per-model basis and falls back to
-        `"weights"` if the encoder does not support DP."""
+      across TP ranks to process the data in parallel, while hosting
+      the full weights on each TP rank.
+      This batch-level DP is not to be confused with API request-level
+      DP (which is controlled by `--data-parallel-size`).
+      This is only supported on a per-model basis and falls back to
+      `"weights"` if the encoder does not support DP."""
    mm_encoder_attn_backend: AttentionBackendEnum | None = None
    """Optional override for the multi-modal encoder attention backend when
    using vision transformers. Accepts any value from
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -148,10 +148,11 @@ class ParallelConfig:
    eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
    """Expert parallelism configuration."""
    expert_placement_strategy: ExpertPlacementStrategy = "linear"
-    """The expert placement strategy for MoE layers:\n
+    """The expert placement strategy for MoE layers:
+
    - "linear": Experts are placed in a contiguous manner. For example, with 4
      experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have
-      experts [2, 3].\n
+      experts [2, 3].
    - "round_robin": Experts are placed in a round-robin manner. For example,
      with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
      will have experts [1, 3]. This strategy can help improve load balancing
@@ -159,11 +160,11 @@ class ParallelConfig:
    all2all_backend: All2AllBackend = "allgather_reducescatter"
    """All2All backend for MoE expert parallel communication. Available options:

-    - "allgather_reducescatter": All2all based on allgather and reducescatter\n
-    - "deepep_high_throughput": Use deepep high-throughput kernels\n
-    - "deepep_low_latency": Use deepep low-latency kernels\n
-    - "mori": Use mori kernels\n
-    - "nixl_ep": Use nixl-ep kernels\n
+    - "allgather_reducescatter": All2all based on allgather and reducescatter
+    - "deepep_high_throughput": Use deepep high-throughput kernels
+    - "deepep_low_latency": Use deepep low-latency kernels
+    - "mori": Use mori kernels
+    - "nixl_ep": Use nixl-ep kernels
    - "flashinfer_nvlink_two_sided": Use flashinfer two-sided kernels for mnnvl
    - "flashinfer_nvlink_one_sided": Use flashinfer high-throughput a2a kernels"""

--- a/vllm/config/profiler.py
+++ b/vllm/config/profiler.py
@@ -37,7 +37,7 @@ class ProfilerConfig:
    profiler: ProfilerKind | None = None
    """Which profiler to use. Defaults to None. Options are:

-    - 'torch': Use PyTorch profiler.\n
+    - 'torch': Use PyTorch profiler.
    - 'cuda': Use CUDA profiler."""

    torch_profiler_dir: str = ""
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -106,11 +106,12 @@ class SchedulerConfig:
    max_num_batched_tokens in case max multimodal embedding size is larger."""

    policy: SchedulerPolicy = "fcfs"
-    """The scheduling policy to use:\n
-    - "fcfs" means first come first served, i.e. requests are handled in order
-    of arrival.\n
+    """The scheduling policy to use:
+
+    - "fcfs" means first come first served, i.e. requests are handled in order 
+      of arrival.
    - "priority" means requests are handled based on given priority (lower
-    value means earlier handling) and time of arrival deciding any ties)."""
+      value means earlier handling) and time of arrival deciding any ties)."""

    disable_chunked_mm_input: bool = False
    """If set to true and chunked prefill is enabled, we do not want to
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -37,10 +37,12 @@ ConfigT = TypeVar("ConfigT", bound=DataclassInstance)


@overload
+@dataclass_transform(field_specifiers=(PydanticField,))
 def config(cls: type[ConfigT]) -> type[ConfigT]: ...


@overload
+@dataclass_transform(field_specifiers=(PydanticField,))
 def config(
    *, config: ConfigDict | None = None, **kwargs: Any
 ) -> Callable[[type[ConfigT]], type[ConfigT]]: ...
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -31,14 +31,12 @@ class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpForma
    def _split_lines(self, text, width):
        """
        1. Sentences split across lines have their single newlines removed.
-        2. Paragraphs and explicit newlines are split into separate lines.
+        2. Paragraphs and lists are split into separate lines.
        3. Each line is wrapped to the specified width (width of terminal).
        """
-        # The patterns also include whitespace after the newline
-        single_newline = re.compile(r"(?<!\n)\n(?!\n)\s*")
-        multiple_newlines = re.compile(r"\n{2,}\s*")
-        text = single_newline.sub(" ", text)
-        lines = re.split(multiple_newlines, text)
+        # The pattern also includes whitespace after the newline
+        newlines_to_remove = re.compile(r"(?<!\n)\n(?!\n)(?!\s*(-|\*|\+|\d+\.))\s*")
+        lines = newlines_to_remove.sub(" ", text).splitlines()
        return sum([textwrap.wrap(line, width) for line in lines], [])

    def add_arguments(self, actions):