Remove need for explicit \n in docstring lists for --help formatting (#38350)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2026-03-27 15:38:00 +00:00
committed by GitHub
parent 98e7f223b9
commit 21d2b53f88
10 changed files with 89 additions and 82 deletions

View File

@@ -66,22 +66,21 @@ class CacheConfig:
enable_prefix_caching: bool = True
"""Whether to enable prefix caching."""
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
"""Set the hash algorithm for prefix caching:\n
- "sha256" uses Pickle for object serialization before hashing. This is the
current default, as SHA256 is the most secure choice to avoid potential
hash collisions.\n
"""Set the hash algorithm for prefix caching:
- "sha256" uses Pickle for object serialization before hashing. This is the current
default, as SHA256 is the most secure choice to avoid potential hash collisions.
- "sha256_cbor" provides a reproducible, cross-language compatible hash. It
serializes objects using canonical CBOR and hashes them with SHA-256.\n
serializes objects using canonical CBOR and hashes them with SHA-256.
- "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
non-cryptographic hashing. Requires the optional ``xxhash`` package.
IMPORTANT: Use of a hashing algorithm that is not considered
cryptographically secure theoretically increases the risk of hash collisions,
which can cause undefined behavior or even leak private information in
multi-tenant environments. Even if collisions are still very unlikely, it is
important to consider your security risk tolerance against the performance
benefits before turning this on.\n
non-cryptographic hashing. Requires the optional ``xxhash`` package.
IMPORTANT: Use of a hashing algorithm that is not considered cryptographically
secure theoretically increases the risk of hash collisions, which can cause
undefined behavior or even leak private information in multi-tenant environments.
Even if collisions are still very unlikely, it is important to consider your
security risk tolerance against the performance benefits before turning this on.
- "xxhash_cbor" combines canonical CBOR serialization with xxHash for
reproducible hashing. Requires the optional ``xxhash`` package."""
reproducible hashing. Requires the optional ``xxhash`` package."""
calculate_kv_scales: bool = False
"""Deprecated: This option is deprecated and will be removed in v0.19.
It enables dynamic calculation of `k_scale` and `v_scale` when

View File

@@ -32,14 +32,14 @@ class KernelConfig:
moe_backend: MoEBackend = "auto"
"""Backend for MoE expert computation kernels. Available options:
- "auto": Automatically select the best backend based on model and hardware\n
- "triton": Use Triton-based fused MoE kernels\n
- "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)\n
- "cutlass": Use vLLM CUTLASS kernels\n
- "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels\n
- "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels\n
- "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)\n
- "marlin": Use Marlin kernels (weight-only quantization)\n
- "auto": Automatically select the best backend based on model and hardware
- "triton": Use Triton-based fused MoE kernels
- "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
- "cutlass": Use vLLM CUTLASS kernels
- "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
- "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
- "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
- "marlin": Use Marlin kernels (weight-only quantization)
- "aiter": Use AMD AITer kernels (ROCm only)"""
@field_validator("moe_backend", mode="before")

View File

@@ -51,7 +51,7 @@ class LoadConfig:
- "gguf" will load weights from GGUF format files (details specified in
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).
- "mistral" will load weights from consolidated safetensors files used by
Mistral models.\n
Mistral models.
- Other custom values can be supported via plugins.
"""
download_dir: str | None = None

View File

@@ -125,26 +125,28 @@ class ModelConfig:
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode: TokenizerMode | str = "auto"
"""Tokenizer mode:\n
"""Tokenizer mode:
- "auto" will use the tokenizer from `mistral_common` for Mistral models
if available, otherwise it will use the "hf" tokenizer.\n
- "hf" will use the fast tokenizer if available.\n
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
if available, otherwise it will use the "hf" tokenizer.
- "hf" will use the fast tokenizer if available.
- "slow" will always use the slow tokenizer.
- "mistral" will always use the tokenizer from `mistral_common`.
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
- "qwen_vl" will always use the tokenizer from `qwen_vl`.
- Other custom values can be supported via plugins."""
trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model
and tokenizer."""
dtype: ModelDType | torch.dtype = "auto"
"""Data type for model weights and activations:\n
"""Data type for model weights and activations:
- "auto" will use FP16 precision for FP32 and FP16 models, and BF16
precision for BF16 models.\n
- "half" for FP16. Recommended for AWQ quantization.\n
- "float16" is the same as "half".\n
- "bfloat16" for a balance between precision and range.\n
- "float" is shorthand for FP32 precision.\n
precision for BF16 models.
- "half" for FP16. Recommended for AWQ quantization.
- "float16" is the same as "half".
- "bfloat16" for a balance between precision and range.
- "float" is shorthand for FP32 precision.
- "float32" for FP32 precision."""
seed: int = 0
"""Random seed for reproducibility.
@@ -182,13 +184,14 @@ class ModelConfig:
automatically derived from the model config.
When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
format. Examples:\n
- 1k -> 1000\n
- 1K -> 1024\n
- 25.6k -> 25,600\n
format. Examples:
- 1k -> 1000
- 1K -> 1024
- 25.6k -> 25,600
- -1 or 'auto' -> Automatically choose the maximum model length that fits in
GPU memory. This will use the model's maximum context length if it fits,
otherwise it will find the largest length that can be accommodated."""
GPU memory. This will use the model's maximum context length if it fits,
otherwise it will find the largest length that can be accommodated."""
spec_target_max_model_len: int | None = None
"""Specify the maximum length for spec decoding draft models."""
quantization: QuantizationMethods | str | None = None
@@ -248,10 +251,11 @@ class ModelConfig:
prometheus metrics, if multiple names provided, metrics tag will take the
first one."""
config_format: str | ConfigFormat = "auto"
"""The format of the model config to load:\n
"""The format of the model config to load:
- "auto" will try to load the config in hf format if available after trying
to load in mistral format.\n
- "hf" will load the config in hf format.\n
to load in mistral format.
- "hf" will load the config in hf format.
- "mistral" will load the config in mistral format."""
hf_token: bool | str | None = None
"""The token to use as HTTP bearer authorization for remote files . If
@@ -276,12 +280,12 @@ class ModelConfig:
"""Enable sleep mode for the engine (only cuda and
hip platforms are supported)."""
model_impl: str | ModelImpl = "auto"
"""Which implementation of the model to use:\n
- "auto" will try to use the vLLM implementation, if it exists, and fall
back to the Transformers implementation if no vLLM implementation is
available.\n
- "vllm" will use the vLLM model implementation.\n
- "transformers" will use the Transformers model implementation.\n
"""Which implementation of the model to use:
- "auto" will try to use the vLLM implementation, if it exists, and fall back to the
Transformers implementation if no vLLM implementation is available.
- "vllm" will use the vLLM model implementation.
- "transformers" will use the Transformers model implementation.
- "terratorch" will use the TerraTorch model implementation.
"""
override_attention_dtype: str | None = None
@@ -1512,10 +1516,11 @@ class ModelConfig:
@property
def score_type(self) -> ScoreType:
"""
Scoring API handles score/rerank for:\n
- "classify" task (score_type: cross-encoder models)\n
- "embed" task (score_type: bi-encoder models)\n
- "token_embed" task (score_type: late interaction models)\n
Scoring API handles score/rerank for:
- "classify" task (score_type: cross-encoder models)
- "embed" task (score_type: bi-encoder models)
- "token_embed" task (score_type: late interaction models)
"""
# fixme: self._model_info.score_type is the score type before
# as_seq_cls_model, which is "bi-encoder", rather than the
@@ -1593,9 +1598,10 @@ class ModelConfig:
such as the lm_head in a generation model,
or the score or classifier in a classification model.
`head_dtype` currently only supports pooling models.\n
- The pooling model defaults to using fp32 head,
you can use --hf-overrides '{"head_dtype": "model"}' to disable it.
`head_dtype` currently only supports pooling models.
- The pooling model defaults to using fp32 head, you can use
--hf-overrides '{"head_dtype": "model"}' to disable it.
"""
head_dtype = _get_head_dtype(

View File

@@ -146,14 +146,14 @@ class MultiModalConfig:
parallelism (TP).
- `"weights"`: Within the same vLLM engine, split the weights of
each layer across TP ranks. (default TP behavior)\n
each layer across TP ranks. (default TP behavior)
- `"data"`: Within the same vLLM engine, split the batched input data
across TP ranks to process the data in parallel, while hosting
the full weights on each TP rank.
This batch-level DP is not to be confused with API request-level
DP (which is controlled by `--data-parallel-size`).
This is only supported on a per-model basis and falls back to
`"weights"` if the encoder does not support DP."""
across TP ranks to process the data in parallel, while hosting
the full weights on each TP rank.
This batch-level DP is not to be confused with API request-level
DP (which is controlled by `--data-parallel-size`).
This is only supported on a per-model basis and falls back to
`"weights"` if the encoder does not support DP."""
mm_encoder_attn_backend: AttentionBackendEnum | None = None
"""Optional override for the multi-modal encoder attention backend when
using vision transformers. Accepts any value from

View File

@@ -148,10 +148,11 @@ class ParallelConfig:
eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
"""Expert parallelism configuration."""
expert_placement_strategy: ExpertPlacementStrategy = "linear"
"""The expert placement strategy for MoE layers:\n
"""The expert placement strategy for MoE layers:
- "linear": Experts are placed in a contiguous manner. For example, with 4
experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have
experts [2, 3].\n
experts [2, 3].
- "round_robin": Experts are placed in a round-robin manner. For example,
with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
will have experts [1, 3]. This strategy can help improve load balancing
@@ -159,11 +160,11 @@ class ParallelConfig:
all2all_backend: All2AllBackend = "allgather_reducescatter"
"""All2All backend for MoE expert parallel communication. Available options:
- "allgather_reducescatter": All2all based on allgather and reducescatter\n
- "deepep_high_throughput": Use deepep high-throughput kernels\n
- "deepep_low_latency": Use deepep low-latency kernels\n
- "mori": Use mori kernels\n
- "nixl_ep": Use nixl-ep kernels\n
- "allgather_reducescatter": All2all based on allgather and reducescatter
- "deepep_high_throughput": Use deepep high-throughput kernels
- "deepep_low_latency": Use deepep low-latency kernels
- "mori": Use mori kernels
- "nixl_ep": Use nixl-ep kernels
- "flashinfer_nvlink_two_sided": Use flashinfer two-sided kernels for mnnvl
- "flashinfer_nvlink_one_sided": Use flashinfer high-throughput a2a kernels"""

View File

@@ -37,7 +37,7 @@ class ProfilerConfig:
profiler: ProfilerKind | None = None
"""Which profiler to use. Defaults to None. Options are:
- 'torch': Use PyTorch profiler.\n
- 'torch': Use PyTorch profiler.
- 'cuda': Use CUDA profiler."""
torch_profiler_dir: str = ""

View File

@@ -106,11 +106,12 @@ class SchedulerConfig:
max_num_batched_tokens in case max multimodal embedding size is larger."""
policy: SchedulerPolicy = "fcfs"
"""The scheduling policy to use:\n
- "fcfs" means first come first served, i.e. requests are handled in order
of arrival.\n
"""The scheduling policy to use:
- "fcfs" means first come first served, i.e. requests are handled in order
of arrival.
- "priority" means requests are handled based on given priority (lower
value means earlier handling) and time of arrival deciding any ties)."""
value means earlier handling) and time of arrival deciding any ties)."""
disable_chunked_mm_input: bool = False
"""If set to true and chunked prefill is enabled, we do not want to

View File

@@ -37,10 +37,12 @@ ConfigT = TypeVar("ConfigT", bound=DataclassInstance)
@overload
@dataclass_transform(field_specifiers=(PydanticField,))
def config(cls: type[ConfigT]) -> type[ConfigT]: ...
@overload
@dataclass_transform(field_specifiers=(PydanticField,))
def config(
*, config: ConfigDict | None = None, **kwargs: Any
) -> Callable[[type[ConfigT]], type[ConfigT]]: ...

View File

@@ -31,14 +31,12 @@ class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpForma
def _split_lines(self, text, width):
"""
1. Sentences split across lines have their single newlines removed.
2. Paragraphs and explicit newlines are split into separate lines.
2. Paragraphs and lists are split into separate lines.
3. Each line is wrapped to the specified width (width of terminal).
"""
# The patterns also include whitespace after the newline
single_newline = re.compile(r"(?<!\n)\n(?!\n)\s*")
multiple_newlines = re.compile(r"\n{2,}\s*")
text = single_newline.sub(" ", text)
lines = re.split(multiple_newlines, text)
# The pattern also includes whitespace after the newline
newlines_to_remove = re.compile(r"(?<!\n)\n(?!\n)(?!\s*(-|\*|\+|\d+\.))\s*")
lines = newlines_to_remove.sub(" ", text).splitlines()
return sum([textwrap.wrap(line, width) for line in lines], [])
def add_arguments(self, actions):