Remove need for explicit \n in docstring lists for --help formatting (#38350)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -66,22 +66,21 @@ class CacheConfig:
|
||||
enable_prefix_caching: bool = True
|
||||
"""Whether to enable prefix caching."""
|
||||
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
|
||||
"""Set the hash algorithm for prefix caching:\n
|
||||
- "sha256" uses Pickle for object serialization before hashing. This is the
|
||||
current default, as SHA256 is the most secure choice to avoid potential
|
||||
hash collisions.\n
|
||||
"""Set the hash algorithm for prefix caching:
|
||||
|
||||
- "sha256" uses Pickle for object serialization before hashing. This is the current
|
||||
default, as SHA256 is the most secure choice to avoid potential hash collisions.
|
||||
- "sha256_cbor" provides a reproducible, cross-language compatible hash. It
|
||||
serializes objects using canonical CBOR and hashes them with SHA-256.\n
|
||||
serializes objects using canonical CBOR and hashes them with SHA-256.
|
||||
- "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
|
||||
non-cryptographic hashing. Requires the optional ``xxhash`` package.
|
||||
IMPORTANT: Use of a hashing algorithm that is not considered
|
||||
cryptographically secure theoretically increases the risk of hash collisions,
|
||||
which can cause undefined behavior or even leak private information in
|
||||
multi-tenant environments. Even if collisions are still very unlikely, it is
|
||||
important to consider your security risk tolerance against the performance
|
||||
benefits before turning this on.\n
|
||||
non-cryptographic hashing. Requires the optional ``xxhash`` package.
|
||||
IMPORTANT: Use of a hashing algorithm that is not considered cryptographically
|
||||
secure theoretically increases the risk of hash collisions, which can cause
|
||||
undefined behavior or even leak private information in multi-tenant environments.
|
||||
Even if collisions are still very unlikely, it is important to consider your
|
||||
security risk tolerance against the performance benefits before turning this on.
|
||||
- "xxhash_cbor" combines canonical CBOR serialization with xxHash for
|
||||
reproducible hashing. Requires the optional ``xxhash`` package."""
|
||||
reproducible hashing. Requires the optional ``xxhash`` package."""
|
||||
calculate_kv_scales: bool = False
|
||||
"""Deprecated: This option is deprecated and will be removed in v0.19.
|
||||
It enables dynamic calculation of `k_scale` and `v_scale` when
|
||||
|
||||
@@ -32,14 +32,14 @@ class KernelConfig:
|
||||
moe_backend: MoEBackend = "auto"
|
||||
"""Backend for MoE expert computation kernels. Available options:
|
||||
|
||||
- "auto": Automatically select the best backend based on model and hardware\n
|
||||
- "triton": Use Triton-based fused MoE kernels\n
|
||||
- "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)\n
|
||||
- "cutlass": Use vLLM CUTLASS kernels\n
|
||||
- "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels\n
|
||||
- "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels\n
|
||||
- "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)\n
|
||||
- "marlin": Use Marlin kernels (weight-only quantization)\n
|
||||
- "auto": Automatically select the best backend based on model and hardware
|
||||
- "triton": Use Triton-based fused MoE kernels
|
||||
- "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
|
||||
- "cutlass": Use vLLM CUTLASS kernels
|
||||
- "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
|
||||
- "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
|
||||
- "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
|
||||
- "marlin": Use Marlin kernels (weight-only quantization)
|
||||
- "aiter": Use AMD AITer kernels (ROCm only)"""
|
||||
|
||||
@field_validator("moe_backend", mode="before")
|
||||
|
||||
@@ -51,7 +51,7 @@ class LoadConfig:
|
||||
- "gguf" will load weights from GGUF format files (details specified in
|
||||
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).
|
||||
- "mistral" will load weights from consolidated safetensors files used by
|
||||
Mistral models.\n
|
||||
Mistral models.
|
||||
- Other custom values can be supported via plugins.
|
||||
"""
|
||||
download_dir: str | None = None
|
||||
|
||||
@@ -125,26 +125,28 @@ class ModelConfig:
|
||||
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
|
||||
name or path will be used."""
|
||||
tokenizer_mode: TokenizerMode | str = "auto"
|
||||
"""Tokenizer mode:\n
|
||||
"""Tokenizer mode:
|
||||
|
||||
- "auto" will use the tokenizer from `mistral_common` for Mistral models
|
||||
if available, otherwise it will use the "hf" tokenizer.\n
|
||||
- "hf" will use the fast tokenizer if available.\n
|
||||
- "slow" will always use the slow tokenizer.\n
|
||||
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
||||
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
|
||||
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
|
||||
if available, otherwise it will use the "hf" tokenizer.
|
||||
- "hf" will use the fast tokenizer if available.
|
||||
- "slow" will always use the slow tokenizer.
|
||||
- "mistral" will always use the tokenizer from `mistral_common`.
|
||||
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
|
||||
- "qwen_vl" will always use the tokenizer from `qwen_vl`.
|
||||
- Other custom values can be supported via plugins."""
|
||||
trust_remote_code: bool = False
|
||||
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
||||
and tokenizer."""
|
||||
dtype: ModelDType | torch.dtype = "auto"
|
||||
"""Data type for model weights and activations:\n
|
||||
"""Data type for model weights and activations:
|
||||
|
||||
- "auto" will use FP16 precision for FP32 and FP16 models, and BF16
|
||||
precision for BF16 models.\n
|
||||
- "half" for FP16. Recommended for AWQ quantization.\n
|
||||
- "float16" is the same as "half".\n
|
||||
- "bfloat16" for a balance between precision and range.\n
|
||||
- "float" is shorthand for FP32 precision.\n
|
||||
precision for BF16 models.
|
||||
- "half" for FP16. Recommended for AWQ quantization.
|
||||
- "float16" is the same as "half".
|
||||
- "bfloat16" for a balance between precision and range.
|
||||
- "float" is shorthand for FP32 precision.
|
||||
- "float32" for FP32 precision."""
|
||||
seed: int = 0
|
||||
"""Random seed for reproducibility.
|
||||
@@ -182,13 +184,14 @@ class ModelConfig:
|
||||
automatically derived from the model config.
|
||||
|
||||
When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
|
||||
format. Examples:\n
|
||||
- 1k -> 1000\n
|
||||
- 1K -> 1024\n
|
||||
- 25.6k -> 25,600\n
|
||||
format. Examples:
|
||||
|
||||
- 1k -> 1000
|
||||
- 1K -> 1024
|
||||
- 25.6k -> 25,600
|
||||
- -1 or 'auto' -> Automatically choose the maximum model length that fits in
|
||||
GPU memory. This will use the model's maximum context length if it fits,
|
||||
otherwise it will find the largest length that can be accommodated."""
|
||||
GPU memory. This will use the model's maximum context length if it fits,
|
||||
otherwise it will find the largest length that can be accommodated."""
|
||||
spec_target_max_model_len: int | None = None
|
||||
"""Specify the maximum length for spec decoding draft models."""
|
||||
quantization: QuantizationMethods | str | None = None
|
||||
@@ -248,10 +251,11 @@ class ModelConfig:
|
||||
prometheus metrics, if multiple names provided, metrics tag will take the
|
||||
first one."""
|
||||
config_format: str | ConfigFormat = "auto"
|
||||
"""The format of the model config to load:\n
|
||||
"""The format of the model config to load:
|
||||
|
||||
- "auto" will try to load the config in hf format if available after trying
|
||||
to load in mistral format.\n
|
||||
- "hf" will load the config in hf format.\n
|
||||
to load in mistral format.
|
||||
- "hf" will load the config in hf format.
|
||||
- "mistral" will load the config in mistral format."""
|
||||
hf_token: bool | str | None = None
|
||||
"""The token to use as HTTP bearer authorization for remote files . If
|
||||
@@ -276,12 +280,12 @@ class ModelConfig:
|
||||
"""Enable sleep mode for the engine (only cuda and
|
||||
hip platforms are supported)."""
|
||||
model_impl: str | ModelImpl = "auto"
|
||||
"""Which implementation of the model to use:\n
|
||||
- "auto" will try to use the vLLM implementation, if it exists, and fall
|
||||
back to the Transformers implementation if no vLLM implementation is
|
||||
available.\n
|
||||
- "vllm" will use the vLLM model implementation.\n
|
||||
- "transformers" will use the Transformers model implementation.\n
|
||||
"""Which implementation of the model to use:
|
||||
|
||||
- "auto" will try to use the vLLM implementation, if it exists, and fall back to the
|
||||
Transformers implementation if no vLLM implementation is available.
|
||||
- "vllm" will use the vLLM model implementation.
|
||||
- "transformers" will use the Transformers model implementation.
|
||||
- "terratorch" will use the TerraTorch model implementation.
|
||||
"""
|
||||
override_attention_dtype: str | None = None
|
||||
@@ -1512,10 +1516,11 @@ class ModelConfig:
|
||||
@property
|
||||
def score_type(self) -> ScoreType:
|
||||
"""
|
||||
Scoring API handles score/rerank for:\n
|
||||
- "classify" task (score_type: cross-encoder models)\n
|
||||
- "embed" task (score_type: bi-encoder models)\n
|
||||
- "token_embed" task (score_type: late interaction models)\n
|
||||
Scoring API handles score/rerank for:
|
||||
|
||||
- "classify" task (score_type: cross-encoder models)
|
||||
- "embed" task (score_type: bi-encoder models)
|
||||
- "token_embed" task (score_type: late interaction models)
|
||||
"""
|
||||
# fixme: self._model_info.score_type is the score type before
|
||||
# as_seq_cls_model, which is "bi-encoder", rather than the
|
||||
@@ -1593,9 +1598,10 @@ class ModelConfig:
|
||||
such as the lm_head in a generation model,
|
||||
or the score or classifier in a classification model.
|
||||
|
||||
`head_dtype` currently only supports pooling models.\n
|
||||
- The pooling model defaults to using fp32 head,
|
||||
you can use --hf-overrides '{"head_dtype": "model"}' to disable it.
|
||||
`head_dtype` currently only supports pooling models.
|
||||
|
||||
- The pooling model defaults to using fp32 head, you can use
|
||||
--hf-overrides '{"head_dtype": "model"}' to disable it.
|
||||
"""
|
||||
|
||||
head_dtype = _get_head_dtype(
|
||||
|
||||
@@ -146,14 +146,14 @@ class MultiModalConfig:
|
||||
parallelism (TP).
|
||||
|
||||
- `"weights"`: Within the same vLLM engine, split the weights of
|
||||
each layer across TP ranks. (default TP behavior)\n
|
||||
each layer across TP ranks. (default TP behavior)
|
||||
- `"data"`: Within the same vLLM engine, split the batched input data
|
||||
across TP ranks to process the data in parallel, while hosting
|
||||
the full weights on each TP rank.
|
||||
This batch-level DP is not to be confused with API request-level
|
||||
DP (which is controlled by `--data-parallel-size`).
|
||||
This is only supported on a per-model basis and falls back to
|
||||
`"weights"` if the encoder does not support DP."""
|
||||
across TP ranks to process the data in parallel, while hosting
|
||||
the full weights on each TP rank.
|
||||
This batch-level DP is not to be confused with API request-level
|
||||
DP (which is controlled by `--data-parallel-size`).
|
||||
This is only supported on a per-model basis and falls back to
|
||||
`"weights"` if the encoder does not support DP."""
|
||||
mm_encoder_attn_backend: AttentionBackendEnum | None = None
|
||||
"""Optional override for the multi-modal encoder attention backend when
|
||||
using vision transformers. Accepts any value from
|
||||
|
||||
@@ -148,10 +148,11 @@ class ParallelConfig:
|
||||
eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
|
||||
"""Expert parallelism configuration."""
|
||||
expert_placement_strategy: ExpertPlacementStrategy = "linear"
|
||||
"""The expert placement strategy for MoE layers:\n
|
||||
"""The expert placement strategy for MoE layers:
|
||||
|
||||
- "linear": Experts are placed in a contiguous manner. For example, with 4
|
||||
experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have
|
||||
experts [2, 3].\n
|
||||
experts [2, 3].
|
||||
- "round_robin": Experts are placed in a round-robin manner. For example,
|
||||
with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
|
||||
will have experts [1, 3]. This strategy can help improve load balancing
|
||||
@@ -159,11 +160,11 @@ class ParallelConfig:
|
||||
all2all_backend: All2AllBackend = "allgather_reducescatter"
|
||||
"""All2All backend for MoE expert parallel communication. Available options:
|
||||
|
||||
- "allgather_reducescatter": All2all based on allgather and reducescatter\n
|
||||
- "deepep_high_throughput": Use deepep high-throughput kernels\n
|
||||
- "deepep_low_latency": Use deepep low-latency kernels\n
|
||||
- "mori": Use mori kernels\n
|
||||
- "nixl_ep": Use nixl-ep kernels\n
|
||||
- "allgather_reducescatter": All2all based on allgather and reducescatter
|
||||
- "deepep_high_throughput": Use deepep high-throughput kernels
|
||||
- "deepep_low_latency": Use deepep low-latency kernels
|
||||
- "mori": Use mori kernels
|
||||
- "nixl_ep": Use nixl-ep kernels
|
||||
- "flashinfer_nvlink_two_sided": Use flashinfer two-sided kernels for mnnvl
|
||||
- "flashinfer_nvlink_one_sided": Use flashinfer high-throughput a2a kernels"""
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ class ProfilerConfig:
|
||||
profiler: ProfilerKind | None = None
|
||||
"""Which profiler to use. Defaults to None. Options are:
|
||||
|
||||
- 'torch': Use PyTorch profiler.\n
|
||||
- 'torch': Use PyTorch profiler.
|
||||
- 'cuda': Use CUDA profiler."""
|
||||
|
||||
torch_profiler_dir: str = ""
|
||||
|
||||
@@ -106,11 +106,12 @@ class SchedulerConfig:
|
||||
max_num_batched_tokens in case max multimodal embedding size is larger."""
|
||||
|
||||
policy: SchedulerPolicy = "fcfs"
|
||||
"""The scheduling policy to use:\n
|
||||
- "fcfs" means first come first served, i.e. requests are handled in order
|
||||
of arrival.\n
|
||||
"""The scheduling policy to use:
|
||||
|
||||
- "fcfs" means first come first served, i.e. requests are handled in order
|
||||
of arrival.
|
||||
- "priority" means requests are handled based on given priority (lower
|
||||
value means earlier handling) and time of arrival deciding any ties)."""
|
||||
value means earlier handling) and time of arrival deciding any ties)."""
|
||||
|
||||
disable_chunked_mm_input: bool = False
|
||||
"""If set to true and chunked prefill is enabled, we do not want to
|
||||
|
||||
@@ -37,10 +37,12 @@ ConfigT = TypeVar("ConfigT", bound=DataclassInstance)
|
||||
|
||||
|
||||
@overload
|
||||
@dataclass_transform(field_specifiers=(PydanticField,))
|
||||
def config(cls: type[ConfigT]) -> type[ConfigT]: ...
|
||||
|
||||
|
||||
@overload
|
||||
@dataclass_transform(field_specifiers=(PydanticField,))
|
||||
def config(
|
||||
*, config: ConfigDict | None = None, **kwargs: Any
|
||||
) -> Callable[[type[ConfigT]], type[ConfigT]]: ...
|
||||
|
||||
@@ -31,14 +31,12 @@ class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpForma
|
||||
def _split_lines(self, text, width):
|
||||
"""
|
||||
1. Sentences split across lines have their single newlines removed.
|
||||
2. Paragraphs and explicit newlines are split into separate lines.
|
||||
2. Paragraphs and lists are split into separate lines.
|
||||
3. Each line is wrapped to the specified width (width of terminal).
|
||||
"""
|
||||
# The patterns also include whitespace after the newline
|
||||
single_newline = re.compile(r"(?<!\n)\n(?!\n)\s*")
|
||||
multiple_newlines = re.compile(r"\n{2,}\s*")
|
||||
text = single_newline.sub(" ", text)
|
||||
lines = re.split(multiple_newlines, text)
|
||||
# The pattern also includes whitespace after the newline
|
||||
newlines_to_remove = re.compile(r"(?<!\n)\n(?!\n)(?!\s*(-|\*|\+|\d+\.))\s*")
|
||||
lines = newlines_to_remove.sub(" ", text).splitlines()
|
||||
return sum([textwrap.wrap(line, width) for line in lines], [])
|
||||
|
||||
def add_arguments(self, actions):
|
||||
|
||||
Reference in New Issue
Block a user