diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 50fe82eb1..dcc93d987 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -66,22 +66,21 @@ class CacheConfig: enable_prefix_caching: bool = True """Whether to enable prefix caching.""" prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256" - """Set the hash algorithm for prefix caching:\n - - "sha256" uses Pickle for object serialization before hashing. This is the - current default, as SHA256 is the most secure choice to avoid potential - hash collisions.\n + """Set the hash algorithm for prefix caching: + + - "sha256" uses Pickle for object serialization before hashing. This is the current + default, as SHA256 is the most secure choice to avoid potential hash collisions. - "sha256_cbor" provides a reproducible, cross-language compatible hash. It - serializes objects using canonical CBOR and hashes them with SHA-256.\n + serializes objects using canonical CBOR and hashes them with SHA-256. - "xxhash" uses Pickle serialization with xxHash (128-bit) for faster, - non-cryptographic hashing. Requires the optional ``xxhash`` package. - IMPORTANT: Use of a hashing algorithm that is not considered - cryptographically secure theoretically increases the risk of hash collisions, - which can cause undefined behavior or even leak private information in - multi-tenant environments. Even if collisions are still very unlikely, it is - important to consider your security risk tolerance against the performance - benefits before turning this on.\n + non-cryptographic hashing. Requires the optional ``xxhash`` package. + IMPORTANT: Use of a hashing algorithm that is not considered cryptographically + secure theoretically increases the risk of hash collisions, which can cause + undefined behavior or even leak private information in multi-tenant environments. + Even if collisions are still very unlikely, it is important to consider your + security risk tolerance against the performance benefits before turning this on. - "xxhash_cbor" combines canonical CBOR serialization with xxHash for - reproducible hashing. Requires the optional ``xxhash`` package.""" + reproducible hashing. Requires the optional ``xxhash`` package.""" calculate_kv_scales: bool = False """Deprecated: This option is deprecated and will be removed in v0.19. It enables dynamic calculation of `k_scale` and `v_scale` when diff --git a/vllm/config/kernel.py b/vllm/config/kernel.py index 2ec18289d..4476cd125 100644 --- a/vllm/config/kernel.py +++ b/vllm/config/kernel.py @@ -32,14 +32,14 @@ class KernelConfig: moe_backend: MoEBackend = "auto" """Backend for MoE expert computation kernels. Available options: - - "auto": Automatically select the best backend based on model and hardware\n - - "triton": Use Triton-based fused MoE kernels\n - - "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)\n - - "cutlass": Use vLLM CUTLASS kernels\n - - "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels\n - - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels\n - - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)\n - - "marlin": Use Marlin kernels (weight-only quantization)\n + - "auto": Automatically select the best backend based on model and hardware + - "triton": Use Triton-based fused MoE kernels + - "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only) + - "cutlass": Use vLLM CUTLASS kernels + - "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels + - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels + - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only) + - "marlin": Use Marlin kernels (weight-only quantization) - "aiter": Use AMD AITer kernels (ROCm only)""" @field_validator("moe_backend", mode="before") diff --git a/vllm/config/load.py b/vllm/config/load.py index e77d9b378..93240ec5f 100644 --- a/vllm/config/load.py +++ b/vllm/config/load.py @@ -51,7 +51,7 @@ class LoadConfig: - "gguf" will load weights from GGUF format files (details specified in https://github.com/ggml-org/ggml/blob/master/docs/gguf.md). - "mistral" will load weights from consolidated safetensors files used by - Mistral models.\n + Mistral models. - Other custom values can be supported via plugins. """ download_dir: str | None = None diff --git a/vllm/config/model.py b/vllm/config/model.py index 225ee119a..acb43a04b 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -125,26 +125,28 @@ class ModelConfig: """Name or path of the Hugging Face tokenizer to use. If unspecified, model name or path will be used.""" tokenizer_mode: TokenizerMode | str = "auto" - """Tokenizer mode:\n + """Tokenizer mode: + - "auto" will use the tokenizer from `mistral_common` for Mistral models - if available, otherwise it will use the "hf" tokenizer.\n - - "hf" will use the fast tokenizer if available.\n - - "slow" will always use the slow tokenizer.\n - - "mistral" will always use the tokenizer from `mistral_common`.\n - - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n - - "qwen_vl" will always use the tokenizer from `qwen_vl`.\n + if available, otherwise it will use the "hf" tokenizer. + - "hf" will use the fast tokenizer if available. + - "slow" will always use the slow tokenizer. + - "mistral" will always use the tokenizer from `mistral_common`. + - "deepseek_v32" will always use the tokenizer from `deepseek_v32`. + - "qwen_vl" will always use the tokenizer from `qwen_vl`. - Other custom values can be supported via plugins.""" trust_remote_code: bool = False """Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.""" dtype: ModelDType | torch.dtype = "auto" - """Data type for model weights and activations:\n + """Data type for model weights and activations: + - "auto" will use FP16 precision for FP32 and FP16 models, and BF16 - precision for BF16 models.\n - - "half" for FP16. Recommended for AWQ quantization.\n - - "float16" is the same as "half".\n - - "bfloat16" for a balance between precision and range.\n - - "float" is shorthand for FP32 precision.\n + precision for BF16 models. + - "half" for FP16. Recommended for AWQ quantization. + - "float16" is the same as "half". + - "bfloat16" for a balance between precision and range. + - "float" is shorthand for FP32 precision. - "float32" for FP32 precision.""" seed: int = 0 """Random seed for reproducibility. @@ -182,13 +184,14 @@ class ModelConfig: automatically derived from the model config. When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable - format. Examples:\n - - 1k -> 1000\n - - 1K -> 1024\n - - 25.6k -> 25,600\n + format. Examples: + + - 1k -> 1000 + - 1K -> 1024 + - 25.6k -> 25,600 - -1 or 'auto' -> Automatically choose the maximum model length that fits in - GPU memory. This will use the model's maximum context length if it fits, - otherwise it will find the largest length that can be accommodated.""" + GPU memory. This will use the model's maximum context length if it fits, + otherwise it will find the largest length that can be accommodated.""" spec_target_max_model_len: int | None = None """Specify the maximum length for spec decoding draft models.""" quantization: QuantizationMethods | str | None = None @@ -248,10 +251,11 @@ class ModelConfig: prometheus metrics, if multiple names provided, metrics tag will take the first one.""" config_format: str | ConfigFormat = "auto" - """The format of the model config to load:\n + """The format of the model config to load: + - "auto" will try to load the config in hf format if available after trying - to load in mistral format.\n - - "hf" will load the config in hf format.\n + to load in mistral format. + - "hf" will load the config in hf format. - "mistral" will load the config in mistral format.""" hf_token: bool | str | None = None """The token to use as HTTP bearer authorization for remote files . If @@ -276,12 +280,12 @@ class ModelConfig: """Enable sleep mode for the engine (only cuda and hip platforms are supported).""" model_impl: str | ModelImpl = "auto" - """Which implementation of the model to use:\n - - "auto" will try to use the vLLM implementation, if it exists, and fall - back to the Transformers implementation if no vLLM implementation is - available.\n - - "vllm" will use the vLLM model implementation.\n - - "transformers" will use the Transformers model implementation.\n + """Which implementation of the model to use: + + - "auto" will try to use the vLLM implementation, if it exists, and fall back to the + Transformers implementation if no vLLM implementation is available. + - "vllm" will use the vLLM model implementation. + - "transformers" will use the Transformers model implementation. - "terratorch" will use the TerraTorch model implementation. """ override_attention_dtype: str | None = None @@ -1512,10 +1516,11 @@ class ModelConfig: @property def score_type(self) -> ScoreType: """ - Scoring API handles score/rerank for:\n - - "classify" task (score_type: cross-encoder models)\n - - "embed" task (score_type: bi-encoder models)\n - - "token_embed" task (score_type: late interaction models)\n + Scoring API handles score/rerank for: + + - "classify" task (score_type: cross-encoder models) + - "embed" task (score_type: bi-encoder models) + - "token_embed" task (score_type: late interaction models) """ # fixme: self._model_info.score_type is the score type before # as_seq_cls_model, which is "bi-encoder", rather than the @@ -1593,9 +1598,10 @@ class ModelConfig: such as the lm_head in a generation model, or the score or classifier in a classification model. - `head_dtype` currently only supports pooling models.\n - - The pooling model defaults to using fp32 head, - you can use --hf-overrides '{"head_dtype": "model"}' to disable it. + `head_dtype` currently only supports pooling models. + + - The pooling model defaults to using fp32 head, you can use + --hf-overrides '{"head_dtype": "model"}' to disable it. """ head_dtype = _get_head_dtype( diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 1c9bc43b0..e66511c92 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -146,14 +146,14 @@ class MultiModalConfig: parallelism (TP). - `"weights"`: Within the same vLLM engine, split the weights of - each layer across TP ranks. (default TP behavior)\n + each layer across TP ranks. (default TP behavior) - `"data"`: Within the same vLLM engine, split the batched input data - across TP ranks to process the data in parallel, while hosting - the full weights on each TP rank. - This batch-level DP is not to be confused with API request-level - DP (which is controlled by `--data-parallel-size`). - This is only supported on a per-model basis and falls back to - `"weights"` if the encoder does not support DP.""" + across TP ranks to process the data in parallel, while hosting + the full weights on each TP rank. + This batch-level DP is not to be confused with API request-level + DP (which is controlled by `--data-parallel-size`). + This is only supported on a per-model basis and falls back to + `"weights"` if the encoder does not support DP.""" mm_encoder_attn_backend: AttentionBackendEnum | None = None """Optional override for the multi-modal encoder attention backend when using vision transformers. Accepts any value from diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 7dd9c5bb5..8afff3af2 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -148,10 +148,11 @@ class ParallelConfig: eplb_config: EPLBConfig = Field(default_factory=EPLBConfig) """Expert parallelism configuration.""" expert_placement_strategy: ExpertPlacementStrategy = "linear" - """The expert placement strategy for MoE layers:\n + """The expert placement strategy for MoE layers: + - "linear": Experts are placed in a contiguous manner. For example, with 4 experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have - experts [2, 3].\n + experts [2, 3]. - "round_robin": Experts are placed in a round-robin manner. For example, with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1 will have experts [1, 3]. This strategy can help improve load balancing @@ -159,11 +160,11 @@ class ParallelConfig: all2all_backend: All2AllBackend = "allgather_reducescatter" """All2All backend for MoE expert parallel communication. Available options: - - "allgather_reducescatter": All2all based on allgather and reducescatter\n - - "deepep_high_throughput": Use deepep high-throughput kernels\n - - "deepep_low_latency": Use deepep low-latency kernels\n - - "mori": Use mori kernels\n - - "nixl_ep": Use nixl-ep kernels\n + - "allgather_reducescatter": All2all based on allgather and reducescatter + - "deepep_high_throughput": Use deepep high-throughput kernels + - "deepep_low_latency": Use deepep low-latency kernels + - "mori": Use mori kernels + - "nixl_ep": Use nixl-ep kernels - "flashinfer_nvlink_two_sided": Use flashinfer two-sided kernels for mnnvl - "flashinfer_nvlink_one_sided": Use flashinfer high-throughput a2a kernels""" diff --git a/vllm/config/profiler.py b/vllm/config/profiler.py index e79e21310..68fa78854 100644 --- a/vllm/config/profiler.py +++ b/vllm/config/profiler.py @@ -37,7 +37,7 @@ class ProfilerConfig: profiler: ProfilerKind | None = None """Which profiler to use. Defaults to None. Options are: - - 'torch': Use PyTorch profiler.\n + - 'torch': Use PyTorch profiler. - 'cuda': Use CUDA profiler.""" torch_profiler_dir: str = "" diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index f988c1086..3cd99bb08 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -106,11 +106,12 @@ class SchedulerConfig: max_num_batched_tokens in case max multimodal embedding size is larger.""" policy: SchedulerPolicy = "fcfs" - """The scheduling policy to use:\n - - "fcfs" means first come first served, i.e. requests are handled in order - of arrival.\n + """The scheduling policy to use: + + - "fcfs" means first come first served, i.e. requests are handled in order + of arrival. - "priority" means requests are handled based on given priority (lower - value means earlier handling) and time of arrival deciding any ties).""" + value means earlier handling) and time of arrival deciding any ties).""" disable_chunked_mm_input: bool = False """If set to true and chunked prefill is enabled, we do not want to diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 73abd7865..a953fcb46 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -37,10 +37,12 @@ ConfigT = TypeVar("ConfigT", bound=DataclassInstance) @overload +@dataclass_transform(field_specifiers=(PydanticField,)) def config(cls: type[ConfigT]) -> type[ConfigT]: ... @overload +@dataclass_transform(field_specifiers=(PydanticField,)) def config( *, config: ConfigDict | None = None, **kwargs: Any ) -> Callable[[type[ConfigT]], type[ConfigT]]: ... diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py index e4482d4fb..c48edb68f 100644 --- a/vllm/utils/argparse_utils.py +++ b/vllm/utils/argparse_utils.py @@ -31,14 +31,12 @@ class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpForma def _split_lines(self, text, width): """ 1. Sentences split across lines have their single newlines removed. - 2. Paragraphs and explicit newlines are split into separate lines. + 2. Paragraphs and lists are split into separate lines. 3. Each line is wrapped to the specified width (width of terminal). """ - # The patterns also include whitespace after the newline - single_newline = re.compile(r"(?