Update Optional[x] -> x | None and Union[x, y] to x | y (#26633)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-12 17:51:31 +01:00
committed by GitHub
parent 9bb38130cb
commit 8fcaaf6a16
944 changed files with 9490 additions and 10121 deletions

View File

@@ -3,7 +3,7 @@
import hashlib
from dataclasses import field
from typing import TYPE_CHECKING, Any, Literal, Optional
from typing import TYPE_CHECKING, Any, Literal
from pydantic import Field, SkipValidation, field_validator
from pydantic.dataclasses import dataclass
@@ -58,13 +58,13 @@ class CacheConfig:
is_attention_free: bool = False
"""Whether the model is attention-free. This is primarily set in
`ModelConfig` and that value should be manually duplicated here."""
num_gpu_blocks_override: Optional[int] = None
num_gpu_blocks_override: int | None = None
"""Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
if specified. Does nothing if `None`. Used for testing preemption."""
sliding_window: Optional[int] = None
sliding_window: int | None = None
"""Sliding window size for the KV cache. This is primarily set in
`ModelConfig` and that value should be manually duplicated here."""
enable_prefix_caching: Optional[bool] = None
enable_prefix_caching: bool | None = None
"""Whether to enable prefix caching. Enabled by default for V1."""
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
"""Set the hash algorithm for prefix caching:\n
@@ -84,12 +84,12 @@ class CacheConfig:
"""This enables dynamic calculation of `k_scale` and `v_scale` when
kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
checkpoint if available. Otherwise, the scales will default to 1.0."""
cpu_kvcache_space_bytes: Optional[int] = None
cpu_kvcache_space_bytes: int | None = None
"""(CPU backend only) CPU key-value cache space."""
mamba_page_size_padded: Optional[int] = None
mamba_page_size_padded: int | None = None
""" Optional override for mamba page size; used by hybrid mamba/attention
models to ensure exact alignment with attention page size."""
mamba_block_size: Optional[int] = None
mamba_block_size: int | None = None
"""Size of a contiguous cache block in number of tokens for mamba cache."""
mamba_cache_dtype: MambaDType = "auto"
"""The data type to use for the Mamba cache (both the conv as well as the
@@ -101,9 +101,9 @@ class CacheConfig:
for the ssm state will be determined by mamba_cache_dtype."""
# Will be set after profiling.
num_gpu_blocks: Optional[int] = field(default=None, init=False)
num_gpu_blocks: int | None = field(default=None, init=False)
"""The number of blocks to allocate for GPU memory."""
num_cpu_blocks: Optional[int] = field(default=None, init=False)
num_cpu_blocks: int | None = field(default=None, init=False)
"""The number of blocks to allocate for CPU memory."""
kv_sharing_fast_prefill: bool = False
@@ -116,7 +116,7 @@ class CacheConfig:
necessary for implementing this optimization in some models (e.g. Gemma3n)
"""
kv_cache_memory_bytes: Optional[int] = None
kv_cache_memory_bytes: int | None = None
"""Size of KV Cache per GPU in bytes. By default, this is set to None
and vllm can automatically infer the kv cache size based on
gpu_memory_utilization. However, users may want to manually specify

View File

@@ -4,9 +4,10 @@
import enum
import hashlib
from collections import Counter
from collections.abc import Callable
from dataclasses import asdict, field
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
from typing import TYPE_CHECKING, Any, ClassVar
from pydantic import TypeAdapter, field_validator
from pydantic.dataclasses import dataclass
@@ -168,7 +169,7 @@ class CompilationConfig:
"""
# Top-level Compilation control
level: Optional[int] = None
level: int | None = None
"""The level of compilation:
- None: If None, we will select the default compilation level.
@@ -177,7 +178,7 @@ class CompilationConfig:
- 1: dynamo as is.
- 2: dynamo once.
- 3: piecewise compilation."""
debug_dump_path: Optional[Path] = None
debug_dump_path: Path | None = None
"""The path to dump the debug information."""
cache_dir: str = ""
"""The directory to store the compiled graph, to accelerate Inductor
@@ -208,7 +209,7 @@ class CompilationConfig:
By default, all custom ops are enabled when running without Inductor and
disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
Inductor generates (fused) Triton kernels for disabled custom ops."""
splitting_ops: Optional[list[str]] = None
splitting_ops: list[str] | None = None
"""A list of ops to exclude from cudagraphs, used in piecewise compilation.
The behavior depends on use_inductor_graph_partition:
@@ -238,7 +239,7 @@ class CompilationConfig:
are compiled using configurations in inductor_compile_config.
This setting is ignored if level<PIECEWISE."""
compile_sizes: Optional[list[Union[int, str]]] = None
compile_sizes: list[int | str] | None = None
"""Sizes to compile for inductor. In addition
to integers, it also supports "cudagraph_capture_sizes" to
specify the sizes for cudagraph capture."""
@@ -253,7 +254,7 @@ class CompilationConfig:
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
# CudaGraph compilation
cudagraph_mode: Optional[CUDAGraphMode] = None
cudagraph_mode: CUDAGraphMode | None = None
"""
The mode of the cudagraph:
@@ -308,7 +309,7 @@ class CompilationConfig:
It means the first several runs will be treated as warmup runs.
Only after that, the execution will be recorded, and the recorded
cudagraph will be used for subsequent runs."""
cudagraph_capture_sizes: Optional[list[int]] = None
cudagraph_capture_sizes: list[int] | None = None
"""Sizes to capture cudagraph.
- None (default): capture sizes are inferred from vllm config.
- list[int]: capture sizes are specified as given."""
@@ -320,7 +321,7 @@ class CompilationConfig:
internally managed buffer. Default is False.
Note that this flag is only effective when cudagraph_mode is PIECEWISE.
"""
full_cuda_graph: Optional[bool] = False
full_cuda_graph: bool | None = False
"""whether to use a full cuda graph for the entire forward pass rather than
splitting certain operations such as attention into subgraphs. Thus this
flag cannot be used together with splitting_ops. This may provide
@@ -544,7 +545,7 @@ class CompilationConfig:
"(where 'op' is the registered op name)"
)
def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
if self.level == CompilationLevel.NO_COMPILATION:
raise ValueError("No compilation level is set.")

View File

@@ -3,7 +3,7 @@
import hashlib
from dataclasses import field
from typing import Any, Literal, Optional, Union
from typing import Any, Literal
import torch
from pydantic import ConfigDict, SkipValidation
@@ -19,7 +19,7 @@ Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
class DeviceConfig:
"""Configuration for the device to use for vLLM execution."""
device: SkipValidation[Optional[Union[Device, torch.device]]] = "auto"
device: SkipValidation[Device | torch.device | None] = "auto"
"""Device type for vLLM execution.
This parameter is deprecated and will be
removed in a future release.

View File

@@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
from pydantic.dataclasses import dataclass
@@ -26,7 +25,7 @@ class KVEventsConfig:
"""The zmq endpoint to use for publishing kv events.
"""
replay_endpoint: Optional[str] = None
replay_endpoint: str | None = None
"""The zmq endpoint to use for replaying kv events.
"""

View File

@@ -4,7 +4,7 @@
import hashlib
import uuid
from dataclasses import field
from typing import Any, Literal, Optional, get_args
from typing import Any, Literal, get_args
from pydantic.dataclasses import dataclass
@@ -20,14 +20,14 @@ KVRole = Literal[KVProducer, KVConsumer]
class KVTransferConfig:
"""Configuration for distributed KV cache transfer."""
kv_connector: Optional[str] = None
kv_connector: str | None = None
"""The KV connector for vLLM to transmit KV caches between vLLM instances.
"""
engine_id: Optional[str] = None
engine_id: str | None = None
"""The engine id for KV transfers."""
kv_buffer_device: Optional[str] = "cuda"
kv_buffer_device: str | None = "cuda"
"""The device used by kv connector to buffer the KV cache. Choices are
'cuda' and 'cpu'."""
@@ -35,11 +35,11 @@ class KVTransferConfig:
"""The buffer size for TorchDistributedConnector. Measured in number of
bytes. Recommended value: 1e9 (about 1GB)."""
kv_role: Optional[KVRole] = None
kv_role: KVRole | None = None
"""Whether this vLLM instance produces, consumes KV cache, or both. Choices
are 'kv_producer', 'kv_consumer', and 'kv_both'."""
kv_rank: Optional[int] = None
kv_rank: int | None = None
"""The rank of this vLLM instance in the KV cache transfer. Typical value:
0 for prefill instance, 1 for decode instance.
Currently only 1P1D is supported."""
@@ -57,7 +57,7 @@ class KVTransferConfig:
kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
"""any extra config that the connector may need."""
kv_connector_module_path: Optional[str] = None
kv_connector_module_path: str | None = None
"""The Python module path to dynamically load the KV connector from.
Only supported in V1."""

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
from typing import TYPE_CHECKING, Any, Optional, Union
from typing import TYPE_CHECKING, Any
from pydantic import Field, field_validator
from pydantic.dataclasses import dataclass
@@ -25,7 +25,7 @@ logger = init_logger(__name__)
class LoadConfig:
"""Configuration for loading the model weights."""
load_format: Union[str, LoadFormats] = "auto"
load_format: str | LoadFormats = "auto"
"""The format of the model weights to load:\n
- "auto" will try to load the weights in the safetensors format and fall
back to the pytorch bin format if safetensors format is not available.\n
@@ -48,7 +48,7 @@ class LoadConfig:
- "mistral" will load weights from consolidated safetensors files used by
Mistral models.
- Other custom values can be supported via plugins."""
download_dir: Optional[str] = None
download_dir: str | None = None
"""Directory to download and load the weights, default to the default
cache directory of Hugging Face."""
safetensors_load_strategy: str = "lazy"
@@ -64,23 +64,19 @@ class LoadConfig:
was quantized using torchao and saved using safetensors.
Needs torchao >= 0.14.0
"""
model_loader_extra_config: Union[dict, TensorizerConfig] = Field(
default_factory=dict
)
model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
"""Extra config for model loader. This will be passed to the model loader
corresponding to the chosen load_format."""
device: Optional[str] = None
device: str | None = None
"""Device to which model weights will be loaded, default to
device_config.device"""
ignore_patterns: Union[list[str], str] = Field(
default_factory=lambda: ["original/**/*"]
)
ignore_patterns: list[str] | str = Field(default_factory=lambda: ["original/**/*"])
"""The list of patterns to ignore when loading the model. Default to
"original/**/*" to avoid repeated loading of llama's checkpoints."""
use_tqdm_on_load: bool = True
"""Whether to enable tqdm for showing progress bar when loading model
weights."""
pt_load_map_location: Union[str, dict[str, str]] = "cpu"
pt_load_map_location: str | dict[str, str] = "cpu"
"""
pt_load_map_location: the map location for loading pytorch checkpoint, to
support loading checkpoints can only be loaded on certain devices like
@@ -115,8 +111,8 @@ class LoadConfig:
@field_validator("ignore_patterns", mode="after")
def _validate_ignore_patterns(
cls, ignore_patterns: Union[list[str], str]
) -> Union[list[str], str]:
cls, ignore_patterns: list[str] | str
) -> list[str] | str:
if ignore_patterns != ["original/**/*"] and len(ignore_patterns) > 0:
logger.info(
"Ignoring the following patterns when downloading weights: %s",

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
from typing import TYPE_CHECKING, Any, ClassVar, Literal
import torch
from pydantic import ConfigDict, Field, model_validator
@@ -42,10 +42,10 @@ class LoRAConfig:
parallelism. Enabling this will use the fully sharded layers. At high
sequence length, max rank or tensor parallel size, this is likely faster.
"""
max_cpu_loras: Optional[int] = None
max_cpu_loras: int | None = None
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
`max_loras`."""
lora_dtype: Union[torch.dtype, LoRADType] = "auto"
lora_dtype: torch.dtype | LoRADType = "auto"
"""Data type for LoRA. If auto, will default to base model dtype."""
lora_extra_vocab_size: LoRAExtraVocabSize = Field(
default=256,
@@ -60,7 +60,7 @@ class LoRAConfig:
lora_vocab_padding_size: ClassVar[int] = (
current_platform.get_lora_vocab_padding_size()
)
default_mm_loras: Optional[dict[str, str]] = None
default_mm_loras: dict[str, str] | None = None
"""Dictionary mapping specific modalities to LoRA model paths; this field
is only applicable to multimodal models and should be leveraged when a
model always expects a LoRA to be active when a given modality is present.

View File

@@ -4,18 +4,10 @@
import hashlib
import json
import warnings
from collections.abc import Callable
from dataclasses import InitVar, field
from importlib.util import find_spec
from typing import (
TYPE_CHECKING,
Any,
Callable,
Literal,
Optional,
Union,
cast,
get_args,
)
from typing import TYPE_CHECKING, Any, Literal, cast, get_args
import torch
from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
@@ -89,7 +81,7 @@ ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
LogprobsMode = Literal[
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
]
HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], PretrainedConfig]]
HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig]
ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"]
_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
@@ -121,7 +113,7 @@ class ModelConfig:
"""Convert the model using adapters defined in
[vllm.model_executor.models.adapters][]. The most common use case is to
adapt a text generation model to be used for pooling tasks."""
task: Optional[TaskOption] = None
task: TaskOption | None = None
"""[DEPRECATED] The task to use the model for. If the model supports more
than one model runner, this is used to select which model runner to run.
@@ -139,7 +131,7 @@ class ModelConfig:
trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model
and tokenizer."""
dtype: Union[ModelDType, torch.dtype] = "auto"
dtype: ModelDType | torch.dtype = "auto"
"""Data type for model weights and activations:\n
- "auto" will use FP16 precision for FP32 and FP16 models, and BF16
precision for BF16 models.\n
@@ -148,33 +140,33 @@ class ModelConfig:
- "bfloat16" for a balance between precision and range.\n
- "float" is shorthand for FP32 precision.\n
- "float32" for FP32 precision."""
seed: Optional[int] = None
seed: int | None = None
"""Random seed for reproducibility. Initialized to None in V0, but
initialized to 0 in V1."""
hf_config_path: Optional[str] = None
hf_config_path: str | None = None
"""Name or path of the Hugging Face config to use. If unspecified, model
name or path will be used."""
allowed_local_media_path: str = ""
"""Allowing API requests to read local images or videos from directories
specified by the server file system. This is a security risk. Should only
be enabled in trusted environments."""
allowed_media_domains: Optional[list[str]] = None
allowed_media_domains: list[str] | None = None
"""If set, only media URLs that belong to this domain can be used for
multi-modal inputs. """
revision: Optional[str] = None
revision: str | None = None
"""The specific model version to use. It can be a branch name, a tag name,
or a commit id. If unspecified, will use the default version."""
code_revision: Optional[str] = None
code_revision: str | None = None
"""The specific revision to use for the model code on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
rope_scaling: dict[str, Any] = field(default_factory=dict)
"""RoPE scaling configuration. For example,
`{"rope_type":"dynamic","factor":2.0}`."""
rope_theta: Optional[float] = None
rope_theta: float | None = None
"""RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE
theta improves the performance of the scaled model."""
tokenizer_revision: Optional[str] = None
tokenizer_revision: str | None = None
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
@@ -187,9 +179,9 @@ class ModelConfig:
- 1k -> 1000\n
- 1K -> 1024\n
- 25.6k -> 25,600"""
spec_target_max_model_len: Optional[int] = None
spec_target_max_model_len: int | None = None
"""Specify the maximum length for spec decoding draft models."""
quantization: SkipValidation[Optional[QuantizationMethods]] = None
quantization: SkipValidation[QuantizationMethods | None] = None
"""Method used to quantize the weights. If `None`, we first check the
`quantization_config` attribute in the model config file. If that is
`None`, we assume the model weights are not quantized and use `dtype` to
@@ -230,7 +222,7 @@ class ModelConfig:
"""If `True`, enables passing text embeddings as inputs via the
`prompt_embeds` key. Note that enabling this will double the time required
for graph compilation."""
served_model_name: Optional[Union[str, list[str]]] = None
served_model_name: str | list[str] | None = None
"""The model name(s) used in the API. If multiple names are provided, the
server will respond to any of the provided names. The model name in the
model field of a response will be the first name in this list. If not
@@ -238,20 +230,20 @@ class ModelConfig:
that this name(s) will also be used in `model_name` tag content of
prometheus metrics, if multiple names provided, metrics tag will take the
first one."""
config_format: Union[str, ConfigFormat] = "auto"
config_format: str | ConfigFormat = "auto"
"""The format of the model config to load:\n
- "auto" will try to load the config in hf format if available else it
will try to load in mistral format.\n
- "hf" will load the config in hf format.\n
- "mistral" will load the config in mistral format."""
hf_token: Optional[Union[bool, str]] = None
hf_token: bool | str | None = None
"""The token to use as HTTP bearer authorization for remote files . If
`True`, will use the token generated when running `huggingface-cli login`
(stored in `~/.huggingface`)."""
hf_overrides: HfOverrides = field(default_factory=dict)
"""If a dictionary, contains arguments to be forwarded to the Hugging Face
config. If a callable, it is called to update the HuggingFace config."""
logits_processor_pattern: Optional[str] = None
logits_processor_pattern: str | None = None
"""Optional regex pattern specifying valid logits processor qualified names
that can be passed with the `logits_processors` extra completion argument.
Defaults to `None`, which allows no processors."""
@@ -269,7 +261,7 @@ class ModelConfig:
`--generation-config vllm`, only the override parameters are used."""
enable_sleep_mode: bool = False
"""Enable sleep mode for the engine (only cuda platform is supported)."""
model_impl: Union[str, ModelImpl] = "auto"
model_impl: str | ModelImpl = "auto"
"""Which implementation of the model to use:\n
- "auto" will try to use the vLLM implementation, if it exists, and fall
back to the Transformers implementation if no vLLM implementation is
@@ -278,36 +270,36 @@ class ModelConfig:
- "transformers" will use the Transformers model implementation.\n
- "terratorch" will use the TerraTorch model implementation.
"""
override_attention_dtype: Optional[str] = None
override_attention_dtype: str | None = None
"""Override dtype for attention"""
logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
logits_processors: list[str | type[LogitsProcessor]] | None = None
"""One or more logits processors' fully-qualified class names or class
definitions"""
io_processor_plugin: Optional[str] = None
io_processor_plugin: str | None = None
"""IOProcessor plugin name to load at model startup"""
# Pooler config
pooler_config: Optional[PoolerConfig] = None
pooler_config: PoolerConfig | None = None
"""Pooler config which controls the behaviour of output pooling in pooling
models."""
override_pooler_config: Optional[Union[dict, PoolerConfig]] = None
override_pooler_config: dict | PoolerConfig | None = None
"""[DEPRECATED] Use `pooler_config` instead. This field will be removed in
v0.12.0 or v1.0.0, whichever is sooner."""
# Multimodal config and init vars
multimodal_config: Optional[MultiModalConfig] = None
multimodal_config: MultiModalConfig | None = None
"""Configuration for multimodal model. If `None`, this will be inferred
from the architecture of `self.model`."""
limit_mm_per_prompt: InitVar[Optional[dict[str, Union[int, dict[str, int]]]]] = None
media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None
mm_processor_cache_gb: InitVar[Optional[float]] = None
mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None
mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None
mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None
interleave_mm_strings: InitVar[Optional[bool]] = None
skip_mm_profiling: InitVar[Optional[bool]] = None
video_pruning_rate: InitVar[Optional[float]] = None
limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
mm_processor_cache_gb: InitVar[float | None] = None
mm_processor_cache_type: InitVar[MMCacheType | None] = None
mm_shm_cache_max_object_size_mb: InitVar[int | None] = None
mm_encoder_tp_mode: InitVar[MMEncoderTPMode | None] = None
interleave_mm_strings: InitVar[bool | None] = None
skip_mm_profiling: InitVar[bool | None] = None
video_pruning_rate: InitVar[float | None] = None
def compute_hash(self) -> str:
"""
@@ -369,7 +361,7 @@ class ModelConfig:
def _update_nested(
self,
target: Union["PretrainedConfig", dict[str, Any]],
target: PretrainedConfig | dict[str, Any],
updates: dict[str, Any],
) -> None:
"""Recursively updates a config or dict with nested updates."""
@@ -397,7 +389,7 @@ class ModelConfig:
def _apply_dict_overrides(
self,
config: "PretrainedConfig",
config: PretrainedConfig,
overrides: dict[str, Any],
) -> None:
"""Apply dict overrides, handling both nested configs and dict values."""
@@ -415,16 +407,16 @@ class ModelConfig:
def __post_init__(
self,
# Multimodal config init vars
limit_mm_per_prompt: Optional[dict[str, int]],
media_io_kwargs: Optional[dict[str, dict[str, Any]]],
mm_processor_kwargs: Optional[dict[str, Any]],
mm_processor_cache_gb: Optional[float],
mm_processor_cache_type: Optional[MMCacheType],
mm_shm_cache_max_object_size_mb: Optional[int],
mm_encoder_tp_mode: Optional[MMEncoderTPMode],
interleave_mm_strings: Optional[bool],
skip_mm_profiling: Optional[bool],
video_pruning_rate: Optional[float],
limit_mm_per_prompt: dict[str, int] | None,
media_io_kwargs: dict[str, dict[str, Any]] | None,
mm_processor_kwargs: dict[str, Any] | None,
mm_processor_cache_gb: float | None,
mm_processor_cache_type: MMCacheType | None,
mm_shm_cache_max_object_size_mb: int | None,
mm_encoder_tp_mode: MMEncoderTPMode | None,
interleave_mm_strings: bool | None,
skip_mm_profiling: bool | None,
video_pruning_rate: float | None,
) -> None:
# Set the default seed to 0 in V1.
# NOTE(woosuk): In V0, we set the default seed to None because the
@@ -1209,7 +1201,7 @@ class ModelConfig:
"Supported models implement the `SupportsPP` interface."
)
def get_sliding_window(self) -> Optional[int]:
def get_sliding_window(self) -> int | None:
"""Get the sliding window size from the HF text config if present."""
return getattr(self.hf_text_config, "sliding_window", None)
@@ -1479,7 +1471,7 @@ class ModelConfig:
f"{block_type.value} layers"
)
def get_mamba_chunk_size(self) -> Optional[int]:
def get_mamba_chunk_size(self) -> int | None:
"""
Returns the mamba chunk size if it exists
"""
@@ -1715,9 +1707,7 @@ class ModelConfig:
return max_model_len
def get_served_model_name(
model: str, served_model_name: Optional[Union[str, list[str]]]
):
def get_served_model_name(model: str, served_model_name: str | list[str] | None):
"""
If the input is a non-empty list, the first model_name in
`served_model_name` is taken.
@@ -1761,9 +1751,9 @@ def iter_architecture_defaults():
def try_match_architecture_defaults(
architecture: str,
*,
runner_type: Optional[RunnerType] = None,
convert_type: Optional[ConvertType] = None,
) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]:
runner_type: RunnerType | None = None,
convert_type: ConvertType | None = None,
) -> tuple[str, tuple[RunnerType, ConvertType]] | None:
for suffix, (
default_runner_type,
default_convert_type,
@@ -1817,7 +1807,7 @@ def _find_dtype(
model_id: str,
config: PretrainedConfig,
*,
revision: Optional[str],
revision: str | None,
):
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
# because config.torch_dtype can be None.
@@ -1902,10 +1892,10 @@ def _resolve_auto_dtype(
def _get_and_verify_dtype(
model_id: str,
config: PretrainedConfig,
dtype: Union[str, torch.dtype],
dtype: str | torch.dtype,
*,
is_pooling_model: bool,
revision: Optional[str] = None,
revision: str | None = None,
) -> torch.dtype:
config_dtype = _find_dtype(model_id, config, revision=revision)
model_type = config.model_type
@@ -1947,7 +1937,7 @@ def _get_and_verify_dtype(
def _get_head_dtype(
config: PretrainedConfig, dtype: torch.dtype, runner_type: str
) -> torch.dtype:
head_dtype: Optional[Union[str, torch.dtype]] = getattr(config, "head_dtype", None)
head_dtype: str | torch.dtype | None = getattr(config, "head_dtype", None)
if head_dtype == "model":
return dtype
@@ -1970,12 +1960,12 @@ def _get_head_dtype(
def _get_and_verify_max_len(
hf_config: PretrainedConfig,
tokenizer_config: Optional[dict],
max_model_len: Optional[int],
tokenizer_config: dict | None,
max_model_len: int | None,
disable_sliding_window: bool,
sliding_window: Optional[int],
spec_target_max_model_len: Optional[int] = None,
encoder_config: Optional[Any] = None,
sliding_window: int | None,
spec_target_max_model_len: int | None = None,
encoder_config: Any | None = None,
) -> int:
"""Get and verify the model's maximum length."""
derived_max_model_len = float("inf")

View File

@@ -4,7 +4,7 @@
import hashlib
from collections.abc import Mapping
from dataclasses import field
from typing import Any, Literal, Optional, Union
from typing import Any, Literal, TypeAlias
from pydantic import ConfigDict, Field, field_validator
from pydantic.dataclasses import dataclass
@@ -23,31 +23,31 @@ class BaseDummyOptions:
class VideoDummyOptions(BaseDummyOptions):
"""Options for generating dummy video data during profiling."""
num_frames: Optional[int] = Field(None, gt=0)
width: Optional[int] = Field(None, gt=0)
height: Optional[int] = Field(None, gt=0)
num_frames: int | None = Field(None, gt=0)
width: int | None = Field(None, gt=0)
height: int | None = Field(None, gt=0)
@dataclass(config=ConfigDict(extra="forbid"))
class ImageDummyOptions(BaseDummyOptions):
"""Options for generating dummy image data during profiling."""
width: Optional[int] = Field(None, gt=0)
height: Optional[int] = Field(None, gt=0)
width: int | None = Field(None, gt=0)
height: int | None = Field(None, gt=0)
@dataclass(config=ConfigDict(extra="forbid"))
class AudioDummyOptions(BaseDummyOptions):
"""Options for generating dummy audio data during profiling."""
length: Optional[int] = Field(None, gt=0)
length: int | None = Field(None, gt=0)
MMEncoderTPMode = Literal["weights", "data"]
MMCacheType = Literal["shm", "lru"]
DummyOptions = Union[
BaseDummyOptions, VideoDummyOptions, ImageDummyOptions, AudioDummyOptions
]
DummyOptions: TypeAlias = (
BaseDummyOptions | VideoDummyOptions | ImageDummyOptions | AudioDummyOptions
)
@config
@@ -75,7 +75,7 @@ class MultiModalConfig:
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
mm_processor_kwargs: Optional[dict[str, object]] = None
mm_processor_kwargs: dict[str, object] | None = None
"""Arguments to be forwarded to the model's processor for multi-modal data,
e.g., image processor. Overrides for the multi-modal processor obtained
from `transformers.AutoProcessor.from_pretrained`.
@@ -123,7 +123,7 @@ class MultiModalConfig:
This reduces engine startup time but shifts the responsibility to users for
estimating the peak memory usage of the activation of multimodal encoder and
embedding cache."""
video_pruning_rate: Optional[float] = None
video_pruning_rate: float | None = None
"""Sets pruning rate for video pruning via Efficient Video Sampling.
Value sits in range [0;1) and determines fraction of media tokens
from each video to be pruned.
@@ -132,7 +132,7 @@ class MultiModalConfig:
@field_validator("limit_per_prompt", mode="before")
@classmethod
def _validate_limit_per_prompt(
cls, value: dict[str, Union[int, dict[str, int]]]
cls, value: dict[str, int | dict[str, int]]
) -> dict[str, DummyOptions]:
for k, v in value.items():
# Handle legacy format where only count is specified
@@ -179,7 +179,7 @@ class MultiModalConfig:
return 999
return limit_data.count
def get_dummy_options(self, modality: str) -> Optional[BaseDummyOptions]:
def get_dummy_options(self, modality: str) -> BaseDummyOptions | None:
"""
Get the configurable dummy data options for a modality.
Returns None if no options are configured for this modality.

View File

@@ -3,7 +3,7 @@
import hashlib
from functools import cached_property
from typing import Any, Literal, Optional, cast
from typing import Any, Literal, cast
from pydantic.dataclasses import dataclass
@@ -18,7 +18,7 @@ DetailedTraceModules = Literal["model", "worker", "all"]
class ObservabilityConfig:
"""Configuration for observability - metrics and tracing."""
show_hidden_metrics_for_version: Optional[str] = None
show_hidden_metrics_for_version: str | None = None
"""Enable deprecated Prometheus metrics that have been hidden since the
specified version. For example, if a previously deprecated metric has been
hidden since the v0.7.0 release, you use
@@ -33,10 +33,10 @@ class ObservabilityConfig:
return False
return version._prev_minor_version_was(self.show_hidden_metrics_for_version)
otlp_traces_endpoint: Optional[str] = None
otlp_traces_endpoint: str | None = None
"""Target URL to which OpenTelemetry traces will be sent."""
collect_detailed_traces: Optional[list[DetailedTraceModules]] = None
collect_detailed_traces: list[DetailedTraceModules] | None = None
"""It makes sense to set this only if `--otlp-traces-endpoint` is set. If
set, it will collect detailed traces for the specified modules. This
involves use of possibly costly and or blocking operations and hence might

View File

@@ -3,7 +3,7 @@
import hashlib
import os
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
from typing import TYPE_CHECKING, Any, Literal
import torch
from pydantic import Field, model_validator
@@ -75,7 +75,7 @@ class ParallelConfig:
"""Number of local data parallel groups."""
data_parallel_rank: int = 0
"""Rank of the data parallel group."""
data_parallel_rank_local: Optional[int] = None
data_parallel_rank_local: int | None = None
"""Local rank of the data parallel group,
set only in SPMD mode."""
data_parallel_master_ip: str = "127.0.0.1"
@@ -113,24 +113,24 @@ class ParallelConfig:
with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
will have experts [1, 3]. This strategy can help improve load balancing
for grouped expert models with no redundant experts."""
num_redundant_experts: Optional[int] = None
num_redundant_experts: int | None = None
"""`num_redundant_experts` is deprecated and has been replaced with
`eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
Please use `eplb_config.num_redundant_experts` instead."""
eplb_window_size: Optional[int] = None
eplb_window_size: int | None = None
"""`eplb_window_size` is deprecated and has been replaced with
`eplb_config.window_size`. This will be removed in v0.12.0.
Please use `eplb_config.window_size` instead."""
eplb_step_interval: Optional[int] = None
eplb_step_interval: int | None = None
"""`eplb_step_interval` is deprecated and has been replaced with
`eplb_config.step_interval`. This will be removed in v0.12.0.
Please use `eplb_config.step_interval` instead."""
eplb_log_balancedness: Optional[bool] = None
eplb_log_balancedness: bool | None = None
"""`eplb_log_balancedness` is deprecated and has been replaced with
`eplb_config.log_balancedness`. This will be removed in v0.12.0.
Please use `eplb_config.log_balancedness` instead."""
max_parallel_loading_workers: Optional[int] = None
max_parallel_loading_workers: int | None = None
"""Maximum number of parallel loading workers when loading model
sequentially in multiple batches. To avoid RAM OOM when using tensor
parallel and large models."""
@@ -159,15 +159,15 @@ class ParallelConfig:
ray_workers_use_nsight: bool = False
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
ray_runtime_env: Optional[RuntimeEnv] = None
ray_runtime_env: RuntimeEnv | None = None
"""Ray runtime environment to pass to distributed workers."""
placement_group: Optional[PlacementGroup] = None
placement_group: PlacementGroup | None = None
"""ray distributed model workers placement group."""
distributed_executor_backend: Optional[
Union[str, DistributedExecutorBackend, type[ExecutorBase]]
] = None
distributed_executor_backend: (
str | DistributedExecutorBackend | type[ExecutorBase] | None
) = None
"""Backend to use for distributed model
workers, either "ray" or "mp" (multiprocessing). If the product
of pipeline_parallel_size and tensor_parallel_size is less than
@@ -306,7 +306,7 @@ class ParallelConfig:
)
max_retries = 5
last_exc: Optional[Exception] = None
last_exc: Exception | None = None
for _ in range(max_retries):
try:
# use gloo since the engine process might not have cuda device

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
from typing import Any, Optional
from typing import Any
from pydantic.dataclasses import dataclass
@@ -14,23 +14,23 @@ from vllm.config.utils import config
class PoolerConfig:
"""Controls the behavior of output pooling in pooling models."""
pooling_type: Optional[str] = None
pooling_type: str | None = None
"""
The pooling method of the pooling model. This should be a key in
[`vllm.model_executor.layers.pooler.PoolingType`][].
"""
## for embeddings models
normalize: Optional[bool] = None
normalize: bool | None = None
"""
Whether to normalize the embeddings outputs. Defaults to True.
"""
dimensions: Optional[int] = None
dimensions: int | None = None
"""
Reduce the dimensions of embeddings if model
support matryoshka representation. Defaults to None.
"""
enable_chunked_processing: Optional[bool] = None
enable_chunked_processing: bool | None = None
"""
Whether to enable chunked processing for long inputs that exceed the model's
maximum position embeddings. When enabled, long inputs will be split into
@@ -38,7 +38,7 @@ class PoolerConfig:
This allows embedding models to handle arbitrarily long text without CUDA
errors. Defaults to False.
"""
max_embed_len: Optional[int] = None
max_embed_len: int | None = None
"""
Maximum input length allowed for embedding generation. When set, allows
inputs longer than max_embed_len to be accepted for embedding models.
@@ -48,29 +48,29 @@ class PoolerConfig:
"""
## for classification models
activation: Optional[bool] = None
activation: bool | None = None
"""
Whether to apply activation function to the classification outputs.
Defaults to True.
"""
logit_bias: Optional[float] = None
logit_bias: float | None = None
"""
If provided, apply classification logit biases. Defaults to None.
"""
## for reward models
softmax: Optional[bool] = None
softmax: bool | None = None
"""
Whether to apply softmax to the reward outputs.
Defaults to True.
"""
step_tag_id: Optional[int] = None
step_tag_id: int | None = None
"""
If set, only the score corresponding to the ``step_tag_id`` in the
generated sentence should be returned. Otherwise, the scores for all tokens
are returned.
"""
returned_token_ids: Optional[list[int]] = None
returned_token_ids: list[int] | None = None
"""
A list of indices for the vocabulary dimensions to be extracted,
such as the token IDs of ``good_token`` and ``bad_token`` in the

View File

@@ -3,7 +3,7 @@
import hashlib
from dataclasses import InitVar, field
from typing import Any, Literal, Union
from typing import Any, Literal
from pydantic import SkipValidation, model_validator
from pydantic.dataclasses import dataclass
@@ -133,7 +133,7 @@ class SchedulerConfig:
# scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
# or "mod.custom_class".
scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
scheduler_cls: str | type[object] = "vllm.core.scheduler.Scheduler"
"""The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
default scheduler. Can be a class directly or the path to a class of form
"mod.custom_class"."""

View File

@@ -3,7 +3,7 @@
import ast
import hashlib
from typing import TYPE_CHECKING, Any, Literal, Optional
from typing import TYPE_CHECKING, Any, Literal
from pydantic import SkipValidation, model_validator
from pydantic.dataclasses import dataclass
@@ -59,16 +59,16 @@ MTP_MODEL_TYPES = (
class SpeculativeConfig:
"""Configuration for speculative decoding."""
enforce_eager: Optional[bool] = None
enforce_eager: bool | None = None
"""Override the default enforce_eager from model_config"""
# General speculative decoding control
num_speculative_tokens: SkipValidation[int] = None # type: ignore
"""The number of speculative tokens, if provided. It will default to the
number in the draft model config if present, otherwise, it is required."""
model: Optional[str] = None
model: str | None = None
"""The name of the draft model, eagle head, or additional weights, if
provided."""
method: Optional[SpeculativeMethod] = None
method: SpeculativeMethod | None = None
"""The name of the speculative method to use. If users provide and set the
`model` param, the speculative method type will be detected automatically
if possible, if `model` param is not provided, the method name must be
@@ -76,7 +76,7 @@ class SpeculativeConfig:
If using `ngram` method, the related configuration `prompt_lookup_max` and
`prompt_lookup_min` should be considered."""
draft_tensor_parallel_size: Optional[int] = None
draft_tensor_parallel_size: int | None = None
"""The degree of the tensor parallelism for the draft model. Can only be 1
or the same as the target model's tensor parallel size."""
disable_logprobs: bool = True
@@ -85,24 +85,24 @@ class SpeculativeConfig:
according to the log probability settings in SamplingParams."""
# Draft model configuration
quantization: Optional[me_quant.QuantizationMethods] = None
quantization: me_quant.QuantizationMethods | None = None
"""Quantization method that was used to quantize the draft model weights.
If `None`, we assume the model weights are not quantized. Note that it only
takes effect when using the draft model-based speculative method."""
max_model_len: Optional[int] = None
max_model_len: int | None = None
"""The maximum model length of the draft model. Used when testing the
ability to skip speculation for some sequences."""
revision: Optional[str] = None
revision: str | None = None
"""The specific model version to use for the draft model. It can be a
branch name, a tag name, or a commit id. If unspecified, will use the
default version."""
code_revision: Optional[str] = None
code_revision: str | None = None
"""The specific revision to use for the draft model code on Hugging Face
Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
will use the default version."""
# Advanced control
disable_by_batch_size: Optional[int] = None
disable_by_batch_size: int | None = None
"""Disable speculative decoding for new incoming requests when the number
of enqueued requests is larger than this value, if provided."""
disable_padded_drafter_batch: bool = False
@@ -112,14 +112,14 @@ class SpeculativeConfig:
only affects the EAGLE method of speculation."""
# Ngram proposer configuration
prompt_lookup_max: Optional[int] = None
prompt_lookup_max: int | None = None
"""Maximum size of ngram token window when using Ngram proposer, required
when method is set to ngram."""
prompt_lookup_min: Optional[int] = None
prompt_lookup_min: int | None = None
"""Minimum size of ngram token window when using Ngram proposer, if
provided. Defaults to 1."""
speculative_token_tree: Optional[str] = None
speculative_token_tree: str | None = None
"""Specifies the tree structure for speculative token generation.
"""
# required configuration params passed from engine
@@ -449,7 +449,7 @@ class SpeculativeConfig:
@staticmethod
def _maybe_override_draft_max_model_len(
speculative_max_model_len: Optional[int],
speculative_max_model_len: int | None,
draft_max_model_len: int,
target_max_model_len: int,
) -> int:
@@ -488,7 +488,7 @@ class SpeculativeConfig:
@staticmethod
def _verify_and_get_draft_tp(
target_parallel_config: ParallelConfig,
speculative_draft_tensor_parallel_size: Optional[int],
speculative_draft_tensor_parallel_size: int | None,
draft_hf_config: PretrainedConfig,
) -> int:
"""

View File

@@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
from pydantic.dataclasses import dataclass
@@ -28,7 +27,7 @@ class SpeechToTextConfig:
splitting long audio. This helps maintain context across chunk boundaries
and improves transcription quality at split points."""
min_energy_split_window_size: Optional[int] = 1600
min_energy_split_window_size: int | None = 1600
"""Window size in samples for finding low-energy (quiet) regions to split
audio chunks. The algorithm looks for the quietest moment within this
window to minimize cutting through speech. Default 1600 samples ≈ 100ms

View File

@@ -10,7 +10,7 @@ from contextlib import contextmanager
from dataclasses import field, replace
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
from typing import TYPE_CHECKING, Any, TypeVar
import torch
from pydantic import ConfigDict
@@ -69,17 +69,17 @@ class VllmConfig:
"""Device configuration."""
load_config: LoadConfig = field(default_factory=LoadConfig)
"""Load configuration."""
lora_config: Optional[LoRAConfig] = None
lora_config: LoRAConfig | None = None
"""LoRA configuration."""
speculative_config: Optional[SpeculativeConfig] = None
speculative_config: SpeculativeConfig | None = None
"""Speculative decoding configuration."""
structured_outputs_config: StructuredOutputsConfig = field(
default_factory=StructuredOutputsConfig
)
"""Structured outputs configuration."""
observability_config: Optional[ObservabilityConfig] = None
observability_config: ObservabilityConfig | None = None
"""Observability configuration."""
quant_config: Optional[QuantizationConfig] = None
quant_config: QuantizationConfig | None = None
"""Quantization configuration."""
compilation_config: CompilationConfig = field(default_factory=CompilationConfig)
"""`torch.compile` and cudagraph capture configuration for the model.
@@ -96,14 +96,14 @@ class VllmConfig:
You can specify the full compilation config like so:
`{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
"""
kv_transfer_config: Optional[KVTransferConfig] = None
kv_transfer_config: KVTransferConfig | None = None
"""The configurations for distributed KV cache transfer."""
kv_events_config: Optional[KVEventsConfig] = None
kv_events_config: KVEventsConfig | None = None
"""The configurations for event publishing."""
# some opaque config, only used to provide additional information
# for the hash computation, mainly used for testing, debugging or out of
# tree config registration.
additional_config: Union[dict, SupportsHash] = field(default_factory=dict)
additional_config: dict | SupportsHash = field(default_factory=dict)
"""Additional config for specified platform. Different platforms may
support different configs. Make sure the configs are valid for the platform
you are using. Contents must be hashable."""
@@ -212,7 +212,7 @@ class VllmConfig:
@staticmethod
def _get_quantization_config(
model_config: ModelConfig, load_config: LoadConfig
) -> Optional[QuantizationConfig]:
) -> QuantizationConfig | None:
"""Get the quantization config."""
from vllm.platforms import current_platform
@@ -245,7 +245,7 @@ class VllmConfig:
@staticmethod
def get_quantization_config(
model_config: ModelConfig, load_config: LoadConfig
) -> Optional[QuantizationConfig]:
) -> QuantizationConfig | None:
import copy
# For some reason, the _ version of this modifies the model_config
@@ -257,7 +257,7 @@ class VllmConfig:
def with_hf_config(
self,
hf_config: PretrainedConfig,
architectures: Optional[list[str]] = None,
architectures: list[str] | None = None,
) -> "VllmConfig":
if architectures is not None:
hf_config = copy.deepcopy(hf_config)
@@ -740,7 +740,7 @@ class VllmConfig:
f"Model: {self.model_config.model}"
)
def compile_debug_dump_path(self) -> Optional[Path]:
def compile_debug_dump_path(self) -> Path | None:
"""Returns a rank-aware path for dumping
torch.compile debug information.
"""
@@ -790,13 +790,13 @@ class VllmConfig:
)
_current_vllm_config: Optional[VllmConfig] = None
_current_prefix: Optional[str] = None
_current_vllm_config: VllmConfig | None = None
_current_prefix: str | None = None
@contextmanager
def set_current_vllm_config(
vllm_config: VllmConfig, check_compile=False, prefix: Optional[str] = None
vllm_config: VllmConfig, check_compile=False, prefix: str | None = None
):
"""
Temporarily set the current vLLM config.
@@ -866,7 +866,7 @@ T = TypeVar("T")
def get_layers_from_vllm_config(
vllm_config: VllmConfig,
layer_type: type[T],
layer_names: Optional[list[str]] = None,
layer_names: list[str] | None = None,
) -> dict[str, T]:
"""
Get layers from the vLLM config.