Update Optional[x] -> x | None and Union[x, y] to x | y (#26633)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-12 17:51:31 +01:00
parent 9bb38130cb
commit 8fcaaf6a16
944 changed files with 9490 additions and 10121 deletions
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -3,7 +3,7 @@

 import hashlib
 from dataclasses import field
-from typing import TYPE_CHECKING, Any, Literal, Optional
+from typing import TYPE_CHECKING, Any, Literal

 from pydantic import Field, SkipValidation, field_validator
 from pydantic.dataclasses import dataclass
@@ -58,13 +58,13 @@ class CacheConfig:
    is_attention_free: bool = False
    """Whether the model is attention-free. This is primarily set in
    `ModelConfig` and that value should be manually duplicated here."""
-    num_gpu_blocks_override: Optional[int] = None
+    num_gpu_blocks_override: int | None = None
    """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
    if specified. Does nothing if `None`. Used for testing preemption."""
-    sliding_window: Optional[int] = None
+    sliding_window: int | None = None
    """Sliding window size for the KV cache. This is primarily set in
    `ModelConfig` and that value should be manually duplicated here."""
-    enable_prefix_caching: Optional[bool] = None
+    enable_prefix_caching: bool | None = None
    """Whether to enable prefix caching. Enabled by default for V1."""
    prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
    """Set the hash algorithm for prefix caching:\n
@@ -84,12 +84,12 @@ class CacheConfig:
    """This enables dynamic calculation of `k_scale` and `v_scale` when
    kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
    checkpoint if available. Otherwise, the scales will default to 1.0."""
-    cpu_kvcache_space_bytes: Optional[int] = None
+    cpu_kvcache_space_bytes: int | None = None
    """(CPU backend only) CPU key-value cache space."""
-    mamba_page_size_padded: Optional[int] = None
+    mamba_page_size_padded: int | None = None
    """ Optional override for mamba page size; used by hybrid mamba/attention
    models to ensure exact alignment with attention page size."""
-    mamba_block_size: Optional[int] = None
+    mamba_block_size: int | None = None
    """Size of a contiguous cache block in number of tokens for mamba cache."""
    mamba_cache_dtype: MambaDType = "auto"
    """The data type to use for the Mamba cache (both the conv as well as the
@@ -101,9 +101,9 @@ class CacheConfig:
    for the ssm state will be determined by mamba_cache_dtype."""

    # Will be set after profiling.
-    num_gpu_blocks: Optional[int] = field(default=None, init=False)
+    num_gpu_blocks: int | None = field(default=None, init=False)
    """The number of blocks to allocate for GPU memory."""
-    num_cpu_blocks: Optional[int] = field(default=None, init=False)
+    num_cpu_blocks: int | None = field(default=None, init=False)
    """The number of blocks to allocate for CPU memory."""

    kv_sharing_fast_prefill: bool = False
@@ -116,7 +116,7 @@ class CacheConfig:
    necessary for implementing this optimization in some models (e.g. Gemma3n)
    """

-    kv_cache_memory_bytes: Optional[int] = None
+    kv_cache_memory_bytes: int | None = None
    """Size of KV Cache per GPU in bytes. By default, this is set to None
    and vllm can automatically infer the kv cache size based on
    gpu_memory_utilization. However, users may want to manually specify
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -4,9 +4,10 @@
 import enum
 import hashlib
 from collections import Counter
+from collections.abc import Callable
 from dataclasses import asdict, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
+from typing import TYPE_CHECKING, Any, ClassVar

 from pydantic import TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
@@ -168,7 +169,7 @@ class CompilationConfig:
    """

    # Top-level Compilation control
-    level: Optional[int] = None
+    level: int | None = None
    """The level of compilation:

    - None: If None, we will select the default compilation level.
@@ -177,7 +178,7 @@ class CompilationConfig:
    - 1: dynamo as is.
    - 2: dynamo once.
    - 3: piecewise compilation."""
-    debug_dump_path: Optional[Path] = None
+    debug_dump_path: Path | None = None
    """The path to dump the debug information."""
    cache_dir: str = ""
    """The directory to store the compiled graph, to accelerate Inductor
@@ -208,7 +209,7 @@ class CompilationConfig:
    By default, all custom ops are enabled when running without Inductor and
    disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
    Inductor generates (fused) Triton kernels for disabled custom ops."""
-    splitting_ops: Optional[list[str]] = None
+    splitting_ops: list[str] | None = None
    """A list of ops to exclude from cudagraphs, used in piecewise compilation.

    The behavior depends on use_inductor_graph_partition:
@@ -238,7 +239,7 @@ class CompilationConfig:
        are compiled using configurations in inductor_compile_config.

    This setting is ignored if level<PIECEWISE."""
-    compile_sizes: Optional[list[Union[int, str]]] = None
+    compile_sizes: list[int | str] | None = None
    """Sizes to compile for inductor. In addition
    to integers, it also supports "cudagraph_capture_sizes" to
    specify the sizes for cudagraph capture."""
@@ -253,7 +254,7 @@ class CompilationConfig:
    constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""

    # CudaGraph compilation
-    cudagraph_mode: Optional[CUDAGraphMode] = None
+    cudagraph_mode: CUDAGraphMode | None = None
    """
    The mode of the cudagraph:

@@ -308,7 +309,7 @@ class CompilationConfig:
    It means the first several runs will be treated as warmup runs.
    Only after that, the execution will be recorded, and the recorded
    cudagraph will be used for subsequent runs."""
-    cudagraph_capture_sizes: Optional[list[int]] = None
+    cudagraph_capture_sizes: list[int] | None = None
    """Sizes to capture cudagraph.
    - None (default): capture sizes are inferred from vllm config.
    - list[int]: capture sizes are specified as given."""
@@ -320,7 +321,7 @@ class CompilationConfig:
    internally managed buffer. Default is False. 
    Note that this flag is only effective when cudagraph_mode is PIECEWISE.
    """
-    full_cuda_graph: Optional[bool] = False
+    full_cuda_graph: bool | None = False
    """whether to use a full cuda graph for the entire forward pass rather than
    splitting certain operations such as attention into subgraphs. Thus this
    flag cannot be used together with splitting_ops. This may provide
@@ -544,7 +545,7 @@ class CompilationConfig:
                    "(where 'op' is the registered op name)"
                )

-    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
+    def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
        if self.level == CompilationLevel.NO_COMPILATION:
            raise ValueError("No compilation level is set.")

--- a/vllm/config/device.py
+++ b/vllm/config/device.py
@@ -3,7 +3,7 @@

 import hashlib
 from dataclasses import field
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal

 import torch
 from pydantic import ConfigDict, SkipValidation
@@ -19,7 +19,7 @@ Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
 class DeviceConfig:
    """Configuration for the device to use for vLLM execution."""

-    device: SkipValidation[Optional[Union[Device, torch.device]]] = "auto"
+    device: SkipValidation[Device | torch.device | None] = "auto"
    """Device type for vLLM execution.
    This parameter is deprecated and will be
    removed in a future release.
--- a/vllm/config/kv_events.py
+++ b/vllm/config/kv_events.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional

 from pydantic.dataclasses import dataclass

@@ -26,7 +25,7 @@ class KVEventsConfig:
    """The zmq endpoint to use for publishing kv events.
    """

-    replay_endpoint: Optional[str] = None
+    replay_endpoint: str | None = None
    """The zmq endpoint to use for replaying kv events.
    """

--- a/vllm/config/kv_transfer.py
+++ b/vllm/config/kv_transfer.py
@@ -4,7 +4,7 @@
 import hashlib
 import uuid
 from dataclasses import field
-from typing import Any, Literal, Optional, get_args
+from typing import Any, Literal, get_args

 from pydantic.dataclasses import dataclass

@@ -20,14 +20,14 @@ KVRole = Literal[KVProducer, KVConsumer]
 class KVTransferConfig:
    """Configuration for distributed KV cache transfer."""

-    kv_connector: Optional[str] = None
+    kv_connector: str | None = None
    """The KV connector for vLLM to transmit KV caches between vLLM instances.
    """

-    engine_id: Optional[str] = None
+    engine_id: str | None = None
    """The engine id for KV transfers."""

-    kv_buffer_device: Optional[str] = "cuda"
+    kv_buffer_device: str | None = "cuda"
    """The device used by kv connector to buffer the KV cache. Choices are 
    'cuda' and 'cpu'."""

@@ -35,11 +35,11 @@ class KVTransferConfig:
    """The buffer size for TorchDistributedConnector. Measured in number of
    bytes. Recommended value: 1e9 (about 1GB)."""

-    kv_role: Optional[KVRole] = None
+    kv_role: KVRole | None = None
    """Whether this vLLM instance produces, consumes KV cache, or both. Choices
    are 'kv_producer', 'kv_consumer', and 'kv_both'."""

-    kv_rank: Optional[int] = None
+    kv_rank: int | None = None
    """The rank of this vLLM instance in the KV cache transfer. Typical value:
    0 for prefill instance, 1 for decode instance.
    Currently only 1P1D is supported."""
@@ -57,7 +57,7 @@ class KVTransferConfig:
    kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
    """any extra config that the connector may need."""

-    kv_connector_module_path: Optional[str] = None
+    kv_connector_module_path: str | None = None
    """The Python module path to dynamically load the KV connector from.
    Only supported in V1."""

--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import hashlib
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any

 from pydantic import Field, field_validator
 from pydantic.dataclasses import dataclass
@@ -25,7 +25,7 @@ logger = init_logger(__name__)
 class LoadConfig:
    """Configuration for loading the model weights."""

-    load_format: Union[str, LoadFormats] = "auto"
+    load_format: str | LoadFormats = "auto"
    """The format of the model weights to load:\n
    - "auto" will try to load the weights in the safetensors format and fall
    back to the pytorch bin format if safetensors format is not available.\n
@@ -48,7 +48,7 @@ class LoadConfig:
    - "mistral" will load weights from consolidated safetensors files used by
    Mistral models.
    - Other custom values can be supported via plugins."""
-    download_dir: Optional[str] = None
+    download_dir: str | None = None
    """Directory to download and load the weights, default to the default
    cache directory of Hugging Face."""
    safetensors_load_strategy: str = "lazy"
@@ -64,23 +64,19 @@ class LoadConfig:
      was quantized using torchao and saved using safetensors.
      Needs torchao >= 0.14.0
    """
-    model_loader_extra_config: Union[dict, TensorizerConfig] = Field(
-        default_factory=dict
-    )
+    model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
    """Extra config for model loader. This will be passed to the model loader
    corresponding to the chosen load_format."""
-    device: Optional[str] = None
+    device: str | None = None
    """Device to which model weights will be loaded, default to
    device_config.device"""
-    ignore_patterns: Union[list[str], str] = Field(
-        default_factory=lambda: ["original/**/*"]
-    )
+    ignore_patterns: list[str] | str = Field(default_factory=lambda: ["original/**/*"])
    """The list of patterns to ignore when loading the model. Default to
    "original/**/*" to avoid repeated loading of llama's checkpoints."""
    use_tqdm_on_load: bool = True
    """Whether to enable tqdm for showing progress bar when loading model
    weights."""
-    pt_load_map_location: Union[str, dict[str, str]] = "cpu"
+    pt_load_map_location: str | dict[str, str] = "cpu"
    """
    pt_load_map_location: the map location for loading pytorch checkpoint, to
    support loading checkpoints can only be loaded on certain devices like
@@ -115,8 +111,8 @@ class LoadConfig:

    @field_validator("ignore_patterns", mode="after")
    def _validate_ignore_patterns(
-        cls, ignore_patterns: Union[list[str], str]
-    ) -> Union[list[str], str]:
+        cls, ignore_patterns: list[str] | str
+    ) -> list[str] | str:
        if ignore_patterns != ["original/**/*"] and len(ignore_patterns) > 0:
            logger.info(
                "Ignoring the following patterns when downloading weights: %s",
--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import hashlib
-from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, ClassVar, Literal

 import torch
 from pydantic import ConfigDict, Field, model_validator
@@ -42,10 +42,10 @@ class LoRAConfig:
    parallelism. Enabling this will use the fully sharded layers. At high
    sequence length, max rank or tensor parallel size, this is likely faster.
    """
-    max_cpu_loras: Optional[int] = None
+    max_cpu_loras: int | None = None
    """Maximum number of LoRAs to store in CPU memory. Must be >= than
    `max_loras`."""
-    lora_dtype: Union[torch.dtype, LoRADType] = "auto"
+    lora_dtype: torch.dtype | LoRADType = "auto"
    """Data type for LoRA. If auto, will default to base model dtype."""
    lora_extra_vocab_size: LoRAExtraVocabSize = Field(
        default=256,
@@ -60,7 +60,7 @@ class LoRAConfig:
    lora_vocab_padding_size: ClassVar[int] = (
        current_platform.get_lora_vocab_padding_size()
    )
-    default_mm_loras: Optional[dict[str, str]] = None
+    default_mm_loras: dict[str, str] | None = None
    """Dictionary mapping specific modalities to LoRA model paths; this field
    is only applicable to multimodal models and should be leveraged when a
    model always expects a LoRA to be active when a given modality is present.
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -4,18 +4,10 @@
 import hashlib
 import json
 import warnings
+from collections.abc import Callable
 from dataclasses import InitVar, field
 from importlib.util import find_spec
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Literal,
-    Optional,
-    Union,
-    cast,
-    get_args,
-)
+from typing import TYPE_CHECKING, Any, Literal, cast, get_args

 import torch
 from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
@@ -89,7 +81,7 @@ ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
    "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
 ]
-HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], PretrainedConfig]]
+HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig]
 ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"]

 _RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
@@ -121,7 +113,7 @@ class ModelConfig:
    """Convert the model using adapters defined in
    [vllm.model_executor.models.adapters][]. The most common use case is to
    adapt a text generation model to be used for pooling tasks."""
-    task: Optional[TaskOption] = None
+    task: TaskOption | None = None
    """[DEPRECATED] The task to use the model for. If the model supports more
    than one model runner, this is used to select which model runner to run.

@@ -139,7 +131,7 @@ class ModelConfig:
    trust_remote_code: bool = False
    """Trust remote code (e.g., from HuggingFace) when downloading the model
    and tokenizer."""
-    dtype: Union[ModelDType, torch.dtype] = "auto"
+    dtype: ModelDType | torch.dtype = "auto"
    """Data type for model weights and activations:\n
    - "auto" will use FP16 precision for FP32 and FP16 models, and BF16
    precision for BF16 models.\n
@@ -148,33 +140,33 @@ class ModelConfig:
    - "bfloat16" for a balance between precision and range.\n
    - "float" is shorthand for FP32 precision.\n
    - "float32" for FP32 precision."""
-    seed: Optional[int] = None
+    seed: int | None = None
    """Random seed for reproducibility. Initialized to None in V0, but
    initialized to 0 in V1."""
-    hf_config_path: Optional[str] = None
+    hf_config_path: str | None = None
    """Name or path of the Hugging Face config to use. If unspecified, model
    name or path will be used."""
    allowed_local_media_path: str = ""
    """Allowing API requests to read local images or videos from directories
    specified by the server file system. This is a security risk. Should only
    be enabled in trusted environments."""
-    allowed_media_domains: Optional[list[str]] = None
+    allowed_media_domains: list[str] | None = None
    """If set, only media URLs that belong to this domain can be used for 
    multi-modal inputs. """
-    revision: Optional[str] = None
+    revision: str | None = None
    """The specific model version to use. It can be a branch name, a tag name,
    or a commit id. If unspecified, will use the default version."""
-    code_revision: Optional[str] = None
+    code_revision: str | None = None
    """The specific revision to use for the model code on the Hugging Face Hub.
    It can be a branch name, a tag name, or a commit id. If unspecified, will
    use the default version."""
    rope_scaling: dict[str, Any] = field(default_factory=dict)
    """RoPE scaling configuration. For example,
    `{"rope_type":"dynamic","factor":2.0}`."""
-    rope_theta: Optional[float] = None
+    rope_theta: float | None = None
    """RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE
    theta improves the performance of the scaled model."""
-    tokenizer_revision: Optional[str] = None
+    tokenizer_revision: str | None = None
    """The specific revision to use for the tokenizer on the Hugging Face Hub.
    It can be a branch name, a tag name, or a commit id. If unspecified, will
    use the default version."""
@@ -187,9 +179,9 @@ class ModelConfig:
    - 1k -> 1000\n
    - 1K -> 1024\n
    - 25.6k -> 25,600"""
-    spec_target_max_model_len: Optional[int] = None
+    spec_target_max_model_len: int | None = None
    """Specify the maximum length for spec decoding draft models."""
-    quantization: SkipValidation[Optional[QuantizationMethods]] = None
+    quantization: SkipValidation[QuantizationMethods | None] = None
    """Method used to quantize the weights. If `None`, we first check the
    `quantization_config` attribute in the model config file. If that is
    `None`, we assume the model weights are not quantized and use `dtype` to
@@ -230,7 +222,7 @@ class ModelConfig:
    """If `True`, enables passing text embeddings as inputs via the
    `prompt_embeds` key. Note that enabling this will double the time required
    for graph compilation."""
-    served_model_name: Optional[Union[str, list[str]]] = None
+    served_model_name: str | list[str] | None = None
    """The model name(s) used in the API. If multiple names are provided, the
    server will respond to any of the provided names. The model name in the
    model field of a response will be the first name in this list. If not
@@ -238,20 +230,20 @@ class ModelConfig:
    that this name(s) will also be used in `model_name` tag content of
    prometheus metrics, if multiple names provided, metrics tag will take the
    first one."""
-    config_format: Union[str, ConfigFormat] = "auto"
+    config_format: str | ConfigFormat = "auto"
    """The format of the model config to load:\n
    - "auto" will try to load the config in hf format if available else it
    will try to load in mistral format.\n
    - "hf" will load the config in hf format.\n
    - "mistral" will load the config in mistral format."""
-    hf_token: Optional[Union[bool, str]] = None
+    hf_token: bool | str | None = None
    """The token to use as HTTP bearer authorization for remote files . If
    `True`, will use the token generated when running `huggingface-cli login`
    (stored in `~/.huggingface`)."""
    hf_overrides: HfOverrides = field(default_factory=dict)
    """If a dictionary, contains arguments to be forwarded to the Hugging Face
    config. If a callable, it is called to update the HuggingFace config."""
-    logits_processor_pattern: Optional[str] = None
+    logits_processor_pattern: str | None = None
    """Optional regex pattern specifying valid logits processor qualified names
    that can be passed with the `logits_processors` extra completion argument.
    Defaults to `None`, which allows no processors."""
@@ -269,7 +261,7 @@ class ModelConfig:
    `--generation-config vllm`, only the override parameters are used."""
    enable_sleep_mode: bool = False
    """Enable sleep mode for the engine (only cuda platform is supported)."""
-    model_impl: Union[str, ModelImpl] = "auto"
+    model_impl: str | ModelImpl = "auto"
    """Which implementation of the model to use:\n
    - "auto" will try to use the vLLM implementation, if it exists, and fall
    back to the Transformers implementation if no vLLM implementation is
@@ -278,36 +270,36 @@ class ModelConfig:
    - "transformers" will use the Transformers model implementation.\n
    - "terratorch" will use the TerraTorch model implementation.
    """
-    override_attention_dtype: Optional[str] = None
+    override_attention_dtype: str | None = None
    """Override dtype for attention"""
-    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
+    logits_processors: list[str | type[LogitsProcessor]] | None = None
    """One or more logits processors' fully-qualified class names or class
    definitions"""
-    io_processor_plugin: Optional[str] = None
+    io_processor_plugin: str | None = None
    """IOProcessor plugin name to load at model startup"""

    # Pooler config
-    pooler_config: Optional[PoolerConfig] = None
+    pooler_config: PoolerConfig | None = None
    """Pooler config which controls the behaviour of output pooling in pooling
    models."""
-    override_pooler_config: Optional[Union[dict, PoolerConfig]] = None
+    override_pooler_config: dict | PoolerConfig | None = None
    """[DEPRECATED] Use `pooler_config` instead. This field will be removed in
    v0.12.0 or v1.0.0, whichever is sooner."""

    # Multimodal config and init vars
-    multimodal_config: Optional[MultiModalConfig] = None
+    multimodal_config: MultiModalConfig | None = None
    """Configuration for multimodal model. If `None`, this will be inferred
    from the architecture of `self.model`."""
-    limit_mm_per_prompt: InitVar[Optional[dict[str, Union[int, dict[str, int]]]]] = None
-    media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
-    mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None
-    mm_processor_cache_gb: InitVar[Optional[float]] = None
-    mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None
-    mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None
-    mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None
-    interleave_mm_strings: InitVar[Optional[bool]] = None
-    skip_mm_profiling: InitVar[Optional[bool]] = None
-    video_pruning_rate: InitVar[Optional[float]] = None
+    limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
+    media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
+    mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
+    mm_processor_cache_gb: InitVar[float | None] = None
+    mm_processor_cache_type: InitVar[MMCacheType | None] = None
+    mm_shm_cache_max_object_size_mb: InitVar[int | None] = None
+    mm_encoder_tp_mode: InitVar[MMEncoderTPMode | None] = None
+    interleave_mm_strings: InitVar[bool | None] = None
+    skip_mm_profiling: InitVar[bool | None] = None
+    video_pruning_rate: InitVar[float | None] = None

    def compute_hash(self) -> str:
        """
@@ -369,7 +361,7 @@ class ModelConfig:

    def _update_nested(
        self,
-        target: Union["PretrainedConfig", dict[str, Any]],
+        target: PretrainedConfig | dict[str, Any],
        updates: dict[str, Any],
    ) -> None:
        """Recursively updates a config or dict with nested updates."""
@@ -397,7 +389,7 @@ class ModelConfig:

    def _apply_dict_overrides(
        self,
-        config: "PretrainedConfig",
+        config: PretrainedConfig,
        overrides: dict[str, Any],
    ) -> None:
        """Apply dict overrides, handling both nested configs and dict values."""
@@ -415,16 +407,16 @@ class ModelConfig:
    def __post_init__(
        self,
        # Multimodal config init vars
-        limit_mm_per_prompt: Optional[dict[str, int]],
-        media_io_kwargs: Optional[dict[str, dict[str, Any]]],
-        mm_processor_kwargs: Optional[dict[str, Any]],
-        mm_processor_cache_gb: Optional[float],
-        mm_processor_cache_type: Optional[MMCacheType],
-        mm_shm_cache_max_object_size_mb: Optional[int],
-        mm_encoder_tp_mode: Optional[MMEncoderTPMode],
-        interleave_mm_strings: Optional[bool],
-        skip_mm_profiling: Optional[bool],
-        video_pruning_rate: Optional[float],
+        limit_mm_per_prompt: dict[str, int] | None,
+        media_io_kwargs: dict[str, dict[str, Any]] | None,
+        mm_processor_kwargs: dict[str, Any] | None,
+        mm_processor_cache_gb: float | None,
+        mm_processor_cache_type: MMCacheType | None,
+        mm_shm_cache_max_object_size_mb: int | None,
+        mm_encoder_tp_mode: MMEncoderTPMode | None,
+        interleave_mm_strings: bool | None,
+        skip_mm_profiling: bool | None,
+        video_pruning_rate: float | None,
    ) -> None:
        # Set the default seed to 0 in V1.
        # NOTE(woosuk): In V0, we set the default seed to None because the
@@ -1209,7 +1201,7 @@ class ModelConfig:
                "Supported models implement the `SupportsPP` interface."
            )

-    def get_sliding_window(self) -> Optional[int]:
+    def get_sliding_window(self) -> int | None:
        """Get the sliding window size from the HF text config if present."""
        return getattr(self.hf_text_config, "sliding_window", None)

@@ -1479,7 +1471,7 @@ class ModelConfig:
                    f"{block_type.value} layers"
                )

-    def get_mamba_chunk_size(self) -> Optional[int]:
+    def get_mamba_chunk_size(self) -> int | None:
        """
        Returns the mamba chunk size if it exists
        """
@@ -1715,9 +1707,7 @@ class ModelConfig:
        return max_model_len


-def get_served_model_name(
-    model: str, served_model_name: Optional[Union[str, list[str]]]
-):
+def get_served_model_name(model: str, served_model_name: str | list[str] | None):
    """
    If the input is a non-empty list, the first model_name in
    `served_model_name` is taken.
@@ -1761,9 +1751,9 @@ def iter_architecture_defaults():
 def try_match_architecture_defaults(
    architecture: str,
    *,
-    runner_type: Optional[RunnerType] = None,
-    convert_type: Optional[ConvertType] = None,
-) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]:
+    runner_type: RunnerType | None = None,
+    convert_type: ConvertType | None = None,
+) -> tuple[str, tuple[RunnerType, ConvertType]] | None:
    for suffix, (
        default_runner_type,
        default_convert_type,
@@ -1817,7 +1807,7 @@ def _find_dtype(
    model_id: str,
    config: PretrainedConfig,
    *,
-    revision: Optional[str],
+    revision: str | None,
 ):
    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
    # because config.torch_dtype can be None.
@@ -1902,10 +1892,10 @@ def _resolve_auto_dtype(
 def _get_and_verify_dtype(
    model_id: str,
    config: PretrainedConfig,
-    dtype: Union[str, torch.dtype],
+    dtype: str | torch.dtype,
    *,
    is_pooling_model: bool,
-    revision: Optional[str] = None,
+    revision: str | None = None,
 ) -> torch.dtype:
    config_dtype = _find_dtype(model_id, config, revision=revision)
    model_type = config.model_type
@@ -1947,7 +1937,7 @@ def _get_and_verify_dtype(
 def _get_head_dtype(
    config: PretrainedConfig, dtype: torch.dtype, runner_type: str
 ) -> torch.dtype:
-    head_dtype: Optional[Union[str, torch.dtype]] = getattr(config, "head_dtype", None)
+    head_dtype: str | torch.dtype | None = getattr(config, "head_dtype", None)

    if head_dtype == "model":
        return dtype
@@ -1970,12 +1960,12 @@ def _get_head_dtype(

 def _get_and_verify_max_len(
    hf_config: PretrainedConfig,
-    tokenizer_config: Optional[dict],
-    max_model_len: Optional[int],
+    tokenizer_config: dict | None,
+    max_model_len: int | None,
    disable_sliding_window: bool,
-    sliding_window: Optional[int],
-    spec_target_max_model_len: Optional[int] = None,
-    encoder_config: Optional[Any] = None,
+    sliding_window: int | None,
+    spec_target_max_model_len: int | None = None,
+    encoder_config: Any | None = None,
 ) -> int:
    """Get and verify the model's maximum length."""
    derived_max_model_len = float("inf")
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -4,7 +4,7 @@
 import hashlib
 from collections.abc import Mapping
 from dataclasses import field
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal, TypeAlias

 from pydantic import ConfigDict, Field, field_validator
 from pydantic.dataclasses import dataclass
@@ -23,31 +23,31 @@ class BaseDummyOptions:
 class VideoDummyOptions(BaseDummyOptions):
    """Options for generating dummy video data during profiling."""

-    num_frames: Optional[int] = Field(None, gt=0)
-    width: Optional[int] = Field(None, gt=0)
-    height: Optional[int] = Field(None, gt=0)
+    num_frames: int | None = Field(None, gt=0)
+    width: int | None = Field(None, gt=0)
+    height: int | None = Field(None, gt=0)


@dataclass(config=ConfigDict(extra="forbid"))
 class ImageDummyOptions(BaseDummyOptions):
    """Options for generating dummy image data during profiling."""

-    width: Optional[int] = Field(None, gt=0)
-    height: Optional[int] = Field(None, gt=0)
+    width: int | None = Field(None, gt=0)
+    height: int | None = Field(None, gt=0)


@dataclass(config=ConfigDict(extra="forbid"))
 class AudioDummyOptions(BaseDummyOptions):
    """Options for generating dummy audio data during profiling."""

-    length: Optional[int] = Field(None, gt=0)
+    length: int | None = Field(None, gt=0)


 MMEncoderTPMode = Literal["weights", "data"]
 MMCacheType = Literal["shm", "lru"]
-DummyOptions = Union[
-    BaseDummyOptions, VideoDummyOptions, ImageDummyOptions, AudioDummyOptions
-]
+DummyOptions: TypeAlias = (
+    BaseDummyOptions | VideoDummyOptions | ImageDummyOptions | AudioDummyOptions
+)


@config
@@ -75,7 +75,7 @@ class MultiModalConfig:
    """Additional args passed to process media inputs, keyed by modalities.
    For example, to set num_frames for video, set
    `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
-    mm_processor_kwargs: Optional[dict[str, object]] = None
+    mm_processor_kwargs: dict[str, object] | None = None
    """Arguments to be forwarded to the model's processor for multi-modal data,
    e.g., image processor. Overrides for the multi-modal processor obtained
    from `transformers.AutoProcessor.from_pretrained`.
@@ -123,7 +123,7 @@ class MultiModalConfig:
    This reduces engine startup time but shifts the responsibility to users for
    estimating the peak memory usage of the activation of multimodal encoder and
    embedding cache."""
-    video_pruning_rate: Optional[float] = None
+    video_pruning_rate: float | None = None
    """Sets pruning rate for video pruning via Efficient Video Sampling.
    Value sits in range [0;1) and determines fraction of media tokens
    from each video to be pruned.
@@ -132,7 +132,7 @@ class MultiModalConfig:
    @field_validator("limit_per_prompt", mode="before")
    @classmethod
    def _validate_limit_per_prompt(
-        cls, value: dict[str, Union[int, dict[str, int]]]
+        cls, value: dict[str, int | dict[str, int]]
    ) -> dict[str, DummyOptions]:
        for k, v in value.items():
            # Handle legacy format where only count is specified
@@ -179,7 +179,7 @@ class MultiModalConfig:
            return 999
        return limit_data.count

-    def get_dummy_options(self, modality: str) -> Optional[BaseDummyOptions]:
+    def get_dummy_options(self, modality: str) -> BaseDummyOptions | None:
        """
        Get the configurable dummy data options for a modality.
        Returns None if no options are configured for this modality.
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -3,7 +3,7 @@

 import hashlib
 from functools import cached_property
-from typing import Any, Literal, Optional, cast
+from typing import Any, Literal, cast

 from pydantic.dataclasses import dataclass

@@ -18,7 +18,7 @@ DetailedTraceModules = Literal["model", "worker", "all"]
 class ObservabilityConfig:
    """Configuration for observability - metrics and tracing."""

-    show_hidden_metrics_for_version: Optional[str] = None
+    show_hidden_metrics_for_version: str | None = None
    """Enable deprecated Prometheus metrics that have been hidden since the
    specified version. For example, if a previously deprecated metric has been
    hidden since the v0.7.0 release, you use
@@ -33,10 +33,10 @@ class ObservabilityConfig:
            return False
        return version._prev_minor_version_was(self.show_hidden_metrics_for_version)

-    otlp_traces_endpoint: Optional[str] = None
+    otlp_traces_endpoint: str | None = None
    """Target URL to which OpenTelemetry traces will be sent."""

-    collect_detailed_traces: Optional[list[DetailedTraceModules]] = None
+    collect_detailed_traces: list[DetailedTraceModules] | None = None
    """It makes sense to set this only if `--otlp-traces-endpoint` is set. If
    set, it will collect detailed traces for the specified modules. This
    involves use of possibly costly and or blocking operations and hence might
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -3,7 +3,7 @@

 import hashlib
 import os
-from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, Literal

 import torch
 from pydantic import Field, model_validator
@@ -75,7 +75,7 @@ class ParallelConfig:
    """Number of local data parallel groups."""
    data_parallel_rank: int = 0
    """Rank of the data parallel group."""
-    data_parallel_rank_local: Optional[int] = None
+    data_parallel_rank_local: int | None = None
    """Local rank of the data parallel group,
    set only in SPMD mode."""
    data_parallel_master_ip: str = "127.0.0.1"
@@ -113,24 +113,24 @@ class ParallelConfig:
      with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
      will have experts [1, 3]. This strategy can help improve load balancing
      for grouped expert models with no redundant experts."""
-    num_redundant_experts: Optional[int] = None
+    num_redundant_experts: int | None = None
    """`num_redundant_experts` is deprecated and has been replaced with
    `eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
    Please use `eplb_config.num_redundant_experts` instead."""
-    eplb_window_size: Optional[int] = None
+    eplb_window_size: int | None = None
    """`eplb_window_size` is deprecated and has been replaced with
    `eplb_config.window_size`. This will be removed in v0.12.0.
    Please use `eplb_config.window_size` instead."""
-    eplb_step_interval: Optional[int] = None
+    eplb_step_interval: int | None = None
    """`eplb_step_interval` is deprecated and has been replaced with
    `eplb_config.step_interval`. This will be removed in v0.12.0.
    Please use `eplb_config.step_interval` instead."""
-    eplb_log_balancedness: Optional[bool] = None
+    eplb_log_balancedness: bool | None = None
    """`eplb_log_balancedness` is deprecated and has been replaced with
    `eplb_config.log_balancedness`. This will be removed in v0.12.0.
    Please use `eplb_config.log_balancedness` instead."""

-    max_parallel_loading_workers: Optional[int] = None
+    max_parallel_loading_workers: int | None = None
    """Maximum number of parallel loading workers when loading model
    sequentially in multiple batches. To avoid RAM OOM when using tensor
    parallel and large models."""
@@ -159,15 +159,15 @@ class ParallelConfig:
    ray_workers_use_nsight: bool = False
    """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""

-    ray_runtime_env: Optional[RuntimeEnv] = None
+    ray_runtime_env: RuntimeEnv | None = None
    """Ray runtime environment to pass to distributed workers."""

-    placement_group: Optional[PlacementGroup] = None
+    placement_group: PlacementGroup | None = None
    """ray distributed model workers placement group."""

-    distributed_executor_backend: Optional[
-        Union[str, DistributedExecutorBackend, type[ExecutorBase]]
-    ] = None
+    distributed_executor_backend: (
+        str | DistributedExecutorBackend | type[ExecutorBase] | None
+    ) = None
    """Backend to use for distributed model
    workers, either "ray" or "mp" (multiprocessing). If the product
    of pipeline_parallel_size and tensor_parallel_size is less than
@@ -306,7 +306,7 @@ class ParallelConfig:
        )

        max_retries = 5
-        last_exc: Optional[Exception] = None
+        last_exc: Exception | None = None
        for _ in range(max_retries):
            try:
                # use gloo since the engine process might not have cuda device
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import hashlib
-from typing import Any, Optional
+from typing import Any

 from pydantic.dataclasses import dataclass

@@ -14,23 +14,23 @@ from vllm.config.utils import config
 class PoolerConfig:
    """Controls the behavior of output pooling in pooling models."""

-    pooling_type: Optional[str] = None
+    pooling_type: str | None = None
    """
    The pooling method of the pooling model. This should be a key in
    [`vllm.model_executor.layers.pooler.PoolingType`][].
    """

    ## for embeddings models
-    normalize: Optional[bool] = None
+    normalize: bool | None = None
    """
    Whether to normalize the embeddings outputs. Defaults to True.
    """
-    dimensions: Optional[int] = None
+    dimensions: int | None = None
    """
    Reduce the dimensions of embeddings if model
    support matryoshka representation. Defaults to None.
    """
-    enable_chunked_processing: Optional[bool] = None
+    enable_chunked_processing: bool | None = None
    """
    Whether to enable chunked processing for long inputs that exceed the model's
    maximum position embeddings. When enabled, long inputs will be split into
@@ -38,7 +38,7 @@ class PoolerConfig:
    This allows embedding models to handle arbitrarily long text without CUDA
    errors. Defaults to False.
    """
-    max_embed_len: Optional[int] = None
+    max_embed_len: int | None = None
    """
    Maximum input length allowed for embedding generation. When set, allows
    inputs longer than max_embed_len to be accepted for embedding models.
@@ -48,29 +48,29 @@ class PoolerConfig:
    """

    ## for classification models
-    activation: Optional[bool] = None
+    activation: bool | None = None
    """
    Whether to apply activation function to the classification outputs.
    Defaults to True.
    """
-    logit_bias: Optional[float] = None
+    logit_bias: float | None = None
    """
    If provided, apply classification logit biases. Defaults to None.
    """

    ## for reward models
-    softmax: Optional[bool] = None
+    softmax: bool | None = None
    """
    Whether to apply softmax to the reward outputs.
    Defaults to True.
    """
-    step_tag_id: Optional[int] = None
+    step_tag_id: int | None = None
    """
    If set, only the score corresponding to the ``step_tag_id`` in the
    generated sentence should be returned. Otherwise, the scores for all tokens
    are returned.
    """
-    returned_token_ids: Optional[list[int]] = None
+    returned_token_ids: list[int] | None = None
    """
    A list of indices for the vocabulary dimensions to be extracted,
    such as the token IDs of ``good_token`` and ``bad_token`` in the
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -3,7 +3,7 @@

 import hashlib
 from dataclasses import InitVar, field
-from typing import Any, Literal, Union
+from typing import Any, Literal

 from pydantic import SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
@@ -133,7 +133,7 @@ class SchedulerConfig:

    # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
    # or "mod.custom_class".
-    scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
+    scheduler_cls: str | type[object] = "vllm.core.scheduler.Scheduler"
    """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
    default scheduler. Can be a class directly or the path to a class of form
    "mod.custom_class"."""
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -3,7 +3,7 @@

 import ast
 import hashlib
-from typing import TYPE_CHECKING, Any, Literal, Optional
+from typing import TYPE_CHECKING, Any, Literal

 from pydantic import SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
@@ -59,16 +59,16 @@ MTP_MODEL_TYPES = (
 class SpeculativeConfig:
    """Configuration for speculative decoding."""

-    enforce_eager: Optional[bool] = None
+    enforce_eager: bool | None = None
    """Override the default enforce_eager from model_config"""
    # General speculative decoding control
    num_speculative_tokens: SkipValidation[int] = None  # type: ignore
    """The number of speculative tokens, if provided. It will default to the
    number in the draft model config if present, otherwise, it is required."""
-    model: Optional[str] = None
+    model: str | None = None
    """The name of the draft model, eagle head, or additional weights, if
    provided."""
-    method: Optional[SpeculativeMethod] = None
+    method: SpeculativeMethod | None = None
    """The name of the speculative method to use. If users provide and set the
    `model` param, the speculative method type will be detected automatically
    if possible, if `model` param is not provided, the method name must be
@@ -76,7 +76,7 @@ class SpeculativeConfig:

    If using `ngram` method, the related configuration `prompt_lookup_max` and
    `prompt_lookup_min` should be considered."""
-    draft_tensor_parallel_size: Optional[int] = None
+    draft_tensor_parallel_size: int | None = None
    """The degree of the tensor parallelism for the draft model. Can only be 1
    or the same as the target model's tensor parallel size."""
    disable_logprobs: bool = True
@@ -85,24 +85,24 @@ class SpeculativeConfig:
    according to the log probability settings in SamplingParams."""

    # Draft model configuration
-    quantization: Optional[me_quant.QuantizationMethods] = None
+    quantization: me_quant.QuantizationMethods | None = None
    """Quantization method that was used to quantize the draft model weights.
    If `None`, we assume the model weights are not quantized. Note that it only
    takes effect when using the draft model-based speculative method."""
-    max_model_len: Optional[int] = None
+    max_model_len: int | None = None
    """The maximum model length of the draft model. Used when testing the
    ability to skip speculation for some sequences."""
-    revision: Optional[str] = None
+    revision: str | None = None
    """The specific model version to use for the draft model. It can be a
    branch name, a tag name, or a commit id. If unspecified, will use the
    default version."""
-    code_revision: Optional[str] = None
+    code_revision: str | None = None
    """The specific revision to use for the draft model code on Hugging Face
    Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
    will use the default version."""

    # Advanced control
-    disable_by_batch_size: Optional[int] = None
+    disable_by_batch_size: int | None = None
    """Disable speculative decoding for new incoming requests when the number
    of enqueued requests is larger than this value, if provided."""
    disable_padded_drafter_batch: bool = False
@@ -112,14 +112,14 @@ class SpeculativeConfig:
    only affects the EAGLE method of speculation."""

    # Ngram proposer configuration
-    prompt_lookup_max: Optional[int] = None
+    prompt_lookup_max: int | None = None
    """Maximum size of ngram token window when using Ngram proposer, required
    when method is set to ngram."""
-    prompt_lookup_min: Optional[int] = None
+    prompt_lookup_min: int | None = None
    """Minimum size of ngram token window when using Ngram proposer, if
    provided. Defaults to 1."""

-    speculative_token_tree: Optional[str] = None
+    speculative_token_tree: str | None = None
    """Specifies the tree structure for speculative token generation.
    """
    # required configuration params passed from engine
@@ -449,7 +449,7 @@ class SpeculativeConfig:

    @staticmethod
    def _maybe_override_draft_max_model_len(
-        speculative_max_model_len: Optional[int],
+        speculative_max_model_len: int | None,
        draft_max_model_len: int,
        target_max_model_len: int,
    ) -> int:
@@ -488,7 +488,7 @@ class SpeculativeConfig:
    @staticmethod
    def _verify_and_get_draft_tp(
        target_parallel_config: ParallelConfig,
-        speculative_draft_tensor_parallel_size: Optional[int],
+        speculative_draft_tensor_parallel_size: int | None,
        draft_hf_config: PretrainedConfig,
    ) -> int:
        """
--- a/vllm/config/speech_to_text.py
+++ b/vllm/config/speech_to_text.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional

 from pydantic.dataclasses import dataclass

@@ -28,7 +27,7 @@ class SpeechToTextConfig:
    splitting long audio. This helps maintain context across chunk boundaries
    and improves transcription quality at split points."""

-    min_energy_split_window_size: Optional[int] = 1600
+    min_energy_split_window_size: int | None = 1600
    """Window size in samples for finding low-energy (quiet) regions to split
    audio chunks. The algorithm looks for the quietest moment within this
    window to minimize cutting through speech. Default 1600 samples ≈ 100ms
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -10,7 +10,7 @@ from contextlib import contextmanager
 from dataclasses import field, replace
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, TypeVar

 import torch
 from pydantic import ConfigDict
@@ -69,17 +69,17 @@ class VllmConfig:
    """Device configuration."""
    load_config: LoadConfig = field(default_factory=LoadConfig)
    """Load configuration."""
-    lora_config: Optional[LoRAConfig] = None
+    lora_config: LoRAConfig | None = None
    """LoRA configuration."""
-    speculative_config: Optional[SpeculativeConfig] = None
+    speculative_config: SpeculativeConfig | None = None
    """Speculative decoding configuration."""
    structured_outputs_config: StructuredOutputsConfig = field(
        default_factory=StructuredOutputsConfig
    )
    """Structured outputs configuration."""
-    observability_config: Optional[ObservabilityConfig] = None
+    observability_config: ObservabilityConfig | None = None
    """Observability configuration."""
-    quant_config: Optional[QuantizationConfig] = None
+    quant_config: QuantizationConfig | None = None
    """Quantization configuration."""
    compilation_config: CompilationConfig = field(default_factory=CompilationConfig)
    """`torch.compile` and cudagraph capture configuration for the model.
@@ -96,14 +96,14 @@ class VllmConfig:
    You can specify the full compilation config like so:
    `{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
    """
-    kv_transfer_config: Optional[KVTransferConfig] = None
+    kv_transfer_config: KVTransferConfig | None = None
    """The configurations for distributed KV cache transfer."""
-    kv_events_config: Optional[KVEventsConfig] = None
+    kv_events_config: KVEventsConfig | None = None
    """The configurations for event publishing."""
    # some opaque config, only used to provide additional information
    # for the hash computation, mainly used for testing, debugging or out of
    # tree config registration.
-    additional_config: Union[dict, SupportsHash] = field(default_factory=dict)
+    additional_config: dict | SupportsHash = field(default_factory=dict)
    """Additional config for specified platform. Different platforms may
    support different configs. Make sure the configs are valid for the platform
    you are using. Contents must be hashable."""
@@ -212,7 +212,7 @@ class VllmConfig:
    @staticmethod
    def _get_quantization_config(
        model_config: ModelConfig, load_config: LoadConfig
-    ) -> Optional[QuantizationConfig]:
+    ) -> QuantizationConfig | None:
        """Get the quantization config."""
        from vllm.platforms import current_platform

@@ -245,7 +245,7 @@ class VllmConfig:
    @staticmethod
    def get_quantization_config(
        model_config: ModelConfig, load_config: LoadConfig
-    ) -> Optional[QuantizationConfig]:
+    ) -> QuantizationConfig | None:
        import copy

        # For some reason, the _ version of this modifies the model_config
@@ -257,7 +257,7 @@ class VllmConfig:
    def with_hf_config(
        self,
        hf_config: PretrainedConfig,
-        architectures: Optional[list[str]] = None,
+        architectures: list[str] | None = None,
    ) -> "VllmConfig":
        if architectures is not None:
            hf_config = copy.deepcopy(hf_config)
@@ -740,7 +740,7 @@ class VllmConfig:
                    f"Model: {self.model_config.model}"
                )

-    def compile_debug_dump_path(self) -> Optional[Path]:
+    def compile_debug_dump_path(self) -> Path | None:
        """Returns a rank-aware path for dumping
        torch.compile debug information.
        """
@@ -790,13 +790,13 @@ class VllmConfig:
        )


-_current_vllm_config: Optional[VllmConfig] = None
-_current_prefix: Optional[str] = None
+_current_vllm_config: VllmConfig | None = None
+_current_prefix: str | None = None


@contextmanager
 def set_current_vllm_config(
-    vllm_config: VllmConfig, check_compile=False, prefix: Optional[str] = None
+    vllm_config: VllmConfig, check_compile=False, prefix: str | None = None
 ):
    """
    Temporarily set the current vLLM config.
@@ -866,7 +866,7 @@ T = TypeVar("T")
 def get_layers_from_vllm_config(
    vllm_config: VllmConfig,
    layer_type: type[T],
-    layer_names: Optional[list[str]] = None,
+    layer_names: list[str] | None = None,
 ) -> dict[str, T]:
    """
    Get layers from the vLLM config.