diff --git a/vllm/beam_search.py b/vllm/beam_search.py index fcd2d1f0e..d0ebd2d9c 100644 --- a/vllm/beam_search.py +++ b/vllm/beam_search.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any from vllm.logprobs import Logprob from vllm.lora.request import LoRARequest @@ -27,7 +27,7 @@ class BeamSearchSequence: text: str | None = None finish_reason: str | None = None stop_reason: int | str | None = None - multi_modal_data: Optional["MultiModalDataDict"] = None + multi_modal_data: "MultiModalDataDict | None" = None mm_processor_kwargs: dict[str, Any] | None = None diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index f4113c91b..3933f6d65 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -3,7 +3,7 @@ import importlib from collections.abc import Callable -from typing import TYPE_CHECKING, Optional, cast +from typing import TYPE_CHECKING, cast from vllm.distributed.kv_transfer.kv_connector.base import ( KVConnectorBase, @@ -44,7 +44,7 @@ class KVConnectorFactory: cls, config: "VllmConfig", role: KVConnectorRole, - kv_cache_config: Optional["KVCacheConfig"] = None, + kv_cache_config: "KVCacheConfig | None" = None, ) -> KVConnectorBase: kv_transfer_config = config.kv_transfer_config if kv_transfer_config is None: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 01b606b28..a0e03b002 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -41,7 +41,7 @@ The class provides the following primitives: import enum from abc import ABC, abstractmethod from collections.abc import Callable, Iterable -from typing import TYPE_CHECKING, Any, Literal, Optional +from typing import TYPE_CHECKING, Any, Literal import torch @@ -161,7 +161,7 @@ class KVConnectorBase_V1(ABC): self, vllm_config: "VllmConfig", role: KVConnectorRole, - kv_cache_config: Optional["KVCacheConfig"] = None, + kv_cache_config: "KVCacheConfig | None" = None, ): logger.warning( "Initializing KVConnectorBase_V1. This API is experimental and " @@ -383,13 +383,13 @@ class KVConnectorBase_V1(ABC): """ return None - def get_kv_connector_stats(self) -> Optional["KVConnectorStats"]: + def get_kv_connector_stats(self) -> "KVConnectorStats | None": """ Get the KV connector stats collected during the last interval. """ return None - def get_kv_connector_kv_cache_events(self) -> Optional["KVConnectorKVEvents"]: + def get_kv_connector_kv_cache_events(self) -> "KVConnectorKVEvents | None": """ Get the KV connector kv cache events collected during the last interval. This function should be called by the model runner every time after the @@ -558,7 +558,7 @@ class KVConnectorBase_V1(ABC): @classmethod def build_kv_connector_stats( cls, data: dict[str, Any] | None = None - ) -> Optional["KVConnectorStats"]: + ) -> "KVConnectorStats | None": """ KVConnectorStats resolution method. This method allows dynamically registered connectors to return their own KVConnectorStats object, @@ -584,7 +584,7 @@ class KVConnectorBase_V1(ABC): metric_types: dict[type["PromMetric"], type["PromMetricT"]], labelnames: list[str], per_engine_labelvalues: dict[int, list[object]], - ) -> Optional["KVConnectorPromMetrics"]: + ) -> "KVConnectorPromMetrics | None": """ Create a KVConnectorPromMetrics subclass which should register per-connector Prometheus metrics and implement observe() to diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py index 525061fc0..6e9e757ff 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py @@ -32,7 +32,7 @@ Usage: """ from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import torch @@ -84,7 +84,7 @@ class DecodeBenchConnector(KVConnectorBase_V1): self, vllm_config: "VllmConfig", role: KVConnectorRole, - kv_cache_config: Optional["KVCacheConfig"] = None, + kv_cache_config: "KVCacheConfig | None" = None, ): super().__init__(vllm_config, role, kv_cache_config) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py index b1b7ee348..19d62fecd 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import safetensors import torch @@ -91,7 +91,7 @@ class ExampleConnector(KVConnectorBase_V1): self, vllm_config: "VllmConfig", role: KVConnectorRole, - kv_cache_config: Optional["KVCacheConfig"] = None, + kv_cache_config: "KVCacheConfig | None" = None, ): super().__init__( vllm_config=vllm_config, diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py index 8159832cc..ee475e16a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py @@ -5,7 +5,7 @@ import os import uuid from collections.abc import Generator from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import torch from lmcache import utils @@ -274,7 +274,7 @@ class ReqMeta: load_spec: LoadSpec | None = None, discard_partial_chunks: bool = True, save_decode_cache: bool = False, - ) -> Optional["ReqMeta"]: + ) -> "ReqMeta | None": """Create the request metadata from a request tracker. Args: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py index 629170615..b542265dd 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -3,7 +3,7 @@ import enum from collections.abc import Iterable from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Literal, Optional, cast +from typing import TYPE_CHECKING, Any, Literal, cast import torch import zmq @@ -385,7 +385,7 @@ class LMCacheMPConnector(KVConnectorBase_V1): self, vllm_config: "VllmConfig", role: KVConnectorRole, - kv_cache_config: Optional["KVCacheConfig"] = None, + kv_cache_config: "KVCacheConfig | None" = None, ): super().__init__(vllm_config, role, kv_cache_config) @@ -595,7 +595,7 @@ class LMCacheMPConnector(KVConnectorBase_V1): self.worker_adapter.shutdown() return None - def get_kv_connector_stats(self) -> Optional["KVConnectorStats"]: + def get_kv_connector_stats(self) -> "KVConnectorStats | None": """ Get the KV connector stats collected during the last interval. """ @@ -810,7 +810,7 @@ class LMCacheMPConnector(KVConnectorBase_V1): @classmethod def build_kv_connector_stats( cls, data: dict[str, Any] | None = None - ) -> Optional["KVConnectorStats"]: + ) -> "KVConnectorStats | None": """ KVConnectorStats resolution method. This method allows dynamically registered connectors to return their own KVConnectorStats object, @@ -825,7 +825,7 @@ class LMCacheMPConnector(KVConnectorBase_V1): metric_types: dict[type["PromMetric"], type["PromMetricT"]], labelnames: list[str], per_engine_labelvalues: dict[int, list[object]], - ) -> Optional["KVConnectorPromMetrics"]: + ) -> "KVConnectorPromMetrics | None": """ Create a KVConnectorPromMetrics subclass which should register per-connector Prometheus metrics and implement observe() to diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py index ef0268b9a..b2b6411f0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py @@ -6,7 +6,7 @@ import time from collections import defaultdict from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import msgspec import numpy as np @@ -115,7 +115,7 @@ class MooncakeConnector(KVConnectorBase_V1): self, vllm_config: VllmConfig, role: KVConnectorRole, - kv_cache_config: Optional["KVCacheConfig"] = None, + kv_cache_config: "KVCacheConfig | None" = None, ): super().__init__(vllm_config, role, kv_cache_config) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py index 026e7faf5..f73f5b2cd 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py @@ -5,7 +5,7 @@ import threading import time from collections.abc import Iterator from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import msgspec import torch @@ -101,7 +101,7 @@ class MoRIIOAgentMetadata( class RoleManager: """Manages role state across the connector.""" - _instance: Optional["RoleManager"] = None + _instance: "RoleManager | None" = None _lock = threading.Lock() def __init__(self) -> None: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py index abdbeb9e4..2494857c6 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py @@ -7,7 +7,7 @@ import threading import time from collections import defaultdict from concurrent.futures import Future, ThreadPoolExecutor -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import msgpack import msgspec @@ -90,7 +90,7 @@ class MoRIIOConnector(KVConnectorBase_V1): self, vllm_config: VllmConfig, role: KVConnectorRole, - kv_cache_config: Optional["KVCacheConfig"] = None, + kv_cache_config: "KVCacheConfig | None" = None, ): super().__init__(vllm_config, role) assert vllm_config.kv_transfer_config is not None, ( @@ -333,7 +333,7 @@ class MoRIIOConnectorScheduler: request: "Request", blocks: "KVCacheBlocks", num_external_tokens: int, - connector_worker: Optional["MoRIIOConnectorWorker"] = None, + connector_worker: "MoRIIOConnectorWorker | None" = None, ): params = request.kv_transfer_params if not params: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py index 3a35c2262..e6d177d8a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import threading -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any from weakref import ref as weakref_ref import msgpack @@ -340,7 +340,7 @@ class MoRIIOWrapper: def __init__( self, - moriio_engine: Optional["IOEngine"] = None, + moriio_engine: "IOEngine | None" = None, tp_rank: int = 0, dp_rank: int = 0, ): diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 8e0651053..d03d70860 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -14,7 +14,7 @@ from collections import defaultdict from collections.abc import Iterator from concurrent.futures import Future, ThreadPoolExecutor from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import msgspec import numpy as np @@ -302,7 +302,7 @@ class NixlConnector(KVConnectorBase_V1): self, vllm_config: VllmConfig, role: KVConnectorRole, - kv_cache_config: Optional["KVCacheConfig"] = None, + kv_cache_config: "KVCacheConfig | None" = None, ): super().__init__(vllm_config, role, kv_cache_config) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 5e601c63c..3be1be18e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import regex as re import torch @@ -76,7 +76,7 @@ class P2pNcclConnector(KVConnectorBase_V1): self, vllm_config: "VllmConfig", role: KVConnectorRole, - kv_cache_config: Optional["KVCacheConfig"] = None, + kv_cache_config: "KVCacheConfig | None" = None, ): super().__init__( vllm_config=vllm_config, diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py index 54b46d988..2cc074bde 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_state.py +++ b/vllm/distributed/kv_transfer/kv_transfer_state.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory @@ -49,7 +49,7 @@ def is_v1_kv_transfer_group(connector: KVConnectorBaseType | None = None) -> boo def ensure_kv_transfer_initialized( - vllm_config: "VllmConfig", kv_cache_config: Optional["KVCacheConfig"] = None + vllm_config: "VllmConfig", kv_cache_config: "KVCacheConfig | None" = None ) -> None: """ Initialize KV cache transfer parallel group. diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 8f9dc0354..00366f96c 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -33,7 +33,7 @@ from contextlib import contextmanager, nullcontext from dataclasses import dataclass from datetime import timedelta from multiprocessing import shared_memory -from typing import Any, Optional +from typing import Any from unittest.mock import patch import torch @@ -106,7 +106,7 @@ def _get_unique_name(name: str) -> str: return newname -_groups: dict[str, Callable[[], Optional["GroupCoordinator"]]] = {} +_groups: dict[str, Callable[[], "GroupCoordinator | None"]] = {} def _register_group(group: "GroupCoordinator") -> None: @@ -784,7 +784,7 @@ class GroupCoordinator: self, tensor_dict: dict[str, torch.Tensor | Any], dst: int | None = None, - all_gather_group: Optional["GroupCoordinator"] = None, + all_gather_group: "GroupCoordinator | None" = None, all_gather_tensors: dict[str, bool] | None = None, ) -> dict[str, torch.Tensor | Any] | None: """Send the input tensor dictionary. @@ -871,7 +871,7 @@ class GroupCoordinator: def recv_tensor_dict( self, src: int | None = None, - all_gather_group: Optional["GroupCoordinator"] = None, + all_gather_group: "GroupCoordinator | None" = None, all_gather_tensors: dict[str, bool] | None = None, ) -> dict[str, torch.Tensor | Any] | None: """Recv the input tensor dictionary. diff --git a/vllm/entrypoints/anthropic/protocol.py b/vllm/entrypoints/anthropic/protocol.py index 626ca7472..5ced67d4c 100644 --- a/vllm/entrypoints/anthropic/protocol.py +++ b/vllm/entrypoints/anthropic/protocol.py @@ -3,7 +3,7 @@ """Pydantic models for Anthropic API protocol""" import time -from typing import Any, Literal, Optional +from typing import Any, Literal from pydantic import BaseModel, field_validator @@ -135,7 +135,7 @@ class AnthropicStreamEvent(BaseModel): "ping", "error", ] - message: Optional["AnthropicMessagesResponse"] = None + message: "AnthropicMessagesResponse | None" = None delta: AnthropicDelta | None = None content_block: AnthropicContentBlock | None = None index: int | None = None diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py index 125201a5a..90b7df818 100644 --- a/vllm/lora/lora_weights.py +++ b/vllm/lora/lora_weights.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence as GenericSequence -from typing import Optional import torch import torch.types @@ -126,7 +125,7 @@ class PackedLoRALayerWeights(LoRALayerWeights): @classmethod def pack( - cls, loras: GenericSequence[Optional["LoRALayerWeights"]] + cls, loras: GenericSequence["LoRALayerWeights | None"] ) -> "PackedLoRALayerWeights": """Pack a list of LoRAs into a single LoRA. @@ -155,7 +154,7 @@ class PackedLoRALayerWeights(LoRALayerWeights): @classmethod def pack_moe( cls, - loras: GenericSequence[Optional["LoRALayerWeights"]], + loras: GenericSequence["LoRALayerWeights | None"], module_name: str, is_non_gated_moe: bool = False, ) -> "PackedLoRALayerWeights": diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 510be672c..2840d5eda 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING import huggingface_hub from huggingface_hub.utils import HfHubHTTPError, HFValidationError @@ -131,7 +131,7 @@ def replace_submodule( def parse_fine_tuned_lora_name( - name: str, weights_mapper: Optional["WeightsMapper"] = None + name: str, weights_mapper: "WeightsMapper | None" = None ) -> tuple[str, bool]: """Parse the name of lora weights. diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index b275bf414..6650367da 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from enum import IntEnum -from typing import Optional, Union +from typing import Union import torch @@ -284,7 +284,7 @@ class FusedMoEQuantConfig: return self._w1.bias @property - def w1_precision(self) -> Optional["PrecisionConfig"]: + def w1_precision(self) -> "PrecisionConfig | None": assert self._w1.scale is None or isinstance(self._w1.scale, PrecisionConfig) return self._w1.scale @@ -306,7 +306,7 @@ class FusedMoEQuantConfig: return self._w2.bias @property - def w2_precision(self) -> Optional["PrecisionConfig"]: + def w2_precision(self) -> "PrecisionConfig | None": assert self._w2.scale is None or isinstance(self._w2.scale, PrecisionConfig) return self._w2.scale diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index e55697458..163ee78a7 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import torch from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE @@ -148,7 +148,7 @@ class AWQMarlinConfig(QuantizationConfig): @classmethod def override_quantization_method( cls, hf_quant_cfg, user_quant - ) -> Optional["QuantizationMethods"]: + ) -> "QuantizationMethods | None": can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg) is_valid_user_quant = ( user_quant is None or user_quant == "marlin" or user_quant == "awq_marlin" @@ -173,7 +173,7 @@ class AWQMarlinConfig(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": if isinstance(layer, LinearBase) or ( isinstance(layer, ParallelLMHead) and self.lm_head_quantized ): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 330d79110..a0eebb9c0 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -3,7 +3,7 @@ from contextlib import suppress from functools import partial -from typing import TYPE_CHECKING, Any, Literal, Optional, cast +from typing import TYPE_CHECKING, Any, Literal, cast import torch from compressed_tensors.config import ( @@ -160,7 +160,7 @@ class CompressedTensorsConfig(QuantizationConfig): self, layer: torch.nn.Module, prefix: str, - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": if isinstance(layer, LinearBase): # collect schemes quant_scheme = self.get_scheme(layer=layer, layer_name=prefix) @@ -691,7 +691,7 @@ class CompressedTensorsConfig(QuantizationConfig): def get_scheme( self, layer: torch.nn.Module, layer_name: str | None = None - ) -> Optional["CompressedTensorsScheme"]: + ) -> "CompressedTensorsScheme | None": """ compressed-tensors supports non uniform in the following way: diff --git a/vllm/model_executor/layers/quantization/cpu_wna16.py b/vllm/model_executor/layers/quantization/cpu_wna16.py index dd985c2ce..406b86ab2 100644 --- a/vllm/model_executor/layers/quantization/cpu_wna16.py +++ b/vllm/model_executor/layers/quantization/cpu_wna16.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Any import torch from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE @@ -105,7 +105,7 @@ class CPUAWQConfig(QuantizationConfig): @classmethod def override_quantization_method( cls, hf_quant_cfg, user_quant - ) -> Optional["QuantizationMethods"]: + ) -> "QuantizationMethods | None": quant_method = hf_quant_cfg.get("quant_method", "").lower() if current_platform.is_cpu() and (quant_method == "awq"): return cls.get_name() @@ -113,7 +113,7 @@ class CPUAWQConfig(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": if isinstance(layer, LinearBase) or ( isinstance(layer, ParallelLMHead) and self.lm_head_quantized ): diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 974c45614..5a0bb5d30 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Any import torch @@ -52,7 +52,7 @@ class ExpertsInt8Config(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": if isinstance(layer, LinearBase): return UnquantizedLinearMethod() elif isinstance(layer, FusedMoE): diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 45d2e4e33..03a2d786a 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Any import torch from torch.nn import Module @@ -78,7 +78,7 @@ class FBGEMMFp8Config(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": if isinstance(layer, LinearBase): if is_layer_skipped( prefix=prefix, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index b2c2a0cff..779f9a0d7 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import torch from torch.nn import Module @@ -182,7 +182,7 @@ class Fp8Config(QuantizationConfig): def get_xpu_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": from vllm.model_executor.layers.quantization.ipex_quant import ( XPUFp8LinearMethod, XPUFp8MoEMethod, @@ -218,7 +218,7 @@ class Fp8Config(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": if current_platform.is_xpu(): return self.get_xpu_quant_method(layer, prefix) if isinstance(layer, LinearBase): diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index b40aebaa5..ce84d2521 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -3,7 +3,7 @@ from collections.abc import Mapping from types import MappingProxyType -from typing import Any, Optional +from typing import Any import gguf import torch @@ -77,7 +77,7 @@ class GGUFConfig(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": if isinstance(layer, LinearBase): if is_layer_skipped_gguf( prefix, self.unquantized_modules, self.packed_modules_mapping diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 45c71f366..698855c09 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy -from typing import Any, Optional +from typing import Any import torch from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE @@ -240,7 +240,7 @@ class GPTQMarlinConfig(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": if isinstance(layer, FusedMoE): from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py index 63392b763..f68fd9578 100644 --- a/vllm/model_executor/layers/quantization/inc.py +++ b/vllm/model_executor/layers/quantization/inc.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from fractions import Fraction -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import regex as re import torch @@ -456,7 +456,7 @@ class INCConfig(QuantizationConfig): @classmethod def override_quantization_method( cls, hf_quant_cfg, user_quant - ) -> Optional["QuantizationMethods"]: + ) -> "QuantizationMethods | None": """Override the `auto-round` method to `inc`.""" is_auto_round_format = hf_quant_cfg.get("quant_method", None) == "auto-round" if is_auto_round_format: diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 119fb2ef8..f957b3991 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Any import torch from packaging import version @@ -144,7 +144,7 @@ class IPEXConfig(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["LinearMethodBase"]: + ) -> "LinearMethodBase | None": if isinstance(layer, LinearBase): if self.method == "awq": if is_layer_skipped( diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 2046561fb..9a9480ffe 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from fnmatch import fnmatch -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import torch from torch.nn import Module @@ -181,7 +181,7 @@ class ModelOptQuantConfigBase(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": # handle kv-cache first so we can focus only on weight quantization thereafter if isinstance(layer, Attention): return self.KVCacheMethodCls(self) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 38340404d..34628591f 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Any import torch @@ -163,7 +163,7 @@ class MoeWNA16Config(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": if is_layer_skipped_quant(prefix, self.modules_to_not_convert): if isinstance(layer, FusedMoE): return UnquantizedFusedMoEMethod(layer.moe_config) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index b966f007c..a50fa4bee 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from enum import Enum -from typing import Optional import torch from torch.nn.parameter import Parameter @@ -197,7 +196,7 @@ class Mxfp4Config(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": if isinstance(layer, LinearBase): if self.ignored_layers and is_layer_skipped( prefix=prefix, diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py index e97fac80f..71bd9c80c 100644 --- a/vllm/model_executor/layers/quantization/petit.py +++ b/vllm/model_executor/layers/quantization/petit.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py -from typing import Any, Optional +from typing import Any import regex as re import torch @@ -159,7 +159,7 @@ class PetitNvFp4Config(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": exclude = self.require_exclude_modules() if isinstance(layer, LinearBase): diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index 1f433e07e..7ae732513 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Any import torch from torch.nn.parameter import Parameter @@ -67,7 +67,7 @@ class PTPCFp8Config(Fp8Config): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": if isinstance(layer, LinearBase): if is_layer_skipped(prefix, self.ignored_layers): return UnquantizedLinearMethod() diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 8fd7b875f..dd6db7193 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fnmatch -from typing import TYPE_CHECKING, Any, Optional, cast +from typing import TYPE_CHECKING, Any, cast import torch @@ -102,7 +102,7 @@ class QuarkConfig(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": # Check if the layer is skipped for quantization. exclude_layers = cast(list[str], self.quant_config.get("exclude")) if should_ignore_layer( diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py index 2783895dd..f195efbbc 100644 --- a/vllm/model_executor/layers/quantization/torchao.py +++ b/vllm/model_executor/layers/quantization/torchao.py @@ -4,7 +4,7 @@ import importlib import json import types from importlib.util import find_spec -from typing import Any, Optional +from typing import Any import regex as re import torch @@ -209,7 +209,7 @@ class TorchAOConfig(QuantizationConfig): def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: + ) -> "QuantizeMethodBase | None": if not isinstance(layer, LinearBase): return None diff --git a/vllm/model_executor/layers/quantization/utils/petit_utils.py b/vllm/model_executor/layers/quantization/utils/petit_utils.py index 0df9748b0..2bed68c1b 100644 --- a/vllm/model_executor/layers/quantization/utils/petit_utils.py +++ b/vllm/model_executor/layers/quantization/utils/petit_utils.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING import torch @@ -9,7 +9,7 @@ if TYPE_CHECKING: from types import ModuleType # 1. Create a global variable as a placeholder for the module -_petit_kernel: Optional["ModuleType"] = None +_petit_kernel: "ModuleType | None" = None _PETIT_INSTALL_MSG = ( "Petit is not installed. Please install it with `pip install petit-kernel`." diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 5160d4842..6e8aee8bc 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -12,7 +12,7 @@ import threading import time from collections.abc import Generator, MutableMapping from dataclasses import asdict, dataclass, field, fields -from typing import TYPE_CHECKING, Any, ClassVar, Optional +from typing import TYPE_CHECKING, Any, ClassVar import regex as re import torch @@ -323,7 +323,7 @@ class TensorizerConfig(MutableMapping): " is unstable and may lead to errors." ) - def open_stream(self, tensorizer_args: Optional["TensorizerArgs"] = None): + def open_stream(self, tensorizer_args: "TensorizerArgs | None" = None): if tensorizer_args is None: tensorizer_args = self._construct_tensorizer_args() diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index a5da362cd..e50771e99 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -11,7 +11,6 @@ from typing import ( TYPE_CHECKING, Any, Literal, - Optional, TypeAlias, TypedDict, Union, @@ -186,7 +185,7 @@ class PlaceholderRange: length: int """The length of the placeholder.""" - is_embed: Optional["torch.Tensor"] = None + is_embed: "torch.Tensor | None" = None """ A boolean mask of shape `(length,)` indicating which positions between `offset` and `offset + length` to assign embeddings to. @@ -341,7 +340,7 @@ class MultiModalFeatureSpec: `MultiModalFeatureSpec` per item. """ - data: Optional["MultiModalKwargsItem"] + data: "MultiModalKwargsItem | None" """ Represents multimodal data for this feature. diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 725bf6506..dbbd0876e 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -7,7 +7,7 @@ pynvml. However, it should not initialize cuda context. import os from collections.abc import Callable from functools import cache, wraps -from typing import TYPE_CHECKING, Optional, TypeVar +from typing import TYPE_CHECKING, TypeVar import torch from typing_extensions import ParamSpec @@ -382,7 +382,7 @@ class CudaPlatformBase(Platform): cls, head_size: int, dtype: torch.dtype, - backend: Optional["AttentionBackendEnum"] = None, + backend: "AttentionBackendEnum | None" = None, ) -> "AttentionBackendEnum": if backend is not None: assert backend in cls.get_supported_vit_attn_backends(), ( diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 8d36d6d52..8c4a97874 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -7,7 +7,7 @@ import platform import random import sys from datetime import timedelta -from typing import TYPE_CHECKING, Any, NamedTuple, Optional +from typing import TYPE_CHECKING, Any, NamedTuple import numpy as np import torch @@ -243,7 +243,7 @@ class Platform: cls, head_size: int, dtype: torch.dtype, - backend: Optional["AttentionBackendEnum"] = None, + backend: "AttentionBackendEnum | None" = None, ) -> "AttentionBackendEnum": """ Get the vision attention backend class of a device. diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 49225fc2e..6f4c235bb 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -3,7 +3,7 @@ import os from functools import cache, lru_cache, wraps -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING import torch @@ -356,7 +356,7 @@ class RocmPlatform(Platform): cls, head_size: int, dtype: torch.dtype, - backend: Optional["AttentionBackendEnum"] = None, + backend: "AttentionBackendEnum | None" = None, ) -> "AttentionBackendEnum": if backend is not None: assert backend in cls.get_supported_vit_attn_backends(), ( diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index b2d7bf38d..439d21cb8 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -3,7 +3,7 @@ import contextlib import os -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING import torch @@ -88,7 +88,7 @@ class XPUPlatform(Platform): cls, head_size: int, dtype: torch.dtype, - backend: Optional["AttentionBackendEnum"] = None, + backend: "AttentionBackendEnum | None" = None, ) -> "AttentionBackendEnum": if backend is not None: assert backend in cls.get_supported_vit_attn_backends(), ( diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index cc750afd8..02e36eda4 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy -from typing import Annotated, Any, Optional +from typing import Annotated, Any import msgspec @@ -80,7 +80,7 @@ class PoolingParams( return deepcopy(self) def verify( - self, task: PoolingTask, model_config: Optional["ModelConfig"] = None + self, task: PoolingTask, model_config: "ModelConfig | None" = None ) -> None: if self.task is None: self.task = task @@ -106,7 +106,7 @@ class PoolingParams( self._verify_valid_parameters() def _merge_default_parameters( - self, model_config: Optional["ModelConfig"] = None + self, model_config: "ModelConfig | None" = None ) -> None: if model_config is None: return @@ -160,7 +160,7 @@ class PoolingParams( if getattr(self, k, None) is None: setattr(self, k, getattr(pooler_config, k)) - def _set_default_parameters(self, model_config: Optional["ModelConfig"]): + def _set_default_parameters(self, model_config: "ModelConfig | None"): if self.task in ["embed", "token_embed"]: if self.use_activation is None: self.use_activation = True diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 829b63d8a..6b4348b96 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -5,7 +5,7 @@ import copy from collections import defaultdict from collections.abc import Callable from dataclasses import asdict, dataclass, field -from typing import Any, Optional, TypeAlias +from typing import Any, TypeAlias from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult from torch._C._profiler import _EventType, _ExperimentalConfig, _ProfilerEvent @@ -31,7 +31,7 @@ except ImportError: @dataclass class _ModuleTreeNode: event: _ProfilerEvent - parent: Optional["_ModuleTreeNode"] = None + parent: "_ModuleTreeNode | None" = None children: list["_ModuleTreeNode"] = field(default_factory=list) trace: str = "" diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index a5a3af653..7dfee9fab 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fnmatch -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from vllm.utils.import_utils import PlaceholderModule @@ -32,7 +32,7 @@ def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]: def glob( - s3: Optional["BaseClient"] = None, + s3: "BaseClient | None" = None, path: str = "", allow_pattern: list[str] | None = None, ) -> list[str]: diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index 2f77e3c03..9a7eb82c0 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Optional +from typing import TYPE_CHECKING, ClassVar import numpy as np import torch @@ -707,7 +707,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]): kv_sharing_target_layer_name: str | None, # MLA Specific Arguments topk_indice_buffer: torch.Tensor | None = None, - indexer: Optional["Indexer"] = None, + indexer: "Indexer | None" = None, **mla_args, ) -> None: super().__init__( diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py index 08a7336b5..263d9cdec 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Optional +from typing import TYPE_CHECKING, ClassVar import numpy as np import torch @@ -284,7 +284,7 @@ class ROCMAiterMLASparseImpl(MLACommonBaseImpl[ROCMAiterMLASparseMetadata]): kv_sharing_target_layer_name: str | None, # MLA Specific Arguments topk_indice_buffer: torch.Tensor | None = None, - indexer: Optional["Indexer"] = None, + indexer: "Indexer | None" = None, **mla_args, ) -> None: super().__init__( diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index c9c85ddc7..48082b3a9 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -4,7 +4,7 @@ import ast from dataclasses import dataclass -from typing import ClassVar, Optional +from typing import ClassVar import torch @@ -87,11 +87,11 @@ class TreeAttentionMetadata: tree_attn_bias: torch.Tensor | None = None # Cached Prefill/decode metadata. - _cached_prefill_metadata: Optional["TreeAttentionMetadata"] = None - _cached_decode_metadata: Optional["TreeAttentionMetadata"] = None + _cached_prefill_metadata: "TreeAttentionMetadata | None" = None + _cached_decode_metadata: "TreeAttentionMetadata | None" = None @property - def prefill_metadata(self) -> Optional["TreeAttentionMetadata"]: + def prefill_metadata(self) -> "TreeAttentionMetadata | None": if self.num_prefills == 0: return None @@ -116,7 +116,7 @@ class TreeAttentionMetadata: return self._cached_prefill_metadata @property - def decode_metadata(self) -> Optional["TreeAttentionMetadata"]: + def decode_metadata(self) -> "TreeAttentionMetadata | None": if self.num_decode_tokens == 0: return None diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py index 92d8d9292..8c8563d13 100644 --- a/vllm/v1/core/sched/interface.py +++ b/vllm/v1/core/sched/interface.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections.abc import Iterable -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry @@ -189,7 +189,7 @@ class SchedulerInterface(ABC): raise NotImplementedError @abstractmethod - def make_stats(self) -> Optional["SchedulerStats"]: + def make_stats(self) -> "SchedulerStats | None": """Make a SchedulerStats object for logging. The SchedulerStats object is created for every scheduling step. @@ -201,5 +201,5 @@ class SchedulerInterface(ABC): """Shutdown the scheduler.""" raise NotImplementedError - def get_kv_connector(self) -> Optional["KVConnectorBase_V1"]: + def get_kv_connector(self) -> "KVConnectorBase_V1 | None": return None diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py index b7761970b..8eb6fa057 100644 --- a/vllm/v1/engine/parallel_sampling.py +++ b/vllm/v1/engine/parallel_sampling.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import copy -from typing import Optional, cast +from typing import cast from vllm.outputs import CompletionOutput from vllm.sampling_params import RequestOutputKind, SamplingParams @@ -133,7 +133,7 @@ class ParentRequest: @staticmethod def observe_finished_request( - parent_req: Optional["ParentRequest"], + parent_req: "ParentRequest | None", iteration_stats: IterationStats, num_generation_tokens: int, ): diff --git a/vllm/v1/request.py b/vllm/v1/request.py index b963fea43..117478a92 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -7,7 +7,7 @@ from collections import deque from collections.abc import Callable, Mapping from dataclasses import dataclass from functools import partial -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import torch @@ -68,7 +68,7 @@ class Request: arrival_time: float | None = None, prompt_embeds: torch.Tensor | None = None, mm_features: list[MultiModalFeatureSpec] | None = None, - lora_request: Optional["LoRARequest"] = None, + lora_request: "LoRARequest | None" = None, cache_salt: str | None = None, priority: int = 0, trace_headers: Mapping[str, str] | None = None, diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py index 0cbfb1878..3e426e321 100644 --- a/vllm/v1/sample/logits_processor/interface.py +++ b/vllm/v1/sample/logits_processor/interface.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from collections.abc import Sequence from dataclasses import dataclass from enum import Enum, auto -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING import torch @@ -94,7 +94,7 @@ class LogitsProcessor(ABC): @abstractmethod def update_state( self, - batch_update: Optional["BatchUpdate"], + batch_update: "BatchUpdate | None", ) -> None: """Called when there are new output tokens, prior to each forward pass. diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 75ad304dd..3d065927e 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -14,7 +14,6 @@ from typing import ( TYPE_CHECKING, Any, Generic, - Optional, TypeVar, Union, overload, @@ -229,7 +228,7 @@ def wait_for_completion_or_failure( api_server_manager: APIServerProcessManager, engine_manager: Union["CoreEngineProcManager", "CoreEngineActorManager"] | None = None, - coordinator: Optional["DPCoordinator"] = None, + coordinator: "DPCoordinator | None" = None, ) -> None: """Wait for all processes to complete or detect if any fail. diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py index e7a947f2e..6d06ba213 100644 --- a/vllm/v1/worker/ubatching.py +++ b/vllm/v1/worker/ubatching.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import threading -from typing import Optional import torch @@ -15,7 +14,7 @@ logger = init_logger(__name__) _THREAD_ID_TO_CONTEXT: dict = {} # Here we hardcode the number of microbatches to 2 for default. _NUM_UBATCHES: int = 2 -_CURRENT_CONTEXTS: list[Optional["UBatchContext"]] = [] +_CURRENT_CONTEXTS: list["UBatchContext | None"] = [] class UBatchContext: diff --git a/vllm/v1/worker/workspace.py b/vllm/v1/worker/workspace.py index bbbd7705d..ef32a32f6 100644 --- a/vllm/v1/worker/workspace.py +++ b/vllm/v1/worker/workspace.py @@ -5,7 +5,6 @@ import inspect import os from itertools import accumulate from math import prod -from typing import Optional import torch @@ -26,7 +25,7 @@ _MB = 1024**2 _GiB = 1024**3 # Global workspace manager instance -_manager: Optional["WorkspaceManager"] = None +_manager: "WorkspaceManager | None" = None class WorkspaceManager: