Update deprecated type hinting in model_executor/layers (#18056)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-13 12:17:23 +01:00
parent 906f0598fc
commit 6223dd8114
87 changed files with 523 additions and 523 deletions
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -2,7 +2,7 @@

 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Callable, Optional, Tuple
+from typing import Callable, Optional

 import torch

@@ -12,8 +12,8 @@ from vllm.scalar_type import ScalarType

@dataclass
 class MPLinearLayerConfig:
-    full_weight_shape: Tuple[int, int]  # [in, out]
-    partition_weight_shape: Tuple[int, int]
+    full_weight_shape: tuple[int, int]  # [in, out]
+    partition_weight_shape: tuple[int, int]
    weight_type: ScalarType
    act_type: torch.dtype
    group_size: int
@@ -31,7 +31,7 @@ class MPLinearKernel(ABC):
    @classmethod
    @abstractmethod
    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
        raise NotImplementedError

    def __init__(self,
@@ -75,7 +75,7 @@ class MPLinearKernel(ABC):
                torch.nn.Parameter(new_param.data, requires_grad=False))

    def _get_weight_params(
-            self, layer: torch.nn.Module) -> Tuple[
+            self, layer: torch.nn.Module) -> tuple[
                torch.Tensor,  # w_q
                torch.Tensor,  # w_s
                Optional[torch.Tensor],  # w_zp, 
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/init.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/init.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0

-from typing import List, Optional, Type
+from typing import Optional

 import vllm.envs as envs
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.allspark import (  # noqa: E501
@@ -18,7 +18,7 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision.MPLinearKer
 from vllm.platforms import current_platform

 # in priority/performance order (when available)
-_POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
+_POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [
    MacheteLinearKernel,
    AllSparkLinearKernel,
    MarlinLinearKernel,
@@ -29,7 +29,7 @@ _POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [

 def choose_mp_linear_kernel(
        config: MPLinearLayerConfig,
-        compute_capability: Optional[int] = None) -> Type[MPLinearKernel]:
+        compute_capability: Optional[int] = None) -> type[MPLinearKernel]:
    """
    Choose an MPLinearKernel that can implement the given config for the given
     compute capability. Attempts to choose the best kernel in terms of 
@@ -46,7 +46,7 @@ def choose_mp_linear_kernel(
        ValueError: If no kernel can implement the given config.

    Returns:
-        Type[MPLinearKernel]: Chosen kernel.
+        type[MPLinearKernel]: Chosen kernel.
    """
    if compute_capability is None:
        if current_platform is None:
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0

-from typing import Optional, Tuple
+from typing import Optional

 import torch

@@ -22,7 +22,7 @@ class AllSparkLinearKernel(MPLinearKernel):

    @classmethod
    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
        if c.has_g_idx:
            return False, "Act reordering currently not supported by AllSpark"

--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0

-from typing import Dict, List, Optional, Tuple
+from typing import Optional

 import torch

@@ -21,10 +21,10 @@ logger = init_logger(__name__)

 class BitBLASLinearKernel(MPLinearKernel):

-    OPT_FEATURES: List[int] = BITBLAS_OPTIMIZE_FEATURES
+    OPT_FEATURES: list[int] = BITBLAS_OPTIMIZE_FEATURES
    ENABLE_TUNING: bool = True
    MATMUL_LAYOUT: str = "nt"
-    BITBLAS_DTYPES: Dict[torch.dtype, str] = {
+    BITBLAS_DTYPES: dict[torch.dtype, str] = {
        torch.float32: "float32",
        torch.float16: "float16",
        torch.bfloat16: "bfloat16",
@@ -103,7 +103,7 @@ class BitBLASLinearKernel(MPLinearKernel):

    @classmethod
    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:

        is_bitblas_installed = True

--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0

-from typing import Optional, Tuple
+from typing import Optional

 import torch

@@ -25,7 +25,7 @@ class ExllamaLinearKernel(MPLinearKernel):

    @classmethod
    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
        if c.has_g_idx and\
            c.partition_weight_shape[0] != c.full_weight_shape[0]:
            return False, "Act reordering currently not supported by Exllama, "\
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0

 from functools import partial
-from typing import Optional, Tuple
+from typing import Optional

 import torch

@@ -25,7 +25,7 @@ class MacheteLinearKernel(MPLinearKernel):

    @classmethod
    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:

        if c.has_g_idx and\
            c.partition_weight_shape[0] != c.full_weight_shape[0]:
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0

-from typing import Optional, Tuple
+from typing import Optional

 import torch

@@ -24,7 +24,7 @@ class MarlinLinearKernel(MPLinearKernel):

    @classmethod
    def can_implement(cls,
-                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:

        quant_types = query_marlin_supported_quant_types(c.zero_points)
        if c.weight_type not in quant_types:
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -2,7 +2,7 @@

 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional

 import torch

@@ -24,7 +24,7 @@ class ScaledMMLinearKernel(ABC):
    @classmethod
    @abstractmethod
    def can_implement(
-            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
        raise NotImplementedError

    def __init__(self, c: ScaledMMLinearLayerConfig, w_q_param_name: str,
@@ -50,7 +50,7 @@ class ScaledMMLinearKernel(ABC):
        raise NotImplementedError

    def _get_weight_params(
-            self, layer: torch.nn.Module) -> Tuple[
+            self, layer: torch.nn.Module) -> tuple[
                torch.Tensor,  # weight
                torch.Tensor,  # weight_scale
                Optional[torch.Tensor],  # input_scale, 
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/init.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/init.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0

 import os
-from typing import Dict, List, Optional, Type
+from typing import Optional

 from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
    AiterScaledMMLinearKernel)
@@ -16,7 +16,7 @@ from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import (
 from vllm.platforms import PlatformEnum, current_platform

 # in priority/performance order (when available)
-_POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
+_POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = {
    PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
    PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
    PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel],
@@ -27,7 +27,7 @@ _POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
 def choose_scaled_mm_linear_kernel(
        config: ScaledMMLinearLayerConfig,
        compute_capability: Optional[int] = None
-) -> Type[ScaledMMLinearKernel]:
+) -> type[ScaledMMLinearKernel]:
    """
    Choose an ScaledMMLinearKernel that can implement the given config for the 
    given compute capability. Attempts to choose the best kernel in terms of 
@@ -44,7 +44,7 @@ def choose_scaled_mm_linear_kernel(
        ValueError: If no kernel can implement the given config.

    Returns:
-        Type[ScaledMMLinearKernel]: Chosen kernel.
+        type[ScaledMMLinearKernel]: Chosen kernel.
    """

    if compute_capability is None:
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0

-from typing import Optional, Tuple
+from typing import Optional

 import torch

@@ -20,7 +20,7 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):

    @classmethod
    def can_implement(
-            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
        if not current_platform.is_rocm():
            return (
                False,
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0

-from typing import Optional, Tuple
+from typing import Optional

 import torch

@@ -22,7 +22,7 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):

    @classmethod
    def can_implement(
-            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:

        if (not current_platform.is_cuda() and not current_platform.is_cpu()):
            return False, "CutlassScaledMM requires running on CUDA or CPU."
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0

-from typing import Optional, Tuple
+from typing import Optional

 import torch

@@ -18,7 +18,7 @@ class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):

    @classmethod
    def can_implement(
-            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
        if current_platform.is_cpu():
            return (
                False,
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0

 import warnings
-from typing import Optional, Tuple
+from typing import Optional

 import torch
 from functorch.experimental.control_flow import cond  # noqa: F401
@@ -25,7 +25,7 @@ class XLAScaledMMLinearKernel(ScaledMMLinearKernel):

    @classmethod
    def can_implement(
-            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+            cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:

        if not current_platform.is_tpu():
            return False, "ScaledMMXLA requires running on TPU."