Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom activation functions."""
+
 import math
 from typing import Optional

@@ -8,8 +9,11 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-from vllm.distributed import (divide, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.utils import set_weight_attrs
@@ -32,7 +36,7 @@ class FatreluAndMul(CustomOp):
        return: (num_tokens, d) or (batch_size, seq_len, d)
    """

-    def __init__(self, threshold: float = 0.):
+    def __init__(self, threshold: float = 0.0):
        super().__init__()
        self.threshold = threshold
        if current_platform.is_cuda_alike():
@@ -49,7 +53,7 @@ class FatreluAndMul(CustomOp):

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x, self.threshold)
        return out
@@ -72,6 +76,7 @@ class SiluAndMul(CustomOp):
            self.op = torch.ops._C.silu_and_mul
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops
+
            self.op = ipex_ops.silu_and_mul
        elif current_platform.is_cpu():
            self._forward_method = self.forward_native
@@ -83,14 +88,14 @@ class SiluAndMul(CustomOp):

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out
@@ -113,6 +118,7 @@ class MulAndSilu(CustomOp):
            self.op = torch.ops._C.mul_and_silu
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops
+
            self.op = ipex_ops.silu_and_mul
        elif current_platform.is_cpu():
            self._forward_method = self.forward_native
@@ -124,7 +130,7 @@ class MulAndSilu(CustomOp):

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out
@@ -156,10 +162,8 @@ class GeluAndMulSparse(CustomOp):

        # Sparsity.
        if activation_sparsity == 0.0:
-            raise ValueError(
-                "activation_sparsity is 0.0. Please use GeluAndMul.")
-        target_sparsity_tensor = torch.tensor(activation_sparsity,
-                                              dtype=torch.float32)
+            raise ValueError("activation_sparsity is 0.0. Please use GeluAndMul.")
+        target_sparsity_tensor = torch.tensor(activation_sparsity, dtype=torch.float32)
        normal_dist = torch.distributions.normal.Normal(0, 1)
        self.std_multiplier = normal_dist.icdf(target_sparsity_tensor)

@@ -207,6 +211,7 @@ class GeluAndMul(CustomOp):
                self.op = torch.ops._C.gelu_tanh_and_mul
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops
+
            if approximate == "none":
                self.op = ipex_ops.gelu_and_mul
            else:
@@ -219,20 +224,20 @@ class GeluAndMul(CustomOp):

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        self.op(out, x)
        return out

    def extra_repr(self) -> str:
-        return f'approximate={repr(self.approximate)}'
+        return f"approximate={repr(self.approximate)}"


@CustomOp.register("swigluoai_and_mul")
@@ -255,7 +260,7 @@ class SwigluOAIAndMul(CustomOp):

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
+        output_shape = x.shape[:-1] + (d,)
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
        torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit)
        return out
@@ -266,20 +271,19 @@ class SwigluOAIAndMul(CustomOp):

@CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
-
    def __init__(self):
        super().__init__()
        if current_platform.is_cuda_alike() or current_platform.is_cpu():
            self.op = torch.ops._C.gelu_new
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops
+
            self.op = ipex_ops.gelu_new

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
        c = math.sqrt(2.0 / math.pi)
-        return 0.5 * x * (1.0 + torch.tanh(c *
-                                           (x + 0.044715 * torch.pow(x, 3.0))))
+        return 0.5 * x * (1.0 + torch.tanh(c * (x + 0.044715 * torch.pow(x, 3.0))))

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        out = torch.empty_like(x)
@@ -292,19 +296,18 @@ class NewGELU(CustomOp):

@CustomOp.register("gelu_fast")
 class FastGELU(CustomOp):
-
    def __init__(self):
        super().__init__()
        if current_platform.is_cuda_alike() or current_platform.is_cpu():
            self.op = torch.ops._C.gelu_fast
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops
+
            self.op = ipex_ops.gelu_fast

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
        """PyTorch-native implementation equivalent to forward()."""
-        return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
-                                           (1.0 + 0.044715 * x * x)))
+        return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
        out = torch.empty_like(x)
@@ -324,6 +327,7 @@ class QuickGELU(CustomOp):
            self.op = torch.ops._C.gelu_quick
        elif current_platform.is_xpu():
            from vllm._ipex_ops import ipex_ops
+
            self.op = ipex_ops.gelu_quick

    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
@@ -355,7 +359,7 @@ class ReLUSquaredActivation(CustomOp):
        return torch.square(F.relu(x))

    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        #TODO : implement cuda kernels
+        # TODO : implement cuda kernels
        return self.forward_native(x)


@@ -378,12 +382,15 @@ class XIELU(CustomOp):
    ):
        super().__init__()
        self.alpha_p = nn.Parameter(
-            torch.log(torch.exp(torch.tensor(alpha_p_init, dtype=dtype)) -
-                      1).unsqueeze(0))
+            torch.log(torch.exp(torch.tensor(alpha_p_init, dtype=dtype)) - 1).unsqueeze(
+                0
+            )
+        )
        self.alpha_n = nn.Parameter(
            torch.log(
-                torch.exp(torch.tensor(alpha_n_init - beta, dtype=dtype)) -
-                1).unsqueeze(0))
+                torch.exp(torch.tensor(alpha_n_init - beta, dtype=dtype)) - 1
+            ).unsqueeze(0)
+        )
        self.register_buffer("beta", torch.tensor(beta, dtype=dtype))
        self.register_buffer("eps", torch.tensor(eps, dtype=dtype))
        self.with_vector_loads = with_vector_loads
@@ -403,8 +410,10 @@ class XIELU(CustomOp):
                self._xielu_cuda_fn = allow_in_graph(self._xielu_cuda)
                msg += " Enabled torch._dynamo for xIELU CUDA."
            except Exception as err:
-                msg += (f" Could not enable torch._dynamo for xIELU ({err}) - "
-                        "this may result in slower performance.")
+                msg += (
+                    f" Could not enable torch._dynamo for xIELU ({err}) - "
+                    "this may result in slower performance."
+                )
                self._xielu_cuda_fn = self._xielu_cuda
            logger.warning_once(msg)
        except Exception as err:
@@ -421,14 +430,12 @@ class XIELU(CustomOp):
        return torch.where(
            x > 0,
            alpha_p * x * x + self.beta * x,
-            (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n +
-            self.beta * x,
+            (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n + self.beta * x,
        )

    def _xielu_cuda(self, x: torch.Tensor) -> torch.Tensor:
        """Firewall function to prevent torch.compile from seeing .item()"""
-        assert self._xielu_cuda_obj is not None, (
-            "XIELU CUDA object must not be None")
+        assert self._xielu_cuda_obj is not None, "XIELU CUDA object must not be None"
        original_shape = x.shape
        # CUDA kernel expects 3D tensors, reshape if needed
        while x.dim() < 3:
@@ -486,14 +493,14 @@ class ScaledActivation(nn.Module):
        self.input_is_parallel = input_is_parallel
        if input_is_parallel:
            tp_size = get_tensor_model_parallel_world_size()
-            intermediate_size_per_partition = divide(intermediate_size,
-                                                     tp_size)
+            intermediate_size_per_partition = divide(intermediate_size, tp_size)
        else:
            intermediate_size_per_partition = intermediate_size
        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
        self.scales = nn.Parameter(
-            torch.empty(intermediate_size_per_partition, dtype=params_dtype))
+            torch.empty(intermediate_size_per_partition, dtype=params_dtype)
+        )
        set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})

    def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -510,30 +517,21 @@ class ScaledActivation(nn.Module):
        param_data.copy_(loaded_weight)


-_ACTIVATION_REGISTRY = LazyDict({
-    "gelu":
-    lambda: nn.GELU(),
-    "gelu_fast":
-    lambda: FastGELU(),
-    "gelu_new":
-    lambda: NewGELU(),
-    "gelu_pytorch_tanh":
-    lambda: nn.GELU(approximate="tanh"),
-    "relu":
-    lambda: nn.ReLU(),
-    "relu2":
-    lambda: ReLUSquaredActivation(),
-    "silu":
-    lambda: nn.SiLU(),
-    "quick_gelu":
-    lambda: QuickGELU(),
-    "tanh":
-    lambda: nn.Tanh(),
-    "sigmoid":
-    lambda: nn.Sigmoid(),
-    "xielu":
-    lambda: XIELU(),
-})
+_ACTIVATION_REGISTRY = LazyDict(
+    {
+        "gelu": lambda: nn.GELU(),
+        "gelu_fast": lambda: FastGELU(),
+        "gelu_new": lambda: NewGELU(),
+        "gelu_pytorch_tanh": lambda: nn.GELU(approximate="tanh"),
+        "relu": lambda: nn.ReLU(),
+        "relu2": lambda: ReLUSquaredActivation(),
+        "silu": lambda: nn.SiLU(),
+        "quick_gelu": lambda: QuickGELU(),
+        "tanh": lambda: nn.Tanh(),
+        "sigmoid": lambda: nn.Sigmoid(),
+        "xielu": lambda: XIELU(),
+    }
+)


 def get_act_fn(act_fn_name: str) -> nn.Module:
@@ -547,29 +545,25 @@ def get_act_fn(act_fn_name: str) -> nn.Module:
        act_fn_name = activation_name

    if act_fn_name not in _ACTIVATION_REGISTRY:
-        raise ValueError(
-            f"Activation function {act_fn_name!r} is not supported.")
+        raise ValueError(f"Activation function {act_fn_name!r} is not supported.")

    return _ACTIVATION_REGISTRY[act_fn_name]


-_ACTIVATION_AND_MUL_REGISTRY = LazyDict({
-    "gelu":
-    lambda: GeluAndMul(),
-    "silu":
-    lambda: SiluAndMul(),
-    "geglu":
-    lambda: GeluAndMul(),
-    "swigluoai":
-    lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs),
-})
+_ACTIVATION_AND_MUL_REGISTRY = LazyDict(
+    {
+        "gelu": lambda: GeluAndMul(),
+        "silu": lambda: SiluAndMul(),
+        "geglu": lambda: GeluAndMul(),
+        "swigluoai": lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs),
+    }
+)


 def get_act_and_mul_fn(act_fn_name: str) -> nn.Module:
    """Get an activation-and-mul (i.e. SiluAndMul) function by name."""
    act_fn_name = act_fn_name.lower()
    if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY:
-        raise ValueError(
-            f"Activation function {act_fn_name!r} is not supported.")
+        raise ValueError(f"Activation function {act_fn_name!r} is not supported.")

    return _ACTIVATION_AND_MUL_REGISTRY[act_fn_name]