Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -91,10 +91,10 @@ def multihead_attention(
    """
    # Unified format legal check
    assert q.dim() == k.dim() == v.dim() == 3, "q, k, v must have 3 dims"
-    assert q_cu_seqlens[-1] == q.shape[
-        0], "q_cu_seqlens must sum to q.shape[0]"
-    assert (k_cu_seqlens[-1] == k.shape[0] ==
-            v.shape[0]), "k_cu_seqlens must sum to k.shape[0]"
+    assert q_cu_seqlens[-1] == q.shape[0], "q_cu_seqlens must sum to q.shape[0]"
+    assert k_cu_seqlens[-1] == k.shape[0] == v.shape[0], (
+        "k_cu_seqlens must sum to k.shape[0]"
+    )
    assert q.dtype in [
        torch.bfloat16,
        torch.float16,
@@ -137,23 +137,19 @@ def sdpa_attention(
        k_cu_seqlens: Optional cumulative sequence lengths of k.
    """
    seq_length = q.shape[0]
-    attention_mask = torch.zeros([1, seq_length, seq_length],
-                                 device=q.device,
-                                 dtype=torch.bool)
+    attention_mask = torch.zeros(
+        [1, seq_length, seq_length], device=q.device, dtype=torch.bool
+    )
    for i in range(1, len(q_cu_seqlens)):
        attention_mask[
            ...,
-            q_cu_seqlens[i - 1]:q_cu_seqlens[i],
-            q_cu_seqlens[i - 1]:q_cu_seqlens[i],
+            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
+            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
        ] = True
    q = q.transpose(0, 1)
    k = k.transpose(0, 1)
    v = v.transpose(0, 1)
-    attn_output = F.scaled_dot_product_attention(q,
-                                                 k,
-                                                 v,
-                                                 attention_mask,
-                                                 dropout_p=0.0)
+    attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
    attn_output = attn_output.transpose(0, 1)
    attn_output = attn_output.reshape(seq_length, -1)
    return attn_output
@@ -172,8 +168,9 @@ def _apply_rope_input_validation(x, freqs_cis):
    assert freqs_cis.dtype == torch.complex64, freqs_cis.dtype


-def apply_rope(xq: torch.Tensor, xk: torch.Tensor,
-               freqs_cis: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+def apply_rope(
+    xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Args: (The leading dimensions of all inputs should be the same)
        xq: query, tensor of shape (..., num_heads, head_dim)
@@ -189,20 +186,15 @@ def apply_rope(xq: torch.Tensor, xk: torch.Tensor,
    # ..., num_heads, head_dim/2
    xq_ = torch.view_as_complex(xq.float().view(*xq.shape[:-1], -1, 2))
    xk_ = torch.view_as_complex(xk.float().view(*xq.shape[:-1], -1, 2))
-    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(
-        -2)  # ..., num_heads, head_dim
-    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(
-        -2)  # ..., num_heads, head_dim
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
    return xq_out.type_as(xq), xk_out.type_as(xk)


 class Learnable2DInterpPosEmb(nn.Module):
-
-    def __init__(self,
-                 height: int,
-                 width: int,
-                 dim: int,
-                 interpolation_mode: str = "bicubic") -> None:
+    def __init__(
+        self, height: int, width: int, dim: int, interpolation_mode: str = "bicubic"
+    ) -> None:
        super().__init__()
        self.height = height
        self.width = width
@@ -224,13 +216,16 @@ class Learnable2DInterpPosEmb(nn.Module):
                        self.weight.permute((2, 0, 1)).unsqueeze(0),
                        size=shape,
                        mode=self.interpolation_mode,
-                    ).squeeze(0).permute((1, 2, 0)).flatten(end_dim=1))
+                    )
+                    .squeeze(0)
+                    .permute((1, 2, 0))
+                    .flatten(end_dim=1)
+                )
        out = x + torch.cat(pos_embs)
        return out


 class MoonVisionPatchEmbed(nn.Module):
-
    def __init__(
        self,
        out_dim: int,
@@ -240,23 +235,23 @@ class MoonVisionPatchEmbed(nn.Module):
        pos_emb_width: int = 14,
    ):
        super().__init__()
-        assert isinstance(
-            patch_size,
-            (int, Sequence)), f"Invalid patch_size type: {type(patch_size)}"
+        assert isinstance(patch_size, (int, Sequence)), (
+            f"Invalid patch_size type: {type(patch_size)}"
+        )
        if isinstance(patch_size, int):
            patch_size = (patch_size, patch_size)
-        assert (len(patch_size) == 2
-                ), f"Expected patch_size to be a tuple of 2, got {patch_size}"
+        assert len(patch_size) == 2, (
+            f"Expected patch_size to be a tuple of 2, got {patch_size}"
+        )
        self.patch_size = patch_size

-        self.proj = nn.Conv2d(in_dim,
-                              out_dim,
-                              kernel_size=patch_size,
-                              stride=patch_size)
+        self.proj = nn.Conv2d(
+            in_dim, out_dim, kernel_size=patch_size, stride=patch_size
+        )

-        self.pos_emb = Learnable2DInterpPosEmb(height=pos_emb_height,
-                                               width=pos_emb_width,
-                                               dim=out_dim)
+        self.pos_emb = Learnable2DInterpPosEmb(
+            height=pos_emb_height, width=pos_emb_width, dim=out_dim
+        )

    def forward(self, x: torch.Tensor, grid_hw: torch.Tensor) -> torch.Tensor:
        """
@@ -295,12 +290,9 @@ class Rope2DPosEmb(nn.Module):
        device (str): the device to store the precomputed cis
    """

-    def __init__(self,
-                 dim: int,
-                 max_height: int,
-                 max_width: int,
-                 theta_base=10000,
-                 device="cuda"):
+    def __init__(
+        self, dim: int, max_height: int, max_width: int, theta_base=10000, device="cuda"
+    ):
        super().__init__()
        self.dim = dim
        assert self.dim % 4 == 0, "dim must be divisible by 4"
@@ -325,18 +317,18 @@ class Rope2DPosEmb(nn.Module):
        flat_pos = torch.arange(0, N).float().to(self.device)
        x_pos = flat_pos % self.max_width
        y_pos = flat_pos // self.max_width
-        dim_range = (torch.arange(0, self.dim,
-                                  4)[:(self.dim // 4)].float().to(self.device)
-                     )  # C/4
-        freqs = 1.0 / (self.theta_base**(dim_range / self.dim))
+        dim_range = (
+            torch.arange(0, self.dim, 4)[: (self.dim // 4)].float().to(self.device)
+        )  # C/4
+        freqs = 1.0 / (self.theta_base ** (dim_range / self.dim))
        x_freqs = torch.outer(x_pos, freqs).float()  # N, C/4
        y_freqs = torch.outer(y_pos, freqs).float()  # N, C/4
        x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)  # N, C/4
        y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)  # N, C/4
        # N, C/4, 2
        freqs_cis = torch.cat(
-            [x_cis.unsqueeze(dim=-1),
-             y_cis.unsqueeze(dim=-1)], dim=-1)
+            [x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1
+        )
        # max_height, max_width, C/2
        freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1)
        return freqs_cis
@@ -349,12 +341,13 @@ class Rope2DPosEmb(nn.Module):
            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
        """
        shapes = grid_hws.tolist()
-        assert all(1 <= h <= self.max_height and 1 <= w <= self.max_width
-                   for h, w in shapes), (
-                       shapes,
-                       self.max_height,
-                       self.max_width,
-                   )
+        assert all(
+            1 <= h <= self.max_height and 1 <= w <= self.max_width for h, w in shapes
+        ), (
+            shapes,
+            self.max_height,
+            self.max_width,
+        )
        freqs_cis = torch.cat(
            [
                self.precomputed_freqs_cis[:h, :w].reshape(-1, self.dim // 2)
@@ -364,8 +357,9 @@ class Rope2DPosEmb(nn.Module):
        )
        return freqs_cis

-    def get_freqs_cis_by_idx(self, pos_idx: torch.Tensor,
-                             pos_idx_mask: torch.Tensor) -> torch.Tensor:
+    def get_freqs_cis_by_idx(
+        self, pos_idx: torch.Tensor, pos_idx_mask: torch.Tensor
+    ) -> torch.Tensor:
        """
        Args:
            pos_idx: tensor of shape (..., 2), It contains the (h, w) position indices of each 2D token.
@@ -374,16 +368,20 @@ class Rope2DPosEmb(nn.Module):
        Return:
            freqs_cis: tensor of shape (..., dim//2)
        """
-        assert (pos_idx.shape[:-1] == pos_idx_mask.shape
-                and pos_idx.shape[-1] == 2 and pos_idx.ndim
-                == pos_idx_mask.ndim + 1), (pos_idx.shape, pos_idx_mask.shape)
+        assert (
+            pos_idx.shape[:-1] == pos_idx_mask.shape
+            and pos_idx.shape[-1] == 2
+            and pos_idx.ndim == pos_idx_mask.ndim + 1
+        ), (pos_idx.shape, pos_idx_mask.shape)
        assert pos_idx_mask.dtype == torch.bool, pos_idx_mask.dtype

-        shp = pos_idx_mask.shape + (self.dim // 2, )  # ..., head_dim/2
-        freqs_cis = torch.ones(shp, dtype=torch.complex64,
-                               device=self.device)  # ..., head_dim/2
-        freqs_cis[pos_idx_mask] = self.precomputed_freqs_cis[pos_idx[
-            ..., 0][pos_idx_mask], pos_idx[..., 1][pos_idx_mask]]
+        shp = pos_idx_mask.shape + (self.dim // 2,)  # ..., head_dim/2
+        freqs_cis = torch.ones(
+            shp, dtype=torch.complex64, device=self.device
+        )  # ..., head_dim/2
+        freqs_cis[pos_idx_mask] = self.precomputed_freqs_cis[
+            pos_idx[..., 0][pos_idx_mask], pos_idx[..., 1][pos_idx_mask]
+        ]
        return freqs_cis


@@ -394,23 +392,23 @@ class MLP2(nn.Module):
        bias: whether to use bias in linear layer.
    """

-    def __init__(self,
-                 dims: list[int],
-                 activation,
-                 bias: bool = True,
-                 prefix: str = "",
-                 use_data_parallel: bool = False):
+    def __init__(
+        self,
+        dims: list[int],
+        activation,
+        bias: bool = True,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
        super().__init__()
        assert len(dims) == 3
        self.use_data_parallel = use_data_parallel
-        self.fc0 = ReplicatedLinear(dims[0],
-                                    dims[1],
-                                    bias=bias,
-                                    prefix=maybe_prefix(prefix, "fc0"))
-        self.fc1 = ReplicatedLinear(dims[1],
-                                    dims[2],
-                                    bias=bias,
-                                    prefix=maybe_prefix(prefix, "fc1"))
+        self.fc0 = ReplicatedLinear(
+            dims[0], dims[1], bias=bias, prefix=maybe_prefix(prefix, "fc0")
+        )
+        self.fc1 = ReplicatedLinear(
+            dims[1], dims[2], bias=bias, prefix=maybe_prefix(prefix, "fc1")
+        )
        self.activation = activation

    def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -421,7 +419,6 @@ class MLP2(nn.Module):


 class MoonVitEncoderLayer(nn.Module):
-
    def __init__(
        self,
        num_heads: int,
@@ -446,18 +443,18 @@ class MoonVitEncoderLayer(nn.Module):
        self.norm0 = nn.LayerNorm(hidden_dim)
        self.norm1 = nn.LayerNorm(hidden_dim)
        self.use_data_parallel = use_data_parallel
-        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim],
-                        activation,
-                        prefix=f"{prefix}.mlp",
-                        use_data_parallel=use_data_parallel)
-        self.wqkv = ReplicatedLinear(hidden_dim,
-                                     hidden_dim * 3,
-                                     bias=attn_bias,
-                                     prefix=f"{prefix}.wqkv")
-        self.wo = ReplicatedLinear(hidden_dim,
-                                   hidden_dim,
-                                   bias=attn_bias,
-                                   prefix=f"{prefix}.wo")
+        self.mlp = MLP2(
+            [hidden_dim, mlp_dim, hidden_dim],
+            activation,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+        self.wqkv = ReplicatedLinear(
+            hidden_dim, hidden_dim * 3, bias=attn_bias, prefix=f"{prefix}.wqkv"
+        )
+        self.wo = ReplicatedLinear(
+            hidden_dim, hidden_dim, bias=attn_bias, prefix=f"{prefix}.wo"
+        )

    def attention_qkvpacked(
        self,
@@ -484,11 +481,9 @@ class MoonVitEncoderLayer(nn.Module):
        xq, xk = apply_rope(xq, xk, rope_freqs_cis)

        attn_func = VL_VISION_ATTENTION_FUNCTIONS[self.attn_implementation]
-        attn_out = attn_func(xq,
-                             xk,
-                             xv,
-                             q_cu_seqlens=cu_seqlens,
-                             k_cu_seqlens=cu_seqlens)
+        attn_out = attn_func(
+            xq, xk, xv, q_cu_seqlens=cu_seqlens, k_cu_seqlens=cu_seqlens
+        )
        attn_out, _ = self.wo(attn_out)
        return attn_out

@@ -507,9 +502,9 @@ class MoonVitEncoderLayer(nn.Module):
        """
        residual = hidden_states
        hidden_states = self.norm0(hidden_states)
-        attn_out = self.attention_qkvpacked(hidden_states,
-                                            cu_seqlens,
-                                            rope_freqs_cis=rope_freqs_cis)
+        attn_out = self.attention_qkvpacked(
+            hidden_states, cu_seqlens, rope_freqs_cis=rope_freqs_cis
+        )
        hidden_states = residual + attn_out

        residual = hidden_states
@@ -519,7 +514,6 @@ class MoonVitEncoderLayer(nn.Module):


 class MoonVitEncoder(nn.Module):
-
    def __init__(
        self,
        hidden_dim: int,
@@ -531,27 +525,37 @@ class MoonVitEncoder(nn.Module):
        super().__init__()

        self.rope_2d = Rope2DPosEmb(
-            block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512)
+            block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512
+        )
        self.blocks = nn.ModuleList(
-            [MoonVitEncoderLayer(use_data_parallel=use_data_parallel, \
-                                 prefix=f"{prefix}.blocks.{layer_idx}", \
-                                 **block_cfg) for layer_idx in range(num_layers)])
+            [
+                MoonVitEncoderLayer(
+                    use_data_parallel=use_data_parallel,
+                    prefix=f"{prefix}.blocks.{layer_idx}",
+                    **block_cfg,
+                )
+                for layer_idx in range(num_layers)
+            ]
+        )
        self.final_layernorm = nn.LayerNorm(hidden_dim)

-    def forward(self, hidden_states: torch.Tensor,
-                grid_hw: torch.Tensor) -> torch.Tensor:
-        rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens(
-            grid_hws=grid_hw)
+    def forward(
+        self, hidden_states: torch.Tensor, grid_hw: torch.Tensor
+    ) -> torch.Tensor:
+        rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens(grid_hws=grid_hw)

        lengths = torch.cat(
-            (torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
-             (grid_hw[:, 0] * grid_hw[:, 1]).to(hidden_states.device)))
+            (
+                torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
+                (grid_hw[:, 0] * grid_hw[:, 1]).to(hidden_states.device),
+            )
+        )
        cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32)

        for _, block in enumerate(self.blocks):
-            hidden_states = block(hidden_states,
-                                  cu_seqlens,
-                                  rope_freqs_cis=rope_freqs_cis)
+            hidden_states = block(
+                hidden_states, cu_seqlens, rope_freqs_cis=rope_freqs_cis
+            )

        hidden_states = self.final_layernorm(hidden_states)

@@ -559,9 +563,9 @@ class MoonVitEncoder(nn.Module):


 def patch_merger(
-        x: torch.Tensor,
-        grid_hw: torch.Tensor,
-        merge_kernel_size: list[int, int] = (2, 2),
+    x: torch.Tensor,
+    grid_hw: torch.Tensor,
+    merge_kernel_size: list[int, int] = (2, 2),
 ) -> list[torch.Tensor]:
    d_model = x.size(-1)

@@ -570,15 +574,17 @@ def patch_merger(
    for x_shape in grid_hw.tolist():
        height, width = x_shape[0], x_shape[1]
        # Get the current sequence
-        seq = x[pre_sum:pre_sum + height * width]
+        seq = x[pre_sum : pre_sum + height * width]
        # Reshape along self.merge_kernel_size and concat to the last dimension
        kernel_height, kernel_width = merge_kernel_size
        new_height, new_width = height // kernel_height, width // kernel_width
-        reshaped_seq = seq.view(new_height, kernel_height, new_width,
-                                kernel_width, d_model)
+        reshaped_seq = seq.view(
+            new_height, kernel_height, new_width, kernel_width, d_model
+        )
        reshaped_seq = reshaped_seq.permute(0, 2, 1, 3, 4).contiguous()
-        padded_seq = reshaped_seq.view(new_height * new_width,
-                                       kernel_height * kernel_width, -1)
+        padded_seq = reshaped_seq.view(
+            new_height * new_width, kernel_height * kernel_width, -1
+        )
        outputs.append(padded_seq)
        pre_sum += height * width

@@ -586,7 +592,6 @@ def patch_merger(


 class MoonVitVLProjector(nn.Module):
-
    def __init__(
        self,
        in_channels: int,
@@ -596,13 +601,10 @@ class MoonVitVLProjector(nn.Module):
        out_dim: int = 4096,
    ):
        super().__init__()
-        self.hidden_size = in_channels * merge_kernel_size[
-            0] * merge_kernel_size[1]
+        self.hidden_size = in_channels * merge_kernel_size[0] * merge_kernel_size[1]

        self.pre_norm = nn.nn.LayerNorm(in_channels, eps=ln_eps)
-        self.linear_1 = nn.Linear(self.hidden_size,
-                                  self.hidden_size,
-                                  bias=True)
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.act = ACT2FN[hidden_act]
        self.linear_2 = nn.Linear(self.hidden_size, out_dim, bias=True)

@@ -621,12 +623,14 @@ class MoonVitPretrainedModel(PreTrainedModel):
    _supports_flash_attn_2 = True
    _supports_sdpa = True

-    def __init__(self,
-                 config: MoonViTConfig,
-                 use_data_parallel: bool = False,
-                 prefix: str = "",
-                 *inputs,
-                 **kwargs):
+    def __init__(
+        self,
+        config: MoonViTConfig,
+        use_data_parallel: bool = False,
+        prefix: str = "",
+        *inputs,
+        **kwargs,
+    ):
        super().__init__(config, *inputs, **kwargs)
        config = deepcopy(config)
        self.use_data_parallel = use_data_parallel
@@ -655,8 +659,9 @@ class MoonVitPretrainedModel(PreTrainedModel):
            prefix=f"{prefix}.encoder",
        )

-    def forward(self, pixel_values: torch.Tensor,
-                grid_hw: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, pixel_values: torch.Tensor, grid_hw: torch.Tensor
+    ) -> torch.Tensor:
        """
        Args:
            pixel_values (torch.Tensor): The input pixel values.
@@ -667,7 +672,7 @@ class MoonVitPretrainedModel(PreTrainedModel):
        """
        hidden_states = self.patch_embed(pixel_values, grid_hw)
        hidden_states = self.encoder(hidden_states, grid_hw)
-        hidden_states = patch_merger(hidden_states,
-                                     grid_hw,
-                                     merge_kernel_size=self.merge_kernel_size)
+        hidden_states = patch_merger(
+            hidden_states, grid_hw, merge_kernel_size=self.merge_kernel_size
+        )
        return hidden_states