Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -18,33 +18,45 @@ import torch
 from torch import nn
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
-from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer,
-                          TensorType)
+from transformers import BatchFeature, PretrainedConfig, PreTrainedTokenizer, TensorType
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargsItems)
+from vllm.multimodal.inputs import (
+    MultiModalDataDict,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
 from vllm.multimodal.parse import MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+)
 from .qwen import QWenBaseModel, QWenModel
 from .utils import flatten_bn

@@ -56,11 +68,12 @@ class QwenImagePixelInputs(TensorSchema):
        - c: Number of channels (3)
        - h: Height
        - w: Width
-    
+
    Note that image_size is the value in the vision config to which we resize
    the image to in the normalization transform. Currently multi-image support
    can only be leveraged by passing image embeddings directly.
    """
+
    type: Literal["pixel_values"] = "pixel_values"
    data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]

@@ -71,10 +84,11 @@ class QwenImageEmbeddingInputs(TensorSchema):
        - bn: Batch size * number of images
        - ifs: Image feature size (256)
        - hs: Hidden size
-    
+
    `hidden_size` must match the hidden size of the language model backbone
    and is stored in the visual config of the model if we have one.
    """
+
    type: Literal["image_embeds"] = "image_embeds"
    data: Annotated[torch.Tensor, TensorShape("bn", 256, "hs")]

@@ -100,8 +114,7 @@ class VisualAttention(nn.Module):
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
-        self._qkv_same_embed_dim = self.kdim == embed_dim \
-            and self.vdim == embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads

@@ -112,8 +125,9 @@ class VisualAttention(nn.Module):
        self.hidden_size_per_partition = embed_dim

        # Strided linear layer.
-        assert self._qkv_same_embed_dim, \
-                'Visual Attention implementation only supports self-attention'
+        assert self._qkv_same_embed_dim, (
+            "Visual Attention implementation only supports self-attention"
+        )
        self.in_proj = ReplicatedLinear(embed_dim, 3 * embed_dim)
        self.out_proj = ReplicatedLinear(embed_dim, embed_dim)
        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
@@ -128,50 +142,63 @@ class VisualAttention(nn.Module):
        mixed_x_layer, _ = self.in_proj(x)

        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
-        new_tensor_shape = mixed_x_layer.size()[:-1] + \
-            (self.num_attention_heads_per_partition,
-             3 * self.hidden_size_per_attention_head)
+        new_tensor_shape = mixed_x_layer.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * self.hidden_size_per_attention_head,
+        )
        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)

        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
        query_layer, key_layer, value_layer = mixed_x_layer.split(
-            self.hidden_size_per_attention_head, dim=-1)
+            self.hidden_size_per_attention_head, dim=-1
+        )

        # [sq, b, np, hn] -> [sq, b * np, hn]
        query_layer = query_layer.view(
-            sq, b * self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head).transpose(0, 1)
+            sq,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        ).transpose(0, 1)
        # [sk, b, np, hn] -> [sk, b * np, hn]
        key_layer = key_layer.view(
-            sq, b * self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head).transpose(0, 1)
+            sq,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        ).transpose(0, 1)

        q_scaled = query_layer / self.norm_factor
        if attn_mask is not None:
-            attention_probs = torch.baddbmm(attn_mask, q_scaled,
-                                            key_layer.transpose(-2, -1))
+            attention_probs = torch.baddbmm(
+                attn_mask, q_scaled, key_layer.transpose(-2, -1)
+            )
        else:
            attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
        attention_probs = attention_probs.softmax(dim=-1)

        value_layer = value_layer.view(
-            sq, b * self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head).transpose(0, 1)
+            sq,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        ).transpose(0, 1)

        # matmul: [b * np, sq, hn]
        context_layer = torch.bmm(attention_probs, value_layer)

        # change view [b, np, sq, hn]
        context_layer = context_layer.view(
-            b, self.num_attention_heads_per_partition, sq,
-            self.hidden_size_per_attention_head)
+            b,
+            self.num_attention_heads_per_partition,
+            sq,
+            self.hidden_size_per_attention_head,
+        )

        # [b, np, sq, hn] --> [sq, b, np, hn]
        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()

        # [sq, b, np, hn] --> [sq, b, hp]
-        new_context_layer_shape = context_layer.size()[:-2] + \
-            (self.hidden_size_per_partition,)
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.hidden_size_per_partition,
+        )
        context_layer = context_layer.view(*new_context_layer_shape)

        output, _ = self.out_proj(context_layer)
@@ -189,10 +216,9 @@ class QwenVLMLP(nn.Module):
        quant_config: Optional[QuantizationConfig] = None,
    ):
        super().__init__()
-        self.c_fc = ColumnParallelLinear(hidden_size,
-                                         intermediate_size,
-                                         bias=True,
-                                         quant_config=quant_config)
+        self.c_fc = ColumnParallelLinear(
+            hidden_size, intermediate_size, bias=True, quant_config=quant_config
+        )
        self.act_fn = get_act_fn("gelu")
        self.c_proj = RowParallelLinear(
            intermediate_size,
@@ -209,7 +235,6 @@ class QwenVLMLP(nn.Module):


 class VisualAttentionBlock(nn.Module):
-
    def __init__(
        self,
        d_model: int,
@@ -249,7 +274,6 @@ class VisualAttentionBlock(nn.Module):


 class TransformerBlock(nn.Module):
-
    def __init__(
        self,
        width: int,
@@ -263,14 +287,18 @@ class TransformerBlock(nn.Module):
        self.width = width
        self.layers = layers

-        self.resblocks = nn.ModuleList([
-            VisualAttentionBlock(width,
-                                 heads,
-                                 mlp_ratio,
-                                 norm_layer=norm_layer,
-                                 quant_config=quant_config)
-            for _ in range(layers)
-        ])
+        self.resblocks = nn.ModuleList(
+            [
+                VisualAttentionBlock(
+                    width,
+                    heads,
+                    mlp_ratio,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                )
+                for _ in range(layers)
+            ]
+        )

    def get_cast_dtype(self) -> torch.dtype:
        return self.resblocks[0].mlp.c_fc.weight.dtype
@@ -278,54 +306,57 @@ class TransformerBlock(nn.Module):
    def get_cast_device(self) -> torch.device:
        return self.resblocks[0].mlp.c_fc.weight.device

-    def forward(self,
-                x: torch.Tensor,
-                attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(
+        self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
        for r in self.resblocks:
            x = r(x, attn_mask=attn_mask)
        return x


 class VisionTransformer(nn.Module):
-
-    def __init__(self,
-                 image_size: int,
-                 patch_size: int,
-                 width: int,
-                 layers: int,
-                 heads: int,
-                 mlp_ratio: float,
-                 n_queries: int = 256,
-                 output_dim: int = 512,
-                 image_start_id: int = 151857,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 **kwargs):
+    def __init__(
+        self,
+        image_size: int,
+        patch_size: int,
+        width: int,
+        layers: int,
+        heads: int,
+        mlp_ratio: float,
+        n_queries: int = 256,
+        output_dim: int = 512,
+        image_start_id: int = 151857,
+        quant_config: Optional[QuantizationConfig] = None,
+        **kwargs,
+    ):
        super().__init__()
        image_height, image_width = self.image_size = (image_size, image_size)
        patch_height, patch_width = self.patch_size = (patch_size, patch_size)
-        self.grid_size = (image_height // patch_height,
-                          image_width // patch_width)
+        self.grid_size = (image_height // patch_height, image_width // patch_width)
        self.output_dim = output_dim
-        self.conv1 = nn.Conv2d(in_channels=3,
-                               out_channels=width,
-                               kernel_size=patch_size,
-                               stride=patch_size,
-                               bias=False)
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+        )

        # class embeddings and positional embeddings
        scale = width**-0.5
-        self.positional_embedding = nn.Parameter(scale *
-                                                 torch.randn(256, width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))

        norm_layer = partial(nn.LayerNorm, eps=1e-6)

        self.ln_pre = norm_layer(width)
-        self.transformer = TransformerBlock(width,
-                                            layers,
-                                            heads,
-                                            mlp_ratio,
-                                            norm_layer=norm_layer,
-                                            quant_config=quant_config)
+        self.transformer = TransformerBlock(
+            width,
+            layers,
+            heads,
+            mlp_ratio,
+            norm_layer=norm_layer,
+            quant_config=quant_config,
+        )

        self.attn_pool = Resampler2(
            grid_size=int(math.sqrt(n_queries)),
@@ -342,7 +373,8 @@ class VisionTransformer(nn.Module):

        self.ln_post = norm_layer(output_dim)
        self.proj = nn.Parameter(
-            (output_dim**-0.5) * torch.randn(output_dim, output_dim))
+            (output_dim**-0.5) * torch.randn(output_dim, output_dim)
+        )

        self.image_start_id = image_start_id
        self.image_end_id = image_start_id + 1
@@ -356,12 +388,10 @@ class VisionTransformer(nn.Module):

        # to patches
        x = self.conv1(x)  # shape = [*, width, grid, grid]
-        x = x.reshape(x.shape[0], x.shape[1],
-                      -1)  # shape = [*, width, grid ** 2]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]

-        x = x + get_abs_pos(self.positional_embedding, int(math.sqrt(
-            x.size(1))))
+        x = x + get_abs_pos(self.positional_embedding, int(math.sqrt(x.size(1))))

        x = self.ln_pre(x)

@@ -377,20 +407,19 @@ class VisionTransformer(nn.Module):


 class QwenVLModel(QWenModel):
-
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__(vllm_config=vllm_config, prefix=prefix)

        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config

-        self.visual = VisionTransformer(**config.visual,
-                                        quant_config=quant_config)
+        self.visual = VisionTransformer(**config.visual, quant_config=quant_config)


@lru_cache(maxsize=1)
 def _get_tokenizer_without_image_pad(
-        tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
+    tokenizer: PreTrainedTokenizer,
+) -> PreTrainedTokenizer:
    """
    The logic of adding image pad tokens should only be applied in
    [`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
@@ -402,7 +431,6 @@ def _get_tokenizer_without_image_pad(
    new_tokenizer = copy.deepcopy(tokenizer)

    class TokenizerWithoutImagePad(tokenizer.__class__):  # type: ignore
-
        def tokenize(
            self,
            text: str,
@@ -413,7 +441,8 @@ def _get_tokenizer_without_image_pad(
            text = unicodedata.normalize("NFC", text)

            return [
-                self.decoder[t] for t in self.tokenizer.encode(
+                self.decoder[t]
+                for t in self.tokenizer.encode(
                    text,
                    allowed_special=allowed_special,
                    disallowed_special=disallowed_special,
@@ -435,8 +464,7 @@ def _get_tokenizer_without_image_pad(
                errors=errors or self.errors,
            )

-    TokenizerWithoutImagePad.__name__ = \
-        f"{tokenizer.__class__.__name__}WithoutImagePad"
+    TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"

    new_tokenizer.__class__ = TokenizerWithoutImagePad
    return new_tokenizer
@@ -467,17 +495,19 @@ class QwenVLProcessor:
        vision_config = config.visual
        image_size = vision_config["image_size"]

-        self.image_transform = transforms.Compose([
-            transforms.Resize(
-                (image_size, image_size),
-                interpolation=InterpolationMode.BICUBIC,
-            ),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=(0.48145466, 0.4578275, 0.40821073),
-                std=(0.26862954, 0.26130258, 0.27577711),
-            ),
-        ])
+        self.image_transform = transforms.Compose(
+            [
+                transforms.Resize(
+                    (image_size, image_size),
+                    interpolation=InterpolationMode.BICUBIC,
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=(0.48145466, 0.4578275, 0.40821073),
+                    std=(0.26862954, 0.26130258, 0.27577711),
+                ),
+            ]
+        )

    @property
    def image_start_tag(self) -> str:
@@ -524,7 +554,6 @@ class QwenVLProcessor:


 class QwenVLProcessingInfo(BaseProcessingInfo):
-
    def get_tokenizer(self) -> PreTrainedTokenizer:
        tokenizer = self.ctx.tokenizer
        assert isinstance(tokenizer, PreTrainedTokenizer)
@@ -553,7 +582,6 @@ class QwenVLProcessingInfo(BaseProcessingInfo):


 class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
-
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_images = mm_counts.get("image", 0)

@@ -561,8 +589,9 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
        img_start = hf_processor.image_start_tag
        img_end = hf_processor.image_end_tag

-        return "".join(f"Picture {i}: {img_start}{img_end}\n"
-                       for i in range(1, num_images + 1))
+        return "".join(
+            f"Picture {i}: {img_start}{img_end}\n" for i in range(1, num_images + 1)
+        )

    def get_dummy_mm_data(
        self,
@@ -579,16 +608,16 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
        image_overrides = mm_options.get("image") if mm_options else None

        return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images,
-                                   overrides=image_overrides)
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images,
+                overrides=image_overrides,
+            )
        }


 class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
-
    def _call_hf_processor(
        self,
        prompt: str,
@@ -644,8 +673,7 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
        out_mm_kwargs: MultiModalKwargsItems,
    ) -> Sequence[PromptUpdate]:
        tokenizer = self.info.get_tokenizer()
-        special_tokens: dict[str,
-                             int] = tokenizer.special_tokens  # type: ignore
+        special_tokens: dict[str, int] = tokenizer.special_tokens  # type: ignore

        processor = self.info.get_hf_processor()
        img_start_id = special_tokens[processor.image_start_tag]
@@ -667,11 +695,14 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
        ]


-@MULTIMODAL_REGISTRY.register_processor(QwenVLMultiModalProcessor,
-                                        info=QwenVLProcessingInfo,
-                                        dummy_inputs=QwenVLDummyInputsBuilder)
-class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
-                                     SupportsMultiModal):
+@MULTIMODAL_REGISTRY.register_processor(
+    QwenVLMultiModalProcessor,
+    info=QwenVLProcessingInfo,
+    dummy_inputs=QwenVLDummyInputsBuilder,
+)
+class QwenVLForConditionalGeneration(
+    QWenBaseModel, SupportsPP, SupportsLoRA, SupportsMultiModal
+):
    packed_modules_mapping = {
        "c_attn": ["c_attn"],
        "gate_up_proj": [
@@ -687,7 +718,8 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
        return MultiModelKeys.from_string_field(
            language_model="transformer.h",
            connector="transformer.visual.attn_pool",
-            tower_model="transformer.visual.transformer")
+            tower_model="transformer.visual.transformer",
+        )

    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
@@ -712,14 +744,16 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
        self.transformer: QwenVLModel

    def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[QwenImageInputs]:
+        self, **kwargs: object
+    ) -> Optional[QwenImageInputs]:
        pixel_values = kwargs.pop("pixel_values", None)
        image_embeds = kwargs.pop("image_embeds", None)

        if pixel_values is not None:
            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
+                raise ValueError(
+                    f"Incorrect type of pixel values. Got type: {type(pixel_values)}"
+                )

            expected_h = expected_w = self.config.visual["image_size"]
            resolve_bindings = {"h": expected_h, "w": expected_w}
@@ -732,8 +766,10 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,

        if image_embeds is not None:
            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
+                raise ValueError(
+                    "Incorrect type of image embeddings. "
+                    f"Got type: {type(image_embeds)}"
+                )

            return QwenImageEmbeddingInputs(
                type="image_embeds",
@@ -742,8 +778,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,

        return None

-    def _process_image_input(self,
-                             image_input: QwenImageInputs) -> torch.Tensor:
+    def _process_image_input(self, image_input: QwenImageInputs) -> torch.Tensor:
        if image_input["type"] == "image_embeds":
            return image_input["data"]

@@ -752,8 +787,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
    def get_language_model(self) -> torch.nn.Module:
        return self.transformer

-    def get_multimodal_embeddings(self,
-                                  **kwargs: object) -> MultiModalEmbeddings:
+    def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
        image_input = self._parse_and_validate_image_input(**kwargs)
        if image_input is None:
            return []
@@ -772,6 +806,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
        if intermediate_tensors is not None:
            inputs_embeds = None

-        hidden_states = self.transformer(input_ids, positions,
-                                         intermediate_tensors, inputs_embeds)
+        hidden_states = self.transformer(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
        return hidden_states