Update transformers to v4.55 (#21931)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-06 06:56:14 +01:00
parent 6e20924350
commit 796bae07c5
13 changed files with 235 additions and 39 deletions
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
+from typing import (TYPE_CHECKING, Any, ClassVar, Literal, Optional, Protocol,
                    Union, overload, runtime_checkable)

 import torch
@@ -14,6 +14,10 @@ if TYPE_CHECKING:
    from vllm.config import VllmConfig
    from vllm.model_executor.layers.pooler import Pooler
    from vllm.model_executor.sampling_metadata import SamplingMetadata
+else:
+    VllmConfig = Any
+    Pooler = Any
+    SamplingMetadata = Any

 logger = init_logger(__name__)

@@ -34,7 +38,7 @@ class VllmModel(Protocol[T_co]):

    def __init__(
        self,
-        vllm_config: "VllmConfig",
+        vllm_config: VllmConfig,
        prefix: str = "",
    ) -> None:
        ...
@@ -96,7 +100,7 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]):
    def compute_logits(
        self,
        hidden_states: T,
-        sampling_metadata: "SamplingMetadata",
+        sampling_metadata: SamplingMetadata,
    ) -> Optional[T]:
        """Return `None` if TP rank > 0."""
        ...
@@ -140,7 +144,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
        MRO of your model class.
    """

-    pooler: "Pooler"
+    pooler: Pooler
    """The pooler is only called on TP rank 0."""


--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1395,11 +1395,12 @@ class Tarsier2Processor(Qwen2VLProcessor):
        **kwargs,
    ):
        self.image_processor = Tarsier2ImageProcessor(**vision_config)
-        super().__init__(image_processor=self.image_processor,
-                         tokenizer=tokenizer,
-                         video_processor=Qwen2VLVideoProcessor(),
-                         chat_template=None,
-                         **kwargs)
+        super().__init__(
+            image_processor=self.image_processor,
+            tokenizer=tokenizer,
+            video_processor=Qwen2VLVideoProcessor(**vision_config),
+            chat_template=None,
+            **kwargs)


 class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -90,7 +90,7 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
 def replace_linear_class(
    linear: nn.Linear, style: Literal["colwise", "rowwise"],
    quant_config: QuantizationConfig
-) -> Union[ColumnParallelLinear, RowParallelLinear]:
+) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
    """
    Replace nn.Linear with one of vLLM's tensor parallel linear classes.

@@ -445,7 +445,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):

        # Set correct attn and init on "meta" to delay allocating GPU tensors
        # TODO: @raushan, use the public `model.set_attn_implementation()`
-        # method after v4.54.0 is released
+        # method once its checks are fixed in Transformers.
        self.text_config._attn_implementation = "vllm"
        with init_on_device_without_buffers("meta"), config_override:
            self.model: PreTrainedModel = AutoModel.from_config(
@@ -520,7 +520,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
        for i in range(len(layers)):
            if start_layer <= i and i < end_layer:
                continue
-            layers[i] = PPMissingLayer(return_tuple=True)
+            layers[i] = PPMissingLayer()

        # Layers after module list
        for name in pp_plan[module_list_idx + 1:]:
@@ -533,14 +533,16 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
        Apply the model's tensor parallelization plan.
        Currently only supports linear layers.
        """
-        if not self.model.supports_tp_plan:
-            if self.tp_size <= 1:
-                return
+        tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {}

+        if not tp_plan and self.tp_size > 1:
            raise ValueError(
                f"{type(self.model)} does not support tensor parallel yet!")

-        tp_plan = self.model._tp_plan
+        # Some weight loaders expect linear layers to inherit from vLLM's
+        # LinearBase class, so we set a default style which causes any
+        # unspecified linear layers to be replaced with ReplicatedLinear
+        tp_plan[".*"] = "replicated"

        def _tensor_parallel(module: nn.Module, prefix: str = ""):
            for child_name, child_module in module.named_children():
@@ -552,6 +554,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
                            child_module, style, self.quant_config)
                        setattr(module, child_name, new_module)
                        log_replacement(qual_name, child_module, new_module)
+                        break
                else:
                    _tensor_parallel(child_module, prefix=qual_name)

--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -534,16 +534,10 @@ class PPMissingLayer(torch.nn.Identity):

    def __init__(self, *args, **kwargs):
        super().__init__()
-        self.return_tuple = kwargs.get("return_tuple", False)

    def forward(self, *args, **kwargs):
-        """
-        Return the first arg from args or the first value from kwargs.
-
-        Wraps the input in a tuple if `self.return_tuple` is True.
-        """
-        input = args[0] if args else next(iter(kwargs.values()))
-        return (input, ) if self.return_tuple else input
+        """Return the first arg from args or the first value from kwargs."""
+        return args[0] if args else next(iter(kwargs.values()))


 _CPU_OFFLOAD_BYTES = 0