Update transformers to v4.55 (#21931)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
|
||||
from typing import (TYPE_CHECKING, Any, ClassVar, Literal, Optional, Protocol,
|
||||
Union, overload, runtime_checkable)
|
||||
|
||||
import torch
|
||||
@@ -14,6 +14,10 @@ if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.pooler import Pooler
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
else:
|
||||
VllmConfig = Any
|
||||
Pooler = Any
|
||||
SamplingMetadata = Any
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -34,7 +38,7 @@ class VllmModel(Protocol[T_co]):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: "VllmConfig",
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
...
|
||||
@@ -96,7 +100,7 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]):
|
||||
def compute_logits(
|
||||
self,
|
||||
hidden_states: T,
|
||||
sampling_metadata: "SamplingMetadata",
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[T]:
|
||||
"""Return `None` if TP rank > 0."""
|
||||
...
|
||||
@@ -140,7 +144,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
|
||||
MRO of your model class.
|
||||
"""
|
||||
|
||||
pooler: "Pooler"
|
||||
pooler: Pooler
|
||||
"""The pooler is only called on TP rank 0."""
|
||||
|
||||
|
||||
|
||||
@@ -1395,11 +1395,12 @@ class Tarsier2Processor(Qwen2VLProcessor):
|
||||
**kwargs,
|
||||
):
|
||||
self.image_processor = Tarsier2ImageProcessor(**vision_config)
|
||||
super().__init__(image_processor=self.image_processor,
|
||||
tokenizer=tokenizer,
|
||||
video_processor=Qwen2VLVideoProcessor(),
|
||||
chat_template=None,
|
||||
**kwargs)
|
||||
super().__init__(
|
||||
image_processor=self.image_processor,
|
||||
tokenizer=tokenizer,
|
||||
video_processor=Qwen2VLVideoProcessor(**vision_config),
|
||||
chat_template=None,
|
||||
**kwargs)
|
||||
|
||||
|
||||
class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo):
|
||||
|
||||
@@ -90,7 +90,7 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
|
||||
def replace_linear_class(
|
||||
linear: nn.Linear, style: Literal["colwise", "rowwise"],
|
||||
quant_config: QuantizationConfig
|
||||
) -> Union[ColumnParallelLinear, RowParallelLinear]:
|
||||
) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
|
||||
"""
|
||||
Replace nn.Linear with one of vLLM's tensor parallel linear classes.
|
||||
|
||||
@@ -445,7 +445,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
|
||||
|
||||
# Set correct attn and init on "meta" to delay allocating GPU tensors
|
||||
# TODO: @raushan, use the public `model.set_attn_implementation()`
|
||||
# method after v4.54.0 is released
|
||||
# method once its checks are fixed in Transformers.
|
||||
self.text_config._attn_implementation = "vllm"
|
||||
with init_on_device_without_buffers("meta"), config_override:
|
||||
self.model: PreTrainedModel = AutoModel.from_config(
|
||||
@@ -520,7 +520,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
|
||||
for i in range(len(layers)):
|
||||
if start_layer <= i and i < end_layer:
|
||||
continue
|
||||
layers[i] = PPMissingLayer(return_tuple=True)
|
||||
layers[i] = PPMissingLayer()
|
||||
|
||||
# Layers after module list
|
||||
for name in pp_plan[module_list_idx + 1:]:
|
||||
@@ -533,14 +533,16 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
|
||||
Apply the model's tensor parallelization plan.
|
||||
Currently only supports linear layers.
|
||||
"""
|
||||
if not self.model.supports_tp_plan:
|
||||
if self.tp_size <= 1:
|
||||
return
|
||||
tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {}
|
||||
|
||||
if not tp_plan and self.tp_size > 1:
|
||||
raise ValueError(
|
||||
f"{type(self.model)} does not support tensor parallel yet!")
|
||||
|
||||
tp_plan = self.model._tp_plan
|
||||
# Some weight loaders expect linear layers to inherit from vLLM's
|
||||
# LinearBase class, so we set a default style which causes any
|
||||
# unspecified linear layers to be replaced with ReplicatedLinear
|
||||
tp_plan[".*"] = "replicated"
|
||||
|
||||
def _tensor_parallel(module: nn.Module, prefix: str = ""):
|
||||
for child_name, child_module in module.named_children():
|
||||
@@ -552,6 +554,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
|
||||
child_module, style, self.quant_config)
|
||||
setattr(module, child_name, new_module)
|
||||
log_replacement(qual_name, child_module, new_module)
|
||||
break
|
||||
else:
|
||||
_tensor_parallel(child_module, prefix=qual_name)
|
||||
|
||||
|
||||
@@ -534,16 +534,10 @@ class PPMissingLayer(torch.nn.Identity):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__()
|
||||
self.return_tuple = kwargs.get("return_tuple", False)
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
"""
|
||||
Return the first arg from args or the first value from kwargs.
|
||||
|
||||
Wraps the input in a tuple if `self.return_tuple` is True.
|
||||
"""
|
||||
input = args[0] if args else next(iter(kwargs.values()))
|
||||
return (input, ) if self.return_tuple else input
|
||||
"""Return the first arg from args or the first value from kwargs."""
|
||||
return args[0] if args else next(iter(kwargs.values()))
|
||||
|
||||
|
||||
_CPU_OFFLOAD_BYTES = 0
|
||||
|
||||
Reference in New Issue
Block a user