Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -18,33 +18,45 @@ import torch
from torch import nn
from torchvision import transforms
from torchvision.transforms import InterpolationMode
from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer,
TensorType)
from transformers import BatchFeature, PretrainedConfig, PreTrainedTokenizer, TensorType
from transformers.image_utils import ImageInput
from transformers.tokenization_utils_base import TextInput
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
ReplicatedLinear,
RowParallelLinear)
from vllm.model_executor.layers.linear import (
ColumnParallelLinear,
ReplicatedLinear,
RowParallelLinear,
)
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargsItems)
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement,
PromptUpdate, PromptUpdateDetails)
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
BaseProcessingInfo,
PromptReplacement,
PromptUpdate,
PromptUpdateDetails,
)
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
SupportsMultiModal, SupportsPP)
from .interfaces import (
MultiModalEmbeddings,
SupportsLoRA,
SupportsMultiModal,
SupportsPP,
)
from .qwen import QWenBaseModel, QWenModel
from .utils import flatten_bn
@@ -56,11 +68,12 @@ class QwenImagePixelInputs(TensorSchema):
- c: Number of channels (3)
- h: Height
- w: Width
Note that image_size is the value in the vision config to which we resize
the image to in the normalization transform. Currently multi-image support
can only be leveraged by passing image embeddings directly.
"""
type: Literal["pixel_values"] = "pixel_values"
data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
@@ -71,10 +84,11 @@ class QwenImageEmbeddingInputs(TensorSchema):
- bn: Batch size * number of images
- ifs: Image feature size (256)
- hs: Hidden size
`hidden_size` must match the hidden size of the language model backbone
and is stored in the visual config of the model if we have one.
"""
type: Literal["image_embeds"] = "image_embeds"
data: Annotated[torch.Tensor, TensorShape("bn", 256, "hs")]
@@ -100,8 +114,7 @@ class VisualAttention(nn.Module):
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
self.vdim = vdim if vdim is not None else embed_dim
self._qkv_same_embed_dim = self.kdim == embed_dim \
and self.vdim == embed_dim
self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
self.num_heads = num_heads
@@ -112,8 +125,9 @@ class VisualAttention(nn.Module):
self.hidden_size_per_partition = embed_dim
# Strided linear layer.
assert self._qkv_same_embed_dim, \
'Visual Attention implementation only supports self-attention'
assert self._qkv_same_embed_dim, (
"Visual Attention implementation only supports self-attention"
)
self.in_proj = ReplicatedLinear(embed_dim, 3 * embed_dim)
self.out_proj = ReplicatedLinear(embed_dim, embed_dim)
self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
@@ -128,50 +142,63 @@ class VisualAttention(nn.Module):
mixed_x_layer, _ = self.in_proj(x)
# [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
new_tensor_shape = mixed_x_layer.size()[:-1] + \
(self.num_attention_heads_per_partition,
3 * self.hidden_size_per_attention_head)
new_tensor_shape = mixed_x_layer.size()[:-1] + (
self.num_attention_heads_per_partition,
3 * self.hidden_size_per_attention_head,
)
mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
# [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
query_layer, key_layer, value_layer = mixed_x_layer.split(
self.hidden_size_per_attention_head, dim=-1)
self.hidden_size_per_attention_head, dim=-1
)
# [sq, b, np, hn] -> [sq, b * np, hn]
query_layer = query_layer.view(
sq, b * self.num_attention_heads_per_partition,
self.hidden_size_per_attention_head).transpose(0, 1)
sq,
b * self.num_attention_heads_per_partition,
self.hidden_size_per_attention_head,
).transpose(0, 1)
# [sk, b, np, hn] -> [sk, b * np, hn]
key_layer = key_layer.view(
sq, b * self.num_attention_heads_per_partition,
self.hidden_size_per_attention_head).transpose(0, 1)
sq,
b * self.num_attention_heads_per_partition,
self.hidden_size_per_attention_head,
).transpose(0, 1)
q_scaled = query_layer / self.norm_factor
if attn_mask is not None:
attention_probs = torch.baddbmm(attn_mask, q_scaled,
key_layer.transpose(-2, -1))
attention_probs = torch.baddbmm(
attn_mask, q_scaled, key_layer.transpose(-2, -1)
)
else:
attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
attention_probs = attention_probs.softmax(dim=-1)
value_layer = value_layer.view(
sq, b * self.num_attention_heads_per_partition,
self.hidden_size_per_attention_head).transpose(0, 1)
sq,
b * self.num_attention_heads_per_partition,
self.hidden_size_per_attention_head,
).transpose(0, 1)
# matmul: [b * np, sq, hn]
context_layer = torch.bmm(attention_probs, value_layer)
# change view [b, np, sq, hn]
context_layer = context_layer.view(
b, self.num_attention_heads_per_partition, sq,
self.hidden_size_per_attention_head)
b,
self.num_attention_heads_per_partition,
sq,
self.hidden_size_per_attention_head,
)
# [b, np, sq, hn] --> [sq, b, np, hn]
context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
# [sq, b, np, hn] --> [sq, b, hp]
new_context_layer_shape = context_layer.size()[:-2] + \
(self.hidden_size_per_partition,)
new_context_layer_shape = context_layer.size()[:-2] + (
self.hidden_size_per_partition,
)
context_layer = context_layer.view(*new_context_layer_shape)
output, _ = self.out_proj(context_layer)
@@ -189,10 +216,9 @@ class QwenVLMLP(nn.Module):
quant_config: Optional[QuantizationConfig] = None,
):
super().__init__()
self.c_fc = ColumnParallelLinear(hidden_size,
intermediate_size,
bias=True,
quant_config=quant_config)
self.c_fc = ColumnParallelLinear(
hidden_size, intermediate_size, bias=True, quant_config=quant_config
)
self.act_fn = get_act_fn("gelu")
self.c_proj = RowParallelLinear(
intermediate_size,
@@ -209,7 +235,6 @@ class QwenVLMLP(nn.Module):
class VisualAttentionBlock(nn.Module):
def __init__(
self,
d_model: int,
@@ -249,7 +274,6 @@ class VisualAttentionBlock(nn.Module):
class TransformerBlock(nn.Module):
def __init__(
self,
width: int,
@@ -263,14 +287,18 @@ class TransformerBlock(nn.Module):
self.width = width
self.layers = layers
self.resblocks = nn.ModuleList([
VisualAttentionBlock(width,
heads,
mlp_ratio,
norm_layer=norm_layer,
quant_config=quant_config)
for _ in range(layers)
])
self.resblocks = nn.ModuleList(
[
VisualAttentionBlock(
width,
heads,
mlp_ratio,
norm_layer=norm_layer,
quant_config=quant_config,
)
for _ in range(layers)
]
)
def get_cast_dtype(self) -> torch.dtype:
return self.resblocks[0].mlp.c_fc.weight.dtype
@@ -278,54 +306,57 @@ class TransformerBlock(nn.Module):
def get_cast_device(self) -> torch.device:
return self.resblocks[0].mlp.c_fc.weight.device
def forward(self,
x: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
def forward(
self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None
) -> torch.Tensor:
for r in self.resblocks:
x = r(x, attn_mask=attn_mask)
return x
class VisionTransformer(nn.Module):
def __init__(self,
image_size: int,
patch_size: int,
width: int,
layers: int,
heads: int,
mlp_ratio: float,
n_queries: int = 256,
output_dim: int = 512,
image_start_id: int = 151857,
quant_config: Optional[QuantizationConfig] = None,
**kwargs):
def __init__(
self,
image_size: int,
patch_size: int,
width: int,
layers: int,
heads: int,
mlp_ratio: float,
n_queries: int = 256,
output_dim: int = 512,
image_start_id: int = 151857,
quant_config: Optional[QuantizationConfig] = None,
**kwargs,
):
super().__init__()
image_height, image_width = self.image_size = (image_size, image_size)
patch_height, patch_width = self.patch_size = (patch_size, patch_size)
self.grid_size = (image_height // patch_height,
image_width // patch_width)
self.grid_size = (image_height // patch_height, image_width // patch_width)
self.output_dim = output_dim
self.conv1 = nn.Conv2d(in_channels=3,
out_channels=width,
kernel_size=patch_size,
stride=patch_size,
bias=False)
self.conv1 = nn.Conv2d(
in_channels=3,
out_channels=width,
kernel_size=patch_size,
stride=patch_size,
bias=False,
)
# class embeddings and positional embeddings
scale = width**-0.5
self.positional_embedding = nn.Parameter(scale *
torch.randn(256, width))
self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
norm_layer = partial(nn.LayerNorm, eps=1e-6)
self.ln_pre = norm_layer(width)
self.transformer = TransformerBlock(width,
layers,
heads,
mlp_ratio,
norm_layer=norm_layer,
quant_config=quant_config)
self.transformer = TransformerBlock(
width,
layers,
heads,
mlp_ratio,
norm_layer=norm_layer,
quant_config=quant_config,
)
self.attn_pool = Resampler2(
grid_size=int(math.sqrt(n_queries)),
@@ -342,7 +373,8 @@ class VisionTransformer(nn.Module):
self.ln_post = norm_layer(output_dim)
self.proj = nn.Parameter(
(output_dim**-0.5) * torch.randn(output_dim, output_dim))
(output_dim**-0.5) * torch.randn(output_dim, output_dim)
)
self.image_start_id = image_start_id
self.image_end_id = image_start_id + 1
@@ -356,12 +388,10 @@ class VisionTransformer(nn.Module):
# to patches
x = self.conv1(x) # shape = [*, width, grid, grid]
x = x.reshape(x.shape[0], x.shape[1],
-1) # shape = [*, width, grid ** 2]
x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
x = x + get_abs_pos(self.positional_embedding, int(math.sqrt(
x.size(1))))
x = x + get_abs_pos(self.positional_embedding, int(math.sqrt(x.size(1))))
x = self.ln_pre(x)
@@ -377,20 +407,19 @@ class VisionTransformer(nn.Module):
class QwenVLModel(QWenModel):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
self.visual = VisionTransformer(**config.visual,
quant_config=quant_config)
self.visual = VisionTransformer(**config.visual, quant_config=quant_config)
@lru_cache(maxsize=1)
def _get_tokenizer_without_image_pad(
tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
tokenizer: PreTrainedTokenizer,
) -> PreTrainedTokenizer:
"""
The logic of adding image pad tokens should only be applied in
[`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
@@ -402,7 +431,6 @@ def _get_tokenizer_without_image_pad(
new_tokenizer = copy.deepcopy(tokenizer)
class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore
def tokenize(
self,
text: str,
@@ -413,7 +441,8 @@ def _get_tokenizer_without_image_pad(
text = unicodedata.normalize("NFC", text)
return [
self.decoder[t] for t in self.tokenizer.encode(
self.decoder[t]
for t in self.tokenizer.encode(
text,
allowed_special=allowed_special,
disallowed_special=disallowed_special,
@@ -435,8 +464,7 @@ def _get_tokenizer_without_image_pad(
errors=errors or self.errors,
)
TokenizerWithoutImagePad.__name__ = \
f"{tokenizer.__class__.__name__}WithoutImagePad"
TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"
new_tokenizer.__class__ = TokenizerWithoutImagePad
return new_tokenizer
@@ -467,17 +495,19 @@ class QwenVLProcessor:
vision_config = config.visual
image_size = vision_config["image_size"]
self.image_transform = transforms.Compose([
transforms.Resize(
(image_size, image_size),
interpolation=InterpolationMode.BICUBIC,
),
transforms.ToTensor(),
transforms.Normalize(
mean=(0.48145466, 0.4578275, 0.40821073),
std=(0.26862954, 0.26130258, 0.27577711),
),
])
self.image_transform = transforms.Compose(
[
transforms.Resize(
(image_size, image_size),
interpolation=InterpolationMode.BICUBIC,
),
transforms.ToTensor(),
transforms.Normalize(
mean=(0.48145466, 0.4578275, 0.40821073),
std=(0.26862954, 0.26130258, 0.27577711),
),
]
)
@property
def image_start_tag(self) -> str:
@@ -524,7 +554,6 @@ class QwenVLProcessor:
class QwenVLProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self) -> PreTrainedTokenizer:
tokenizer = self.ctx.tokenizer
assert isinstance(tokenizer, PreTrainedTokenizer)
@@ -553,7 +582,6 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_images = mm_counts.get("image", 0)
@@ -561,8 +589,9 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
img_start = hf_processor.image_start_tag
img_end = hf_processor.image_end_tag
return "".join(f"Picture {i}: {img_start}{img_end}\n"
for i in range(1, num_images + 1))
return "".join(
f"Picture {i}: {img_start}{img_end}\n" for i in range(1, num_images + 1)
)
def get_dummy_mm_data(
self,
@@ -579,16 +608,16 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
image_overrides = mm_options.get("image") if mm_options else None
return {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images,
overrides=image_overrides)
"image": self._get_dummy_images(
width=target_width,
height=target_height,
num_images=num_images,
overrides=image_overrides,
)
}
class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
def _call_hf_processor(
self,
prompt: str,
@@ -644,8 +673,7 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
tokenizer = self.info.get_tokenizer()
special_tokens: dict[str,
int] = tokenizer.special_tokens # type: ignore
special_tokens: dict[str, int] = tokenizer.special_tokens # type: ignore
processor = self.info.get_hf_processor()
img_start_id = special_tokens[processor.image_start_tag]
@@ -667,11 +695,14 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
]
@MULTIMODAL_REGISTRY.register_processor(QwenVLMultiModalProcessor,
info=QwenVLProcessingInfo,
dummy_inputs=QwenVLDummyInputsBuilder)
class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
SupportsMultiModal):
@MULTIMODAL_REGISTRY.register_processor(
QwenVLMultiModalProcessor,
info=QwenVLProcessingInfo,
dummy_inputs=QwenVLDummyInputsBuilder,
)
class QwenVLForConditionalGeneration(
QWenBaseModel, SupportsPP, SupportsLoRA, SupportsMultiModal
):
packed_modules_mapping = {
"c_attn": ["c_attn"],
"gate_up_proj": [
@@ -687,7 +718,8 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
return MultiModelKeys.from_string_field(
language_model="transformer.h",
connector="transformer.visual.attn_pool",
tower_model="transformer.visual.transformer")
tower_model="transformer.visual.transformer",
)
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
@@ -712,14 +744,16 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
self.transformer: QwenVLModel
def _parse_and_validate_image_input(
self, **kwargs: object) -> Optional[QwenImageInputs]:
self, **kwargs: object
) -> Optional[QwenImageInputs]:
pixel_values = kwargs.pop("pixel_values", None)
image_embeds = kwargs.pop("image_embeds", None)
if pixel_values is not None:
if not isinstance(pixel_values, (torch.Tensor, list)):
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")
raise ValueError(
f"Incorrect type of pixel values. Got type: {type(pixel_values)}"
)
expected_h = expected_w = self.config.visual["image_size"]
resolve_bindings = {"h": expected_h, "w": expected_w}
@@ -732,8 +766,10 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
if image_embeds is not None:
if not isinstance(image_embeds, (torch.Tensor, list)):
raise ValueError("Incorrect type of image embeddings. "
f"Got type: {type(image_embeds)}")
raise ValueError(
"Incorrect type of image embeddings. "
f"Got type: {type(image_embeds)}"
)
return QwenImageEmbeddingInputs(
type="image_embeds",
@@ -742,8 +778,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
return None
def _process_image_input(self,
image_input: QwenImageInputs) -> torch.Tensor:
def _process_image_input(self, image_input: QwenImageInputs) -> torch.Tensor:
if image_input["type"] == "image_embeds":
return image_input["data"]
@@ -752,8 +787,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
def get_language_model(self) -> torch.nn.Module:
return self.transformer
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return []
@@ -772,6 +806,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
if intermediate_tensors is not None:
inputs_embeds = None
hidden_states = self.transformer(input_ids, positions,
intermediate_tensors, inputs_embeds)
hidden_states = self.transformer(
input_ids, positions, intermediate_tensors, inputs_embeds
)
return hidden_states