Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -18,33 +18,45 @@ import torch
|
||||
from torch import nn
|
||||
from torchvision import transforms
|
||||
from torchvision.transforms import InterpolationMode
|
||||
from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer,
|
||||
TensorType)
|
||||
from transformers import BatchFeature, PretrainedConfig, PreTrainedTokenizer, TensorType
|
||||
from transformers.image_utils import ImageInput
|
||||
from transformers.tokenization_utils_base import TextInput
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
ReplicatedLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.linear import (
|
||||
ColumnParallelLinear,
|
||||
ReplicatedLinear,
|
||||
RowParallelLinear,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
||||
MultiModalKwargsItems)
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
MultiModalFieldConfig,
|
||||
MultiModalKwargsItems,
|
||||
)
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
PromptUpdate, PromptUpdateDetails)
|
||||
from vllm.multimodal.processing import (
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||
SupportsMultiModal, SupportsPP)
|
||||
from .interfaces import (
|
||||
MultiModalEmbeddings,
|
||||
SupportsLoRA,
|
||||
SupportsMultiModal,
|
||||
SupportsPP,
|
||||
)
|
||||
from .qwen import QWenBaseModel, QWenModel
|
||||
from .utils import flatten_bn
|
||||
|
||||
@@ -56,11 +68,12 @@ class QwenImagePixelInputs(TensorSchema):
|
||||
- c: Number of channels (3)
|
||||
- h: Height
|
||||
- w: Width
|
||||
|
||||
|
||||
Note that image_size is the value in the vision config to which we resize
|
||||
the image to in the normalization transform. Currently multi-image support
|
||||
can only be leveraged by passing image embeddings directly.
|
||||
"""
|
||||
|
||||
type: Literal["pixel_values"] = "pixel_values"
|
||||
data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
|
||||
|
||||
@@ -71,10 +84,11 @@ class QwenImageEmbeddingInputs(TensorSchema):
|
||||
- bn: Batch size * number of images
|
||||
- ifs: Image feature size (256)
|
||||
- hs: Hidden size
|
||||
|
||||
|
||||
`hidden_size` must match the hidden size of the language model backbone
|
||||
and is stored in the visual config of the model if we have one.
|
||||
"""
|
||||
|
||||
type: Literal["image_embeds"] = "image_embeds"
|
||||
data: Annotated[torch.Tensor, TensorShape("bn", 256, "hs")]
|
||||
|
||||
@@ -100,8 +114,7 @@ class VisualAttention(nn.Module):
|
||||
self.embed_dim = embed_dim
|
||||
self.kdim = kdim if kdim is not None else embed_dim
|
||||
self.vdim = vdim if vdim is not None else embed_dim
|
||||
self._qkv_same_embed_dim = self.kdim == embed_dim \
|
||||
and self.vdim == embed_dim
|
||||
self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
|
||||
|
||||
self.num_heads = num_heads
|
||||
|
||||
@@ -112,8 +125,9 @@ class VisualAttention(nn.Module):
|
||||
self.hidden_size_per_partition = embed_dim
|
||||
|
||||
# Strided linear layer.
|
||||
assert self._qkv_same_embed_dim, \
|
||||
'Visual Attention implementation only supports self-attention'
|
||||
assert self._qkv_same_embed_dim, (
|
||||
"Visual Attention implementation only supports self-attention"
|
||||
)
|
||||
self.in_proj = ReplicatedLinear(embed_dim, 3 * embed_dim)
|
||||
self.out_proj = ReplicatedLinear(embed_dim, embed_dim)
|
||||
self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
|
||||
@@ -128,50 +142,63 @@ class VisualAttention(nn.Module):
|
||||
mixed_x_layer, _ = self.in_proj(x)
|
||||
|
||||
# [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
|
||||
new_tensor_shape = mixed_x_layer.size()[:-1] + \
|
||||
(self.num_attention_heads_per_partition,
|
||||
3 * self.hidden_size_per_attention_head)
|
||||
new_tensor_shape = mixed_x_layer.size()[:-1] + (
|
||||
self.num_attention_heads_per_partition,
|
||||
3 * self.hidden_size_per_attention_head,
|
||||
)
|
||||
mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
|
||||
|
||||
# [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
|
||||
query_layer, key_layer, value_layer = mixed_x_layer.split(
|
||||
self.hidden_size_per_attention_head, dim=-1)
|
||||
self.hidden_size_per_attention_head, dim=-1
|
||||
)
|
||||
|
||||
# [sq, b, np, hn] -> [sq, b * np, hn]
|
||||
query_layer = query_layer.view(
|
||||
sq, b * self.num_attention_heads_per_partition,
|
||||
self.hidden_size_per_attention_head).transpose(0, 1)
|
||||
sq,
|
||||
b * self.num_attention_heads_per_partition,
|
||||
self.hidden_size_per_attention_head,
|
||||
).transpose(0, 1)
|
||||
# [sk, b, np, hn] -> [sk, b * np, hn]
|
||||
key_layer = key_layer.view(
|
||||
sq, b * self.num_attention_heads_per_partition,
|
||||
self.hidden_size_per_attention_head).transpose(0, 1)
|
||||
sq,
|
||||
b * self.num_attention_heads_per_partition,
|
||||
self.hidden_size_per_attention_head,
|
||||
).transpose(0, 1)
|
||||
|
||||
q_scaled = query_layer / self.norm_factor
|
||||
if attn_mask is not None:
|
||||
attention_probs = torch.baddbmm(attn_mask, q_scaled,
|
||||
key_layer.transpose(-2, -1))
|
||||
attention_probs = torch.baddbmm(
|
||||
attn_mask, q_scaled, key_layer.transpose(-2, -1)
|
||||
)
|
||||
else:
|
||||
attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
|
||||
attention_probs = attention_probs.softmax(dim=-1)
|
||||
|
||||
value_layer = value_layer.view(
|
||||
sq, b * self.num_attention_heads_per_partition,
|
||||
self.hidden_size_per_attention_head).transpose(0, 1)
|
||||
sq,
|
||||
b * self.num_attention_heads_per_partition,
|
||||
self.hidden_size_per_attention_head,
|
||||
).transpose(0, 1)
|
||||
|
||||
# matmul: [b * np, sq, hn]
|
||||
context_layer = torch.bmm(attention_probs, value_layer)
|
||||
|
||||
# change view [b, np, sq, hn]
|
||||
context_layer = context_layer.view(
|
||||
b, self.num_attention_heads_per_partition, sq,
|
||||
self.hidden_size_per_attention_head)
|
||||
b,
|
||||
self.num_attention_heads_per_partition,
|
||||
sq,
|
||||
self.hidden_size_per_attention_head,
|
||||
)
|
||||
|
||||
# [b, np, sq, hn] --> [sq, b, np, hn]
|
||||
context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
|
||||
|
||||
# [sq, b, np, hn] --> [sq, b, hp]
|
||||
new_context_layer_shape = context_layer.size()[:-2] + \
|
||||
(self.hidden_size_per_partition,)
|
||||
new_context_layer_shape = context_layer.size()[:-2] + (
|
||||
self.hidden_size_per_partition,
|
||||
)
|
||||
context_layer = context_layer.view(*new_context_layer_shape)
|
||||
|
||||
output, _ = self.out_proj(context_layer)
|
||||
@@ -189,10 +216,9 @@ class QwenVLMLP(nn.Module):
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.c_fc = ColumnParallelLinear(hidden_size,
|
||||
intermediate_size,
|
||||
bias=True,
|
||||
quant_config=quant_config)
|
||||
self.c_fc = ColumnParallelLinear(
|
||||
hidden_size, intermediate_size, bias=True, quant_config=quant_config
|
||||
)
|
||||
self.act_fn = get_act_fn("gelu")
|
||||
self.c_proj = RowParallelLinear(
|
||||
intermediate_size,
|
||||
@@ -209,7 +235,6 @@ class QwenVLMLP(nn.Module):
|
||||
|
||||
|
||||
class VisualAttentionBlock(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
d_model: int,
|
||||
@@ -249,7 +274,6 @@ class VisualAttentionBlock(nn.Module):
|
||||
|
||||
|
||||
class TransformerBlock(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
width: int,
|
||||
@@ -263,14 +287,18 @@ class TransformerBlock(nn.Module):
|
||||
self.width = width
|
||||
self.layers = layers
|
||||
|
||||
self.resblocks = nn.ModuleList([
|
||||
VisualAttentionBlock(width,
|
||||
heads,
|
||||
mlp_ratio,
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config)
|
||||
for _ in range(layers)
|
||||
])
|
||||
self.resblocks = nn.ModuleList(
|
||||
[
|
||||
VisualAttentionBlock(
|
||||
width,
|
||||
heads,
|
||||
mlp_ratio,
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
for _ in range(layers)
|
||||
]
|
||||
)
|
||||
|
||||
def get_cast_dtype(self) -> torch.dtype:
|
||||
return self.resblocks[0].mlp.c_fc.weight.dtype
|
||||
@@ -278,54 +306,57 @@ class TransformerBlock(nn.Module):
|
||||
def get_cast_device(self) -> torch.device:
|
||||
return self.resblocks[0].mlp.c_fc.weight.device
|
||||
|
||||
def forward(self,
|
||||
x: torch.Tensor,
|
||||
attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||
def forward(
|
||||
self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None
|
||||
) -> torch.Tensor:
|
||||
for r in self.resblocks:
|
||||
x = r(x, attn_mask=attn_mask)
|
||||
return x
|
||||
|
||||
|
||||
class VisionTransformer(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
image_size: int,
|
||||
patch_size: int,
|
||||
width: int,
|
||||
layers: int,
|
||||
heads: int,
|
||||
mlp_ratio: float,
|
||||
n_queries: int = 256,
|
||||
output_dim: int = 512,
|
||||
image_start_id: int = 151857,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
image_size: int,
|
||||
patch_size: int,
|
||||
width: int,
|
||||
layers: int,
|
||||
heads: int,
|
||||
mlp_ratio: float,
|
||||
n_queries: int = 256,
|
||||
output_dim: int = 512,
|
||||
image_start_id: int = 151857,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
image_height, image_width = self.image_size = (image_size, image_size)
|
||||
patch_height, patch_width = self.patch_size = (patch_size, patch_size)
|
||||
self.grid_size = (image_height // patch_height,
|
||||
image_width // patch_width)
|
||||
self.grid_size = (image_height // patch_height, image_width // patch_width)
|
||||
self.output_dim = output_dim
|
||||
self.conv1 = nn.Conv2d(in_channels=3,
|
||||
out_channels=width,
|
||||
kernel_size=patch_size,
|
||||
stride=patch_size,
|
||||
bias=False)
|
||||
self.conv1 = nn.Conv2d(
|
||||
in_channels=3,
|
||||
out_channels=width,
|
||||
kernel_size=patch_size,
|
||||
stride=patch_size,
|
||||
bias=False,
|
||||
)
|
||||
|
||||
# class embeddings and positional embeddings
|
||||
scale = width**-0.5
|
||||
self.positional_embedding = nn.Parameter(scale *
|
||||
torch.randn(256, width))
|
||||
self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
|
||||
|
||||
norm_layer = partial(nn.LayerNorm, eps=1e-6)
|
||||
|
||||
self.ln_pre = norm_layer(width)
|
||||
self.transformer = TransformerBlock(width,
|
||||
layers,
|
||||
heads,
|
||||
mlp_ratio,
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config)
|
||||
self.transformer = TransformerBlock(
|
||||
width,
|
||||
layers,
|
||||
heads,
|
||||
mlp_ratio,
|
||||
norm_layer=norm_layer,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
self.attn_pool = Resampler2(
|
||||
grid_size=int(math.sqrt(n_queries)),
|
||||
@@ -342,7 +373,8 @@ class VisionTransformer(nn.Module):
|
||||
|
||||
self.ln_post = norm_layer(output_dim)
|
||||
self.proj = nn.Parameter(
|
||||
(output_dim**-0.5) * torch.randn(output_dim, output_dim))
|
||||
(output_dim**-0.5) * torch.randn(output_dim, output_dim)
|
||||
)
|
||||
|
||||
self.image_start_id = image_start_id
|
||||
self.image_end_id = image_start_id + 1
|
||||
@@ -356,12 +388,10 @@ class VisionTransformer(nn.Module):
|
||||
|
||||
# to patches
|
||||
x = self.conv1(x) # shape = [*, width, grid, grid]
|
||||
x = x.reshape(x.shape[0], x.shape[1],
|
||||
-1) # shape = [*, width, grid ** 2]
|
||||
x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
|
||||
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
|
||||
|
||||
x = x + get_abs_pos(self.positional_embedding, int(math.sqrt(
|
||||
x.size(1))))
|
||||
x = x + get_abs_pos(self.positional_embedding, int(math.sqrt(x.size(1))))
|
||||
|
||||
x = self.ln_pre(x)
|
||||
|
||||
@@ -377,20 +407,19 @@ class VisionTransformer(nn.Module):
|
||||
|
||||
|
||||
class QwenVLModel(QWenModel):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
|
||||
self.visual = VisionTransformer(**config.visual,
|
||||
quant_config=quant_config)
|
||||
self.visual = VisionTransformer(**config.visual, quant_config=quant_config)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def _get_tokenizer_without_image_pad(
|
||||
tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
) -> PreTrainedTokenizer:
|
||||
"""
|
||||
The logic of adding image pad tokens should only be applied in
|
||||
[`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
|
||||
@@ -402,7 +431,6 @@ def _get_tokenizer_without_image_pad(
|
||||
new_tokenizer = copy.deepcopy(tokenizer)
|
||||
|
||||
class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore
|
||||
|
||||
def tokenize(
|
||||
self,
|
||||
text: str,
|
||||
@@ -413,7 +441,8 @@ def _get_tokenizer_without_image_pad(
|
||||
text = unicodedata.normalize("NFC", text)
|
||||
|
||||
return [
|
||||
self.decoder[t] for t in self.tokenizer.encode(
|
||||
self.decoder[t]
|
||||
for t in self.tokenizer.encode(
|
||||
text,
|
||||
allowed_special=allowed_special,
|
||||
disallowed_special=disallowed_special,
|
||||
@@ -435,8 +464,7 @@ def _get_tokenizer_without_image_pad(
|
||||
errors=errors or self.errors,
|
||||
)
|
||||
|
||||
TokenizerWithoutImagePad.__name__ = \
|
||||
f"{tokenizer.__class__.__name__}WithoutImagePad"
|
||||
TokenizerWithoutImagePad.__name__ = f"{tokenizer.__class__.__name__}WithoutImagePad"
|
||||
|
||||
new_tokenizer.__class__ = TokenizerWithoutImagePad
|
||||
return new_tokenizer
|
||||
@@ -467,17 +495,19 @@ class QwenVLProcessor:
|
||||
vision_config = config.visual
|
||||
image_size = vision_config["image_size"]
|
||||
|
||||
self.image_transform = transforms.Compose([
|
||||
transforms.Resize(
|
||||
(image_size, image_size),
|
||||
interpolation=InterpolationMode.BICUBIC,
|
||||
),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(
|
||||
mean=(0.48145466, 0.4578275, 0.40821073),
|
||||
std=(0.26862954, 0.26130258, 0.27577711),
|
||||
),
|
||||
])
|
||||
self.image_transform = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(
|
||||
(image_size, image_size),
|
||||
interpolation=InterpolationMode.BICUBIC,
|
||||
),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(
|
||||
mean=(0.48145466, 0.4578275, 0.40821073),
|
||||
std=(0.26862954, 0.26130258, 0.27577711),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
@property
|
||||
def image_start_tag(self) -> str:
|
||||
@@ -524,7 +554,6 @@ class QwenVLProcessor:
|
||||
|
||||
|
||||
class QwenVLProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_tokenizer(self) -> PreTrainedTokenizer:
|
||||
tokenizer = self.ctx.tokenizer
|
||||
assert isinstance(tokenizer, PreTrainedTokenizer)
|
||||
@@ -553,7 +582,6 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
|
||||
class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
|
||||
|
||||
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
@@ -561,8 +589,9 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
|
||||
img_start = hf_processor.image_start_tag
|
||||
img_end = hf_processor.image_end_tag
|
||||
|
||||
return "".join(f"Picture {i}: {img_start}{img_end}\n"
|
||||
for i in range(1, num_images + 1))
|
||||
return "".join(
|
||||
f"Picture {i}: {img_start}{img_end}\n" for i in range(1, num_images + 1)
|
||||
)
|
||||
|
||||
def get_dummy_mm_data(
|
||||
self,
|
||||
@@ -579,16 +608,16 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
|
||||
image_overrides = mm_options.get("image") if mm_options else None
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides)
|
||||
"image": self._get_dummy_images(
|
||||
width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images,
|
||||
overrides=image_overrides,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
|
||||
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
prompt: str,
|
||||
@@ -644,8 +673,7 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
|
||||
out_mm_kwargs: MultiModalKwargsItems,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
special_tokens: dict[str,
|
||||
int] = tokenizer.special_tokens # type: ignore
|
||||
special_tokens: dict[str, int] = tokenizer.special_tokens # type: ignore
|
||||
|
||||
processor = self.info.get_hf_processor()
|
||||
img_start_id = special_tokens[processor.image_start_tag]
|
||||
@@ -667,11 +695,14 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
|
||||
]
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(QwenVLMultiModalProcessor,
|
||||
info=QwenVLProcessingInfo,
|
||||
dummy_inputs=QwenVLDummyInputsBuilder)
|
||||
class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
|
||||
SupportsMultiModal):
|
||||
@MULTIMODAL_REGISTRY.register_processor(
|
||||
QwenVLMultiModalProcessor,
|
||||
info=QwenVLProcessingInfo,
|
||||
dummy_inputs=QwenVLDummyInputsBuilder,
|
||||
)
|
||||
class QwenVLForConditionalGeneration(
|
||||
QWenBaseModel, SupportsPP, SupportsLoRA, SupportsMultiModal
|
||||
):
|
||||
packed_modules_mapping = {
|
||||
"c_attn": ["c_attn"],
|
||||
"gate_up_proj": [
|
||||
@@ -687,7 +718,8 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
|
||||
return MultiModelKeys.from_string_field(
|
||||
language_model="transformer.h",
|
||||
connector="transformer.visual.attn_pool",
|
||||
tower_model="transformer.visual.transformer")
|
||||
tower_model="transformer.visual.transformer",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
|
||||
@@ -712,14 +744,16 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
|
||||
self.transformer: QwenVLModel
|
||||
|
||||
def _parse_and_validate_image_input(
|
||||
self, **kwargs: object) -> Optional[QwenImageInputs]:
|
||||
self, **kwargs: object
|
||||
) -> Optional[QwenImageInputs]:
|
||||
pixel_values = kwargs.pop("pixel_values", None)
|
||||
image_embeds = kwargs.pop("image_embeds", None)
|
||||
|
||||
if pixel_values is not None:
|
||||
if not isinstance(pixel_values, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of pixel values. "
|
||||
f"Got type: {type(pixel_values)}")
|
||||
raise ValueError(
|
||||
f"Incorrect type of pixel values. Got type: {type(pixel_values)}"
|
||||
)
|
||||
|
||||
expected_h = expected_w = self.config.visual["image_size"]
|
||||
resolve_bindings = {"h": expected_h, "w": expected_w}
|
||||
@@ -732,8 +766,10 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
|
||||
|
||||
if image_embeds is not None:
|
||||
if not isinstance(image_embeds, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of image embeddings. "
|
||||
f"Got type: {type(image_embeds)}")
|
||||
raise ValueError(
|
||||
"Incorrect type of image embeddings. "
|
||||
f"Got type: {type(image_embeds)}"
|
||||
)
|
||||
|
||||
return QwenImageEmbeddingInputs(
|
||||
type="image_embeds",
|
||||
@@ -742,8 +778,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
|
||||
|
||||
return None
|
||||
|
||||
def _process_image_input(self,
|
||||
image_input: QwenImageInputs) -> torch.Tensor:
|
||||
def _process_image_input(self, image_input: QwenImageInputs) -> torch.Tensor:
|
||||
if image_input["type"] == "image_embeds":
|
||||
return image_input["data"]
|
||||
|
||||
@@ -752,8 +787,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
|
||||
def get_language_model(self) -> torch.nn.Module:
|
||||
return self.transformer
|
||||
|
||||
def get_multimodal_embeddings(self,
|
||||
**kwargs: object) -> MultiModalEmbeddings:
|
||||
def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
return []
|
||||
@@ -772,6 +806,7 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
|
||||
if intermediate_tensors is not None:
|
||||
inputs_embeds = None
|
||||
|
||||
hidden_states = self.transformer(input_ids, positions,
|
||||
intermediate_tensors, inputs_embeds)
|
||||
hidden_states = self.transformer(
|
||||
input_ids, positions, intermediate_tensors, inputs_embeds
|
||||
)
|
||||
return hidden_states
|
||||
|
||||
Reference in New Issue
Block a user