[Bugfix] Check dimensions of multimodal embeddings in V1 (#15816)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -875,7 +875,8 @@ class Florence2MultiModalProcessor(
|
||||
Florence2MultiModalProcessor,
|
||||
info=Florence2ProcessingInfo,
|
||||
dummy_inputs=Florence2DummyInputsBuilder)
|
||||
class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal):
|
||||
class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
SupportsV0Only):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
|
||||
@@ -39,7 +39,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
PromptUpdate, PromptUpdateDetails)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils import flatten_2d_lists
|
||||
|
||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||
from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
|
||||
@@ -66,10 +65,13 @@ class FuyuImagePatchInputs(TypedDict):
|
||||
This is used to split the embeddings which has the first two dimensions
|
||||
flattened just like `flat_data`.
|
||||
"""
|
||||
|
||||
embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
|
||||
"""
|
||||
A boolean mask indicating which image embeddings correspond
|
||||
to patch tokens.
|
||||
|
||||
Shape: `(batch_size * num_images, num_embeds)`
|
||||
"""
|
||||
|
||||
|
||||
@@ -322,16 +324,18 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
def _parse_and_validate_image_input(
|
||||
self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
|
||||
image_patches = kwargs.pop("image_patches", None)
|
||||
embed_is_patch = kwargs.pop("embed_is_patch", None)
|
||||
if image_patches is not None:
|
||||
if not isinstance(image_patches, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of image patches. "
|
||||
f"Got type: {type(image_patches)}")
|
||||
|
||||
embed_is_patch = kwargs.pop("embed_is_patch")
|
||||
if not isinstance(embed_is_patch, (torch.Tensor, list)):
|
||||
raise ValueError("Incorrect type of embed_is_patch. "
|
||||
f"Got type: {type(embed_is_patch)}")
|
||||
|
||||
image_patches_flat = flatten_bn(image_patches)
|
||||
embed_is_patch = flatten_bn(embed_is_patch)
|
||||
|
||||
return FuyuImagePatchInputs(
|
||||
type="image_patches",
|
||||
@@ -351,6 +355,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
assert self.vision_embed_tokens is not None
|
||||
vision_embeddings_flat, _ = self.vision_embed_tokens(
|
||||
image_patches_flat)
|
||||
|
||||
return vision_embeddings_flat.split(patches_per_image, dim=0)
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
@@ -358,13 +363,13 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
return None
|
||||
vision_embeddings = self._process_image_input(image_input)
|
||||
#return vision_embeddings
|
||||
return flatten_2d_lists(
|
||||
scatter_patch_features(*args) for args in zip(
|
||||
vision_embeddings,
|
||||
image_input["embed_is_patch"],
|
||||
))
|
||||
|
||||
image_features = self._process_image_input(image_input)
|
||||
|
||||
return scatter_patch_features(
|
||||
image_features,
|
||||
image_input["embed_is_patch"],
|
||||
)
|
||||
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
|
||||
@@ -613,7 +613,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
||||
def _process_image_input(
|
||||
self,
|
||||
image_input: Gemma3ImageInputs,
|
||||
) -> tuple[torch.Tensor, ...]:
|
||||
) -> list[torch.Tensor]:
|
||||
assert self.vision_tower is not None
|
||||
|
||||
pixel_values = image_input["pixel_values"]
|
||||
@@ -625,7 +625,9 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
|
||||
)
|
||||
image_embeds = self.multi_modal_projector(image_features)
|
||||
|
||||
return image_embeds.split(num_patches.tolist())
|
||||
return [
|
||||
e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())
|
||||
]
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
|
||||
@@ -733,7 +733,10 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
pixel_attention_mask=pixel_attention_mask,
|
||||
)
|
||||
|
||||
def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
|
||||
def _process_image_input(
|
||||
self,
|
||||
image_input: ImageInputs,
|
||||
) -> Union[torch.Tensor, list[torch.Tensor]]:
|
||||
if image_input["type"] == "image_embeds":
|
||||
return image_input["data"]
|
||||
|
||||
@@ -741,7 +744,9 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
image_features = self.model.connector(image_features)
|
||||
|
||||
num_patches = image_input["num_patches"]
|
||||
return image_features.split(num_patches.tolist())
|
||||
return [
|
||||
e.flatten(0, 1) for e in image_features.split(num_patches.tolist())
|
||||
]
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
|
||||
@@ -406,20 +406,21 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
h, w)
|
||||
stacked_embeddings = self._video_pixels_to_features(
|
||||
self.vision_tower, stacked_pixels)
|
||||
return stacked_embeddings.view(b, num_frames,
|
||||
*stacked_embeddings.shape[1:])
|
||||
embeds = stacked_embeddings.view(b, num_frames,
|
||||
*stacked_embeddings.shape[1:])
|
||||
|
||||
elif is_list_of(video_pixels, torch.Tensor):
|
||||
frames_per_videos = [v.shape[0] for v in video_pixels]
|
||||
stacked_pixels = torch.cat(video_pixels, dim=0)
|
||||
stacked_embeddings = self._video_pixels_to_features(
|
||||
self.vision_tower, stacked_pixels)
|
||||
return torch.split(stacked_embeddings, frames_per_videos, dim=0)
|
||||
|
||||
embeds = torch.split(stacked_embeddings, frames_per_videos, dim=0)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported type of video input {type(video_pixels)}")
|
||||
|
||||
return [e.flatten(0, 1) for e in embeds]
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
video_input = self._parse_and_validate_video_input(**kwargs)
|
||||
|
||||
@@ -919,8 +919,11 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
|
||||
image_features_flat = self.get_vision_hidden_states(image_input)
|
||||
|
||||
# Reconstruct the batch dimension
|
||||
return image_features_flat.split(image_input["num_slices"].tolist())
|
||||
num_slices = image_input["num_slices"]
|
||||
return [
|
||||
e.flatten(0, 1)
|
||||
for e in image_features_flat.split(num_slices.tolist())
|
||||
]
|
||||
|
||||
def _process_multimodal_inputs(self, modalities: dict):
|
||||
# The result multimodal_embeddings is tuple of tensors, with each
|
||||
|
||||
@@ -204,7 +204,7 @@ def scatter_patch_features(
|
||||
(e_is_patch.shape[0], patches_one.shape[-1]),
|
||||
fill_value=torch.nan,
|
||||
)
|
||||
embed_one[e_is_patch] = patches_one.flatten(0, -2)
|
||||
embed_one[e_is_patch] = patches_one
|
||||
return embed_one
|
||||
|
||||
return tuple(
|
||||
|
||||
Reference in New Issue
Block a user