[Bugfix] Standardize merging multimodal embeddings (#26771)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -1586,19 +1586,19 @@ class Qwen2_5_VLForConditionalGeneration(
|
||||
for modality in mm_input_by_modality:
|
||||
multimodal_input = mm_input_by_modality[modality]
|
||||
if modality == "image":
|
||||
vision_embeddings = self._process_image_input(multimodal_input)
|
||||
image_embeddings = self._process_image_input(multimodal_input)
|
||||
if self.is_multimodal_pruning_enabled:
|
||||
vision_embeddings = self._postprocess_image_embeds_evs(
|
||||
vision_embeddings, multimodal_input
|
||||
image_embeddings = self._postprocess_image_embeds_evs(
|
||||
image_embeddings, multimodal_input
|
||||
)
|
||||
multimodal_embeddings += vision_embeddings
|
||||
multimodal_embeddings += tuple(image_embeddings)
|
||||
if modality == "video":
|
||||
video_embeddings = self._process_video_input(multimodal_input)
|
||||
if self.is_multimodal_pruning_enabled:
|
||||
video_embeddings = self._postprocess_video_embeds_evs(
|
||||
video_embeddings, multimodal_input
|
||||
)
|
||||
multimodal_embeddings += video_embeddings
|
||||
multimodal_embeddings += tuple(video_embeddings)
|
||||
return multimodal_embeddings
|
||||
|
||||
def forward(
|
||||
|
||||
Reference in New Issue
Block a user