[Bugfix] Standardize merging multimodal embeddings (#26771)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-14 17:36:21 +08:00
committed by GitHub
parent 577d498212
commit d2f816d6ff
19 changed files with 57 additions and 57 deletions

View File

@@ -1601,11 +1601,11 @@ class Qwen3VLForConditionalGeneration(
for modality in mm_input_by_modality:
multimodal_input = mm_input_by_modality[modality]
if modality == "image":
vision_embeddings = self._process_image_input(multimodal_input)
multimodal_embeddings += vision_embeddings
image_embeddings = self._process_image_input(multimodal_input)
multimodal_embeddings += tuple(image_embeddings)
if modality == "video":
video_embeddings = self._process_video_input(multimodal_input)
multimodal_embeddings += video_embeddings
multimodal_embeddings += tuple(video_embeddings)
return multimodal_embeddings
def _compute_deepstack_embeds(