[Core][VLM] Stack multimodal tensors to represent multiple images within each prompt (#7902)

2024-08-27 18:53:56 -07:00
parent 9c71c97ae2
commit fab5f53e2d
15 changed files with 214 additions and 60 deletions
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -333,6 +333,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal):
                raise ValueError("Incorrect type of audio features. "
                                 f"Got type: {type(audio_features)}")

+            # Remove the N dimension until multiple audios are supported.
+            if isinstance(audio_features, torch.Tensor):
+                audio_features = audio_features.squeeze(1)
+            else:
+                audio_features = [t.squeeze(0) for t in audio_features]
+
            return UltravoxAudioFeatureInputs(type="audio_features",
                                              data=audio_features)

@@ -341,6 +347,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal):
                raise ValueError("Incorrect type of audio embeds. "
                                 f"Got type: {type(audio_embeds)}")

+            # Remove the N dimension until multiple audios are supported.
+            audio_embeds = audio_embeds.squeeze(1)
+
            return UltravoxAudioEmbeddingInputs(type="audio_embeds",
                                                data=audio_embeds)