[Core][VLM] Stack multimodal tensors to represent multiple images within each prompt (#7902)
This commit is contained in:
@@ -333,6 +333,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal):
|
||||
raise ValueError("Incorrect type of audio features. "
|
||||
f"Got type: {type(audio_features)}")
|
||||
|
||||
# Remove the N dimension until multiple audios are supported.
|
||||
if isinstance(audio_features, torch.Tensor):
|
||||
audio_features = audio_features.squeeze(1)
|
||||
else:
|
||||
audio_features = [t.squeeze(0) for t in audio_features]
|
||||
|
||||
return UltravoxAudioFeatureInputs(type="audio_features",
|
||||
data=audio_features)
|
||||
|
||||
@@ -341,6 +347,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal):
|
||||
raise ValueError("Incorrect type of audio embeds. "
|
||||
f"Got type: {type(audio_embeds)}")
|
||||
|
||||
# Remove the N dimension until multiple audios are supported.
|
||||
audio_embeds = audio_embeds.squeeze(1)
|
||||
|
||||
return UltravoxAudioEmbeddingInputs(type="audio_embeds",
|
||||
data=audio_embeds)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user