[V1] Support audio language models on V1 (#11733)
Signed-off-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
@@ -335,13 +335,16 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
selected_audio_feature = audio_outputs.last_hidden_state
|
||||
audio_features = self.multi_modal_projector(selected_audio_feature)
|
||||
num_audios, max_audio_tokens, embed_dim = audio_features.shape
|
||||
audio_output_lengths = audio_output_lengths.unsqueeze(1)
|
||||
audio_features_mask = torch.arange(max_audio_tokens).expand(
|
||||
num_audios, max_audio_tokens
|
||||
).to(audio_output_lengths.device) < audio_output_lengths.unsqueeze(1)
|
||||
num_audios, max_audio_tokens).to(
|
||||
audio_output_lengths.device) < audio_output_lengths
|
||||
masked_audio_features = audio_features[audio_features_mask].view(
|
||||
-1, embed_dim)
|
||||
|
||||
return masked_audio_features
|
||||
# Split to tuple of embeddings for individual audio input.
|
||||
return torch.split(masked_audio_features,
|
||||
audio_output_lengths.flatten().tolist())
|
||||
|
||||
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
|
||||
audio_input = self._parse_and_validate_audio_input(**kwargs)
|
||||
|
||||
Reference in New Issue
Block a user