Fix AudioFlamingo3/MusicFlamingo HF parity and RoTE handling (#37643)

Signed-off-by: Lasha <26011196+lashahub@users.noreply.github.com>
This commit is contained in:
Lasha Koroshinadze
2026-03-22 22:29:07 -04:00
committed by GitHub
parent 43877a620b
commit e7767eccae
12 changed files with 1157 additions and 243 deletions

View File

@@ -26,6 +26,54 @@ from tests.models.registry import HF_EXAMPLE_MODELS
from vllm import LLM, SamplingParams
MODEL_NAME = "nvidia/audio-flamingo-3-hf"
SINGLE_CONVERSATION = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is surprising about the relationship between "
"the barking and the music?",
},
{
"type": "audio_url",
"audio_url": {
"url": "https://huggingface.co/datasets/nvidia/AudioSkills/"
"resolve/main/assets/"
"dogs_barking_in_sync_with_the_music.wav",
},
},
],
}
]
BATCHED_CONVERSATIONS = [
SINGLE_CONVERSATION,
[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Why is the philosopher's name mentioned in the "
"lyrics? (A) To express a sense of nostalgia "
"(B) To indicate that language cannot express clearly, "
"satirizing the inversion of black and white in the world "
"(C) To add depth and complexity to the lyrics "
"(D) To showcase the wisdom and influence of the "
"philosopher",
},
{
"type": "audio_url",
"audio_url": {
"url": "https://huggingface.co/datasets/nvidia/"
"AudioSkills/resolve/main/assets/"
"Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
},
},
],
}
],
]
def get_fixture_path(filename):
@@ -34,21 +82,29 @@ def get_fixture_path(filename):
)
def assert_output_matches(output, expected_text, expected_token_ids):
generated = output.outputs[0]
assert generated.text.strip() == expected_text
actual_token_ids = list(generated.token_ids)
assert (
actual_token_ids == expected_token_ids
or actual_token_ids == expected_token_ids[:-1]
or actual_token_ids[:-1] == expected_token_ids
)
@pytest.fixture(scope="module")
def llm():
# Check if the model is supported by the current transformers version
model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
model_info.check_transformers_version(on_fail="skip")
try:
llm = LLM(
return LLM(
model=MODEL_NAME,
trust_remote_code=True,
dtype="bfloat16",
enforce_eager=True,
limit_mm_per_prompt={"audio": 1},
)
return llm
except Exception as e:
pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
@@ -61,29 +117,17 @@ def test_single_generation(llm):
with open(fixture_path) as f:
expected = json.load(f)
audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
messages = [
{
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": audio_url}},
{"type": "text", "text": "Transcribe the input speech."},
],
}
]
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
outputs = llm.chat(
messages=messages,
messages=SINGLE_CONVERSATION,
sampling_params=sampling_params,
)
generated_text = outputs[0].outputs[0].text.strip()
expected_text = expected["transcriptions"][0]
assert expected_text in generated_text or generated_text in expected_text
assert_output_matches(
outputs[0],
expected["transcriptions"][0],
expected["token_ids"][0],
)
def test_batched_generation(llm):
@@ -94,49 +138,34 @@ def test_batched_generation(llm):
with open(fixture_path) as f:
expected = json.load(f)
items = [
{
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
"question": "What is surprising about the relationship "
"between the barking and the music?",
"expected_idx": 0,
},
{
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
"question": (
"Why is the philosopher's name mentioned in the lyrics? "
"(A) To express a sense of nostalgia "
"(B) To indicate that language cannot express clearly, "
"satirizing the inversion of black and white in the world "
"(C) To add depth and complexity to the lyrics "
"(D) To showcase the wisdom and influence of the philosopher"
),
"expected_idx": 1,
},
]
conversations = []
for item in items:
messages = [
{
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
{"type": "text", "text": item["question"]},
],
}
]
conversations.append(messages)
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
outputs = llm.chat(
messages=conversations,
messages=BATCHED_CONVERSATIONS,
sampling_params=sampling_params,
)
for i, output in enumerate(outputs):
generated_text = output.outputs[0].text.strip()
expected_text = expected["transcriptions"][i]
assert_output_matches(
output,
expected["transcriptions"][i],
expected["token_ids"][i],
)
assert expected_text in generated_text or generated_text in expected_text
def test_single_and_batched_generation_match(llm):
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
single_output = llm.chat(
messages=SINGLE_CONVERSATION,
sampling_params=sampling_params,
)[0]
batched_output = llm.chat(
messages=BATCHED_CONVERSATIONS,
sampling_params=sampling_params,
)[0]
assert single_output.outputs[0].text == batched_output.outputs[0].text
assert list(single_output.outputs[0].token_ids) == list(
batched_output.outputs[0].token_ids
)

View File

@@ -0,0 +1,146 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import os
import pytest
from tests.models.registry import HF_EXAMPLE_MODELS
from vllm import LLM, SamplingParams
MODEL_NAME = "nvidia/music-flamingo-2601-hf"
SINGLE_CONVERSATION = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this track in full detail - tell me the "
"genre, tempo, and key, then dive into the instruments, "
"production style, and overall mood it creates.",
},
{
"type": "audio_url",
"audio_url": {
"url": "https://huggingface.co/datasets/nvidia/AudioSkills/"
"resolve/main/assets/song_1.mp3",
},
},
],
}
]
BATCHED_CONVERSATIONS = [
SINGLE_CONVERSATION,
[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Generate a structured lyric sheet from the input music.",
},
{
"type": "audio_url",
"audio_url": {
"url": "https://huggingface.co/datasets/nvidia/"
"AudioSkills/resolve/main/assets/song_2.mp3",
},
},
],
}
],
]
def get_fixture_path(filename):
return os.path.join(
os.path.dirname(__file__), "../../fixtures/musicflamingo", filename
)
def assert_output_matches(output, expected_text, expected_token_ids):
generated = output.outputs[0]
assert generated.text == expected_text
actual_token_ids = list(generated.token_ids)
assert (
actual_token_ids == expected_token_ids
or actual_token_ids == expected_token_ids[:-1]
or actual_token_ids[:-1] == expected_token_ids
)
@pytest.fixture(scope="module")
def llm():
model_info = HF_EXAMPLE_MODELS.get_hf_info("MusicFlamingoForConditionalGeneration")
model_info.check_transformers_version(on_fail="skip")
try:
return LLM(
model=MODEL_NAME,
dtype="bfloat16",
enforce_eager=True,
max_model_len=8192,
limit_mm_per_prompt={"audio": 1},
)
except Exception as e:
pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
def test_single_generation(llm):
fixture_path = get_fixture_path("expected_results_single.json")
if not os.path.exists(fixture_path):
pytest.skip(f"Fixture not found: {fixture_path}")
with open(fixture_path) as f:
expected = json.load(f)
outputs = llm.chat(
messages=SINGLE_CONVERSATION,
sampling_params=SamplingParams(temperature=0.0, max_tokens=50),
)
assert_output_matches(
outputs[0],
expected["transcriptions"][0],
expected["token_ids"][0],
)
def test_batched_generation(llm):
fixture_path = get_fixture_path("expected_results_batched.json")
if not os.path.exists(fixture_path):
pytest.skip(f"Fixture not found: {fixture_path}")
with open(fixture_path) as f:
expected = json.load(f)
outputs = llm.chat(
messages=BATCHED_CONVERSATIONS,
sampling_params=SamplingParams(temperature=0.0, max_tokens=50),
)
for i, output in enumerate(outputs):
assert_output_matches(
output,
expected["transcriptions"][i],
expected["token_ids"][i],
)
def test_single_and_batched_generation_match(llm):
sampling_params = SamplingParams(temperature=0.0, max_tokens=50)
single_output = llm.chat(
messages=SINGLE_CONVERSATION,
sampling_params=sampling_params,
)[0]
batched_output = llm.chat(
messages=BATCHED_CONVERSATIONS,
sampling_params=sampling_params,
)[0]
assert single_output.outputs[0].text == batched_output.outputs[0].text
assert list(single_output.outputs[0].token_ids) == list(
batched_output.outputs[0].token_ids
)

View File

@@ -40,6 +40,7 @@ class MockAudioFlamingo3Processor:
def __init__(self):
self.audio_token = "<sound>"
self.audio_token_id = 12345
self.max_audio_len = 60
self.feature_extractor = MockFeatureExtractor()
def __call__(self, text=None, audios=None, **kwargs):
@@ -65,7 +66,6 @@ def mock_ctx():
@pytest.fixture(autouse=True)
def check_transformers_version():
# Check if the model is supported by the current transformers version
model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
model_info.check_transformers_version(on_fail="skip")
@@ -84,7 +84,7 @@ def test_audio_chunk_counting(mock_ctx):
sr = 16000
audio_1 = np.zeros(30 * sr)
audio_2 = np.zeros(45 * sr)
audio_2 = np.zeros(75 * sr)
mm_data = {"audio": [audio_1, audio_2]}
prompt = "<|user|>Listen.<|end|>"
@@ -121,5 +121,107 @@ def test_dummy_data_generation(mock_ctx):
assert "audio" in dummy_data
assert len(dummy_data["audio"]) == 2
expected_len = 600 * 16000
expected_len = 60 * 16000
assert len(dummy_data["audio"][0]) == expected_len
def test_audio_token_count_matches_hf_processor_math():
from vllm.model_executor.models.audioflamingo3 import (
_count_audio_tokens_from_mask,
)
feature_attention_mask = torch.zeros((3, 3000), dtype=torch.long)
feature_attention_mask[0, :2999] = 1
feature_attention_mask[1, :2999] = 1
feature_attention_mask[2, :1500] = 1
chunk_counts = torch.tensor([2, 1], dtype=torch.long)
assert (
_count_audio_tokens_from_mask(feature_attention_mask, chunk_counts, 0) == 1499
)
assert _count_audio_tokens_from_mask(feature_attention_mask, chunk_counts, 1) == 375
def test_audio_feature_pipeline_matches_hf_small_config():
from transformers.models.audioflamingo3 import (
modeling_audioflamingo3 as hf_audioflamingo3_modeling,
)
from transformers.models.audioflamingo3.configuration_audioflamingo3 import (
AudioFlamingo3Config,
)
from vllm.model_executor.models.audioflamingo3 import (
AudioFlamingo3Encoder,
AudioFlamingo3MultiModalProjector,
_build_audio_encoder_attention_mask,
_flatten_valid_audio_embeddings,
)
text_config = {
"model_type": "qwen2",
"intermediate_size": 64,
"initializer_range": 0.02,
"hidden_size": 32,
"max_position_embeddings": 1024,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"num_key_value_heads": 2,
"vocab_size": 128,
"pad_token_id": 1,
"use_mrope": False,
}
audio_config = {
"hidden_size": 16,
"num_attention_heads": 4,
"intermediate_size": 32,
"num_hidden_layers": 2,
"num_mel_bins": 80,
"max_source_positions": 1500,
"dropout": 0.0,
"attention_dropout": 0.0,
"activation_dropout": 0.0,
"encoder_layerdrop": 0.0,
}
torch.manual_seed(0)
config = AudioFlamingo3Config(
text_config=text_config,
audio_config=audio_config,
audio_token_id=0,
)
hf_model = hf_audioflamingo3_modeling.AudioFlamingo3ForConditionalGeneration(
config
).eval()
vllm_encoder = AudioFlamingo3Encoder(config.audio_config).eval()
vllm_encoder.load_state_dict(hf_model.audio_tower.state_dict())
vllm_projector = AudioFlamingo3MultiModalProjector(config).eval()
vllm_projector.load_state_dict(hf_model.multi_modal_projector.state_dict())
input_features = torch.randn(3, 80, 3000)
feature_attention_mask = torch.zeros(3, 3000, dtype=torch.bool)
feature_attention_mask[0, :3000] = True
feature_attention_mask[1, :2500] = True
feature_attention_mask[2, :1500] = True
hf_output = hf_model.get_audio_features(
input_features,
feature_attention_mask,
return_dict=True,
).pooler_output
vllm_attention_mask = _build_audio_encoder_attention_mask(
feature_attention_mask,
dtype=vllm_encoder.conv1.weight.dtype,
device=vllm_encoder.conv1.weight.device,
)
vllm_hidden_states = vllm_encoder(
input_features,
attention_mask=vllm_attention_mask,
)
vllm_output, _ = _flatten_valid_audio_embeddings(
vllm_projector(vllm_hidden_states),
feature_attention_mask,
)
torch.testing.assert_close(vllm_output, hf_output)

View File

@@ -0,0 +1,222 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2026 The vLLM team.
# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
# reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest.mock import MagicMock
import numpy as np
import pytest
import torch
from transformers import PretrainedConfig
from tests.models.registry import HF_EXAMPLE_MODELS
class MockMusicFlamingoConfig(PretrainedConfig):
model_type = "musicflamingo"
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.audio_config = PretrainedConfig()
self.text_config = PretrainedConfig()
class MockMusicFlamingoProcessor:
def __init__(self):
self.audio_token = "<sound>"
self.audio_token_id = 12345
self.audio_bos_token = "<|sound_bos|>"
self.audio_bos_token_id = 12346
self.audio_eos_token = "<|sound_eos|>"
self.audio_eos_token_id = 12347
self.max_audio_len = 1200
self.feature_extractor = MockFeatureExtractor()
class MockFeatureExtractor:
def __init__(self):
self.sampling_rate = 16000
self.chunk_length = 30
@pytest.fixture
def mock_ctx():
config = MockMusicFlamingoConfig()
ctx = MagicMock()
ctx.get_hf_config.return_value = config
ctx.get_hf_processor.return_value = MockMusicFlamingoProcessor()
ctx.model_config.hf_config = config
return ctx
@pytest.fixture(autouse=True)
def check_transformers_version():
model_info = HF_EXAMPLE_MODELS.get_hf_info("MusicFlamingoForConditionalGeneration")
model_info.check_transformers_version(on_fail="skip")
def test_musicflamingo_chunk_counting_uses_rote_timestamps(mock_ctx, monkeypatch):
from vllm.model_executor.models.musicflamingo import (
MusicFlamingoDummyInputsBuilder,
MusicFlamingoMultiModalProcessor,
MusicFlamingoProcessingInfo,
)
info = MusicFlamingoProcessingInfo(mock_ctx)
processor = MusicFlamingoMultiModalProcessor(
info, MusicFlamingoDummyInputsBuilder(info)
)
sr = 16000
audio_1 = np.zeros(30 * sr)
audio_2 = np.zeros(45 * sr)
mm_data = {"audio": [audio_1, audio_2]}
prompt = "<|user|>Listen.<|end|>"
from vllm.multimodal.processing import BaseMultiModalProcessor
def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
del self, prompt, mm_data, mm_kwargs, tok_kwargs
return {
"input_ids": [1, 2, 3],
"input_features": torch.randn(3, 80, 3000),
"rote_timestamps": torch.randn(3, 750),
}
monkeypatch.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)
processed = processor._call_hf_processor(prompt, mm_data, {}, {})
chunk_counts = processed["chunk_counts"]
assert chunk_counts.tolist() == [1, 2]
assert "rote_timestamps" in processed
def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):
from vllm.model_executor.models.musicflamingo import (
MusicFlamingoDummyInputsBuilder,
MusicFlamingoProcessingInfo,
)
info = MusicFlamingoProcessingInfo(mock_ctx)
builder = MusicFlamingoDummyInputsBuilder(info)
assert builder.get_dummy_text({"audio": 2}) == "<sound><sound>"
def test_musicflamingo_audio_feature_pipeline_matches_hf_small_config():
from transformers.models.musicflamingo import (
modeling_musicflamingo as hf_musicflamingo_modeling,
)
from transformers.models.musicflamingo.configuration_musicflamingo import (
MusicFlamingoConfig,
)
from vllm.model_executor.models.audioflamingo3 import (
_build_audio_encoder_attention_mask,
_flatten_valid_audio_embeddings,
)
from vllm.model_executor.models.musicflamingo import (
MusicFlamingoEncoder,
MusicFlamingoMultiModalProjector,
MusicFlamingoRotaryEmbedding,
apply_rotary_time_emb,
)
text_config = {
"model_type": "qwen2",
"intermediate_size": 64,
"initializer_range": 0.02,
"hidden_size": 32,
"max_position_embeddings": 1024,
"num_hidden_layers": 2,
"num_attention_heads": 4,
"num_key_value_heads": 2,
"vocab_size": 128,
"pad_token_id": 1,
"use_mrope": False,
}
audio_config = {
"hidden_size": 16,
"num_attention_heads": 4,
"intermediate_size": 32,
"num_hidden_layers": 2,
"num_mel_bins": 80,
"max_source_positions": 1500,
"dropout": 0.0,
"attention_dropout": 0.0,
"activation_dropout": 0.0,
"encoder_layerdrop": 0.0,
}
torch.manual_seed(0)
config = MusicFlamingoConfig(
text_config=text_config,
audio_config=audio_config,
audio_token_id=0,
head_dim=8,
rope_parameters={"rope_type": "default", "rope_theta": 2048},
)
hf_model = hf_musicflamingo_modeling.MusicFlamingoForConditionalGeneration(
config
).eval()
vllm_encoder = MusicFlamingoEncoder(config.audio_config).eval()
vllm_encoder.load_state_dict(hf_model.audio_tower.state_dict())
vllm_projector = MusicFlamingoMultiModalProjector(config).eval()
vllm_projector.load_state_dict(hf_model.multi_modal_projector.state_dict())
vllm_rope = MusicFlamingoRotaryEmbedding(config).eval()
vllm_rope.load_state_dict(hf_model.pos_emb.state_dict(), strict=False)
input_features = torch.randn(3, 80, 3000)
feature_attention_mask = torch.zeros(3, 3000, dtype=torch.bool)
feature_attention_mask[0, :3000] = True
feature_attention_mask[1, :2500] = True
feature_attention_mask[2, :1500] = True
rote_timestamps = (
torch.arange(750, dtype=torch.float32).unsqueeze(0).repeat(3, 1) * 0.04
)
hf_output = hf_model.get_audio_features(
input_features,
feature_attention_mask,
rote_timestamps=rote_timestamps,
return_dict=True,
).pooler_output
vllm_attention_mask = _build_audio_encoder_attention_mask(
feature_attention_mask,
dtype=vllm_encoder.conv1.weight.dtype,
device=vllm_encoder.conv1.weight.device,
)
vllm_hidden_states = vllm_encoder(
input_features,
attention_mask=vllm_attention_mask,
)
cos, sin = vllm_rope(rote_timestamps, seq_len=vllm_hidden_states.shape[-2])
vllm_hidden_states = apply_rotary_time_emb(vllm_hidden_states, cos, sin)
vllm_output, _ = _flatten_valid_audio_embeddings(
vllm_projector(vllm_hidden_states),
feature_attention_mask,
)
torch.testing.assert_close(vllm_output, hf_output)