Fix AudioFlamingo3/MusicFlamingo HF parity and RoTE handling (#37643)

Signed-off-by: Lasha <26011196+lashahub@users.noreply.github.com>
This commit is contained in:
Lasha Koroshinadze
2026-03-22 22:29:07 -04:00
committed by GitHub
parent 43877a620b
commit e7767eccae
12 changed files with 1157 additions and 243 deletions

View File

@@ -26,6 +26,54 @@ from tests.models.registry import HF_EXAMPLE_MODELS
from vllm import LLM, SamplingParams
MODEL_NAME = "nvidia/audio-flamingo-3-hf"
SINGLE_CONVERSATION = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is surprising about the relationship between "
"the barking and the music?",
},
{
"type": "audio_url",
"audio_url": {
"url": "https://huggingface.co/datasets/nvidia/AudioSkills/"
"resolve/main/assets/"
"dogs_barking_in_sync_with_the_music.wav",
},
},
],
}
]
BATCHED_CONVERSATIONS = [
SINGLE_CONVERSATION,
[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Why is the philosopher's name mentioned in the "
"lyrics? (A) To express a sense of nostalgia "
"(B) To indicate that language cannot express clearly, "
"satirizing the inversion of black and white in the world "
"(C) To add depth and complexity to the lyrics "
"(D) To showcase the wisdom and influence of the "
"philosopher",
},
{
"type": "audio_url",
"audio_url": {
"url": "https://huggingface.co/datasets/nvidia/"
"AudioSkills/resolve/main/assets/"
"Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
},
},
],
}
],
]
def get_fixture_path(filename):
@@ -34,21 +82,29 @@ def get_fixture_path(filename):
)
def assert_output_matches(output, expected_text, expected_token_ids):
generated = output.outputs[0]
assert generated.text.strip() == expected_text
actual_token_ids = list(generated.token_ids)
assert (
actual_token_ids == expected_token_ids
or actual_token_ids == expected_token_ids[:-1]
or actual_token_ids[:-1] == expected_token_ids
)
@pytest.fixture(scope="module")
def llm():
# Check if the model is supported by the current transformers version
model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
model_info.check_transformers_version(on_fail="skip")
try:
llm = LLM(
return LLM(
model=MODEL_NAME,
trust_remote_code=True,
dtype="bfloat16",
enforce_eager=True,
limit_mm_per_prompt={"audio": 1},
)
return llm
except Exception as e:
pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
@@ -61,29 +117,17 @@ def test_single_generation(llm):
with open(fixture_path) as f:
expected = json.load(f)
audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
messages = [
{
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": audio_url}},
{"type": "text", "text": "Transcribe the input speech."},
],
}
]
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
outputs = llm.chat(
messages=messages,
messages=SINGLE_CONVERSATION,
sampling_params=sampling_params,
)
generated_text = outputs[0].outputs[0].text.strip()
expected_text = expected["transcriptions"][0]
assert expected_text in generated_text or generated_text in expected_text
assert_output_matches(
outputs[0],
expected["transcriptions"][0],
expected["token_ids"][0],
)
def test_batched_generation(llm):
@@ -94,49 +138,34 @@ def test_batched_generation(llm):
with open(fixture_path) as f:
expected = json.load(f)
items = [
{
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
"question": "What is surprising about the relationship "
"between the barking and the music?",
"expected_idx": 0,
},
{
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
"question": (
"Why is the philosopher's name mentioned in the lyrics? "
"(A) To express a sense of nostalgia "
"(B) To indicate that language cannot express clearly, "
"satirizing the inversion of black and white in the world "
"(C) To add depth and complexity to the lyrics "
"(D) To showcase the wisdom and influence of the philosopher"
),
"expected_idx": 1,
},
]
conversations = []
for item in items:
messages = [
{
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
{"type": "text", "text": item["question"]},
],
}
]
conversations.append(messages)
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
outputs = llm.chat(
messages=conversations,
messages=BATCHED_CONVERSATIONS,
sampling_params=sampling_params,
)
for i, output in enumerate(outputs):
generated_text = output.outputs[0].text.strip()
expected_text = expected["transcriptions"][i]
assert_output_matches(
output,
expected["transcriptions"][i],
expected["token_ids"][i],
)
assert expected_text in generated_text or generated_text in expected_text
def test_single_and_batched_generation_match(llm):
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
single_output = llm.chat(
messages=SINGLE_CONVERSATION,
sampling_params=sampling_params,
)[0]
batched_output = llm.chat(
messages=BATCHED_CONVERSATIONS,
sampling_params=sampling_params,
)[0]
assert single_output.outputs[0].text == batched_output.outputs[0].text
assert list(single_output.outputs[0].token_ids) == list(
batched_output.outputs[0].token_ids
)