Fix AudioFlamingo3/MusicFlamingo HF parity and RoTE handling (#37643)

Signed-off-by: Lasha <26011196+lashahub@users.noreply.github.com>
This commit is contained in:
Lasha Koroshinadze
2026-03-22 22:29:07 -04:00
committed by GitHub
parent 43877a620b
commit e7767eccae
12 changed files with 1157 additions and 243 deletions

View File

@@ -104,12 +104,22 @@ def run_musicflamingo(question: str, audio_count: int) -> ModelRequestData:
enforce_eager=True,
)
# MusicFlamingo uses <sound> token for audio
# MusicFlamingo prompt placeholders use <sound>; vLLM's MusicFlamingo
# multimodal processor expands each one into <|sound_bos|> + audio tokens +
# <|sound_eos|> based on extracted audio feature lengths.
audio_placeholder = "<sound>" * audio_count
system_prompt = (
"You are Music Flamingo, a multimodal assistant for language and music. "
"On each turn you receive an audio clip which contains music and optional "
"text, you will receive at least one or both; use your world knowledge and "
"reasoning to help the user with any task. Interpret the entirety of the "
"content any input music--regardlenss of whether the user calls it audio, "
"music, or sound."
)
prompt = (
"<|im_start|>system\n"
"You are a helpful assistant.<|im_end|>\n"
f"{system_prompt}<|im_end|>\n"
"<|im_start|>user\n"
f"{audio_placeholder}{question}<|im_end|>\n"
"<|im_start|>assistant\n"