Fix AudioFlamingo3/MusicFlamingo HF parity and RoTE handling (#37643)

Signed-off-by: Lasha <26011196+lashahub@users.noreply.github.com>
2026-03-22 22:29:07 -04:00
parent 43877a620b
commit e7767eccae
12 changed files with 1157 additions and 243 deletions
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -104,12 +104,22 @@ def run_musicflamingo(question: str, audio_count: int) -> ModelRequestData:
        enforce_eager=True,
    )

-    # MusicFlamingo uses <sound> token for audio
+    # MusicFlamingo prompt placeholders use <sound>; vLLM's MusicFlamingo
+    # multimodal processor expands each one into <|sound_bos|> + audio tokens +
+    # <|sound_eos|> based on extracted audio feature lengths.
    audio_placeholder = "<sound>" * audio_count
+    system_prompt = (
+        "You are Music Flamingo, a multimodal assistant for language and music. "
+        "On each turn you receive an audio clip which contains music and optional "
+        "text, you will receive at least one or both; use your world knowledge and "
+        "reasoning to help the user with any task. Interpret the entirety of the "
+        "content any input music--regardlenss of whether the user calls it audio, "
+        "music, or sound."
+    )

    prompt = (
        "<|im_start|>system\n"
-        "You are a helpful assistant.<|im_end|>\n"
+        f"{system_prompt}<|im_end|>\n"
        "<|im_start|>user\n"
        f"{audio_placeholder}{question}<|im_end|>\n"
        "<|im_start|>assistant\n"