Fix AudioFlamingo3/MusicFlamingo HF parity and RoTE handling (#37643)

Signed-off-by: Lasha <26011196+lashahub@users.noreply.github.com>
2026-03-22 22:29:07 -04:00
parent 43877a620b
commit e7767eccae
12 changed files with 1157 additions and 243 deletions
--- a/tests/models/multimodal/generation/test_audioflamingo3.py
+++ b/tests/models/multimodal/generation/test_audioflamingo3.py
@@ -26,6 +26,54 @@ from tests.models.registry import HF_EXAMPLE_MODELS
 from vllm import LLM, SamplingParams

 MODEL_NAME = "nvidia/audio-flamingo-3-hf"
+SINGLE_CONVERSATION = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What is surprising about the relationship between "
+                "the barking and the music?",
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": "https://huggingface.co/datasets/nvidia/AudioSkills/"
+                    "resolve/main/assets/"
+                    "dogs_barking_in_sync_with_the_music.wav",
+                },
+            },
+        ],
+    }
+]
+BATCHED_CONVERSATIONS = [
+    SINGLE_CONVERSATION,
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Why is the philosopher's name mentioned in the "
+                    "lyrics? (A) To express a sense of nostalgia "
+                    "(B) To indicate that language cannot express clearly, "
+                    "satirizing the inversion of black and white in the world "
+                    "(C) To add depth and complexity to the lyrics "
+                    "(D) To showcase the wisdom and influence of the "
+                    "philosopher",
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": "https://huggingface.co/datasets/nvidia/"
+                        "AudioSkills/resolve/main/assets/"
+                        "Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
+                    },
+                },
+            ],
+        }
+    ],
+]


 def get_fixture_path(filename):
@@ -34,21 +82,29 @@ def get_fixture_path(filename):
    )


+def assert_output_matches(output, expected_text, expected_token_ids):
+    generated = output.outputs[0]
+    assert generated.text.strip() == expected_text
+    actual_token_ids = list(generated.token_ids)
+    assert (
+        actual_token_ids == expected_token_ids
+        or actual_token_ids == expected_token_ids[:-1]
+        or actual_token_ids[:-1] == expected_token_ids
+    )
+
+
@pytest.fixture(scope="module")
 def llm():
-    # Check if the model is supported by the current transformers version
    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
    model_info.check_transformers_version(on_fail="skip")

    try:
-        llm = LLM(
+        return LLM(
            model=MODEL_NAME,
-            trust_remote_code=True,
            dtype="bfloat16",
            enforce_eager=True,
            limit_mm_per_prompt={"audio": 1},
        )
-        return llm
    except Exception as e:
        pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")

@@ -61,29 +117,17 @@ def test_single_generation(llm):
    with open(fixture_path) as f:
        expected = json.load(f)

-    audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
-
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "audio_url", "audio_url": {"url": audio_url}},
-                {"type": "text", "text": "Transcribe the input speech."},
-            ],
-        }
-    ]
-
    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)

    outputs = llm.chat(
-        messages=messages,
+        messages=SINGLE_CONVERSATION,
        sampling_params=sampling_params,
    )
-    generated_text = outputs[0].outputs[0].text.strip()
-
-    expected_text = expected["transcriptions"][0]
-
-    assert expected_text in generated_text or generated_text in expected_text
+    assert_output_matches(
+        outputs[0],
+        expected["transcriptions"][0],
+        expected["token_ids"][0],
+    )


 def test_batched_generation(llm):
@@ -94,49 +138,34 @@ def test_batched_generation(llm):
    with open(fixture_path) as f:
        expected = json.load(f)

-    items = [
-        {
-            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
-            "question": "What is surprising about the relationship "
-            "between the barking and the music?",
-            "expected_idx": 0,
-        },
-        {
-            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
-            "question": (
-                "Why is the philosopher's name mentioned in the lyrics? "
-                "(A) To express a sense of nostalgia "
-                "(B) To indicate that language cannot express clearly, "
-                "satirizing the inversion of black and white in the world "
-                "(C) To add depth and complexity to the lyrics "
-                "(D) To showcase the wisdom and influence of the philosopher"
-            ),
-            "expected_idx": 1,
-        },
-    ]
-
-    conversations = []
-    for item in items:
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
-                    {"type": "text", "text": item["question"]},
-                ],
-            }
-        ]
-        conversations.append(messages)
-
    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)

    outputs = llm.chat(
-        messages=conversations,
+        messages=BATCHED_CONVERSATIONS,
        sampling_params=sampling_params,
    )

    for i, output in enumerate(outputs):
-        generated_text = output.outputs[0].text.strip()
-        expected_text = expected["transcriptions"][i]
+        assert_output_matches(
+            output,
+            expected["transcriptions"][i],
+            expected["token_ids"][i],
+        )

-        assert expected_text in generated_text or generated_text in expected_text
+
+def test_single_and_batched_generation_match(llm):
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
+
+    single_output = llm.chat(
+        messages=SINGLE_CONVERSATION,
+        sampling_params=sampling_params,
+    )[0]
+    batched_output = llm.chat(
+        messages=BATCHED_CONVERSATIONS,
+        sampling_params=sampling_params,
+    )[0]
+
+    assert single_output.outputs[0].text == batched_output.outputs[0].text
+    assert list(single_output.outputs[0].token_ids) == list(
+        batched_output.outputs[0].token_ids
+    )
--- a/tests/models/multimodal/generation/test_musicflamingo.py
+++ b/tests/models/multimodal/generation/test_musicflamingo.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import os
+
+import pytest
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+from vllm import LLM, SamplingParams
+
+MODEL_NAME = "nvidia/music-flamingo-2601-hf"
+SINGLE_CONVERSATION = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Describe this track in full detail - tell me the "
+                "genre, tempo, and key, then dive into the instruments, "
+                "production style, and overall mood it creates.",
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": "https://huggingface.co/datasets/nvidia/AudioSkills/"
+                    "resolve/main/assets/song_1.mp3",
+                },
+            },
+        ],
+    }
+]
+BATCHED_CONVERSATIONS = [
+    SINGLE_CONVERSATION,
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Generate a structured lyric sheet from the input music.",
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": "https://huggingface.co/datasets/nvidia/"
+                        "AudioSkills/resolve/main/assets/song_2.mp3",
+                    },
+                },
+            ],
+        }
+    ],
+]
+
+
+def get_fixture_path(filename):
+    return os.path.join(
+        os.path.dirname(__file__), "../../fixtures/musicflamingo", filename
+    )
+
+
+def assert_output_matches(output, expected_text, expected_token_ids):
+    generated = output.outputs[0]
+    assert generated.text == expected_text
+    actual_token_ids = list(generated.token_ids)
+    assert (
+        actual_token_ids == expected_token_ids
+        or actual_token_ids == expected_token_ids[:-1]
+        or actual_token_ids[:-1] == expected_token_ids
+    )
+
+
+@pytest.fixture(scope="module")
+def llm():
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("MusicFlamingoForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+
+    try:
+        return LLM(
+            model=MODEL_NAME,
+            dtype="bfloat16",
+            enforce_eager=True,
+            max_model_len=8192,
+            limit_mm_per_prompt={"audio": 1},
+        )
+    except Exception as e:
+        pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
+
+
+def test_single_generation(llm):
+    fixture_path = get_fixture_path("expected_results_single.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+
+    with open(fixture_path) as f:
+        expected = json.load(f)
+
+    outputs = llm.chat(
+        messages=SINGLE_CONVERSATION,
+        sampling_params=SamplingParams(temperature=0.0, max_tokens=50),
+    )
+
+    assert_output_matches(
+        outputs[0],
+        expected["transcriptions"][0],
+        expected["token_ids"][0],
+    )
+
+
+def test_batched_generation(llm):
+    fixture_path = get_fixture_path("expected_results_batched.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+
+    with open(fixture_path) as f:
+        expected = json.load(f)
+
+    outputs = llm.chat(
+        messages=BATCHED_CONVERSATIONS,
+        sampling_params=SamplingParams(temperature=0.0, max_tokens=50),
+    )
+
+    for i, output in enumerate(outputs):
+        assert_output_matches(
+            output,
+            expected["transcriptions"][i],
+            expected["token_ids"][i],
+        )
+
+
+def test_single_and_batched_generation_match(llm):
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=50)
+
+    single_output = llm.chat(
+        messages=SINGLE_CONVERSATION,
+        sampling_params=sampling_params,
+    )[0]
+    batched_output = llm.chat(
+        messages=BATCHED_CONVERSATIONS,
+        sampling_params=sampling_params,
+    )[0]
+
+    assert single_output.outputs[0].text == batched_output.outputs[0].text
+    assert list(single_output.outputs[0].token_ids) == list(
+        batched_output.outputs[0].token_ids
+    )
--- a/tests/models/multimodal/processing/test_audioflamingo3.py
+++ b/tests/models/multimodal/processing/test_audioflamingo3.py
@@ -40,6 +40,7 @@ class MockAudioFlamingo3Processor:
    def __init__(self):
        self.audio_token = "<sound>"
        self.audio_token_id = 12345
+        self.max_audio_len = 60
        self.feature_extractor = MockFeatureExtractor()

    def __call__(self, text=None, audios=None, **kwargs):
@@ -65,7 +66,6 @@ def mock_ctx():

@pytest.fixture(autouse=True)
 def check_transformers_version():
-    # Check if the model is supported by the current transformers version
    model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
    model_info.check_transformers_version(on_fail="skip")

@@ -84,7 +84,7 @@ def test_audio_chunk_counting(mock_ctx):

    sr = 16000
    audio_1 = np.zeros(30 * sr)
-    audio_2 = np.zeros(45 * sr)
+    audio_2 = np.zeros(75 * sr)

    mm_data = {"audio": [audio_1, audio_2]}
    prompt = "<|user|>Listen.<|end|>"
@@ -121,5 +121,107 @@ def test_dummy_data_generation(mock_ctx):
    assert "audio" in dummy_data
    assert len(dummy_data["audio"]) == 2

-    expected_len = 600 * 16000
+    expected_len = 60 * 16000
    assert len(dummy_data["audio"][0]) == expected_len
+
+
+def test_audio_token_count_matches_hf_processor_math():
+    from vllm.model_executor.models.audioflamingo3 import (
+        _count_audio_tokens_from_mask,
+    )
+
+    feature_attention_mask = torch.zeros((3, 3000), dtype=torch.long)
+    feature_attention_mask[0, :2999] = 1
+    feature_attention_mask[1, :2999] = 1
+    feature_attention_mask[2, :1500] = 1
+    chunk_counts = torch.tensor([2, 1], dtype=torch.long)
+
+    assert (
+        _count_audio_tokens_from_mask(feature_attention_mask, chunk_counts, 0) == 1499
+    )
+    assert _count_audio_tokens_from_mask(feature_attention_mask, chunk_counts, 1) == 375
+
+
+def test_audio_feature_pipeline_matches_hf_small_config():
+    from transformers.models.audioflamingo3 import (
+        modeling_audioflamingo3 as hf_audioflamingo3_modeling,
+    )
+    from transformers.models.audioflamingo3.configuration_audioflamingo3 import (
+        AudioFlamingo3Config,
+    )
+
+    from vllm.model_executor.models.audioflamingo3 import (
+        AudioFlamingo3Encoder,
+        AudioFlamingo3MultiModalProjector,
+        _build_audio_encoder_attention_mask,
+        _flatten_valid_audio_embeddings,
+    )
+
+    text_config = {
+        "model_type": "qwen2",
+        "intermediate_size": 64,
+        "initializer_range": 0.02,
+        "hidden_size": 32,
+        "max_position_embeddings": 1024,
+        "num_hidden_layers": 2,
+        "num_attention_heads": 4,
+        "num_key_value_heads": 2,
+        "vocab_size": 128,
+        "pad_token_id": 1,
+        "use_mrope": False,
+    }
+    audio_config = {
+        "hidden_size": 16,
+        "num_attention_heads": 4,
+        "intermediate_size": 32,
+        "num_hidden_layers": 2,
+        "num_mel_bins": 80,
+        "max_source_positions": 1500,
+        "dropout": 0.0,
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "encoder_layerdrop": 0.0,
+    }
+
+    torch.manual_seed(0)
+    config = AudioFlamingo3Config(
+        text_config=text_config,
+        audio_config=audio_config,
+        audio_token_id=0,
+    )
+    hf_model = hf_audioflamingo3_modeling.AudioFlamingo3ForConditionalGeneration(
+        config
+    ).eval()
+
+    vllm_encoder = AudioFlamingo3Encoder(config.audio_config).eval()
+    vllm_encoder.load_state_dict(hf_model.audio_tower.state_dict())
+
+    vllm_projector = AudioFlamingo3MultiModalProjector(config).eval()
+    vllm_projector.load_state_dict(hf_model.multi_modal_projector.state_dict())
+
+    input_features = torch.randn(3, 80, 3000)
+    feature_attention_mask = torch.zeros(3, 3000, dtype=torch.bool)
+    feature_attention_mask[0, :3000] = True
+    feature_attention_mask[1, :2500] = True
+    feature_attention_mask[2, :1500] = True
+
+    hf_output = hf_model.get_audio_features(
+        input_features,
+        feature_attention_mask,
+        return_dict=True,
+    ).pooler_output
+    vllm_attention_mask = _build_audio_encoder_attention_mask(
+        feature_attention_mask,
+        dtype=vllm_encoder.conv1.weight.dtype,
+        device=vllm_encoder.conv1.weight.device,
+    )
+    vllm_hidden_states = vllm_encoder(
+        input_features,
+        attention_mask=vllm_attention_mask,
+    )
+    vllm_output, _ = _flatten_valid_audio_embeddings(
+        vllm_projector(vllm_hidden_states),
+        feature_attention_mask,
+    )
+
+    torch.testing.assert_close(vllm_output, hf_output)
--- a/tests/models/multimodal/processing/test_musicflamingo.py
+++ b/tests/models/multimodal/processing/test_musicflamingo.py
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2026 The vLLM team.
+# Copyright 2026 NVIDIA CORPORATION and the HuggingFace Inc. team. All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+import torch
+from transformers import PretrainedConfig
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+
+
+class MockMusicFlamingoConfig(PretrainedConfig):
+    model_type = "musicflamingo"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.audio_config = PretrainedConfig()
+        self.text_config = PretrainedConfig()
+
+
+class MockMusicFlamingoProcessor:
+    def __init__(self):
+        self.audio_token = "<sound>"
+        self.audio_token_id = 12345
+        self.audio_bos_token = "<|sound_bos|>"
+        self.audio_bos_token_id = 12346
+        self.audio_eos_token = "<|sound_eos|>"
+        self.audio_eos_token_id = 12347
+        self.max_audio_len = 1200
+        self.feature_extractor = MockFeatureExtractor()
+
+
+class MockFeatureExtractor:
+    def __init__(self):
+        self.sampling_rate = 16000
+        self.chunk_length = 30
+
+
+@pytest.fixture
+def mock_ctx():
+    config = MockMusicFlamingoConfig()
+
+    ctx = MagicMock()
+    ctx.get_hf_config.return_value = config
+    ctx.get_hf_processor.return_value = MockMusicFlamingoProcessor()
+    ctx.model_config.hf_config = config
+    return ctx
+
+
+@pytest.fixture(autouse=True)
+def check_transformers_version():
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("MusicFlamingoForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+
+
+def test_musicflamingo_chunk_counting_uses_rote_timestamps(mock_ctx, monkeypatch):
+    from vllm.model_executor.models.musicflamingo import (
+        MusicFlamingoDummyInputsBuilder,
+        MusicFlamingoMultiModalProcessor,
+        MusicFlamingoProcessingInfo,
+    )
+
+    info = MusicFlamingoProcessingInfo(mock_ctx)
+    processor = MusicFlamingoMultiModalProcessor(
+        info, MusicFlamingoDummyInputsBuilder(info)
+    )
+
+    sr = 16000
+    audio_1 = np.zeros(30 * sr)
+    audio_2 = np.zeros(45 * sr)
+
+    mm_data = {"audio": [audio_1, audio_2]}
+    prompt = "<|user|>Listen.<|end|>"
+
+    from vllm.multimodal.processing import BaseMultiModalProcessor
+
+    def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
+        del self, prompt, mm_data, mm_kwargs, tok_kwargs
+        return {
+            "input_ids": [1, 2, 3],
+            "input_features": torch.randn(3, 80, 3000),
+            "rote_timestamps": torch.randn(3, 750),
+        }
+
+    monkeypatch.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)
+
+    processed = processor._call_hf_processor(prompt, mm_data, {}, {})
+
+    chunk_counts = processed["chunk_counts"]
+
+    assert chunk_counts.tolist() == [1, 2]
+    assert "rote_timestamps" in processed
+
+
+def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):
+    from vllm.model_executor.models.musicflamingo import (
+        MusicFlamingoDummyInputsBuilder,
+        MusicFlamingoProcessingInfo,
+    )
+
+    info = MusicFlamingoProcessingInfo(mock_ctx)
+    builder = MusicFlamingoDummyInputsBuilder(info)
+
+    assert builder.get_dummy_text({"audio": 2}) == "<sound><sound>"
+
+
+def test_musicflamingo_audio_feature_pipeline_matches_hf_small_config():
+    from transformers.models.musicflamingo import (
+        modeling_musicflamingo as hf_musicflamingo_modeling,
+    )
+    from transformers.models.musicflamingo.configuration_musicflamingo import (
+        MusicFlamingoConfig,
+    )
+
+    from vllm.model_executor.models.audioflamingo3 import (
+        _build_audio_encoder_attention_mask,
+        _flatten_valid_audio_embeddings,
+    )
+    from vllm.model_executor.models.musicflamingo import (
+        MusicFlamingoEncoder,
+        MusicFlamingoMultiModalProjector,
+        MusicFlamingoRotaryEmbedding,
+        apply_rotary_time_emb,
+    )
+
+    text_config = {
+        "model_type": "qwen2",
+        "intermediate_size": 64,
+        "initializer_range": 0.02,
+        "hidden_size": 32,
+        "max_position_embeddings": 1024,
+        "num_hidden_layers": 2,
+        "num_attention_heads": 4,
+        "num_key_value_heads": 2,
+        "vocab_size": 128,
+        "pad_token_id": 1,
+        "use_mrope": False,
+    }
+    audio_config = {
+        "hidden_size": 16,
+        "num_attention_heads": 4,
+        "intermediate_size": 32,
+        "num_hidden_layers": 2,
+        "num_mel_bins": 80,
+        "max_source_positions": 1500,
+        "dropout": 0.0,
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "encoder_layerdrop": 0.0,
+    }
+
+    torch.manual_seed(0)
+    config = MusicFlamingoConfig(
+        text_config=text_config,
+        audio_config=audio_config,
+        audio_token_id=0,
+        head_dim=8,
+        rope_parameters={"rope_type": "default", "rope_theta": 2048},
+    )
+    hf_model = hf_musicflamingo_modeling.MusicFlamingoForConditionalGeneration(
+        config
+    ).eval()
+
+    vllm_encoder = MusicFlamingoEncoder(config.audio_config).eval()
+    vllm_encoder.load_state_dict(hf_model.audio_tower.state_dict())
+
+    vllm_projector = MusicFlamingoMultiModalProjector(config).eval()
+    vllm_projector.load_state_dict(hf_model.multi_modal_projector.state_dict())
+
+    vllm_rope = MusicFlamingoRotaryEmbedding(config).eval()
+    vllm_rope.load_state_dict(hf_model.pos_emb.state_dict(), strict=False)
+
+    input_features = torch.randn(3, 80, 3000)
+    feature_attention_mask = torch.zeros(3, 3000, dtype=torch.bool)
+    feature_attention_mask[0, :3000] = True
+    feature_attention_mask[1, :2500] = True
+    feature_attention_mask[2, :1500] = True
+    rote_timestamps = (
+        torch.arange(750, dtype=torch.float32).unsqueeze(0).repeat(3, 1) * 0.04
+    )
+
+    hf_output = hf_model.get_audio_features(
+        input_features,
+        feature_attention_mask,
+        rote_timestamps=rote_timestamps,
+        return_dict=True,
+    ).pooler_output
+    vllm_attention_mask = _build_audio_encoder_attention_mask(
+        feature_attention_mask,
+        dtype=vllm_encoder.conv1.weight.dtype,
+        device=vllm_encoder.conv1.weight.device,
+    )
+    vllm_hidden_states = vllm_encoder(
+        input_features,
+        attention_mask=vllm_attention_mask,
+    )
+    cos, sin = vllm_rope(rote_timestamps, seq_len=vllm_hidden_states.shape[-2])
+    vllm_hidden_states = apply_rotary_time_emb(vllm_hidden_states, cos, sin)
+    vllm_output, _ = _flatten_valid_audio_embeddings(
+        vllm_projector(vllm_hidden_states),
+        feature_attention_mask,
+    )
+
+    torch.testing.assert_close(vllm_output, hf_output)