[Fix] Introduce audio channels spec (#31595)

Signed-off-by: Jeremy Teboul <jeremyte@meta.com>
2026-01-09 11:34:51 -08:00
parent 308feab33f
commit 657e9c0e18
9 changed files with 717 additions and 189 deletions
--- a/tests/multimodal/test_audio.py
+++ b/tests/multimodal/test_audio.py
@@ -7,10 +7,16 @@ from unittest.mock import patch

 import numpy as np
 import pytest
+import torch

 from vllm.multimodal.audio import (
+    MONO_AUDIO_SPEC,
+    PASSTHROUGH_AUDIO_SPEC,
    AudioMediaIO,
    AudioResampler,
+    AudioSpec,
+    ChannelReduction,
+    normalize_audio,
    resample_audio_librosa,
    resample_audio_scipy,
 )
@@ -137,3 +143,500 @@ def test_audio_media_io_encode_base64(dummy_audio):
        decoded = base64.b64decode(out)
        assert decoded == b"dummy_wav_data"
        mock_write.assert_called_once()
+
+
+# ============================================================
+# Tests for normalize_audio function
+# ============================================================
+
+
+class TestNormalizeAudio:
+    """Tests for normalize_audio function with different specs."""
+
+    def test_passthrough_preserves_audio(self):
+        """Passthrough spec should not modify audio."""
+        stereo = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
+        result = normalize_audio(stereo, PASSTHROUGH_AUDIO_SPEC)
+        np.testing.assert_array_equal(result, stereo)
+
+    def test_mono_spec_with_numpy_stereo(self):
+        """Mono spec should reduce stereo numpy array to 1D."""
+        stereo = np.array([[1.0, 2.0], [-1.0, 0.0]], dtype=np.float32)
+        result = normalize_audio(stereo, MONO_AUDIO_SPEC)
+        assert result.ndim == 1
+        np.testing.assert_array_almost_equal(result, [0.0, 1.0])
+
+    def test_mono_spec_with_torch_stereo(self):
+        """Mono spec should reduce stereo torch tensor to 1D."""
+        stereo = torch.tensor([[1.0, 2.0], [-1.0, 0.0]])
+        result = normalize_audio(stereo, MONO_AUDIO_SPEC)
+        assert result.ndim == 1
+        torch.testing.assert_close(result, torch.tensor([0.0, 1.0]))
+
+    def test_mono_passthrough_for_1d_numpy(self):
+        """1D numpy array should pass through unchanged with mono spec."""
+        mono = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        result = normalize_audio(mono, MONO_AUDIO_SPEC)
+        assert result.ndim == 1
+        np.testing.assert_array_equal(result, mono)
+
+    def test_mono_passthrough_for_1d_torch(self):
+        """1D torch tensor should pass through unchanged with mono spec."""
+        mono = torch.tensor([1.0, 2.0, 3.0])
+        result = normalize_audio(mono, MONO_AUDIO_SPEC)
+        assert result.ndim == 1
+        torch.testing.assert_close(result, mono)
+
+    def test_first_channel_reduction(self):
+        """FIRST reduction should take only the first channel."""
+        spec = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.FIRST)
+        stereo = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+        result = normalize_audio(stereo, spec)
+        np.testing.assert_array_equal(result, [1.0, 2.0])
+
+    def test_max_channel_reduction(self):
+        """MAX reduction should take max across channels."""
+        spec = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.MAX)
+        stereo = np.array([[1.0, 4.0], [3.0, 2.0]], dtype=np.float32)
+        result = normalize_audio(stereo, spec)
+        np.testing.assert_array_equal(result, [3.0, 4.0])
+
+    def test_sum_channel_reduction(self):
+        """SUM reduction should sum across channels."""
+        spec = AudioSpec(target_channels=1, channel_reduction=ChannelReduction.SUM)
+        stereo = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+        result = normalize_audio(stereo, spec)
+        np.testing.assert_array_equal(result, [4.0, 6.0])
+
+    def test_invalid_3d_array_raises(self):
+        """3D arrays should raise ValueError."""
+        audio_3d = np.random.randn(2, 3, 4).astype(np.float32)
+        with pytest.raises(ValueError, match="Unsupported audio"):
+            normalize_audio(audio_3d, MONO_AUDIO_SPEC)
+
+    def test_channel_expansion_raises(self):
+        """Expanding from mono to stereo should raise ValueError."""
+        mono = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        spec = AudioSpec(target_channels=2)
+        with pytest.raises(ValueError, match="Cannot expand"):
+            normalize_audio(mono, spec)
+
+    def test_time_channels_format_numpy(self):
+        """Audio in (time, channels) format should be transposed to (channels, time).
+
+        This handles the case where audio loaders like soundfile return
+        (time, channels) format instead of (channels, time) like torchaudio.
+        """
+        # Create audio in (time, channels) format: 1000 samples, 2 channels
+        audio_time_channels = np.array(
+            [[1.0, -1.0]] * 1000,  # 1000 time steps, 2 channels
+            dtype=np.float32,
+        )
+        assert audio_time_channels.shape == (1000, 2)  # (time, channels)
+
+        result = normalize_audio(audio_time_channels, MONO_AUDIO_SPEC)
+
+        # Should be reduced to mono 1D
+        assert result.ndim == 1
+        assert result.shape == (1000,)
+        # Mean of [1.0, -1.0] at each time step should be 0.0
+        np.testing.assert_array_almost_equal(result, np.zeros(1000))
+
+    def test_time_channels_format_torch(self):
+        """Torch tensor in (time, channels) format should be transposed."""
+        # Create audio in (time, channels) format: 1000 samples, 2 channels
+        audio_time_channels = torch.tensor(
+            [[1.0, -1.0]] * 1000,  # 1000 time steps, 2 channels
+        )
+        assert audio_time_channels.shape == (1000, 2)  # (time, channels)
+
+        result = normalize_audio(audio_time_channels, MONO_AUDIO_SPEC)
+
+        # Should be reduced to mono 1D
+        assert result.ndim == 1
+        assert result.shape == (1000,)
+        # Mean of [1.0, -1.0] at each time step should be 0.0
+        torch.testing.assert_close(result, torch.zeros(1000))
+
+    def test_channels_time_format_preserved(self):
+        """Audio already in (channels, time) format should work correctly."""
+        # Create audio in standard (channels, time) format: 2 channels, 1000 samples
+        audio_channels_time = np.array(
+            [[1.0] * 1000, [-1.0] * 1000],  # 2 channels, 1000 time steps
+            dtype=np.float32,
+        )
+        assert audio_channels_time.shape == (2, 1000)  # (channels, time)
+
+        result = normalize_audio(audio_channels_time, MONO_AUDIO_SPEC)
+
+        # Should be reduced to mono 1D
+        assert result.ndim == 1
+        assert result.shape == (1000,)
+        # Mean of [1.0, -1.0] at each time step should be 0.0
+        np.testing.assert_array_almost_equal(result, np.zeros(1000))
+
+    def test_ambiguous_square_audio_numpy(self):
+        """Square audio arrays (N, N) should use shape[0] > shape[1] heuristic.
+
+        For a square array, shape[0] == shape[1], so no transpose happens
+        and we assume (channels, time) format.
+        """
+        # Create square audio: 4 channels, 4 samples
+        audio_square = np.array(
+            [
+                [1.0, 2.0, 3.0, 4.0],
+                [5.0, 6.0, 7.0, 8.0],
+                [9.0, 10.0, 11.0, 12.0],
+                [13.0, 14.0, 15.0, 16.0],
+            ],
+            dtype=np.float32,
+        )
+        assert audio_square.shape == (4, 4)
+
+        result = normalize_audio(audio_square, MONO_AUDIO_SPEC)
+
+        # Should be reduced to mono 1D with mean across channels (axis 0)
+        assert result.ndim == 1
+        assert result.shape == (4,)
+        # Mean across 4 channels: [1+5+9+13, 2+6+10+14, ...] / 4
+        expected = np.array([7.0, 8.0, 9.0, 10.0])
+        np.testing.assert_array_almost_equal(result, expected)
+
+
+# ============================================================
+# Tests for MultiModalDataParser integration with target_channels
+# ============================================================
+
+
+class TestMultiModalDataParserChannelNormalization:
+    """Tests for MultiModalDataParser.target_channels integration.
+
+    These tests verify that the target_channels parameter is properly used
+    in the _parse_audio_data method to normalize audio channels.
+    """
+
+    def test_parser_normalizes_stereo_to_mono(self):
+        """Parser should normalize stereo to mono when target_channels=1."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Create parser with mono normalization enabled
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Create stereo audio (simulating torchaudio output)
+        stereo_audio = np.array(
+            [[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0]],  # 2 channels, 3 samples
+            dtype=np.float32,
+        )
+
+        # Parse audio data
+        result = parser._parse_audio_data((stereo_audio, 16000))
+
+        # Check that result is mono (1D)
+        audio_item = result.get(0)
+        assert audio_item.ndim == 1, f"Expected 1D mono audio, got {audio_item.ndim}D"
+        assert audio_item.shape == (3,), f"Expected shape (3,), got {audio_item.shape}"
+        # Channel average of [1, 1, 1] and [-1, -1, -1] should be [0, 0, 0]
+        np.testing.assert_array_almost_equal(audio_item, np.zeros(3))
+
+    def test_parser_preserves_stereo_when_target_channels_none(self):
+        """Parser should preserve stereo when target_channels=None."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Create parser without channel normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=None,
+        )
+
+        # Create stereo audio
+        stereo_audio = np.array(
+            [[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0]],
+            dtype=np.float32,
+        )
+
+        # Parse audio data
+        result = parser._parse_audio_data((stereo_audio, 16000))
+
+        # Check that result preserves original shape (after resampling)
+        audio_item = result.get(0)
+        # When target_channels=None, stereo audio should be preserved
+        assert audio_item.ndim == 2, f"Expected 2D stereo audio, got {audio_item.ndim}D"
+
+    def test_parser_mono_passthrough_when_target_channels_1(self):
+        """Parser should pass through mono audio unchanged when target_channels=1."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Create parser with mono normalization enabled
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Create mono audio (already 1D)
+        mono_audio = np.random.randn(16000).astype(np.float32)
+
+        # Parse audio data
+        result = parser._parse_audio_data((mono_audio, 16000))
+
+        # Check that result is still mono (1D)
+        audio_item = result.get(0)
+        assert audio_item.ndim == 1
+        assert audio_item.shape == (16000,)
+
+    def test_parser_with_target_channels_2(self):
+        """Parser should reduce 6-channel to 2-channel when target_channels=2."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Create parser with stereo target
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=2,
+        )
+
+        # Create 6-channel audio (5.1 surround)
+        surround_audio = np.random.randn(6, 1000).astype(np.float32)
+
+        # Parse audio data
+        result = parser._parse_audio_data((surround_audio, 16000))
+
+        # Check that result is stereo (2 channels)
+        audio_item = result.get(0)
+        assert audio_item.ndim == 2
+        assert audio_item.shape[0] == 2  # 2 channels
+
+
+# ============================================================
+# End-to-End Audio Pipeline Tests
+# ============================================================
+
+
+class TestAudioPipelineE2E:
+    """End-to-end tests for audio normalization in the full pipeline.
+
+    These tests verify the complete flow from raw audio input through
+    the MultiModalDataParser, simulating different audio loader formats.
+    """
+
+    def test_stereo_audio_normalized_to_mono_e2e(self):
+        """Full pipeline: stereo audio (torchaudio format) → mono output."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate torchaudio output: (channels, time) format
+        # Stereo audio with left channel = 1.0, right channel = -1.0
+        stereo_torchaudio = np.array(
+            [[1.0] * 16000, [-1.0] * 16000],  # 2 channels, 1 second at 16kHz
+            dtype=np.float32,
+        )
+        assert stereo_torchaudio.shape == (2, 16000)
+
+        # Create parser with mono normalization (like Whisper models)
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((stereo_torchaudio, 16000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D
+        assert audio_output.ndim == 1, f"Expected 1D, got {audio_output.ndim}D"
+        assert audio_output.shape == (16000,)
+
+        # Verify channel averaging: mean of [1.0, -1.0] = 0.0
+        np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
+
+    def test_soundfile_format_normalized_to_mono_e2e(self):
+        """Full pipeline: soundfile format (time, channels) → mono output."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate soundfile output: (time, channels) format
+        # 16000 samples, 2 channels
+        stereo_soundfile = np.array(
+            [[0.5, -0.5]] * 16000,  # Each row is [left, right]
+            dtype=np.float32,
+        )
+        assert stereo_soundfile.shape == (16000, 2)
+
+        # Create parser with mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((stereo_soundfile, 16000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D
+        assert audio_output.ndim == 1, f"Expected 1D, got {audio_output.ndim}D"
+        assert audio_output.shape == (16000,)
+
+        # Verify channel averaging: mean of [0.5, -0.5] = 0.0
+        np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
+
+    def test_librosa_mono_passthrough_e2e(self):
+        """Full pipeline: librosa mono format → preserved as mono."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate librosa output: already mono (time,) format
+        mono_librosa = np.random.randn(16000).astype(np.float32)
+        assert mono_librosa.shape == (16000,)
+
+        # Create parser with mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((mono_librosa, 16000))
+        audio_output = result.get(0)
+
+        # Verify output is still mono 1D
+        assert audio_output.ndim == 1
+        assert audio_output.shape == (16000,)
+
+        # Verify audio content is preserved
+        np.testing.assert_array_almost_equal(audio_output, mono_librosa)
+
+    def test_multichannel_5_1_surround_to_mono_e2e(self):
+        """Full pipeline: 5.1 surround (6 channels) → mono output."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate 5.1 surround audio: 6 channels
+        surround_audio = np.array(
+            [
+                [1.0] * 8000,  # Front Left
+                [2.0] * 8000,  # Front Right
+                [3.0] * 8000,  # Center
+                [4.0] * 8000,  # LFE (subwoofer)
+                [5.0] * 8000,  # Rear Left
+                [6.0] * 8000,  # Rear Right
+            ],
+            dtype=np.float32,
+        )
+        assert surround_audio.shape == (6, 8000)
+
+        # Create parser with mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((surround_audio, 16000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D
+        assert audio_output.ndim == 1
+
+        # Verify channel averaging: mean of [1,2,3,4,5,6] = 3.5
+        expected_value = (1.0 + 2.0 + 3.0 + 4.0 + 5.0 + 6.0) / 6
+        np.testing.assert_array_almost_equal(
+            audio_output, np.full(8000, expected_value), decimal=5
+        )
+
+    def test_torch_tensor_input_e2e(self):
+        """Full pipeline: torch.Tensor stereo input → mono numpy output."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Simulate torch tensor input (from torchaudio)
+        stereo_torch = torch.tensor(
+            [[1.0] * 8000, [-1.0] * 8000],  # 2 channels
+            dtype=torch.float32,
+        )
+        assert stereo_torch.shape == (2, 8000)
+
+        # Create parser with mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        # Process audio through the parser
+        # Note: Parser expects numpy, so we convert first (simulating real usage)
+        result = parser._parse_audio_data((stereo_torch.numpy(), 16000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D numpy array
+        assert audio_output.ndim == 1
+        assert isinstance(audio_output, np.ndarray)
+
+        # Verify channel averaging
+        np.testing.assert_array_almost_equal(audio_output, np.zeros(8000), decimal=5)
+
+    def test_passthrough_preserves_stereo_e2e(self):
+        """Full pipeline: stereo with target_channels=None → stereo preserved."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Stereo audio
+        stereo_audio = np.array(
+            [[1.0] * 8000, [-1.0] * 8000],
+            dtype=np.float32,
+        )
+
+        # Create parser WITHOUT mono normalization (passthrough)
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=None,  # Passthrough - no normalization
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((stereo_audio, 16000))
+        audio_output = result.get(0)
+
+        # Verify output preserves stereo (2D)
+        assert audio_output.ndim == 2
+        assert audio_output.shape == (2, 8000)
+
+    def test_resampling_with_channel_normalization_e2e(self):
+        """Full pipeline: resample + channel normalize in single pass."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Stereo audio at 48kHz (common recording rate)
+        stereo_48k = np.array(
+            [[1.0] * 48000, [-1.0] * 48000],  # 1 second at 48kHz
+            dtype=np.float32,
+        )
+
+        # Create parser with both resampling and mono normalization
+        parser = MultiModalDataParser(
+            target_sr=16000,  # Resample to 16kHz
+            target_channels=1,  # Normalize to mono
+        )
+
+        # Process audio through the parser
+        result = parser._parse_audio_data((stereo_48k, 48000))
+        audio_output = result.get(0)
+
+        # Verify output is mono 1D at target sample rate
+        assert audio_output.ndim == 1
+        # After resampling from 48kHz to 16kHz, length should be ~16000
+        assert audio_output.shape[0] == 16000
+
+    def test_very_short_audio_e2e(self):
+        """Full pipeline: very short audio (< 1 frame) handled correctly."""
+        from vllm.multimodal.parse import MultiModalDataParser
+
+        # Very short stereo audio (10 samples)
+        short_stereo = np.array(
+            [[1.0] * 10, [-1.0] * 10],
+            dtype=np.float32,
+        )
+
+        parser = MultiModalDataParser(
+            target_sr=16000,
+            target_channels=1,
+        )
+
+        result = parser._parse_audio_data((short_stereo, 16000))
+        audio_output = result.get(0)
+
+        # Should still produce mono output
+        assert audio_output.ndim == 1
+        assert audio_output.shape == (10,)
+        np.testing.assert_array_almost_equal(audio_output, np.zeros(10))