[Frontend] Remove librosa from audio dependency (#37058)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2026-03-21 11:36:15 +08:00
committed by GitHub
parent 1c472f8fe1
commit c7f98b4d0a
18 changed files with 247 additions and 188 deletions

View File

@@ -10,6 +10,8 @@ import pytest
from vllm.multimodal.media import AudioMediaIO
from ...conftest import AudioTestAssets
pytestmark = pytest.mark.cpu_test
ASSETS_DIR = Path(__file__).parent.parent / "assets"
@@ -22,40 +24,32 @@ def dummy_audio():
@pytest.fixture
def dummy_audio_bytes():
return b"FAKEAUDIOBYTES"
def dummy_audio_bytes(audio_assets: AudioTestAssets):
with open(audio_assets[0].get_local_path(), "rb") as f:
return f.read()
def test_audio_media_io_load_bytes(dummy_audio_bytes):
audio_io = AudioMediaIO()
with patch("librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_bytes(dummy_audio_bytes)
mock_load.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
out = audio_io.load_bytes(dummy_audio_bytes)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_base64(dummy_audio_bytes):
audio_io = AudioMediaIO()
encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_base64("audio/wav", encoded)
mock_load_bytes.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
out = audio_io.load_base64("audio/wav", encoded)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_file():
def test_audio_media_io_load_file(audio_assets: AudioTestAssets):
audio_io = AudioMediaIO()
path = Path("/fake/path.wav")
with patch("librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_file(path)
mock_load.assert_called_once_with(path, sr=None)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
path = audio_assets[0].get_local_path()
out = audio_io.load_file(path)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_encode_base64(dummy_audio):

View File

@@ -14,7 +14,7 @@ from vllm.multimodal.audio import (
AudioSpec,
ChannelReduction,
normalize_audio,
resample_audio_librosa,
resample_audio_pyav,
resample_audio_scipy,
split_audio,
)
@@ -25,14 +25,14 @@ def dummy_audio():
return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float)
def test_resample_audio_librosa(dummy_audio):
with patch("vllm.multimodal.audio.librosa.resample") as mock_resample:
mock_resample.return_value = dummy_audio * 2
out = resample_audio_librosa(dummy_audio, orig_sr=44100, target_sr=22050)
mock_resample.assert_called_once_with(
dummy_audio, orig_sr=44100, target_sr=22050
)
assert np.all(out == dummy_audio * 2)
def test_resample_audio_pyav(dummy_audio):
out_down = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=2)
out_up = resample_audio_pyav(dummy_audio, orig_sr=2, target_sr=4)
out_same = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=4)
assert len(out_down) == 3
assert len(out_up) == 10
assert np.all(out_same == dummy_audio)
def test_resample_audio_scipy(dummy_audio):
@@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
assert np.isfinite(out).all()
def test_audio_resampler_librosa_calls_resample(dummy_audio):
resampler = AudioResampler(target_sr=22050, method="librosa")
with patch("vllm.multimodal.audio.resample_audio_librosa") as mock_resample:
def test_audio_resampler_pyav_calls_resample(dummy_audio):
resampler = AudioResampler(target_sr=22050, method="pyav")
with patch("vllm.multimodal.audio.resample_audio_pyav") as mock_resample:
mock_resample.return_value = dummy_audio
out = resampler.resample(dummy_audio, orig_sr=44100)
mock_resample.assert_called_once_with(
@@ -423,13 +423,13 @@ class TestAudioPipelineE2E:
# Verify channel averaging: mean of [0.5, -0.5] = 0.0
np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
def test_librosa_mono_passthrough_e2e(self):
"""Full pipeline: librosa mono format → preserved as mono."""
def test_pyav_mono_passthrough_e2e(self):
"""Full pipeline: pyav mono format → preserved as mono."""
from vllm.multimodal.parse import MultiModalDataParser
# Simulate librosa output: already mono (time,) format
mono_librosa = np.random.randn(16000).astype(np.float32)
assert mono_librosa.shape == (16000,)
# Simulate pyav output: already mono (time,) format
mono_pyav = np.random.randn(16000).astype(np.float32)
assert mono_pyav.shape == (16000,)
# Create parser with mono normalization
parser = MultiModalDataParser(
@@ -438,7 +438,7 @@ class TestAudioPipelineE2E:
)
# Process audio through the parser
result = parser._parse_audio_data((mono_librosa, 16000))
result = parser._parse_audio_data((mono_pyav, 16000))
audio_output = result.get(0)
# Verify output is still mono 1D
@@ -446,7 +446,7 @@ class TestAudioPipelineE2E:
assert audio_output.shape == (16000,)
# Verify audio content is preserved
np.testing.assert_array_almost_equal(audio_output, mono_librosa)
np.testing.assert_array_almost_equal(audio_output, mono_pyav)
def test_multichannel_5_1_surround_to_mono_e2e(self):
"""Full pipeline: 5.1 surround (6 channels) → mono output."""