From c7f98b4d0a63b32ed939e2b6dfaa8a626e9b46c4 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 21 Mar 2026 11:36:15 +0800 Subject: [PATCH] [Frontend] Remove librosa from audio dependency (#37058) Signed-off-by: Isotr0py --- requirements/test.in | 1 + requirements/test.txt | 4 + setup.py | 4 +- .../test_transcription_validation.py | 2 +- tests/entrypoints/openai/test_run_batch.py | 2 +- .../generation/vlm_utils/builders.py | 5 +- tests/multimodal/media/test_audio.py | 38 ++--- tests/multimodal/test_audio.py | 38 ++--- vllm/assets/audio.py | 9 +- vllm/assets/video.py | 9 +- vllm/benchmarks/datasets.py | 7 +- .../openai/speech_to_text/speech_to_text.py | 54 +----- .../model_executor/models/nano_nemotron_vl.py | 5 +- vllm/multimodal/audio.py | 96 ++++++++++- vllm/multimodal/media/audio.py | 154 +++++++++++------- vllm/multimodal/parse.py | 2 +- vllm/renderers/base.py | 3 - .../processors/fireredasr2.py | 2 +- 18 files changed, 247 insertions(+), 188 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index 8bd005144..be4c2e579 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test peft>=0.15.0 # required for phi-4-mm test pqdm ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests +resampy # required for audio tests sentence-transformers>=5.2.0 # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests diff --git a/requirements/test.txt b/requirements/test.txt index e2f9040be..7d3a988a7 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -544,6 +544,7 @@ numba==0.61.2 # via # -r requirements/test.in # librosa + # resampy numpy==2.2.6 # via # -r requirements/test.in @@ -584,6 +585,7 @@ numpy==2.2.6 # pyogrio # pywavelets # rasterio + # resampy # rioxarray # rouge-score # runai-model-streamer @@ -995,6 +997,8 @@ requests==2.32.3 # tiktoken # transformers # wandb +resampy==0.4.3 + # via -r requirements/test.in responses==0.25.3 # via genai-perf rfc3339-validator==0.1.4 diff --git a/setup.py b/setup.py index 7b5c49e98..2f251a6a2 100644 --- a/setup.py +++ b/setup.py @@ -987,11 +987,11 @@ setup( "instanttensor": ["instanttensor >= 0.1.5"], "runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"], "audio": [ - "librosa", + "av", + "resampy", "scipy", "soundfile", "mistral_common[audio]", - "av", ], # Required for audio processing "video": [], # Kept for backwards compatibility "flashinfer": [], # Kept for backwards compatibility diff --git a/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py b/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py index e9bde638d..4ac48699a 100644 --- a/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py +++ b/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py @@ -152,5 +152,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name) model_name, foscolo, language="it", - expected_text="ove il mio corpo fanciulletto giacque", + expected_text="ove il mio corpo fanciulletto", ) diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index cf7e2a7b0..bf670105b 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -275,7 +275,7 @@ INPUT_REASONING_BATCH = "\n".join( ] ) -MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=" +MINIMAL_WAV_BASE64 = "UklGRigAAABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAAZGF0YQQAAAAAAP9/" INPUT_TRANSCRIPTION_BATCH = ( json.dumps( { diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py index 47852453c..1b7e2347b 100644 --- a/tests/models/multimodal/generation/vlm_utils/builders.py +++ b/tests/models/multimodal/generation/vlm_utils/builders.py @@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info( test_info.audio_idx_to_prompt, test_info.prompt_formatter, ) - resampler = AudioResampler( - target_sr=16000, - method="librosa", - ) + resampler = AudioResampler(target_sr=16000) audios = [asset.audio_and_sample_rate for asset in audio_assets] resampled_audios = [ ( diff --git a/tests/multimodal/media/test_audio.py b/tests/multimodal/media/test_audio.py index 18f142008..4361066ab 100644 --- a/tests/multimodal/media/test_audio.py +++ b/tests/multimodal/media/test_audio.py @@ -10,6 +10,8 @@ import pytest from vllm.multimodal.media import AudioMediaIO +from ...conftest import AudioTestAssets + pytestmark = pytest.mark.cpu_test ASSETS_DIR = Path(__file__).parent.parent / "assets" @@ -22,40 +24,32 @@ def dummy_audio(): @pytest.fixture -def dummy_audio_bytes(): - return b"FAKEAUDIOBYTES" +def dummy_audio_bytes(audio_assets: AudioTestAssets): + with open(audio_assets[0].get_local_path(), "rb") as f: + return f.read() def test_audio_media_io_load_bytes(dummy_audio_bytes): audio_io = AudioMediaIO() - with patch("librosa.load") as mock_load: - mock_load.return_value = (np.array([0.1, 0.2]), 16000) - out = audio_io.load_bytes(dummy_audio_bytes) - mock_load.assert_called_once() - assert isinstance(out[0], np.ndarray) - assert out[1] == 16000 + out = audio_io.load_bytes(dummy_audio_bytes) + assert isinstance(out[0], np.ndarray) + assert out[1] == 16000 def test_audio_media_io_load_base64(dummy_audio_bytes): audio_io = AudioMediaIO() encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8") - with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes: - mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000) - out = audio_io.load_base64("audio/wav", encoded) - mock_load_bytes.assert_called_once() - assert isinstance(out[0], np.ndarray) - assert out[1] == 16000 + out = audio_io.load_base64("audio/wav", encoded) + assert isinstance(out[0], np.ndarray) + assert out[1] == 16000 -def test_audio_media_io_load_file(): +def test_audio_media_io_load_file(audio_assets: AudioTestAssets): audio_io = AudioMediaIO() - path = Path("/fake/path.wav") - with patch("librosa.load") as mock_load: - mock_load.return_value = (np.array([0.1, 0.2]), 16000) - out = audio_io.load_file(path) - mock_load.assert_called_once_with(path, sr=None) - assert isinstance(out[0], np.ndarray) - assert out[1] == 16000 + path = audio_assets[0].get_local_path() + out = audio_io.load_file(path) + assert isinstance(out[0], np.ndarray) + assert out[1] == 16000 def test_audio_media_io_encode_base64(dummy_audio): diff --git a/tests/multimodal/test_audio.py b/tests/multimodal/test_audio.py index 3cc6bcadb..0bc898845 100644 --- a/tests/multimodal/test_audio.py +++ b/tests/multimodal/test_audio.py @@ -14,7 +14,7 @@ from vllm.multimodal.audio import ( AudioSpec, ChannelReduction, normalize_audio, - resample_audio_librosa, + resample_audio_pyav, resample_audio_scipy, split_audio, ) @@ -25,14 +25,14 @@ def dummy_audio(): return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float) -def test_resample_audio_librosa(dummy_audio): - with patch("vllm.multimodal.audio.librosa.resample") as mock_resample: - mock_resample.return_value = dummy_audio * 2 - out = resample_audio_librosa(dummy_audio, orig_sr=44100, target_sr=22050) - mock_resample.assert_called_once_with( - dummy_audio, orig_sr=44100, target_sr=22050 - ) - assert np.all(out == dummy_audio * 2) +def test_resample_audio_pyav(dummy_audio): + out_down = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=2) + out_up = resample_audio_pyav(dummy_audio, orig_sr=2, target_sr=4) + out_same = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=4) + + assert len(out_down) == 3 + assert len(out_up) == 10 + assert np.all(out_same == dummy_audio) def test_resample_audio_scipy(dummy_audio): @@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio): assert np.isfinite(out).all() -def test_audio_resampler_librosa_calls_resample(dummy_audio): - resampler = AudioResampler(target_sr=22050, method="librosa") - with patch("vllm.multimodal.audio.resample_audio_librosa") as mock_resample: +def test_audio_resampler_pyav_calls_resample(dummy_audio): + resampler = AudioResampler(target_sr=22050, method="pyav") + with patch("vllm.multimodal.audio.resample_audio_pyav") as mock_resample: mock_resample.return_value = dummy_audio out = resampler.resample(dummy_audio, orig_sr=44100) mock_resample.assert_called_once_with( @@ -423,13 +423,13 @@ class TestAudioPipelineE2E: # Verify channel averaging: mean of [0.5, -0.5] = 0.0 np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5) - def test_librosa_mono_passthrough_e2e(self): - """Full pipeline: librosa mono format → preserved as mono.""" + def test_pyav_mono_passthrough_e2e(self): + """Full pipeline: pyav mono format → preserved as mono.""" from vllm.multimodal.parse import MultiModalDataParser - # Simulate librosa output: already mono (time,) format - mono_librosa = np.random.randn(16000).astype(np.float32) - assert mono_librosa.shape == (16000,) + # Simulate pyav output: already mono (time,) format + mono_pyav = np.random.randn(16000).astype(np.float32) + assert mono_pyav.shape == (16000,) # Create parser with mono normalization parser = MultiModalDataParser( @@ -438,7 +438,7 @@ class TestAudioPipelineE2E: ) # Process audio through the parser - result = parser._parse_audio_data((mono_librosa, 16000)) + result = parser._parse_audio_data((mono_pyav, 16000)) audio_output = result.get(0) # Verify output is still mono 1D @@ -446,7 +446,7 @@ class TestAudioPipelineE2E: assert audio_output.shape == (16000,) # Verify audio content is preserved - np.testing.assert_array_almost_equal(audio_output, mono_librosa) + np.testing.assert_array_almost_equal(audio_output, mono_pyav) def test_multichannel_5_1_surround_to_mono_e2e(self): """Full pipeline: 5.1 surround (6 channels) → mono output.""" diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py index b527ffcf9..24a5b9bee 100644 --- a/vllm/assets/audio.py +++ b/vllm/assets/audio.py @@ -8,15 +8,10 @@ from urllib.parse import urljoin import numpy.typing as npt -from vllm.utils.import_utils import PlaceholderModule +from vllm.multimodal.media.audio import load_audio from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets -try: - import librosa -except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] - ASSET_DIR = "multimodal_asset" AudioAssetName = Literal["winning_call", "mary_had_lamb"] @@ -33,7 +28,7 @@ class AudioAsset: @property def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]: audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR) - return librosa.load(audio_path, sr=None) + return load_audio(audio_path, sr=None) def get_local_path(self) -> Path: return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR) diff --git a/vllm/assets/video.py b/vllm/assets/video.py index d025368cb..f5e443db9 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -10,15 +10,10 @@ import numpy.typing as npt from huggingface_hub import hf_hub_download from PIL import Image -from vllm.utils.import_utils import PlaceholderModule +from vllm.multimodal.media.audio import load_audio_pyav from .base import get_cache_dir -try: - import librosa -except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] - @lru_cache def download_video_asset(filename: str) -> str: @@ -146,4 +141,4 @@ class VideoAsset: See also: examples/offline_inference/qwen2_5_omni/only_thinker.py """ - return librosa.load(self.video_path, sr=sampling_rate)[0] + return load_audio_pyav(self.video_path, sr=sampling_rate)[0] diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 1e0a63dd6..8304e8703 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -38,6 +38,7 @@ from typing_extensions import deprecated from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import MultiModalDataDict +from vllm.multimodal.audio import get_audio_duration from vllm.multimodal.image import convert_image_mode from vllm.tokenizers import TokenizerLike from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -54,10 +55,6 @@ try: except ImportError: pd = PlaceholderModule("pandas") -try: - import librosa -except ImportError: - librosa = PlaceholderModule("librosa") logger = logging.getLogger(__name__) @@ -3253,7 +3250,7 @@ class ASRDataset(HuggingFaceDataset): break audio = item["audio"] y, sr = audio["array"], audio["sampling_rate"] - duration_s = librosa.get_duration(y=y, sr=sr) + duration_s = get_audio_duration(y=y, sr=sr) if duration_s < asr_min_audio_len_sec or duration_s > asr_max_audio_len_sec: skipped += 1 continue diff --git a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py index 4a6030d71..bf58273f7 100644 --- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py @@ -42,32 +42,13 @@ from vllm.inputs import EncoderDecoderInputs, ProcessorInputs from vllm.logger import init_logger from vllm.logprobs import FlatLogprobs, Logprob from vllm.model_executor.models import SupportsTranscription -from vllm.multimodal.audio import split_audio -from vllm.multimodal.media.audio import extract_audio_from_video_bytes +from vllm.multimodal.audio import get_audio_duration, split_audio +from vllm.multimodal.media.audio import load_audio from vllm.outputs import RequestOutput from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.tokenizers import get_tokenizer -from vllm.utils.import_utils import PlaceholderModule - -try: - import librosa -except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] - -try: - import soundfile as sf -except ImportError: - sf = PlaceholderModule("soundfile") # type: ignore[assignment] - -# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile -# being librosa's main backend. Used to validate if an audio loading error is due to a -# server error vs a client error (invalid audio file). -# 1 = unrecognised format (file is not a supported audio container) -# 3 = malformed file (corrupt or structurally invalid audio) -# 4 = unsupported encoding (codec not supported by this libsndfile build) -_BAD_SF_CODES = {1, 3, 4} SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse SpeechToTextResponseVerbose: TypeAlias = ( @@ -214,32 +195,13 @@ class OpenAISpeechToText(OpenAIServing): # pre-requisite for chunking, as it assumes Whisper SR. try: with io.BytesIO(audio_data) as buf: - y, sr = librosa.load(buf, sr=self.asr_config.sample_rate) # type: ignore[return-value] - except sf.LibsndfileError as exc: - # Only fall back for known format-detection failures. - # Re-raise anything else (e.g. corrupt but recognised format). - if exc.code not in _BAD_SF_CODES: - raise - logger.debug( - "librosa/soundfile could not decode audio from BytesIO " - "(code=%s: %s); falling back to pyav in-process decode", - exc.code, - exc, - ) - try: - native_y, native_sr = extract_audio_from_video_bytes(audio_data) - sr = self.asr_config.sample_rate - y = librosa.resample(native_y, orig_sr=native_sr, target_sr=sr) - except Exception as pyav_exc: - logger.debug( - "pyAV fallback also failed: %s", - pyav_exc, - ) - raise ValueError("Invalid or unsupported audio file.") from pyav_exc + y, sr = load_audio(buf, sr=self.asr_config.sample_rate) + except Exception as exc: + raise ValueError("Invalid or unsupported audio file.") from exc - duration = librosa.get_duration(y=y, sr=sr) - do_split_audio = ( - self.asr_config.allow_audio_chunking + duration = get_audio_duration(y=y, sr=sr) + do_split_audio = self.asr_config.allow_audio_chunking and ( + self.asr_config.max_audio_clip_s is not None and duration > self.asr_config.max_audio_clip_s ) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 5ff9c5f04..1741e18fd 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -12,6 +12,7 @@ import math import warnings from collections.abc import Iterable, Mapping, Sequence from functools import cached_property +from io import BytesIO from typing import Annotated, Literal, TypeAlias import torch @@ -53,7 +54,7 @@ from vllm.multimodal.inputs import ( MultiModalKwargsItems, VideoItem, ) -from vllm.multimodal.media.audio import extract_audio_from_video_bytes +from vllm.multimodal.media.audio import load_audio_pyav from vllm.multimodal.parse import ( AudioProcessorItems, ImageEmbeddingItems, @@ -553,7 +554,7 @@ class NanoNemotronVLMultiModalProcessor( "video must be loaded with keep_video_bytes=True (e.g. via " "the chat API with a model that sets use_audio_in_video)." ) - audio_items.append(extract_audio_from_video_bytes(video_bytes)) + audio_items.append(load_audio_pyav(BytesIO(video_bytes))) # Create a new VideoProcessorItems with metadata that does not contain # the large video bytes, to avoid modifying the input `mm_items`. diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index 28f066d11..0a748a6d1 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -12,17 +12,35 @@ import torch from vllm.utils.import_utils import PlaceholderModule try: - import librosa + import av as av except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] + av = PlaceholderModule("av") # type: ignore[assignment] +try: + import resampy +except ImportError: + resampy = PlaceholderModule("resampy") # type: ignore[assignment] try: import scipy.signal as scipy_signal except ImportError: scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal") # type: ignore[assignment] + # ============================================================ +# Aligned with `librosa.get_duration` function +def get_audio_duration(*, y: npt.NDArray[np.floating], sr: float = 22050) -> float: + """Get the duration of an audio array in seconds. + + Args: + y: Audio time series. Can be 1D (samples,) or 2D (channels, samples). + sr: Sample rate of the audio in Hz. + + Returns: + Duration of the audio in seconds. + """ + n_samples = y.shape[-1] + return float(n_samples) / sr class ChannelReduction(str, Enum): @@ -153,13 +171,71 @@ def normalize_audio( # ============================================================ -def resample_audio_librosa( +def resample_audio_pyav( audio: npt.NDArray[np.floating], *, orig_sr: float, target_sr: float, ) -> npt.NDArray[np.floating]: - return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) + """Resample audio using PyAV (libswresample via FFmpeg). + + Args: + audio: Input audio. Can be: + - 1D array ``(samples,)``: mono audio + - 2D array ``(channels, samples)``: stereo audio + orig_sr: Original sample rate in Hz. + target_sr: Target sample rate in Hz. + + Returns: + Resampled audio with the same shape as the input (1D → 1D, 2D → 2D). + """ + orig_sr_int = int(round(orig_sr)) + target_sr_int = int(round(target_sr)) + + if orig_sr_int == target_sr_int: + return audio + + if audio.ndim == 2: + # Resample each channel independently and re-stack. + return np.stack( + [ + resample_audio_pyav(ch, orig_sr=orig_sr, target_sr=target_sr) + for ch in audio + ], + axis=0, + ) + + expected_len = int(math.ceil(audio.shape[-1] * target_sr_int / orig_sr_int)) + + # from_ndarray expects shape (channels, samples) for planar formats. + # libswresample requires a minimum number of input samples to produce + # output frames; pad short inputs with zeros so we always get output, + # then trim to the expected output length. + _MIN_SAMPLES = 1024 + audio_f32 = np.asarray(audio, dtype=np.float32) + if len(audio_f32) < _MIN_SAMPLES: + audio_f32 = np.pad(audio_f32, (0, _MIN_SAMPLES - len(audio_f32))) + audio_f32 = audio_f32.reshape(1, -1) + + resampler = av.AudioResampler(format="fltp", layout="mono", rate=target_sr_int) + + frame = av.AudioFrame.from_ndarray(audio_f32, format="fltp", layout="mono") + frame.sample_rate = orig_sr_int + + out_frames = resampler.resample(frame) + out_frames.extend(resampler.resample(None)) # flush buffered samples + + result = np.concatenate([f.to_ndarray() for f in out_frames], axis=1).squeeze(0) + return result[:expected_len] + + +def resample_audio_resampy( + audio: npt.NDArray[np.floating], + *, + orig_sr: float, + target_sr: float, +) -> npt.NDArray[np.floating]: + return resampy.resample(audio, sr_orig=orig_sr, sr_new=target_sr) def resample_audio_scipy( @@ -167,7 +243,7 @@ def resample_audio_scipy( *, orig_sr: float, target_sr: float, -): +) -> npt.NDArray[np.floating]: if orig_sr > target_sr: return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr) elif orig_sr < target_sr: @@ -181,7 +257,7 @@ class AudioResampler: def __init__( self, target_sr: float | None = None, - method: Literal["librosa", "scipy"] = "librosa", + method: Literal["pyav", "resampy", "scipy"] = "resampy", ): self.target_sr = target_sr self.method = method @@ -203,8 +279,10 @@ class AudioResampler: abs_tol=1e-6, ): return audio - if self.method == "librosa": - return resample_audio_librosa( + if self.method == "pyav": + return resample_audio_pyav(audio, orig_sr=orig_sr, target_sr=self.target_sr) + if self.method == "resampy": + return resample_audio_resampy( audio, orig_sr=orig_sr, target_sr=self.target_sr ) elif self.method == "scipy": @@ -214,7 +292,7 @@ class AudioResampler: else: raise ValueError( f"Invalid resampling method: {self.method}. " - "Supported methods are 'librosa' and 'scipy'." + "Supported methods are 'pyav' and 'scipy'." ) diff --git a/vllm/multimodal/media/audio.py b/vllm/multimodal/media/audio.py index 88dcb0b01..ae0a9f55b 100644 --- a/vllm/multimodal/media/audio.py +++ b/vllm/multimodal/media/audio.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math from io import BytesIO from pathlib import Path @@ -14,58 +15,80 @@ from vllm.utils.serial_utils import tensor2base64 from .base import MediaIO try: - import librosa + import av except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] + av = PlaceholderModule("av") # type: ignore[assignment] try: import soundfile except ImportError: soundfile = PlaceholderModule("soundfile") # type: ignore[assignment] + try: - import av + import resampy except ImportError: - av = PlaceholderModule("av") # type: ignore[assignment] + resampy = PlaceholderModule("resampy") # type: ignore[assignment] -def extract_audio_from_video_bytes( - data: bytes, +# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile +# being librosa's main backend. Used to validate if an audio loading error is due to a +# server error vs a client error (invalid audio file). +# 1 = unrecognised format (file is not a supported audio container) +# 3 = malformed file (corrupt or structurally invalid audio) +# 4 = unsupported encoding (codec not supported by this libsndfile build) +_BAD_SF_CODES = {1, 3, 4} + + +def load_audio_pyav( + path: BytesIO | Path | str, + *, + sr: float | None = 22050, + mono: bool = True, ) -> tuple[npt.NDArray, float]: - """Extract the audio track from raw video bytes using PyAV. + """Load an audio file using PyAV (FFmpeg), returning float32 mono waveform. - PyAV wraps FFmpeg's C libraries in-process — no subprocess is - spawned, which is critical to avoid crashing CUDA-active vLLM - worker processes. - - The returned waveform is at the native sample rate of the video's - audio stream. Resampling to a model-specific rate is left to the - downstream :class:`AudioResampler` in the parsing pipeline. + Decodes the audio stream at its native sample rate. Channel reduction to + mono is performed by averaging across channels. Resampling to a + model-specific rate is left to the downstream :class:`AudioResampler`. Args: - data: Raw video file bytes (e.g. from an mp4 file). + path: A :class:`~io.BytesIO` buffer, a filesystem + :class:`~pathlib.Path`, or a string path. Returns: - A tuple of ``(waveform, sample_rate)`` suitable for use as an - :class:`AudioItem`. + ``(waveform, sample_rate)`` where *waveform* is a 1-D float32 + NumPy array and *sample_rate* is the native sample rate in Hz. """ - if data is None or len(data) == 0: - raise ValueError( - "Cannot extract audio: video bytes are missing or empty. " - "Ensure video was loaded with keep_video_bytes=True for " - "audio-in-video extraction." - ) + native_sr = None try: - with av.open(BytesIO(data)) as container: + with av.open(path) as container: if not container.streams.audio: - raise ValueError("No audio stream found in the video.") + raise ValueError("No audio stream found.") stream = container.streams.audio[0] + stream.thread_type = "AUTO" native_sr = stream.rate + sr = sr or native_sr chunks: list[npt.NDArray] = [] - for frame in container.decode(audio=0): - arr = frame.to_ndarray() - chunks.append(arr.mean(axis=0) if arr.ndim > 1 else arr) + needs_resampling = not math.isclose( + float(sr), + float(native_sr), + rel_tol=0.0, + abs_tol=1e-6, + ) + resampler = ( + av.AudioResampler(format="fltp", layout="mono", rate=sr) + if needs_resampling + else None + ) + for frame in container.decode(stream): + if needs_resampling: + assert resampler is not None + for out_frame in resampler.resample(frame): + chunks.append(out_frame.to_ndarray()) + else: + chunks.append(frame.to_ndarray()) except ValueError: raise except Exception as e: @@ -77,37 +100,54 @@ def extract_audio_from_video_bytes( if not chunks: raise ValueError("No audio found in the video.") - audio = np.concatenate(chunks).astype(np.float32) - return audio, float(native_sr) + audio = np.concatenate(chunks, axis=-1).astype(np.float32) + if mono and audio.ndim > 1: + audio = np.mean(audio, axis=0) + + return audio, sr -def is_video(data: bytes) -> bool: - """Check if the fetched bytes are video""" - if len(data) < 12: - return False +def load_audio_soundfile( + path: BytesIO | Path | str, + *, + sr: float | None = 22050, + mono: bool = True, +) -> tuple[np.ndarray, int]: + """Load audio via soundfile""" + with soundfile.SoundFile(path) as f: + native_sr = f.samplerate + y = f.read(dtype="float32", always_2d=False).T - box_type = data[4:8] - major_brand = data[8:12] + if mono and y.ndim > 1: + y = np.mean(y, axis=tuple(range(y.ndim - 1))) - MP4_BRANDS = { - b"mp41", - b"mp42", # MP4 - b"isom", # ISO Base Media - b"iso2", - b"iso4", - b"iso5", - b"iso6", - b"M4V ", - b"M4A ", # Apple - b"avc1", # H.264 - b"dash", # DASH - b"mmp4", - b"MSNV", - } + if sr is not None and sr != native_sr: + y = resampy.resample(y, sr_orig=native_sr, sr_new=sr) + return y, int(sr) + return y, native_sr - is_avi = data[:4] == b"RIFF" and major_brand == b"AVI " - is_mp4 = box_type == b"ftyp" and major_brand in MP4_BRANDS - return is_mp4 or is_avi + +def load_audio( + path: BytesIO | Path | str, + *, + sr: float | None = 22050, + mono: bool = True, +): + try: + return load_audio_soundfile(path, sr=sr, mono=mono) + except soundfile.LibsndfileError as exc: + # Only fall back for known format-detection failures. + # Re-raise anything else (e.g. corrupt but recognised format). + if exc.code not in _BAD_SF_CODES: + raise + # soundfile may have advanced the BytesIO seek position before failing; + # reset it so PyAV can read from the beginning. + if isinstance(path, BytesIO): + path.seek(0) + try: + return load_audio_pyav(path, sr=sr, mono=mono) + except Exception as pyav_exc: + raise ValueError("Invalid or unsupported audio file.") from pyav_exc class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): @@ -128,9 +168,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): self.kwargs = kwargs def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]: - if is_video(data): - return extract_audio_from_video_bytes(data) - return librosa.load(BytesIO(data), sr=None) + return load_audio(BytesIO(data), sr=None) def load_base64( self, @@ -140,7 +178,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): return self.load_bytes(pybase64.b64decode(data)) def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]: - return librosa.load(filepath, sr=None) + return load_audio(filepath, sr=None) def encode_base64( self, diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 6a588dad0..9e1774e39 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -497,7 +497,7 @@ class MultiModalDataParser: *, target_sr: float | None = None, target_channels: int | None = None, - audio_resample_method: Literal["librosa", "scipy"] = "librosa", + audio_resample_method: Literal["pyav", "scipy"] = "pyav", video_needs_metadata: bool = False, expected_hidden_size: int | None = None, ) -> None: diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index b468712ad..63946e8fd 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -172,9 +172,6 @@ class BaseRenderer(ABC, Generic[_T]): For chat requests: - Jinja2 template compilation - - For multi-modal requests: - - Importing libraries such as librosa triggers JIT compilation. """ from vllm.entrypoints.chat_utils import ChatTemplateResolutionError diff --git a/vllm/transformers_utils/processors/fireredasr2.py b/vllm/transformers_utils/processors/fireredasr2.py index 4bde53015..bba7e7ee0 100644 --- a/vllm/transformers_utils/processors/fireredasr2.py +++ b/vllm/transformers_utils/processors/fireredasr2.py @@ -188,7 +188,7 @@ class FireRedASR2FeatureExtractor(SequenceFeatureExtractor): for speech in raw_speech: """ We must multiply by 32768 here because FireRedASR2 loads audio data - using kaldiio.load_mat, while vLLM loads audio data using librosa. + using kaldiio.load_mat, while vLLM loads audio data using pyav. """ speech = speech * 32768 fbank = self.fbank(sampling_rate, speech)