From c7f98b4d0a63b32ed939e2b6dfaa8a626e9b46c4 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 21 Mar 2026 11:36:15 +0800
Subject: [PATCH] [Frontend] Remove librosa from audio dependency (#37058)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 requirements/test.in                          |   1 +
 requirements/test.txt                         |   4 +
 setup.py                                      |   4 +-
 .../test_transcription_validation.py          |   2 +-
 tests/entrypoints/openai/test_run_batch.py    |   2 +-
 .../generation/vlm_utils/builders.py          |   5 +-
 tests/multimodal/media/test_audio.py          |  38 ++---
 tests/multimodal/test_audio.py                |  38 ++---
 vllm/assets/audio.py                          |   9 +-
 vllm/assets/video.py                          |   9 +-
 vllm/benchmarks/datasets.py                   |   7 +-
 .../openai/speech_to_text/speech_to_text.py   |  54 +-----
 .../model_executor/models/nano_nemotron_vl.py |   5 +-
 vllm/multimodal/audio.py                      |  96 ++++++++++-
 vllm/multimodal/media/audio.py                | 154 +++++++++++-------
 vllm/multimodal/parse.py                      |   2 +-
 vllm/renderers/base.py                        |   3 -
 .../processors/fireredasr2.py                 |   2 +-
 18 files changed, 247 insertions(+), 188 deletions(-)

diff --git a/requirements/test.in b/requirements/test.in
index 8bd005144..be4c2e579 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test
 peft>=0.15.0 # required for phi-4-mm test
 pqdm
 ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
+resampy # required for audio tests
 sentence-transformers>=5.2.0 # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
diff --git a/requirements/test.txt b/requirements/test.txt
index e2f9040be..7d3a988a7 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -544,6 +544,7 @@ numba==0.61.2
     # via
     #   -r requirements/test.in
     #   librosa
+    #   resampy
 numpy==2.2.6
     # via
     #   -r requirements/test.in
@@ -584,6 +585,7 @@ numpy==2.2.6
     #   pyogrio
     #   pywavelets
     #   rasterio
+    #   resampy
     #   rioxarray
     #   rouge-score
     #   runai-model-streamer
@@ -995,6 +997,8 @@ requests==2.32.3
     #   tiktoken
     #   transformers
     #   wandb
+resampy==0.4.3
+    # via -r requirements/test.in
 responses==0.25.3
     # via genai-perf
 rfc3339-validator==0.1.4
diff --git a/setup.py b/setup.py
index 7b5c49e98..2f251a6a2 100644
--- a/setup.py
+++ b/setup.py
@@ -987,11 +987,11 @@ setup(
         "instanttensor": ["instanttensor >= 0.1.5"],
         "runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"],
         "audio": [
-            "librosa",
+            "av",
+            "resampy",
             "scipy",
             "soundfile",
             "mistral_common[audio]",
-            "av",
         ],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         "flashinfer": [],  # Kept for backwards compatibility
diff --git a/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py b/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
index e9bde638d..4ac48699a 100644
--- a/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
+++ b/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
@@ -152,5 +152,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name)
             model_name,
             foscolo,
             language="it",
-            expected_text="ove il mio corpo fanciulletto giacque",
+            expected_text="ove il mio corpo fanciulletto",
         )
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index cf7e2a7b0..bf670105b 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -275,7 +275,7 @@ INPUT_REASONING_BATCH = "\n".join(
     ]
 )
 
-MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA="
+MINIMAL_WAV_BASE64 = "UklGRigAAABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAAZGF0YQQAAAAAAP9/"
 INPUT_TRANSCRIPTION_BATCH = (
     json.dumps(
         {
diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py
index 47852453c..1b7e2347b 100644
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
         test_info.audio_idx_to_prompt,
         test_info.prompt_formatter,
     )
-    resampler = AudioResampler(
-        target_sr=16000,
-        method="librosa",
-    )
+    resampler = AudioResampler(target_sr=16000)
     audios = [asset.audio_and_sample_rate for asset in audio_assets]
     resampled_audios = [
         (
diff --git a/tests/multimodal/media/test_audio.py b/tests/multimodal/media/test_audio.py
index 18f142008..4361066ab 100644
--- a/tests/multimodal/media/test_audio.py
+++ b/tests/multimodal/media/test_audio.py
@@ -10,6 +10,8 @@ import pytest
 
 from vllm.multimodal.media import AudioMediaIO
 
+from ...conftest import AudioTestAssets
+
 pytestmark = pytest.mark.cpu_test
 
 ASSETS_DIR = Path(__file__).parent.parent / "assets"
@@ -22,40 +24,32 @@ def dummy_audio():
 
 
 @pytest.fixture
-def dummy_audio_bytes():
-    return b"FAKEAUDIOBYTES"
+def dummy_audio_bytes(audio_assets: AudioTestAssets):
+    with open(audio_assets[0].get_local_path(), "rb") as f:
+        return f.read()
 
 
 def test_audio_media_io_load_bytes(dummy_audio_bytes):
     audio_io = AudioMediaIO()
-    with patch("librosa.load") as mock_load:
-        mock_load.return_value = (np.array([0.1, 0.2]), 16000)
-        out = audio_io.load_bytes(dummy_audio_bytes)
-        mock_load.assert_called_once()
-        assert isinstance(out[0], np.ndarray)
-        assert out[1] == 16000
+    out = audio_io.load_bytes(dummy_audio_bytes)
+    assert isinstance(out[0], np.ndarray)
+    assert out[1] == 16000
 
 
 def test_audio_media_io_load_base64(dummy_audio_bytes):
     audio_io = AudioMediaIO()
     encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
-    with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
-        mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
-        out = audio_io.load_base64("audio/wav", encoded)
-        mock_load_bytes.assert_called_once()
-        assert isinstance(out[0], np.ndarray)
-        assert out[1] == 16000
+    out = audio_io.load_base64("audio/wav", encoded)
+    assert isinstance(out[0], np.ndarray)
+    assert out[1] == 16000
 
 
-def test_audio_media_io_load_file():
+def test_audio_media_io_load_file(audio_assets: AudioTestAssets):
     audio_io = AudioMediaIO()
-    path = Path("/fake/path.wav")
-    with patch("librosa.load") as mock_load:
-        mock_load.return_value = (np.array([0.1, 0.2]), 16000)
-        out = audio_io.load_file(path)
-        mock_load.assert_called_once_with(path, sr=None)
-        assert isinstance(out[0], np.ndarray)
-        assert out[1] == 16000
+    path = audio_assets[0].get_local_path()
+    out = audio_io.load_file(path)
+    assert isinstance(out[0], np.ndarray)
+    assert out[1] == 16000
 
 
 def test_audio_media_io_encode_base64(dummy_audio):
diff --git a/tests/multimodal/test_audio.py b/tests/multimodal/test_audio.py
index 3cc6bcadb..0bc898845 100644
--- a/tests/multimodal/test_audio.py
+++ b/tests/multimodal/test_audio.py
@@ -14,7 +14,7 @@ from vllm.multimodal.audio import (
     AudioSpec,
     ChannelReduction,
     normalize_audio,
-    resample_audio_librosa,
+    resample_audio_pyav,
     resample_audio_scipy,
     split_audio,
 )
@@ -25,14 +25,14 @@ def dummy_audio():
     return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float)
 
 
-def test_resample_audio_librosa(dummy_audio):
-    with patch("vllm.multimodal.audio.librosa.resample") as mock_resample:
-        mock_resample.return_value = dummy_audio * 2
-        out = resample_audio_librosa(dummy_audio, orig_sr=44100, target_sr=22050)
-        mock_resample.assert_called_once_with(
-            dummy_audio, orig_sr=44100, target_sr=22050
-        )
-        assert np.all(out == dummy_audio * 2)
+def test_resample_audio_pyav(dummy_audio):
+    out_down = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=2)
+    out_up = resample_audio_pyav(dummy_audio, orig_sr=2, target_sr=4)
+    out_same = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=4)
+
+    assert len(out_down) == 3
+    assert len(out_up) == 10
+    assert np.all(out_same == dummy_audio)
 
 
 def test_resample_audio_scipy(dummy_audio):
@@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
     assert np.isfinite(out).all()
 
 
-def test_audio_resampler_librosa_calls_resample(dummy_audio):
-    resampler = AudioResampler(target_sr=22050, method="librosa")
-    with patch("vllm.multimodal.audio.resample_audio_librosa") as mock_resample:
+def test_audio_resampler_pyav_calls_resample(dummy_audio):
+    resampler = AudioResampler(target_sr=22050, method="pyav")
+    with patch("vllm.multimodal.audio.resample_audio_pyav") as mock_resample:
         mock_resample.return_value = dummy_audio
         out = resampler.resample(dummy_audio, orig_sr=44100)
         mock_resample.assert_called_once_with(
@@ -423,13 +423,13 @@ class TestAudioPipelineE2E:
         # Verify channel averaging: mean of [0.5, -0.5] = 0.0
         np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
 
-    def test_librosa_mono_passthrough_e2e(self):
-        """Full pipeline: librosa mono format → preserved as mono."""
+    def test_pyav_mono_passthrough_e2e(self):
+        """Full pipeline: pyav mono format → preserved as mono."""
         from vllm.multimodal.parse import MultiModalDataParser
 
-        # Simulate librosa output: already mono (time,) format
-        mono_librosa = np.random.randn(16000).astype(np.float32)
-        assert mono_librosa.shape == (16000,)
+        # Simulate pyav output: already mono (time,) format
+        mono_pyav = np.random.randn(16000).astype(np.float32)
+        assert mono_pyav.shape == (16000,)
 
         # Create parser with mono normalization
         parser = MultiModalDataParser(
@@ -438,7 +438,7 @@ class TestAudioPipelineE2E:
         )
 
         # Process audio through the parser
-        result = parser._parse_audio_data((mono_librosa, 16000))
+        result = parser._parse_audio_data((mono_pyav, 16000))
         audio_output = result.get(0)
 
         # Verify output is still mono 1D
@@ -446,7 +446,7 @@ class TestAudioPipelineE2E:
         assert audio_output.shape == (16000,)
 
         # Verify audio content is preserved
-        np.testing.assert_array_almost_equal(audio_output, mono_librosa)
+        np.testing.assert_array_almost_equal(audio_output, mono_pyav)
 
     def test_multichannel_5_1_surround_to_mono_e2e(self):
         """Full pipeline: 5.1 surround (6 channels) → mono output."""
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index b527ffcf9..24a5b9bee 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -8,15 +8,10 @@ from urllib.parse import urljoin
 
 import numpy.typing as npt
 
-from vllm.utils.import_utils import PlaceholderModule
+from vllm.multimodal.media.audio import load_audio
 
 from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
 
-try:
-    import librosa
-except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
-
 ASSET_DIR = "multimodal_asset"
 
 AudioAssetName = Literal["winning_call", "mary_had_lamb"]
@@ -33,7 +28,7 @@ class AudioAsset:
     @property
     def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
         audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
-        return librosa.load(audio_path, sr=None)
+        return load_audio(audio_path, sr=None)
 
     def get_local_path(self) -> Path:
         return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index d025368cb..f5e443db9 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -10,15 +10,10 @@ import numpy.typing as npt
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.utils.import_utils import PlaceholderModule
+from vllm.multimodal.media.audio import load_audio_pyav
 
 from .base import get_cache_dir
 
-try:
-    import librosa
-except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
-
 
 @lru_cache
 def download_video_asset(filename: str) -> str:
@@ -146,4 +141,4 @@ class VideoAsset:
 
         See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
         """
-        return librosa.load(self.video_path, sr=sampling_rate)[0]
+        return load_audio_pyav(self.video_path, sr=sampling_rate)[0]
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 1e0a63dd6..8304e8703 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -38,6 +38,7 @@ from typing_extensions import deprecated
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.audio import get_audio_duration
 from vllm.multimodal.image import convert_image_mode
 from vllm.tokenizers import TokenizerLike
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -54,10 +55,6 @@ try:
 except ImportError:
     pd = PlaceholderModule("pandas")
 
-try:
-    import librosa
-except ImportError:
-    librosa = PlaceholderModule("librosa")
 
 logger = logging.getLogger(__name__)
 
@@ -3253,7 +3250,7 @@ class ASRDataset(HuggingFaceDataset):
                 break
             audio = item["audio"]
             y, sr = audio["array"], audio["sampling_rate"]
-            duration_s = librosa.get_duration(y=y, sr=sr)
+            duration_s = get_audio_duration(y=y, sr=sr)
             if duration_s < asr_min_audio_len_sec or duration_s > asr_max_audio_len_sec:
                 skipped += 1
                 continue
diff --git a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
index 4a6030d71..bf58273f7 100644
--- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
@@ -42,32 +42,13 @@ from vllm.inputs import EncoderDecoderInputs, ProcessorInputs
 from vllm.logger import init_logger
 from vllm.logprobs import FlatLogprobs, Logprob
 from vllm.model_executor.models import SupportsTranscription
-from vllm.multimodal.audio import split_audio
-from vllm.multimodal.media.audio import extract_audio_from_video_bytes
+from vllm.multimodal.audio import get_audio_duration, split_audio
+from vllm.multimodal.media.audio import load_audio
 from vllm.outputs import RequestOutput
 from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt
 from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.tokenizers import get_tokenizer
-from vllm.utils.import_utils import PlaceholderModule
-
-try:
-    import librosa
-except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
-
-try:
-    import soundfile as sf
-except ImportError:
-    sf = PlaceholderModule("soundfile")  # type: ignore[assignment]
-
-# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
-# being librosa's main backend. Used to validate if an audio loading error is due to a
-# server error vs a client error (invalid audio file).
-# 1 = unrecognised format      (file is not a supported audio container)
-# 3 = malformed file           (corrupt or structurally invalid audio)
-# 4 = unsupported encoding     (codec not supported by this libsndfile build)
-_BAD_SF_CODES = {1, 3, 4}
 
 SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
 SpeechToTextResponseVerbose: TypeAlias = (
@@ -214,32 +195,13 @@ class OpenAISpeechToText(OpenAIServing):
         # pre-requisite for chunking, as it assumes Whisper SR.
         try:
             with io.BytesIO(audio_data) as buf:
-                y, sr = librosa.load(buf, sr=self.asr_config.sample_rate)  # type: ignore[return-value]
-        except sf.LibsndfileError as exc:
-            # Only fall back for known format-detection failures.
-            # Re-raise anything else (e.g. corrupt but recognised format).
-            if exc.code not in _BAD_SF_CODES:
-                raise
-            logger.debug(
-                "librosa/soundfile could not decode audio from BytesIO "
-                "(code=%s: %s); falling back to pyav in-process decode",
-                exc.code,
-                exc,
-            )
-            try:
-                native_y, native_sr = extract_audio_from_video_bytes(audio_data)
-                sr = self.asr_config.sample_rate
-                y = librosa.resample(native_y, orig_sr=native_sr, target_sr=sr)
-            except Exception as pyav_exc:
-                logger.debug(
-                    "pyAV fallback also failed: %s",
-                    pyav_exc,
-                )
-                raise ValueError("Invalid or unsupported audio file.") from pyav_exc
+                y, sr = load_audio(buf, sr=self.asr_config.sample_rate)
+        except Exception as exc:
+            raise ValueError("Invalid or unsupported audio file.") from exc
 
-        duration = librosa.get_duration(y=y, sr=sr)
-        do_split_audio = (
-            self.asr_config.allow_audio_chunking
+        duration = get_audio_duration(y=y, sr=sr)
+        do_split_audio = self.asr_config.allow_audio_chunking and (
+            self.asr_config.max_audio_clip_s is not None
             and duration > self.asr_config.max_audio_clip_s
         )
 
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 5ff9c5f04..1741e18fd 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -12,6 +12,7 @@ import math
 import warnings
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
+from io import BytesIO
 from typing import Annotated, Literal, TypeAlias
 
 import torch
@@ -53,7 +54,7 @@ from vllm.multimodal.inputs import (
     MultiModalKwargsItems,
     VideoItem,
 )
-from vllm.multimodal.media.audio import extract_audio_from_video_bytes
+from vllm.multimodal.media.audio import load_audio_pyav
 from vllm.multimodal.parse import (
     AudioProcessorItems,
     ImageEmbeddingItems,
@@ -553,7 +554,7 @@ class NanoNemotronVLMultiModalProcessor(
                     "video must be loaded with keep_video_bytes=True (e.g. via "
                     "the chat API with a model that sets use_audio_in_video)."
                 )
-            audio_items.append(extract_audio_from_video_bytes(video_bytes))
+            audio_items.append(load_audio_pyav(BytesIO(video_bytes)))
 
         # Create a new VideoProcessorItems with metadata that does not contain
         # the large video bytes, to avoid modifying the input `mm_items`.
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 28f066d11..0a748a6d1 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -12,17 +12,35 @@ import torch
 from vllm.utils.import_utils import PlaceholderModule
 
 try:
-    import librosa
+    import av as av
 except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+    av = PlaceholderModule("av")  # type: ignore[assignment]
 
+try:
+    import resampy
+except ImportError:
+    resampy = PlaceholderModule("resampy")  # type: ignore[assignment]
 
 try:
     import scipy.signal as scipy_signal
 except ImportError:
     scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal")  # type: ignore[assignment]
 
+
 # ============================================================
+# Aligned with `librosa.get_duration` function
+def get_audio_duration(*, y: npt.NDArray[np.floating], sr: float = 22050) -> float:
+    """Get the duration of an audio array in seconds.
+
+    Args:
+        y: Audio time series. Can be 1D (samples,) or 2D (channels, samples).
+        sr: Sample rate of the audio in Hz.
+
+    Returns:
+        Duration of the audio in seconds.
+    """
+    n_samples = y.shape[-1]
+    return float(n_samples) / sr
 
 
 class ChannelReduction(str, Enum):
@@ -153,13 +171,71 @@ def normalize_audio(
 # ============================================================
 
 
-def resample_audio_librosa(
+def resample_audio_pyav(
     audio: npt.NDArray[np.floating],
     *,
     orig_sr: float,
     target_sr: float,
 ) -> npt.NDArray[np.floating]:
-    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+    """Resample audio using PyAV (libswresample via FFmpeg).
+
+    Args:
+        audio: Input audio. Can be:
+            - 1D array ``(samples,)``: mono audio
+            - 2D array ``(channels, samples)``: stereo audio
+        orig_sr: Original sample rate in Hz.
+        target_sr: Target sample rate in Hz.
+
+    Returns:
+        Resampled audio with the same shape as the input (1D → 1D, 2D → 2D).
+    """
+    orig_sr_int = int(round(orig_sr))
+    target_sr_int = int(round(target_sr))
+
+    if orig_sr_int == target_sr_int:
+        return audio
+
+    if audio.ndim == 2:
+        # Resample each channel independently and re-stack.
+        return np.stack(
+            [
+                resample_audio_pyav(ch, orig_sr=orig_sr, target_sr=target_sr)
+                for ch in audio
+            ],
+            axis=0,
+        )
+
+    expected_len = int(math.ceil(audio.shape[-1] * target_sr_int / orig_sr_int))
+
+    # from_ndarray expects shape (channels, samples) for planar formats.
+    # libswresample requires a minimum number of input samples to produce
+    # output frames; pad short inputs with zeros so we always get output,
+    # then trim to the expected output length.
+    _MIN_SAMPLES = 1024
+    audio_f32 = np.asarray(audio, dtype=np.float32)
+    if len(audio_f32) < _MIN_SAMPLES:
+        audio_f32 = np.pad(audio_f32, (0, _MIN_SAMPLES - len(audio_f32)))
+    audio_f32 = audio_f32.reshape(1, -1)
+
+    resampler = av.AudioResampler(format="fltp", layout="mono", rate=target_sr_int)
+
+    frame = av.AudioFrame.from_ndarray(audio_f32, format="fltp", layout="mono")
+    frame.sample_rate = orig_sr_int
+
+    out_frames = resampler.resample(frame)
+    out_frames.extend(resampler.resample(None))  # flush buffered samples
+
+    result = np.concatenate([f.to_ndarray() for f in out_frames], axis=1).squeeze(0)
+    return result[:expected_len]
+
+
+def resample_audio_resampy(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+) -> npt.NDArray[np.floating]:
+    return resampy.resample(audio, sr_orig=orig_sr, sr_new=target_sr)
 
 
 def resample_audio_scipy(
@@ -167,7 +243,7 @@ def resample_audio_scipy(
     *,
     orig_sr: float,
     target_sr: float,
-):
+) -> npt.NDArray[np.floating]:
     if orig_sr > target_sr:
         return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr)
     elif orig_sr < target_sr:
@@ -181,7 +257,7 @@ class AudioResampler:
     def __init__(
         self,
         target_sr: float | None = None,
-        method: Literal["librosa", "scipy"] = "librosa",
+        method: Literal["pyav", "resampy", "scipy"] = "resampy",
     ):
         self.target_sr = target_sr
         self.method = method
@@ -203,8 +279,10 @@ class AudioResampler:
             abs_tol=1e-6,
         ):
             return audio
-        if self.method == "librosa":
-            return resample_audio_librosa(
+        if self.method == "pyav":
+            return resample_audio_pyav(audio, orig_sr=orig_sr, target_sr=self.target_sr)
+        if self.method == "resampy":
+            return resample_audio_resampy(
                 audio, orig_sr=orig_sr, target_sr=self.target_sr
             )
         elif self.method == "scipy":
@@ -214,7 +292,7 @@ class AudioResampler:
         else:
             raise ValueError(
                 f"Invalid resampling method: {self.method}. "
-                "Supported methods are 'librosa' and 'scipy'."
+                "Supported methods are 'pyav' and 'scipy'."
             )
 
 
diff --git a/vllm/multimodal/media/audio.py b/vllm/multimodal/media/audio.py
index 88dcb0b01..ae0a9f55b 100644
--- a/vllm/multimodal/media/audio.py
+++ b/vllm/multimodal/media/audio.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
 from io import BytesIO
 from pathlib import Path
 
@@ -14,58 +15,80 @@ from vllm.utils.serial_utils import tensor2base64
 from .base import MediaIO
 
 try:
-    import librosa
+    import av
 except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+    av = PlaceholderModule("av")  # type: ignore[assignment]
 
 try:
     import soundfile
 except ImportError:
     soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
 
+
 try:
-    import av
+    import resampy
 except ImportError:
-    av = PlaceholderModule("av")  # type: ignore[assignment]
+    resampy = PlaceholderModule("resampy")  # type: ignore[assignment]
 
 
-def extract_audio_from_video_bytes(
-    data: bytes,
+# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
+# being librosa's main backend. Used to validate if an audio loading error is due to a
+# server error vs a client error (invalid audio file).
+# 1 = unrecognised format      (file is not a supported audio container)
+# 3 = malformed file           (corrupt or structurally invalid audio)
+# 4 = unsupported encoding     (codec not supported by this libsndfile build)
+_BAD_SF_CODES = {1, 3, 4}
+
+
+def load_audio_pyav(
+    path: BytesIO | Path | str,
+    *,
+    sr: float | None = 22050,
+    mono: bool = True,
 ) -> tuple[npt.NDArray, float]:
-    """Extract the audio track from raw video bytes using PyAV.
+    """Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.
 
-    PyAV wraps FFmpeg's C libraries in-process — no subprocess is
-    spawned, which is critical to avoid crashing CUDA-active vLLM
-    worker processes.
-
-    The returned waveform is at the native sample rate of the video's
-    audio stream.  Resampling to a model-specific rate is left to the
-    downstream :class:`AudioResampler` in the parsing pipeline.
+    Decodes the audio stream at its native sample rate. Channel reduction to
+    mono is performed by averaging across channels.  Resampling to a
+    model-specific rate is left to the downstream :class:`AudioResampler`.
 
     Args:
-        data: Raw video file bytes (e.g. from an mp4 file).
+        path: A :class:`~io.BytesIO` buffer, a filesystem
+            :class:`~pathlib.Path`, or a string path.
 
     Returns:
-        A tuple of ``(waveform, sample_rate)`` suitable for use as an
-        :class:`AudioItem`.
+        ``(waveform, sample_rate)`` where *waveform* is a 1-D float32
+        NumPy array and *sample_rate* is the native sample rate in Hz.
     """
-    if data is None or len(data) == 0:
-        raise ValueError(
-            "Cannot extract audio: video bytes are missing or empty. "
-            "Ensure video was loaded with keep_video_bytes=True for "
-            "audio-in-video extraction."
-        )
+    native_sr = None
     try:
-        with av.open(BytesIO(data)) as container:
+        with av.open(path) as container:
             if not container.streams.audio:
-                raise ValueError("No audio stream found in the video.")
+                raise ValueError("No audio stream found.")
             stream = container.streams.audio[0]
+            stream.thread_type = "AUTO"
             native_sr = stream.rate
+            sr = sr or native_sr
 
             chunks: list[npt.NDArray] = []
-            for frame in container.decode(audio=0):
-                arr = frame.to_ndarray()
-                chunks.append(arr.mean(axis=0) if arr.ndim > 1 else arr)
+            needs_resampling = not math.isclose(
+                float(sr),
+                float(native_sr),
+                rel_tol=0.0,
+                abs_tol=1e-6,
+            )
+            resampler = (
+                av.AudioResampler(format="fltp", layout="mono", rate=sr)
+                if needs_resampling
+                else None
+            )
+            for frame in container.decode(stream):
+                if needs_resampling:
+                    assert resampler is not None
+                    for out_frame in resampler.resample(frame):
+                        chunks.append(out_frame.to_ndarray())
+                else:
+                    chunks.append(frame.to_ndarray())
     except ValueError:
         raise
     except Exception as e:
@@ -77,37 +100,54 @@ def extract_audio_from_video_bytes(
     if not chunks:
         raise ValueError("No audio found in the video.")
 
-    audio = np.concatenate(chunks).astype(np.float32)
-    return audio, float(native_sr)
+    audio = np.concatenate(chunks, axis=-1).astype(np.float32)
+    if mono and audio.ndim > 1:
+        audio = np.mean(audio, axis=0)
+
+    return audio, sr
 
 
-def is_video(data: bytes) -> bool:
-    """Check if the fetched bytes are video"""
-    if len(data) < 12:
-        return False
+def load_audio_soundfile(
+    path: BytesIO | Path | str,
+    *,
+    sr: float | None = 22050,
+    mono: bool = True,
+) -> tuple[np.ndarray, int]:
+    """Load audio via soundfile"""
+    with soundfile.SoundFile(path) as f:
+        native_sr = f.samplerate
+        y = f.read(dtype="float32", always_2d=False).T
 
-    box_type = data[4:8]
-    major_brand = data[8:12]
+    if mono and y.ndim > 1:
+        y = np.mean(y, axis=tuple(range(y.ndim - 1)))
 
-    MP4_BRANDS = {
-        b"mp41",
-        b"mp42",  # MP4
-        b"isom",  # ISO Base Media
-        b"iso2",
-        b"iso4",
-        b"iso5",
-        b"iso6",
-        b"M4V ",
-        b"M4A ",  # Apple
-        b"avc1",  # H.264
-        b"dash",  # DASH
-        b"mmp4",
-        b"MSNV",
-    }
+    if sr is not None and sr != native_sr:
+        y = resampy.resample(y, sr_orig=native_sr, sr_new=sr)
+        return y, int(sr)
+    return y, native_sr
 
-    is_avi = data[:4] == b"RIFF" and major_brand == b"AVI "
-    is_mp4 = box_type == b"ftyp" and major_brand in MP4_BRANDS
-    return is_mp4 or is_avi
+
+def load_audio(
+    path: BytesIO | Path | str,
+    *,
+    sr: float | None = 22050,
+    mono: bool = True,
+):
+    try:
+        return load_audio_soundfile(path, sr=sr, mono=mono)
+    except soundfile.LibsndfileError as exc:
+        # Only fall back for known format-detection failures.
+        # Re-raise anything else (e.g. corrupt but recognised format).
+        if exc.code not in _BAD_SF_CODES:
+            raise
+        # soundfile may have advanced the BytesIO seek position before failing;
+        # reset it so PyAV can read from the beginning.
+        if isinstance(path, BytesIO):
+            path.seek(0)
+        try:
+            return load_audio_pyav(path, sr=sr, mono=mono)
+        except Exception as pyav_exc:
+            raise ValueError("Invalid or unsupported audio file.") from pyav_exc
 
 
 class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
@@ -128,9 +168,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
         self.kwargs = kwargs
 
     def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
-        if is_video(data):
-            return extract_audio_from_video_bytes(data)
-        return librosa.load(BytesIO(data), sr=None)
+        return load_audio(BytesIO(data), sr=None)
 
     def load_base64(
         self,
@@ -140,7 +178,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
         return self.load_bytes(pybase64.b64decode(data))
 
     def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
-        return librosa.load(filepath, sr=None)
+        return load_audio(filepath, sr=None)
 
     def encode_base64(
         self,
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 6a588dad0..9e1774e39 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -497,7 +497,7 @@ class MultiModalDataParser:
         *,
         target_sr: float | None = None,
         target_channels: int | None = None,
-        audio_resample_method: Literal["librosa", "scipy"] = "librosa",
+        audio_resample_method: Literal["pyav", "scipy"] = "pyav",
         video_needs_metadata: bool = False,
         expected_hidden_size: int | None = None,
     ) -> None:
diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py
index b468712ad..63946e8fd 100644
--- a/vllm/renderers/base.py
+++ b/vllm/renderers/base.py
@@ -172,9 +172,6 @@ class BaseRenderer(ABC, Generic[_T]):
 
         For chat requests:
         - Jinja2 template compilation
-
-        For multi-modal requests:
-        - Importing libraries such as librosa triggers JIT compilation.
         """
         from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
 
diff --git a/vllm/transformers_utils/processors/fireredasr2.py b/vllm/transformers_utils/processors/fireredasr2.py
index 4bde53015..bba7e7ee0 100644
--- a/vllm/transformers_utils/processors/fireredasr2.py
+++ b/vllm/transformers_utils/processors/fireredasr2.py
@@ -188,7 +188,7 @@ class FireRedASR2FeatureExtractor(SequenceFeatureExtractor):
         for speech in raw_speech:
             """
             We must multiply by 32768 here because FireRedASR2 loads audio data
-            using kaldiio.load_mat, while vLLM loads audio data using librosa.
+            using kaldiio.load_mat, while vLLM loads audio data using pyav.
             """
             speech = speech * 32768
             fbank = self.fbank(sampling_rate, speech)