[Frontend] Remove librosa from audio dependency (#37058)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2026-03-21 11:36:15 +08:00
committed by GitHub
parent 1c472f8fe1
commit c7f98b4d0a
18 changed files with 247 additions and 188 deletions

View File

@@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test
peft>=0.15.0 # required for phi-4-mm test
pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
resampy # required for audio tests
sentence-transformers>=5.2.0 # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests

View File

@@ -544,6 +544,7 @@ numba==0.61.2
# via
# -r requirements/test.in
# librosa
# resampy
numpy==2.2.6
# via
# -r requirements/test.in
@@ -584,6 +585,7 @@ numpy==2.2.6
# pyogrio
# pywavelets
# rasterio
# resampy
# rioxarray
# rouge-score
# runai-model-streamer
@@ -995,6 +997,8 @@ requests==2.32.3
# tiktoken
# transformers
# wandb
resampy==0.4.3
# via -r requirements/test.in
responses==0.25.3
# via genai-perf
rfc3339-validator==0.1.4

View File

@@ -987,11 +987,11 @@ setup(
"instanttensor": ["instanttensor >= 0.1.5"],
"runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"],
"audio": [
"librosa",
"av",
"resampy",
"scipy",
"soundfile",
"mistral_common[audio]",
"av",
], # Required for audio processing
"video": [], # Kept for backwards compatibility
"flashinfer": [], # Kept for backwards compatibility

View File

@@ -152,5 +152,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name)
model_name,
foscolo,
language="it",
expected_text="ove il mio corpo fanciulletto giacque",
expected_text="ove il mio corpo fanciulletto",
)

View File

@@ -275,7 +275,7 @@ INPUT_REASONING_BATCH = "\n".join(
]
)
MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA="
MINIMAL_WAV_BASE64 = "UklGRigAAABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAAZGF0YQQAAAAAAP9/"
INPUT_TRANSCRIPTION_BATCH = (
json.dumps(
{

View File

@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
resampler = AudioResampler(
target_sr=16000,
method="librosa",
)
resampler = AudioResampler(target_sr=16000)
audios = [asset.audio_and_sample_rate for asset in audio_assets]
resampled_audios = [
(

View File

@@ -10,6 +10,8 @@ import pytest
from vllm.multimodal.media import AudioMediaIO
from ...conftest import AudioTestAssets
pytestmark = pytest.mark.cpu_test
ASSETS_DIR = Path(__file__).parent.parent / "assets"
@@ -22,40 +24,32 @@ def dummy_audio():
@pytest.fixture
def dummy_audio_bytes():
return b"FAKEAUDIOBYTES"
def dummy_audio_bytes(audio_assets: AudioTestAssets):
with open(audio_assets[0].get_local_path(), "rb") as f:
return f.read()
def test_audio_media_io_load_bytes(dummy_audio_bytes):
audio_io = AudioMediaIO()
with patch("librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_bytes(dummy_audio_bytes)
mock_load.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
out = audio_io.load_bytes(dummy_audio_bytes)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_base64(dummy_audio_bytes):
audio_io = AudioMediaIO()
encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_base64("audio/wav", encoded)
mock_load_bytes.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
out = audio_io.load_base64("audio/wav", encoded)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_file():
def test_audio_media_io_load_file(audio_assets: AudioTestAssets):
audio_io = AudioMediaIO()
path = Path("/fake/path.wav")
with patch("librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_file(path)
mock_load.assert_called_once_with(path, sr=None)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
path = audio_assets[0].get_local_path()
out = audio_io.load_file(path)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_encode_base64(dummy_audio):

View File

@@ -14,7 +14,7 @@ from vllm.multimodal.audio import (
AudioSpec,
ChannelReduction,
normalize_audio,
resample_audio_librosa,
resample_audio_pyav,
resample_audio_scipy,
split_audio,
)
@@ -25,14 +25,14 @@ def dummy_audio():
return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float)
def test_resample_audio_librosa(dummy_audio):
with patch("vllm.multimodal.audio.librosa.resample") as mock_resample:
mock_resample.return_value = dummy_audio * 2
out = resample_audio_librosa(dummy_audio, orig_sr=44100, target_sr=22050)
mock_resample.assert_called_once_with(
dummy_audio, orig_sr=44100, target_sr=22050
)
assert np.all(out == dummy_audio * 2)
def test_resample_audio_pyav(dummy_audio):
out_down = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=2)
out_up = resample_audio_pyav(dummy_audio, orig_sr=2, target_sr=4)
out_same = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=4)
assert len(out_down) == 3
assert len(out_up) == 10
assert np.all(out_same == dummy_audio)
def test_resample_audio_scipy(dummy_audio):
@@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
assert np.isfinite(out).all()
def test_audio_resampler_librosa_calls_resample(dummy_audio):
resampler = AudioResampler(target_sr=22050, method="librosa")
with patch("vllm.multimodal.audio.resample_audio_librosa") as mock_resample:
def test_audio_resampler_pyav_calls_resample(dummy_audio):
resampler = AudioResampler(target_sr=22050, method="pyav")
with patch("vllm.multimodal.audio.resample_audio_pyav") as mock_resample:
mock_resample.return_value = dummy_audio
out = resampler.resample(dummy_audio, orig_sr=44100)
mock_resample.assert_called_once_with(
@@ -423,13 +423,13 @@ class TestAudioPipelineE2E:
# Verify channel averaging: mean of [0.5, -0.5] = 0.0
np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
def test_librosa_mono_passthrough_e2e(self):
"""Full pipeline: librosa mono format → preserved as mono."""
def test_pyav_mono_passthrough_e2e(self):
"""Full pipeline: pyav mono format → preserved as mono."""
from vllm.multimodal.parse import MultiModalDataParser
# Simulate librosa output: already mono (time,) format
mono_librosa = np.random.randn(16000).astype(np.float32)
assert mono_librosa.shape == (16000,)
# Simulate pyav output: already mono (time,) format
mono_pyav = np.random.randn(16000).astype(np.float32)
assert mono_pyav.shape == (16000,)
# Create parser with mono normalization
parser = MultiModalDataParser(
@@ -438,7 +438,7 @@ class TestAudioPipelineE2E:
)
# Process audio through the parser
result = parser._parse_audio_data((mono_librosa, 16000))
result = parser._parse_audio_data((mono_pyav, 16000))
audio_output = result.get(0)
# Verify output is still mono 1D
@@ -446,7 +446,7 @@ class TestAudioPipelineE2E:
assert audio_output.shape == (16000,)
# Verify audio content is preserved
np.testing.assert_array_almost_equal(audio_output, mono_librosa)
np.testing.assert_array_almost_equal(audio_output, mono_pyav)
def test_multichannel_5_1_surround_to_mono_e2e(self):
"""Full pipeline: 5.1 surround (6 channels) → mono output."""

View File

@@ -8,15 +8,10 @@ from urllib.parse import urljoin
import numpy.typing as npt
from vllm.utils.import_utils import PlaceholderModule
from vllm.multimodal.media.audio import load_audio
from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
ASSET_DIR = "multimodal_asset"
AudioAssetName = Literal["winning_call", "mary_had_lamb"]
@@ -33,7 +28,7 @@ class AudioAsset:
@property
def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
return librosa.load(audio_path, sr=None)
return load_audio(audio_path, sr=None)
def get_local_path(self) -> Path:
return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)

View File

@@ -10,15 +10,10 @@ import numpy.typing as npt
from huggingface_hub import hf_hub_download
from PIL import Image
from vllm.utils.import_utils import PlaceholderModule
from vllm.multimodal.media.audio import load_audio_pyav
from .base import get_cache_dir
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
@lru_cache
def download_video_asset(filename: str) -> str:
@@ -146,4 +141,4 @@ class VideoAsset:
See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
"""
return librosa.load(self.video_path, sr=sampling_rate)[0]
return load_audio_pyav(self.video_path, sr=sampling_rate)[0]

View File

@@ -38,6 +38,7 @@ from typing_extensions import deprecated
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.audio import get_audio_duration
from vllm.multimodal.image import convert_image_mode
from vllm.tokenizers import TokenizerLike
from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -54,10 +55,6 @@ try:
except ImportError:
pd = PlaceholderModule("pandas")
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa")
logger = logging.getLogger(__name__)
@@ -3253,7 +3250,7 @@ class ASRDataset(HuggingFaceDataset):
break
audio = item["audio"]
y, sr = audio["array"], audio["sampling_rate"]
duration_s = librosa.get_duration(y=y, sr=sr)
duration_s = get_audio_duration(y=y, sr=sr)
if duration_s < asr_min_audio_len_sec or duration_s > asr_max_audio_len_sec:
skipped += 1
continue

View File

@@ -42,32 +42,13 @@ from vllm.inputs import EncoderDecoderInputs, ProcessorInputs
from vllm.logger import init_logger
from vllm.logprobs import FlatLogprobs, Logprob
from vllm.model_executor.models import SupportsTranscription
from vllm.multimodal.audio import split_audio
from vllm.multimodal.media.audio import extract_audio_from_video_bytes
from vllm.multimodal.audio import get_audio_duration, split_audio
from vllm.multimodal.media.audio import load_audio
from vllm.outputs import RequestOutput
from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt
from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.tokenizers import get_tokenizer
from vllm.utils.import_utils import PlaceholderModule
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
try:
import soundfile as sf
except ImportError:
sf = PlaceholderModule("soundfile") # type: ignore[assignment]
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES = {1, 3, 4}
SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
SpeechToTextResponseVerbose: TypeAlias = (
@@ -214,32 +195,13 @@ class OpenAISpeechToText(OpenAIServing):
# pre-requisite for chunking, as it assumes Whisper SR.
try:
with io.BytesIO(audio_data) as buf:
y, sr = librosa.load(buf, sr=self.asr_config.sample_rate) # type: ignore[return-value]
except sf.LibsndfileError as exc:
# Only fall back for known format-detection failures.
# Re-raise anything else (e.g. corrupt but recognised format).
if exc.code not in _BAD_SF_CODES:
raise
logger.debug(
"librosa/soundfile could not decode audio from BytesIO "
"(code=%s: %s); falling back to pyav in-process decode",
exc.code,
exc,
)
try:
native_y, native_sr = extract_audio_from_video_bytes(audio_data)
sr = self.asr_config.sample_rate
y = librosa.resample(native_y, orig_sr=native_sr, target_sr=sr)
except Exception as pyav_exc:
logger.debug(
"pyAV fallback also failed: %s",
pyav_exc,
)
raise ValueError("Invalid or unsupported audio file.") from pyav_exc
y, sr = load_audio(buf, sr=self.asr_config.sample_rate)
except Exception as exc:
raise ValueError("Invalid or unsupported audio file.") from exc
duration = librosa.get_duration(y=y, sr=sr)
do_split_audio = (
self.asr_config.allow_audio_chunking
duration = get_audio_duration(y=y, sr=sr)
do_split_audio = self.asr_config.allow_audio_chunking and (
self.asr_config.max_audio_clip_s is not None
and duration > self.asr_config.max_audio_clip_s
)

View File

@@ -12,6 +12,7 @@ import math
import warnings
from collections.abc import Iterable, Mapping, Sequence
from functools import cached_property
from io import BytesIO
from typing import Annotated, Literal, TypeAlias
import torch
@@ -53,7 +54,7 @@ from vllm.multimodal.inputs import (
MultiModalKwargsItems,
VideoItem,
)
from vllm.multimodal.media.audio import extract_audio_from_video_bytes
from vllm.multimodal.media.audio import load_audio_pyav
from vllm.multimodal.parse import (
AudioProcessorItems,
ImageEmbeddingItems,
@@ -553,7 +554,7 @@ class NanoNemotronVLMultiModalProcessor(
"video must be loaded with keep_video_bytes=True (e.g. via "
"the chat API with a model that sets use_audio_in_video)."
)
audio_items.append(extract_audio_from_video_bytes(video_bytes))
audio_items.append(load_audio_pyav(BytesIO(video_bytes)))
# Create a new VideoProcessorItems with metadata that does not contain
# the large video bytes, to avoid modifying the input `mm_items`.

View File

@@ -12,17 +12,35 @@ import torch
from vllm.utils.import_utils import PlaceholderModule
try:
import librosa
import av as av
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
av = PlaceholderModule("av") # type: ignore[assignment]
try:
import resampy
except ImportError:
resampy = PlaceholderModule("resampy") # type: ignore[assignment]
try:
import scipy.signal as scipy_signal
except ImportError:
scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal") # type: ignore[assignment]
# ============================================================
# Aligned with `librosa.get_duration` function
def get_audio_duration(*, y: npt.NDArray[np.floating], sr: float = 22050) -> float:
"""Get the duration of an audio array in seconds.
Args:
y: Audio time series. Can be 1D (samples,) or 2D (channels, samples).
sr: Sample rate of the audio in Hz.
Returns:
Duration of the audio in seconds.
"""
n_samples = y.shape[-1]
return float(n_samples) / sr
class ChannelReduction(str, Enum):
@@ -153,13 +171,71 @@ def normalize_audio(
# ============================================================
def resample_audio_librosa(
def resample_audio_pyav(
audio: npt.NDArray[np.floating],
*,
orig_sr: float,
target_sr: float,
) -> npt.NDArray[np.floating]:
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
"""Resample audio using PyAV (libswresample via FFmpeg).
Args:
audio: Input audio. Can be:
- 1D array ``(samples,)``: mono audio
- 2D array ``(channels, samples)``: stereo audio
orig_sr: Original sample rate in Hz.
target_sr: Target sample rate in Hz.
Returns:
Resampled audio with the same shape as the input (1D → 1D, 2D → 2D).
"""
orig_sr_int = int(round(orig_sr))
target_sr_int = int(round(target_sr))
if orig_sr_int == target_sr_int:
return audio
if audio.ndim == 2:
# Resample each channel independently and re-stack.
return np.stack(
[
resample_audio_pyav(ch, orig_sr=orig_sr, target_sr=target_sr)
for ch in audio
],
axis=0,
)
expected_len = int(math.ceil(audio.shape[-1] * target_sr_int / orig_sr_int))
# from_ndarray expects shape (channels, samples) for planar formats.
# libswresample requires a minimum number of input samples to produce
# output frames; pad short inputs with zeros so we always get output,
# then trim to the expected output length.
_MIN_SAMPLES = 1024
audio_f32 = np.asarray(audio, dtype=np.float32)
if len(audio_f32) < _MIN_SAMPLES:
audio_f32 = np.pad(audio_f32, (0, _MIN_SAMPLES - len(audio_f32)))
audio_f32 = audio_f32.reshape(1, -1)
resampler = av.AudioResampler(format="fltp", layout="mono", rate=target_sr_int)
frame = av.AudioFrame.from_ndarray(audio_f32, format="fltp", layout="mono")
frame.sample_rate = orig_sr_int
out_frames = resampler.resample(frame)
out_frames.extend(resampler.resample(None)) # flush buffered samples
result = np.concatenate([f.to_ndarray() for f in out_frames], axis=1).squeeze(0)
return result[:expected_len]
def resample_audio_resampy(
audio: npt.NDArray[np.floating],
*,
orig_sr: float,
target_sr: float,
) -> npt.NDArray[np.floating]:
return resampy.resample(audio, sr_orig=orig_sr, sr_new=target_sr)
def resample_audio_scipy(
@@ -167,7 +243,7 @@ def resample_audio_scipy(
*,
orig_sr: float,
target_sr: float,
):
) -> npt.NDArray[np.floating]:
if orig_sr > target_sr:
return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr)
elif orig_sr < target_sr:
@@ -181,7 +257,7 @@ class AudioResampler:
def __init__(
self,
target_sr: float | None = None,
method: Literal["librosa", "scipy"] = "librosa",
method: Literal["pyav", "resampy", "scipy"] = "resampy",
):
self.target_sr = target_sr
self.method = method
@@ -203,8 +279,10 @@ class AudioResampler:
abs_tol=1e-6,
):
return audio
if self.method == "librosa":
return resample_audio_librosa(
if self.method == "pyav":
return resample_audio_pyav(audio, orig_sr=orig_sr, target_sr=self.target_sr)
if self.method == "resampy":
return resample_audio_resampy(
audio, orig_sr=orig_sr, target_sr=self.target_sr
)
elif self.method == "scipy":
@@ -214,7 +292,7 @@ class AudioResampler:
else:
raise ValueError(
f"Invalid resampling method: {self.method}. "
"Supported methods are 'librosa' and 'scipy'."
"Supported methods are 'pyav' and 'scipy'."
)

View File

@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from io import BytesIO
from pathlib import Path
@@ -14,58 +15,80 @@ from vllm.utils.serial_utils import tensor2base64
from .base import MediaIO
try:
import librosa
import av
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
av = PlaceholderModule("av") # type: ignore[assignment]
try:
import soundfile
except ImportError:
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
try:
import av
import resampy
except ImportError:
av = PlaceholderModule("av") # type: ignore[assignment]
resampy = PlaceholderModule("resampy") # type: ignore[assignment]
def extract_audio_from_video_bytes(
data: bytes,
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES = {1, 3, 4}
def load_audio_pyav(
path: BytesIO | Path | str,
*,
sr: float | None = 22050,
mono: bool = True,
) -> tuple[npt.NDArray, float]:
"""Extract the audio track from raw video bytes using PyAV.
"""Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.
PyAV wraps FFmpeg's C libraries in-process — no subprocess is
spawned, which is critical to avoid crashing CUDA-active vLLM
worker processes.
The returned waveform is at the native sample rate of the video's
audio stream. Resampling to a model-specific rate is left to the
downstream :class:`AudioResampler` in the parsing pipeline.
Decodes the audio stream at its native sample rate. Channel reduction to
mono is performed by averaging across channels. Resampling to a
model-specific rate is left to the downstream :class:`AudioResampler`.
Args:
data: Raw video file bytes (e.g. from an mp4 file).
path: A :class:`~io.BytesIO` buffer, a filesystem
:class:`~pathlib.Path`, or a string path.
Returns:
A tuple of ``(waveform, sample_rate)`` suitable for use as an
:class:`AudioItem`.
``(waveform, sample_rate)`` where *waveform* is a 1-D float32
NumPy array and *sample_rate* is the native sample rate in Hz.
"""
if data is None or len(data) == 0:
raise ValueError(
"Cannot extract audio: video bytes are missing or empty. "
"Ensure video was loaded with keep_video_bytes=True for "
"audio-in-video extraction."
)
native_sr = None
try:
with av.open(BytesIO(data)) as container:
with av.open(path) as container:
if not container.streams.audio:
raise ValueError("No audio stream found in the video.")
raise ValueError("No audio stream found.")
stream = container.streams.audio[0]
stream.thread_type = "AUTO"
native_sr = stream.rate
sr = sr or native_sr
chunks: list[npt.NDArray] = []
for frame in container.decode(audio=0):
arr = frame.to_ndarray()
chunks.append(arr.mean(axis=0) if arr.ndim > 1 else arr)
needs_resampling = not math.isclose(
float(sr),
float(native_sr),
rel_tol=0.0,
abs_tol=1e-6,
)
resampler = (
av.AudioResampler(format="fltp", layout="mono", rate=sr)
if needs_resampling
else None
)
for frame in container.decode(stream):
if needs_resampling:
assert resampler is not None
for out_frame in resampler.resample(frame):
chunks.append(out_frame.to_ndarray())
else:
chunks.append(frame.to_ndarray())
except ValueError:
raise
except Exception as e:
@@ -77,37 +100,54 @@ def extract_audio_from_video_bytes(
if not chunks:
raise ValueError("No audio found in the video.")
audio = np.concatenate(chunks).astype(np.float32)
return audio, float(native_sr)
audio = np.concatenate(chunks, axis=-1).astype(np.float32)
if mono and audio.ndim > 1:
audio = np.mean(audio, axis=0)
return audio, sr
def is_video(data: bytes) -> bool:
"""Check if the fetched bytes are video"""
if len(data) < 12:
return False
def load_audio_soundfile(
path: BytesIO | Path | str,
*,
sr: float | None = 22050,
mono: bool = True,
) -> tuple[np.ndarray, int]:
"""Load audio via soundfile"""
with soundfile.SoundFile(path) as f:
native_sr = f.samplerate
y = f.read(dtype="float32", always_2d=False).T
box_type = data[4:8]
major_brand = data[8:12]
if mono and y.ndim > 1:
y = np.mean(y, axis=tuple(range(y.ndim - 1)))
MP4_BRANDS = {
b"mp41",
b"mp42", # MP4
b"isom", # ISO Base Media
b"iso2",
b"iso4",
b"iso5",
b"iso6",
b"M4V ",
b"M4A ", # Apple
b"avc1", # H.264
b"dash", # DASH
b"mmp4",
b"MSNV",
}
if sr is not None and sr != native_sr:
y = resampy.resample(y, sr_orig=native_sr, sr_new=sr)
return y, int(sr)
return y, native_sr
is_avi = data[:4] == b"RIFF" and major_brand == b"AVI "
is_mp4 = box_type == b"ftyp" and major_brand in MP4_BRANDS
return is_mp4 or is_avi
def load_audio(
path: BytesIO | Path | str,
*,
sr: float | None = 22050,
mono: bool = True,
):
try:
return load_audio_soundfile(path, sr=sr, mono=mono)
except soundfile.LibsndfileError as exc:
# Only fall back for known format-detection failures.
# Re-raise anything else (e.g. corrupt but recognised format).
if exc.code not in _BAD_SF_CODES:
raise
# soundfile may have advanced the BytesIO seek position before failing;
# reset it so PyAV can read from the beginning.
if isinstance(path, BytesIO):
path.seek(0)
try:
return load_audio_pyav(path, sr=sr, mono=mono)
except Exception as pyav_exc:
raise ValueError("Invalid or unsupported audio file.") from pyav_exc
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
@@ -128,9 +168,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
self.kwargs = kwargs
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
if is_video(data):
return extract_audio_from_video_bytes(data)
return librosa.load(BytesIO(data), sr=None)
return load_audio(BytesIO(data), sr=None)
def load_base64(
self,
@@ -140,7 +178,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
return self.load_bytes(pybase64.b64decode(data))
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
return librosa.load(filepath, sr=None)
return load_audio(filepath, sr=None)
def encode_base64(
self,

View File

@@ -497,7 +497,7 @@ class MultiModalDataParser:
*,
target_sr: float | None = None,
target_channels: int | None = None,
audio_resample_method: Literal["librosa", "scipy"] = "librosa",
audio_resample_method: Literal["pyav", "scipy"] = "pyav",
video_needs_metadata: bool = False,
expected_hidden_size: int | None = None,
) -> None:

View File

@@ -172,9 +172,6 @@ class BaseRenderer(ABC, Generic[_T]):
For chat requests:
- Jinja2 template compilation
For multi-modal requests:
- Importing libraries such as librosa triggers JIT compilation.
"""
from vllm.entrypoints.chat_utils import ChatTemplateResolutionError

View File

@@ -188,7 +188,7 @@ class FireRedASR2FeatureExtractor(SequenceFeatureExtractor):
for speech in raw_speech:
"""
We must multiply by 32768 here because FireRedASR2 loads audio data
using kaldiio.load_mat, while vLLM loads audio data using librosa.
using kaldiio.load_mat, while vLLM loads audio data using pyav.
"""
speech = speech * 32768
fbank = self.fbank(sampling_rate, speech)