[Frontend] Remove librosa from audio dependency (#37058)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test
|
||||
peft>=0.15.0 # required for phi-4-mm test
|
||||
pqdm
|
||||
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
|
||||
resampy # required for audio tests
|
||||
sentence-transformers>=5.2.0 # required for embedding tests
|
||||
soundfile # required for audio tests
|
||||
jiwer # required for audio tests
|
||||
|
||||
@@ -544,6 +544,7 @@ numba==0.61.2
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# librosa
|
||||
# resampy
|
||||
numpy==2.2.6
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
@@ -584,6 +585,7 @@ numpy==2.2.6
|
||||
# pyogrio
|
||||
# pywavelets
|
||||
# rasterio
|
||||
# resampy
|
||||
# rioxarray
|
||||
# rouge-score
|
||||
# runai-model-streamer
|
||||
@@ -995,6 +997,8 @@ requests==2.32.3
|
||||
# tiktoken
|
||||
# transformers
|
||||
# wandb
|
||||
resampy==0.4.3
|
||||
# via -r requirements/test.in
|
||||
responses==0.25.3
|
||||
# via genai-perf
|
||||
rfc3339-validator==0.1.4
|
||||
|
||||
4
setup.py
4
setup.py
@@ -987,11 +987,11 @@ setup(
|
||||
"instanttensor": ["instanttensor >= 0.1.5"],
|
||||
"runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"],
|
||||
"audio": [
|
||||
"librosa",
|
||||
"av",
|
||||
"resampy",
|
||||
"scipy",
|
||||
"soundfile",
|
||||
"mistral_common[audio]",
|
||||
"av",
|
||||
], # Required for audio processing
|
||||
"video": [], # Kept for backwards compatibility
|
||||
"flashinfer": [], # Kept for backwards compatibility
|
||||
|
||||
@@ -152,5 +152,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name)
|
||||
model_name,
|
||||
foscolo,
|
||||
language="it",
|
||||
expected_text="ove il mio corpo fanciulletto giacque",
|
||||
expected_text="ove il mio corpo fanciulletto",
|
||||
)
|
||||
|
||||
@@ -275,7 +275,7 @@ INPUT_REASONING_BATCH = "\n".join(
|
||||
]
|
||||
)
|
||||
|
||||
MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA="
|
||||
MINIMAL_WAV_BASE64 = "UklGRigAAABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAAZGF0YQQAAAAAAP9/"
|
||||
INPUT_TRANSCRIPTION_BATCH = (
|
||||
json.dumps(
|
||||
{
|
||||
|
||||
@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
resampler = AudioResampler(
|
||||
target_sr=16000,
|
||||
method="librosa",
|
||||
)
|
||||
resampler = AudioResampler(target_sr=16000)
|
||||
audios = [asset.audio_and_sample_rate for asset in audio_assets]
|
||||
resampled_audios = [
|
||||
(
|
||||
|
||||
@@ -10,6 +10,8 @@ import pytest
|
||||
|
||||
from vllm.multimodal.media import AudioMediaIO
|
||||
|
||||
from ...conftest import AudioTestAssets
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
ASSETS_DIR = Path(__file__).parent.parent / "assets"
|
||||
@@ -22,40 +24,32 @@ def dummy_audio():
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_audio_bytes():
|
||||
return b"FAKEAUDIOBYTES"
|
||||
def dummy_audio_bytes(audio_assets: AudioTestAssets):
|
||||
with open(audio_assets[0].get_local_path(), "rb") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def test_audio_media_io_load_bytes(dummy_audio_bytes):
|
||||
audio_io = AudioMediaIO()
|
||||
with patch("librosa.load") as mock_load:
|
||||
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
|
||||
out = audio_io.load_bytes(dummy_audio_bytes)
|
||||
mock_load.assert_called_once()
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
out = audio_io.load_bytes(dummy_audio_bytes)
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
|
||||
|
||||
def test_audio_media_io_load_base64(dummy_audio_bytes):
|
||||
audio_io = AudioMediaIO()
|
||||
encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
|
||||
with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
|
||||
mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
|
||||
out = audio_io.load_base64("audio/wav", encoded)
|
||||
mock_load_bytes.assert_called_once()
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
out = audio_io.load_base64("audio/wav", encoded)
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
|
||||
|
||||
def test_audio_media_io_load_file():
|
||||
def test_audio_media_io_load_file(audio_assets: AudioTestAssets):
|
||||
audio_io = AudioMediaIO()
|
||||
path = Path("/fake/path.wav")
|
||||
with patch("librosa.load") as mock_load:
|
||||
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
|
||||
out = audio_io.load_file(path)
|
||||
mock_load.assert_called_once_with(path, sr=None)
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
path = audio_assets[0].get_local_path()
|
||||
out = audio_io.load_file(path)
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
|
||||
|
||||
def test_audio_media_io_encode_base64(dummy_audio):
|
||||
|
||||
@@ -14,7 +14,7 @@ from vllm.multimodal.audio import (
|
||||
AudioSpec,
|
||||
ChannelReduction,
|
||||
normalize_audio,
|
||||
resample_audio_librosa,
|
||||
resample_audio_pyav,
|
||||
resample_audio_scipy,
|
||||
split_audio,
|
||||
)
|
||||
@@ -25,14 +25,14 @@ def dummy_audio():
|
||||
return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float)
|
||||
|
||||
|
||||
def test_resample_audio_librosa(dummy_audio):
|
||||
with patch("vllm.multimodal.audio.librosa.resample") as mock_resample:
|
||||
mock_resample.return_value = dummy_audio * 2
|
||||
out = resample_audio_librosa(dummy_audio, orig_sr=44100, target_sr=22050)
|
||||
mock_resample.assert_called_once_with(
|
||||
dummy_audio, orig_sr=44100, target_sr=22050
|
||||
)
|
||||
assert np.all(out == dummy_audio * 2)
|
||||
def test_resample_audio_pyav(dummy_audio):
|
||||
out_down = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=2)
|
||||
out_up = resample_audio_pyav(dummy_audio, orig_sr=2, target_sr=4)
|
||||
out_same = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=4)
|
||||
|
||||
assert len(out_down) == 3
|
||||
assert len(out_up) == 10
|
||||
assert np.all(out_same == dummy_audio)
|
||||
|
||||
|
||||
def test_resample_audio_scipy(dummy_audio):
|
||||
@@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
|
||||
assert np.isfinite(out).all()
|
||||
|
||||
|
||||
def test_audio_resampler_librosa_calls_resample(dummy_audio):
|
||||
resampler = AudioResampler(target_sr=22050, method="librosa")
|
||||
with patch("vllm.multimodal.audio.resample_audio_librosa") as mock_resample:
|
||||
def test_audio_resampler_pyav_calls_resample(dummy_audio):
|
||||
resampler = AudioResampler(target_sr=22050, method="pyav")
|
||||
with patch("vllm.multimodal.audio.resample_audio_pyav") as mock_resample:
|
||||
mock_resample.return_value = dummy_audio
|
||||
out = resampler.resample(dummy_audio, orig_sr=44100)
|
||||
mock_resample.assert_called_once_with(
|
||||
@@ -423,13 +423,13 @@ class TestAudioPipelineE2E:
|
||||
# Verify channel averaging: mean of [0.5, -0.5] = 0.0
|
||||
np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
|
||||
|
||||
def test_librosa_mono_passthrough_e2e(self):
|
||||
"""Full pipeline: librosa mono format → preserved as mono."""
|
||||
def test_pyav_mono_passthrough_e2e(self):
|
||||
"""Full pipeline: pyav mono format → preserved as mono."""
|
||||
from vllm.multimodal.parse import MultiModalDataParser
|
||||
|
||||
# Simulate librosa output: already mono (time,) format
|
||||
mono_librosa = np.random.randn(16000).astype(np.float32)
|
||||
assert mono_librosa.shape == (16000,)
|
||||
# Simulate pyav output: already mono (time,) format
|
||||
mono_pyav = np.random.randn(16000).astype(np.float32)
|
||||
assert mono_pyav.shape == (16000,)
|
||||
|
||||
# Create parser with mono normalization
|
||||
parser = MultiModalDataParser(
|
||||
@@ -438,7 +438,7 @@ class TestAudioPipelineE2E:
|
||||
)
|
||||
|
||||
# Process audio through the parser
|
||||
result = parser._parse_audio_data((mono_librosa, 16000))
|
||||
result = parser._parse_audio_data((mono_pyav, 16000))
|
||||
audio_output = result.get(0)
|
||||
|
||||
# Verify output is still mono 1D
|
||||
@@ -446,7 +446,7 @@ class TestAudioPipelineE2E:
|
||||
assert audio_output.shape == (16000,)
|
||||
|
||||
# Verify audio content is preserved
|
||||
np.testing.assert_array_almost_equal(audio_output, mono_librosa)
|
||||
np.testing.assert_array_almost_equal(audio_output, mono_pyav)
|
||||
|
||||
def test_multichannel_5_1_surround_to_mono_e2e(self):
|
||||
"""Full pipeline: 5.1 surround (6 channels) → mono output."""
|
||||
|
||||
@@ -8,15 +8,10 @@ from urllib.parse import urljoin
|
||||
|
||||
import numpy.typing as npt
|
||||
|
||||
from vllm.utils.import_utils import PlaceholderModule
|
||||
from vllm.multimodal.media.audio import load_audio
|
||||
|
||||
from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
|
||||
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||
|
||||
ASSET_DIR = "multimodal_asset"
|
||||
|
||||
AudioAssetName = Literal["winning_call", "mary_had_lamb"]
|
||||
@@ -33,7 +28,7 @@ class AudioAsset:
|
||||
@property
|
||||
def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
|
||||
audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
|
||||
return librosa.load(audio_path, sr=None)
|
||||
return load_audio(audio_path, sr=None)
|
||||
|
||||
def get_local_path(self) -> Path:
|
||||
return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
|
||||
|
||||
@@ -10,15 +10,10 @@ import numpy.typing as npt
|
||||
from huggingface_hub import hf_hub_download
|
||||
from PIL import Image
|
||||
|
||||
from vllm.utils.import_utils import PlaceholderModule
|
||||
from vllm.multimodal.media.audio import load_audio_pyav
|
||||
|
||||
from .base import get_cache_dir
|
||||
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||
|
||||
|
||||
@lru_cache
|
||||
def download_video_asset(filename: str) -> str:
|
||||
@@ -146,4 +141,4 @@ class VideoAsset:
|
||||
|
||||
See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
|
||||
"""
|
||||
return librosa.load(self.video_path, sr=sampling_rate)[0]
|
||||
return load_audio_pyav(self.video_path, sr=sampling_rate)[0]
|
||||
|
||||
@@ -38,6 +38,7 @@ from typing_extensions import deprecated
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.utils import get_adapter_absolute_path
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.multimodal.audio import get_audio_duration
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
@@ -54,10 +55,6 @@ try:
|
||||
except ImportError:
|
||||
pd = PlaceholderModule("pandas")
|
||||
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
librosa = PlaceholderModule("librosa")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -3253,7 +3250,7 @@ class ASRDataset(HuggingFaceDataset):
|
||||
break
|
||||
audio = item["audio"]
|
||||
y, sr = audio["array"], audio["sampling_rate"]
|
||||
duration_s = librosa.get_duration(y=y, sr=sr)
|
||||
duration_s = get_audio_duration(y=y, sr=sr)
|
||||
if duration_s < asr_min_audio_len_sec or duration_s > asr_max_audio_len_sec:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
@@ -42,32 +42,13 @@ from vllm.inputs import EncoderDecoderInputs, ProcessorInputs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import FlatLogprobs, Logprob
|
||||
from vllm.model_executor.models import SupportsTranscription
|
||||
from vllm.multimodal.audio import split_audio
|
||||
from vllm.multimodal.media.audio import extract_audio_from_video_bytes
|
||||
from vllm.multimodal.audio import get_audio_duration, split_audio
|
||||
from vllm.multimodal.media.audio import load_audio
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt
|
||||
from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt
|
||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.utils.import_utils import PlaceholderModule
|
||||
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import soundfile as sf
|
||||
except ImportError:
|
||||
sf = PlaceholderModule("soundfile") # type: ignore[assignment]
|
||||
|
||||
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
|
||||
# being librosa's main backend. Used to validate if an audio loading error is due to a
|
||||
# server error vs a client error (invalid audio file).
|
||||
# 1 = unrecognised format (file is not a supported audio container)
|
||||
# 3 = malformed file (corrupt or structurally invalid audio)
|
||||
# 4 = unsupported encoding (codec not supported by this libsndfile build)
|
||||
_BAD_SF_CODES = {1, 3, 4}
|
||||
|
||||
SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
|
||||
SpeechToTextResponseVerbose: TypeAlias = (
|
||||
@@ -214,32 +195,13 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
# pre-requisite for chunking, as it assumes Whisper SR.
|
||||
try:
|
||||
with io.BytesIO(audio_data) as buf:
|
||||
y, sr = librosa.load(buf, sr=self.asr_config.sample_rate) # type: ignore[return-value]
|
||||
except sf.LibsndfileError as exc:
|
||||
# Only fall back for known format-detection failures.
|
||||
# Re-raise anything else (e.g. corrupt but recognised format).
|
||||
if exc.code not in _BAD_SF_CODES:
|
||||
raise
|
||||
logger.debug(
|
||||
"librosa/soundfile could not decode audio from BytesIO "
|
||||
"(code=%s: %s); falling back to pyav in-process decode",
|
||||
exc.code,
|
||||
exc,
|
||||
)
|
||||
try:
|
||||
native_y, native_sr = extract_audio_from_video_bytes(audio_data)
|
||||
sr = self.asr_config.sample_rate
|
||||
y = librosa.resample(native_y, orig_sr=native_sr, target_sr=sr)
|
||||
except Exception as pyav_exc:
|
||||
logger.debug(
|
||||
"pyAV fallback also failed: %s",
|
||||
pyav_exc,
|
||||
)
|
||||
raise ValueError("Invalid or unsupported audio file.") from pyav_exc
|
||||
y, sr = load_audio(buf, sr=self.asr_config.sample_rate)
|
||||
except Exception as exc:
|
||||
raise ValueError("Invalid or unsupported audio file.") from exc
|
||||
|
||||
duration = librosa.get_duration(y=y, sr=sr)
|
||||
do_split_audio = (
|
||||
self.asr_config.allow_audio_chunking
|
||||
duration = get_audio_duration(y=y, sr=sr)
|
||||
do_split_audio = self.asr_config.allow_audio_chunking and (
|
||||
self.asr_config.max_audio_clip_s is not None
|
||||
and duration > self.asr_config.max_audio_clip_s
|
||||
)
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ import math
|
||||
import warnings
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from functools import cached_property
|
||||
from io import BytesIO
|
||||
from typing import Annotated, Literal, TypeAlias
|
||||
|
||||
import torch
|
||||
@@ -53,7 +54,7 @@ from vllm.multimodal.inputs import (
|
||||
MultiModalKwargsItems,
|
||||
VideoItem,
|
||||
)
|
||||
from vllm.multimodal.media.audio import extract_audio_from_video_bytes
|
||||
from vllm.multimodal.media.audio import load_audio_pyav
|
||||
from vllm.multimodal.parse import (
|
||||
AudioProcessorItems,
|
||||
ImageEmbeddingItems,
|
||||
@@ -553,7 +554,7 @@ class NanoNemotronVLMultiModalProcessor(
|
||||
"video must be loaded with keep_video_bytes=True (e.g. via "
|
||||
"the chat API with a model that sets use_audio_in_video)."
|
||||
)
|
||||
audio_items.append(extract_audio_from_video_bytes(video_bytes))
|
||||
audio_items.append(load_audio_pyav(BytesIO(video_bytes)))
|
||||
|
||||
# Create a new VideoProcessorItems with metadata that does not contain
|
||||
# the large video bytes, to avoid modifying the input `mm_items`.
|
||||
|
||||
@@ -12,17 +12,35 @@ import torch
|
||||
from vllm.utils.import_utils import PlaceholderModule
|
||||
|
||||
try:
|
||||
import librosa
|
||||
import av as av
|
||||
except ImportError:
|
||||
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||
av = PlaceholderModule("av") # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import resampy
|
||||
except ImportError:
|
||||
resampy = PlaceholderModule("resampy") # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import scipy.signal as scipy_signal
|
||||
except ImportError:
|
||||
scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal") # type: ignore[assignment]
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Aligned with `librosa.get_duration` function
|
||||
def get_audio_duration(*, y: npt.NDArray[np.floating], sr: float = 22050) -> float:
|
||||
"""Get the duration of an audio array in seconds.
|
||||
|
||||
Args:
|
||||
y: Audio time series. Can be 1D (samples,) or 2D (channels, samples).
|
||||
sr: Sample rate of the audio in Hz.
|
||||
|
||||
Returns:
|
||||
Duration of the audio in seconds.
|
||||
"""
|
||||
n_samples = y.shape[-1]
|
||||
return float(n_samples) / sr
|
||||
|
||||
|
||||
class ChannelReduction(str, Enum):
|
||||
@@ -153,13 +171,71 @@ def normalize_audio(
|
||||
# ============================================================
|
||||
|
||||
|
||||
def resample_audio_librosa(
|
||||
def resample_audio_pyav(
|
||||
audio: npt.NDArray[np.floating],
|
||||
*,
|
||||
orig_sr: float,
|
||||
target_sr: float,
|
||||
) -> npt.NDArray[np.floating]:
|
||||
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
|
||||
"""Resample audio using PyAV (libswresample via FFmpeg).
|
||||
|
||||
Args:
|
||||
audio: Input audio. Can be:
|
||||
- 1D array ``(samples,)``: mono audio
|
||||
- 2D array ``(channels, samples)``: stereo audio
|
||||
orig_sr: Original sample rate in Hz.
|
||||
target_sr: Target sample rate in Hz.
|
||||
|
||||
Returns:
|
||||
Resampled audio with the same shape as the input (1D → 1D, 2D → 2D).
|
||||
"""
|
||||
orig_sr_int = int(round(orig_sr))
|
||||
target_sr_int = int(round(target_sr))
|
||||
|
||||
if orig_sr_int == target_sr_int:
|
||||
return audio
|
||||
|
||||
if audio.ndim == 2:
|
||||
# Resample each channel independently and re-stack.
|
||||
return np.stack(
|
||||
[
|
||||
resample_audio_pyav(ch, orig_sr=orig_sr, target_sr=target_sr)
|
||||
for ch in audio
|
||||
],
|
||||
axis=0,
|
||||
)
|
||||
|
||||
expected_len = int(math.ceil(audio.shape[-1] * target_sr_int / orig_sr_int))
|
||||
|
||||
# from_ndarray expects shape (channels, samples) for planar formats.
|
||||
# libswresample requires a minimum number of input samples to produce
|
||||
# output frames; pad short inputs with zeros so we always get output,
|
||||
# then trim to the expected output length.
|
||||
_MIN_SAMPLES = 1024
|
||||
audio_f32 = np.asarray(audio, dtype=np.float32)
|
||||
if len(audio_f32) < _MIN_SAMPLES:
|
||||
audio_f32 = np.pad(audio_f32, (0, _MIN_SAMPLES - len(audio_f32)))
|
||||
audio_f32 = audio_f32.reshape(1, -1)
|
||||
|
||||
resampler = av.AudioResampler(format="fltp", layout="mono", rate=target_sr_int)
|
||||
|
||||
frame = av.AudioFrame.from_ndarray(audio_f32, format="fltp", layout="mono")
|
||||
frame.sample_rate = orig_sr_int
|
||||
|
||||
out_frames = resampler.resample(frame)
|
||||
out_frames.extend(resampler.resample(None)) # flush buffered samples
|
||||
|
||||
result = np.concatenate([f.to_ndarray() for f in out_frames], axis=1).squeeze(0)
|
||||
return result[:expected_len]
|
||||
|
||||
|
||||
def resample_audio_resampy(
|
||||
audio: npt.NDArray[np.floating],
|
||||
*,
|
||||
orig_sr: float,
|
||||
target_sr: float,
|
||||
) -> npt.NDArray[np.floating]:
|
||||
return resampy.resample(audio, sr_orig=orig_sr, sr_new=target_sr)
|
||||
|
||||
|
||||
def resample_audio_scipy(
|
||||
@@ -167,7 +243,7 @@ def resample_audio_scipy(
|
||||
*,
|
||||
orig_sr: float,
|
||||
target_sr: float,
|
||||
):
|
||||
) -> npt.NDArray[np.floating]:
|
||||
if orig_sr > target_sr:
|
||||
return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr)
|
||||
elif orig_sr < target_sr:
|
||||
@@ -181,7 +257,7 @@ class AudioResampler:
|
||||
def __init__(
|
||||
self,
|
||||
target_sr: float | None = None,
|
||||
method: Literal["librosa", "scipy"] = "librosa",
|
||||
method: Literal["pyav", "resampy", "scipy"] = "resampy",
|
||||
):
|
||||
self.target_sr = target_sr
|
||||
self.method = method
|
||||
@@ -203,8 +279,10 @@ class AudioResampler:
|
||||
abs_tol=1e-6,
|
||||
):
|
||||
return audio
|
||||
if self.method == "librosa":
|
||||
return resample_audio_librosa(
|
||||
if self.method == "pyav":
|
||||
return resample_audio_pyav(audio, orig_sr=orig_sr, target_sr=self.target_sr)
|
||||
if self.method == "resampy":
|
||||
return resample_audio_resampy(
|
||||
audio, orig_sr=orig_sr, target_sr=self.target_sr
|
||||
)
|
||||
elif self.method == "scipy":
|
||||
@@ -214,7 +292,7 @@ class AudioResampler:
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid resampling method: {self.method}. "
|
||||
"Supported methods are 'librosa' and 'scipy'."
|
||||
"Supported methods are 'pyav' and 'scipy'."
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import math
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
@@ -14,58 +15,80 @@ from vllm.utils.serial_utils import tensor2base64
|
||||
from .base import MediaIO
|
||||
|
||||
try:
|
||||
import librosa
|
||||
import av
|
||||
except ImportError:
|
||||
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||
av = PlaceholderModule("av") # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import soundfile
|
||||
except ImportError:
|
||||
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
|
||||
|
||||
|
||||
try:
|
||||
import av
|
||||
import resampy
|
||||
except ImportError:
|
||||
av = PlaceholderModule("av") # type: ignore[assignment]
|
||||
resampy = PlaceholderModule("resampy") # type: ignore[assignment]
|
||||
|
||||
|
||||
def extract_audio_from_video_bytes(
|
||||
data: bytes,
|
||||
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
|
||||
# being librosa's main backend. Used to validate if an audio loading error is due to a
|
||||
# server error vs a client error (invalid audio file).
|
||||
# 1 = unrecognised format (file is not a supported audio container)
|
||||
# 3 = malformed file (corrupt or structurally invalid audio)
|
||||
# 4 = unsupported encoding (codec not supported by this libsndfile build)
|
||||
_BAD_SF_CODES = {1, 3, 4}
|
||||
|
||||
|
||||
def load_audio_pyav(
|
||||
path: BytesIO | Path | str,
|
||||
*,
|
||||
sr: float | None = 22050,
|
||||
mono: bool = True,
|
||||
) -> tuple[npt.NDArray, float]:
|
||||
"""Extract the audio track from raw video bytes using PyAV.
|
||||
"""Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.
|
||||
|
||||
PyAV wraps FFmpeg's C libraries in-process — no subprocess is
|
||||
spawned, which is critical to avoid crashing CUDA-active vLLM
|
||||
worker processes.
|
||||
|
||||
The returned waveform is at the native sample rate of the video's
|
||||
audio stream. Resampling to a model-specific rate is left to the
|
||||
downstream :class:`AudioResampler` in the parsing pipeline.
|
||||
Decodes the audio stream at its native sample rate. Channel reduction to
|
||||
mono is performed by averaging across channels. Resampling to a
|
||||
model-specific rate is left to the downstream :class:`AudioResampler`.
|
||||
|
||||
Args:
|
||||
data: Raw video file bytes (e.g. from an mp4 file).
|
||||
path: A :class:`~io.BytesIO` buffer, a filesystem
|
||||
:class:`~pathlib.Path`, or a string path.
|
||||
|
||||
Returns:
|
||||
A tuple of ``(waveform, sample_rate)`` suitable for use as an
|
||||
:class:`AudioItem`.
|
||||
``(waveform, sample_rate)`` where *waveform* is a 1-D float32
|
||||
NumPy array and *sample_rate* is the native sample rate in Hz.
|
||||
"""
|
||||
if data is None or len(data) == 0:
|
||||
raise ValueError(
|
||||
"Cannot extract audio: video bytes are missing or empty. "
|
||||
"Ensure video was loaded with keep_video_bytes=True for "
|
||||
"audio-in-video extraction."
|
||||
)
|
||||
native_sr = None
|
||||
try:
|
||||
with av.open(BytesIO(data)) as container:
|
||||
with av.open(path) as container:
|
||||
if not container.streams.audio:
|
||||
raise ValueError("No audio stream found in the video.")
|
||||
raise ValueError("No audio stream found.")
|
||||
stream = container.streams.audio[0]
|
||||
stream.thread_type = "AUTO"
|
||||
native_sr = stream.rate
|
||||
sr = sr or native_sr
|
||||
|
||||
chunks: list[npt.NDArray] = []
|
||||
for frame in container.decode(audio=0):
|
||||
arr = frame.to_ndarray()
|
||||
chunks.append(arr.mean(axis=0) if arr.ndim > 1 else arr)
|
||||
needs_resampling = not math.isclose(
|
||||
float(sr),
|
||||
float(native_sr),
|
||||
rel_tol=0.0,
|
||||
abs_tol=1e-6,
|
||||
)
|
||||
resampler = (
|
||||
av.AudioResampler(format="fltp", layout="mono", rate=sr)
|
||||
if needs_resampling
|
||||
else None
|
||||
)
|
||||
for frame in container.decode(stream):
|
||||
if needs_resampling:
|
||||
assert resampler is not None
|
||||
for out_frame in resampler.resample(frame):
|
||||
chunks.append(out_frame.to_ndarray())
|
||||
else:
|
||||
chunks.append(frame.to_ndarray())
|
||||
except ValueError:
|
||||
raise
|
||||
except Exception as e:
|
||||
@@ -77,37 +100,54 @@ def extract_audio_from_video_bytes(
|
||||
if not chunks:
|
||||
raise ValueError("No audio found in the video.")
|
||||
|
||||
audio = np.concatenate(chunks).astype(np.float32)
|
||||
return audio, float(native_sr)
|
||||
audio = np.concatenate(chunks, axis=-1).astype(np.float32)
|
||||
if mono and audio.ndim > 1:
|
||||
audio = np.mean(audio, axis=0)
|
||||
|
||||
return audio, sr
|
||||
|
||||
|
||||
def is_video(data: bytes) -> bool:
|
||||
"""Check if the fetched bytes are video"""
|
||||
if len(data) < 12:
|
||||
return False
|
||||
def load_audio_soundfile(
|
||||
path: BytesIO | Path | str,
|
||||
*,
|
||||
sr: float | None = 22050,
|
||||
mono: bool = True,
|
||||
) -> tuple[np.ndarray, int]:
|
||||
"""Load audio via soundfile"""
|
||||
with soundfile.SoundFile(path) as f:
|
||||
native_sr = f.samplerate
|
||||
y = f.read(dtype="float32", always_2d=False).T
|
||||
|
||||
box_type = data[4:8]
|
||||
major_brand = data[8:12]
|
||||
if mono and y.ndim > 1:
|
||||
y = np.mean(y, axis=tuple(range(y.ndim - 1)))
|
||||
|
||||
MP4_BRANDS = {
|
||||
b"mp41",
|
||||
b"mp42", # MP4
|
||||
b"isom", # ISO Base Media
|
||||
b"iso2",
|
||||
b"iso4",
|
||||
b"iso5",
|
||||
b"iso6",
|
||||
b"M4V ",
|
||||
b"M4A ", # Apple
|
||||
b"avc1", # H.264
|
||||
b"dash", # DASH
|
||||
b"mmp4",
|
||||
b"MSNV",
|
||||
}
|
||||
if sr is not None and sr != native_sr:
|
||||
y = resampy.resample(y, sr_orig=native_sr, sr_new=sr)
|
||||
return y, int(sr)
|
||||
return y, native_sr
|
||||
|
||||
is_avi = data[:4] == b"RIFF" and major_brand == b"AVI "
|
||||
is_mp4 = box_type == b"ftyp" and major_brand in MP4_BRANDS
|
||||
return is_mp4 or is_avi
|
||||
|
||||
def load_audio(
|
||||
path: BytesIO | Path | str,
|
||||
*,
|
||||
sr: float | None = 22050,
|
||||
mono: bool = True,
|
||||
):
|
||||
try:
|
||||
return load_audio_soundfile(path, sr=sr, mono=mono)
|
||||
except soundfile.LibsndfileError as exc:
|
||||
# Only fall back for known format-detection failures.
|
||||
# Re-raise anything else (e.g. corrupt but recognised format).
|
||||
if exc.code not in _BAD_SF_CODES:
|
||||
raise
|
||||
# soundfile may have advanced the BytesIO seek position before failing;
|
||||
# reset it so PyAV can read from the beginning.
|
||||
if isinstance(path, BytesIO):
|
||||
path.seek(0)
|
||||
try:
|
||||
return load_audio_pyav(path, sr=sr, mono=mono)
|
||||
except Exception as pyav_exc:
|
||||
raise ValueError("Invalid or unsupported audio file.") from pyav_exc
|
||||
|
||||
|
||||
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
|
||||
@@ -128,9 +168,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
|
||||
self.kwargs = kwargs
|
||||
|
||||
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
|
||||
if is_video(data):
|
||||
return extract_audio_from_video_bytes(data)
|
||||
return librosa.load(BytesIO(data), sr=None)
|
||||
return load_audio(BytesIO(data), sr=None)
|
||||
|
||||
def load_base64(
|
||||
self,
|
||||
@@ -140,7 +178,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
|
||||
return self.load_bytes(pybase64.b64decode(data))
|
||||
|
||||
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
|
||||
return librosa.load(filepath, sr=None)
|
||||
return load_audio(filepath, sr=None)
|
||||
|
||||
def encode_base64(
|
||||
self,
|
||||
|
||||
@@ -497,7 +497,7 @@ class MultiModalDataParser:
|
||||
*,
|
||||
target_sr: float | None = None,
|
||||
target_channels: int | None = None,
|
||||
audio_resample_method: Literal["librosa", "scipy"] = "librosa",
|
||||
audio_resample_method: Literal["pyav", "scipy"] = "pyav",
|
||||
video_needs_metadata: bool = False,
|
||||
expected_hidden_size: int | None = None,
|
||||
) -> None:
|
||||
|
||||
@@ -172,9 +172,6 @@ class BaseRenderer(ABC, Generic[_T]):
|
||||
|
||||
For chat requests:
|
||||
- Jinja2 template compilation
|
||||
|
||||
For multi-modal requests:
|
||||
- Importing libraries such as librosa triggers JIT compilation.
|
||||
"""
|
||||
from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
|
||||
|
||||
|
||||
@@ -188,7 +188,7 @@ class FireRedASR2FeatureExtractor(SequenceFeatureExtractor):
|
||||
for speech in raw_speech:
|
||||
"""
|
||||
We must multiply by 32768 here because FireRedASR2 loads audio data
|
||||
using kaldiio.load_mat, while vLLM loads audio data using librosa.
|
||||
using kaldiio.load_mat, while vLLM loads audio data using pyav.
|
||||
"""
|
||||
speech = speech * 32768
|
||||
fbank = self.fbank(sampling_rate, speech)
|
||||
|
||||
Reference in New Issue
Block a user