[Bugfix][Frontend] Fix audio transcription for MP4, M4A, and WebM formats (#35109)

Signed-off-by: seanmamasde <seanmamasde@gmail.com>
This commit is contained in:
seanmamasde
2026-03-14 23:44:03 +08:00
committed by GitHub
parent a8e8d62dd8
commit 84868e4793
3 changed files with 114 additions and 20 deletions

View File

@@ -976,6 +976,7 @@ setup(
"soundfile",
"mistral_common[audio]",
"av",
"torchcodec",
], # Required for audio processing
"video": [], # Kept for backwards compatibility
"flashinfer": [], # Kept for backwards compatibility

View File

@@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import io
import math
import time
import zlib
@@ -11,7 +10,6 @@ from typing import Final, Literal, TypeAlias, TypeVar, cast
import numpy as np
from fastapi import Request
from soundfile import LibsndfileError
from transformers import PreTrainedTokenizerBase
import vllm.envs as envs
@@ -37,6 +35,7 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
TranslationSegment,
TranslationStreamResponse,
)
from vllm.entrypoints.openai.speech_to_text.utils import load_audio_bytes
from vllm.entrypoints.utils import get_max_tokens
from vllm.exceptions import VLLMValidationError
from vllm.inputs import EncoderDecoderInputs, ProcessorInputs
@@ -56,14 +55,6 @@ try:
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
# being librosa's main backend. Used to validate if an audio loading error is due to a
# server error vs a client error (invalid audio file).
# 1 = unrecognised format (file is not a supported audio container)
# 3 = malformed file (corrupt or structurally invalid audio)
# 4 = unsupported encoding (codec not supported by this libsndfile build)
_BAD_SF_CODES = {1, 3, 4}
SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
SpeechToTextResponseVerbose: TypeAlias = (
TranscriptionResponseVerbose | TranslationResponseVerbose
@@ -202,16 +193,12 @@ class OpenAISpeechToText(OpenAIServing):
value=len(audio_data) / 1024**2,
)
with io.BytesIO(audio_data) as bytes_:
try:
# NOTE resample to model SR here for efficiency. This is also a
# pre-requisite for chunking, as it assumes Whisper SR.
y, sr = librosa.load(bytes_, sr=self.asr_config.sample_rate)
except LibsndfileError as exc:
# Distinguish client errors (invalid audio) from server errors
if exc.code in _BAD_SF_CODES:
raise ValueError("Invalid or unsupported audio file.") from exc
raise
# Decode audio bytes. For container formats (MP4, M4A, WebM) that
# soundfile cannot detect from a BytesIO stream, _load_audio_bytes
# transparently falls back to ffmpeg via an in-memory fd.
# NOTE resample to model SR here for efficiency. This is also a
# pre-requisite for chunking, as it assumes Whisper SR.
y, sr = load_audio_bytes(audio_data, sr=self.asr_config.sample_rate)
duration = librosa.get_duration(y=y, sr=sr)
do_split_audio = (

View File

@@ -0,0 +1,106 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Audio decoding utilities for the speech-to-text endpoints."""
import io
import numpy as np
import torchaudio
from vllm.logger import init_logger
from vllm.utils.import_utils import PlaceholderModule
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
try:
import soundfile as sf
except ImportError:
sf = PlaceholderModule("soundfile") # type: ignore[assignment]
logger = init_logger(__name__)
# Public libsndfile error codes exposed via ``soundfile.LibsndfileError.code``.
# soundfile is librosa's primary backend. These codes indicate that the audio
# data itself is problematic (unrecognised container, corrupt file, or
# unsupported encoding) rather than a transient server error.
# 1 = unrecognised format, 3 = malformed file, 4 = unsupported encoding
_BAD_SF_CODES = {1, 3, 4}
def _decode_audio_bytes_torchaudio(
audio_data: bytes,
sr: int,
) -> tuple[np.ndarray, int]:
"""Decode audio bytes to mono float32 PCM via torchaudio, in-process.
``torchaudio.load`` (backed by TorchCodec / FFmpeg) can decode
container formats (MP4, M4A, WebM) directly from a ``BytesIO``
buffer without spawning a subprocess. The decoded waveform is
down-mixed to mono and resampled to *sr* Hz, matching the return
convention of ``librosa.load``.
"""
buf = io.BytesIO(audio_data)
waveform, orig_sr = torchaudio.load(buf)
# Down-mix to mono (average across channels).
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Resample to the target sample rate when necessary.
if orig_sr != sr:
waveform = torchaudio.functional.resample(
waveform, orig_freq=orig_sr, new_freq=sr
)
# Squeeze channel dim → 1-D float32 numpy array (same as librosa.load).
y = waveform.squeeze(0).numpy()
if y.size == 0:
raise RuntimeError(
"torchaudio produced no audio samples (file may be empty or corrupt)"
)
return y, sr
def load_audio_bytes(
audio_data: bytes,
sr: int | float,
) -> tuple[np.ndarray, int]:
"""Load audio from raw bytes, with an in-process torchaudio fallback.
First tries ``librosa.load(BytesIO(...))`` which works for formats
that *soundfile* can auto-detect (WAV, FLAC, MP3, OGG, ...). If
that fails with a ``LibsndfileError`` indicating an unrecognised or
unsupported format (typically container formats like MP4/M4A/WebM),
the bytes are decoded in-process via ``torchaudio`` (backed by
TorchCodec / FFmpeg) which handles these containers natively.
"""
sr = int(sr)
# Fast path: librosa + soundfile (works for most formats).
try:
with io.BytesIO(audio_data) as buf:
return librosa.load(buf, sr=sr) # type: ignore[return-value]
except sf.LibsndfileError as exc:
# Only fall back for known format-detection failures.
# Re-raise anything else (e.g. corrupt but recognised format).
if exc.code not in _BAD_SF_CODES:
raise
logger.debug(
"librosa/soundfile could not decode audio from BytesIO "
"(code=%s: %s); falling back to torchaudio in-process decode",
exc.code,
exc,
)
# Fallback: torchaudio in-process decode (no subprocess overhead).
try:
return _decode_audio_bytes_torchaudio(audio_data, sr)
except Exception as ta_exc:
logger.debug(
"torchaudio fallback also failed: %s",
ta_exc,
)
raise ValueError("Invalid or unsupported audio file.") from ta_exc