[Frontend] Remove librosa from audio dependency (#37058)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -42,32 +42,13 @@ from vllm.inputs import EncoderDecoderInputs, ProcessorInputs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import FlatLogprobs, Logprob
|
||||
from vllm.model_executor.models import SupportsTranscription
|
||||
from vllm.multimodal.audio import split_audio
|
||||
from vllm.multimodal.media.audio import extract_audio_from_video_bytes
|
||||
from vllm.multimodal.audio import get_audio_duration, split_audio
|
||||
from vllm.multimodal.media.audio import load_audio
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt
|
||||
from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt
|
||||
from vllm.sampling_params import BeamSearchParams, SamplingParams
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
from vllm.utils.import_utils import PlaceholderModule
|
||||
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import soundfile as sf
|
||||
except ImportError:
|
||||
sf = PlaceholderModule("soundfile") # type: ignore[assignment]
|
||||
|
||||
# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
|
||||
# being librosa's main backend. Used to validate if an audio loading error is due to a
|
||||
# server error vs a client error (invalid audio file).
|
||||
# 1 = unrecognised format (file is not a supported audio container)
|
||||
# 3 = malformed file (corrupt or structurally invalid audio)
|
||||
# 4 = unsupported encoding (codec not supported by this libsndfile build)
|
||||
_BAD_SF_CODES = {1, 3, 4}
|
||||
|
||||
SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
|
||||
SpeechToTextResponseVerbose: TypeAlias = (
|
||||
@@ -214,32 +195,13 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
# pre-requisite for chunking, as it assumes Whisper SR.
|
||||
try:
|
||||
with io.BytesIO(audio_data) as buf:
|
||||
y, sr = librosa.load(buf, sr=self.asr_config.sample_rate) # type: ignore[return-value]
|
||||
except sf.LibsndfileError as exc:
|
||||
# Only fall back for known format-detection failures.
|
||||
# Re-raise anything else (e.g. corrupt but recognised format).
|
||||
if exc.code not in _BAD_SF_CODES:
|
||||
raise
|
||||
logger.debug(
|
||||
"librosa/soundfile could not decode audio from BytesIO "
|
||||
"(code=%s: %s); falling back to pyav in-process decode",
|
||||
exc.code,
|
||||
exc,
|
||||
)
|
||||
try:
|
||||
native_y, native_sr = extract_audio_from_video_bytes(audio_data)
|
||||
sr = self.asr_config.sample_rate
|
||||
y = librosa.resample(native_y, orig_sr=native_sr, target_sr=sr)
|
||||
except Exception as pyav_exc:
|
||||
logger.debug(
|
||||
"pyAV fallback also failed: %s",
|
||||
pyav_exc,
|
||||
)
|
||||
raise ValueError("Invalid or unsupported audio file.") from pyav_exc
|
||||
y, sr = load_audio(buf, sr=self.asr_config.sample_rate)
|
||||
except Exception as exc:
|
||||
raise ValueError("Invalid or unsupported audio file.") from exc
|
||||
|
||||
duration = librosa.get_duration(y=y, sr=sr)
|
||||
do_split_audio = (
|
||||
self.asr_config.allow_audio_chunking
|
||||
duration = get_audio_duration(y=y, sr=sr)
|
||||
do_split_audio = self.asr_config.allow_audio_chunking and (
|
||||
self.asr_config.max_audio_clip_s is not None
|
||||
and duration > self.asr_config.max_audio_clip_s
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user