[Misc] Abstract the logic for reading and writing media content (#11527)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2024-12-27 19:21:23 +08:00
committed by GitHub
parent 2c9b8ea2b0
commit 7af553ea30
10 changed files with 495 additions and 389 deletions

View File

@@ -1,10 +1,14 @@
import base64
from io import BytesIO
from pathlib import Path
import numpy as np
import numpy.typing as npt
from vllm.inputs.registry import InputContext
from vllm.utils import PlaceholderModule
from .base import MultiModalPlugin
from .base import MediaIO, MultiModalPlugin
from .inputs import AudioItem, MultiModalData, MultiModalKwargs
try:
@@ -12,6 +16,11 @@ try:
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
try:
import soundfile
except ImportError:
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
class AudioPlugin(MultiModalPlugin):
"""Plugin for audio data."""
@@ -39,3 +48,28 @@ def resample_audio(
target_sr: float,
) -> npt.NDArray[np.floating]:
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
return librosa.load(BytesIO(data), sr=None)
def load_base64(
self,
media_type: str,
data: str,
) -> tuple[npt.NDArray, float]:
return self.load_bytes(base64.b64decode(data))
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
return librosa.load(filepath, sr=None)
def encode_base64(self, media: tuple[npt.NDArray, float]) -> str:
audio, sr = media
with BytesIO() as buffer:
soundfile.write(buffer, audio, sr, format="WAV")
data = buffer.getvalue()
return base64.b64encode(data).decode('utf-8')