[Realtime API] Adds minimal realtime API based on websockets (#33187)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
committed by
GitHub
parent
1a7894dbdf
commit
10152d2194
133
tests/entrypoints/openai/test_realtime_validation.py
Normal file
133
tests/entrypoints/openai/test_realtime_validation.py
Normal file
@@ -0,0 +1,133 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import pytest
|
||||
import websockets
|
||||
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from .conftest import add_attention_backend
|
||||
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode",
|
||||
"mistral",
|
||||
"--config_format",
|
||||
"mistral",
|
||||
"--load_format",
|
||||
"mistral",
|
||||
]
|
||||
|
||||
MODEL_NAME = "mistralai/Voxtral-Mini-3B-Realtime-2602"
|
||||
|
||||
|
||||
def _audio_to_base64_pcm16(path: str, target_sr: int = 16000) -> str:
|
||||
"""Load audio file, convert to PCM16 @ target sample rate, base64 encode."""
|
||||
audio, _ = librosa.load(path, sr=target_sr, mono=True)
|
||||
# Convert float32 [-1, 1] to int16 [-32768, 32767]
|
||||
audio_int16 = (audio * 32767).astype(np.int16)
|
||||
audio_bytes = audio_int16.tobytes()
|
||||
return base64.b64encode(audio_bytes).decode("utf-8")
|
||||
|
||||
|
||||
def _get_websocket_url(server: RemoteOpenAIServer) -> str:
|
||||
"""Convert HTTP URL to WebSocket URL for realtime endpoint."""
|
||||
http_url = server.url_root
|
||||
ws_url = http_url.replace("http://", "ws://")
|
||||
return f"{ws_url}/v1/realtime"
|
||||
|
||||
|
||||
async def receive_event(ws, timeout: float = 60.0) -> dict:
|
||||
"""Receive and parse JSON event from WebSocket."""
|
||||
message = await asyncio.wait_for(ws.recv(), timeout=timeout)
|
||||
return json.loads(message)
|
||||
|
||||
|
||||
async def send_event(ws, event: dict) -> None:
|
||||
"""Send JSON event to WebSocket."""
|
||||
await ws.send(json.dumps(event))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mary_had_lamb_audio_chunks() -> list[str]:
|
||||
"""Audio split into ~1 second chunks for streaming."""
|
||||
path = AudioAsset("mary_had_lamb").get_local_path()
|
||||
audio, _ = librosa.load(str(path), sr=16000, mono=True)
|
||||
|
||||
# Split into ~0.1 second chunks (1600 samples at 16kHz)
|
||||
chunk_size = 1600
|
||||
chunks = []
|
||||
for i in range(0, len(audio), chunk_size):
|
||||
chunk = audio[i : i + chunk_size]
|
||||
chunk_int16 = (chunk * 32767).astype(np.int16)
|
||||
chunk_bytes = chunk_int16.tobytes()
|
||||
chunks.append(base64.b64encode(chunk_bytes).decode("utf-8"))
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.skip(reason="Voxtral streaming is not yet public")
|
||||
async def test_multi_chunk_streaming(
|
||||
model_name, mary_had_lamb_audio_chunks, rocm_aiter_fa_attention
|
||||
):
|
||||
"""Test streaming multiple audio chunks before committing."""
|
||||
server_args = ["--enforce-eager"]
|
||||
|
||||
if model_name.startswith("mistralai"):
|
||||
server_args += MISTRAL_FORMAT_ARGS
|
||||
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
ws_url = _get_websocket_url(remote_server)
|
||||
async with websockets.connect(ws_url) as ws:
|
||||
# Receive session.created
|
||||
event = await receive_event(ws, timeout=30.0)
|
||||
assert event["type"] == "session.created"
|
||||
|
||||
await send_event(ws, {"type": "session.update", "model": model_name})
|
||||
|
||||
# Send commit to start transcription
|
||||
await send_event(ws, {"type": "input_audio_buffer.commit"})
|
||||
|
||||
# Send multiple audio chunks
|
||||
for chunk in mary_had_lamb_audio_chunks:
|
||||
await send_event(
|
||||
ws, {"type": "input_audio_buffer.append", "audio": chunk}
|
||||
)
|
||||
|
||||
# Send commit to end
|
||||
await send_event(ws, {"type": "input_audio_buffer.commit", "final": True})
|
||||
|
||||
# Collect transcription deltas
|
||||
full_text = ""
|
||||
done_received = False
|
||||
|
||||
while not done_received:
|
||||
event = await receive_event(ws, timeout=60.0)
|
||||
|
||||
if event["type"] == "transcription.delta":
|
||||
full_text += event["delta"]
|
||||
elif event["type"] == "transcription.done":
|
||||
done_received = True
|
||||
assert "text" in event
|
||||
elif event["type"] == "error":
|
||||
pytest.fail(f"Received error: {event}")
|
||||
|
||||
# Verify transcription contains expected content
|
||||
assert event["type"] == "transcription.done"
|
||||
assert event["text"] == full_text
|
||||
assert full_text == (
|
||||
" He has first words I spoke in the original phonograph."
|
||||
" A little piece of practical poetry. Mary had a little lamb,"
|
||||
" it squeaked with quite a flow, and everywhere that Mary went,"
|
||||
" the lamb was sure to go"
|
||||
)
|
||||
Reference in New Issue
Block a user