[Realtime API] Adds minimal realtime API based on websockets (#33187)

Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Nick Hill <nickhill123@gmail.com>
2026-01-30 11:41:29 +01:00
parent 1a7894dbdf
commit 10152d2194
21 changed files with 1316 additions and 48 deletions
--- a/tests/entrypoints/openai/test_realtime_validation.py
+++ b/tests/entrypoints/openai/test_realtime_validation.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import base64
+import json
+
+import librosa
+import numpy as np
+import pytest
+import websockets
+
+from vllm.assets.audio import AudioAsset
+
+from ...utils import RemoteOpenAIServer
+from .conftest import add_attention_backend
+
+MISTRAL_FORMAT_ARGS = [
+    "--tokenizer_mode",
+    "mistral",
+    "--config_format",
+    "mistral",
+    "--load_format",
+    "mistral",
+]
+
+MODEL_NAME = "mistralai/Voxtral-Mini-3B-Realtime-2602"
+
+
+def _audio_to_base64_pcm16(path: str, target_sr: int = 16000) -> str:
+    """Load audio file, convert to PCM16 @ target sample rate, base64 encode."""
+    audio, _ = librosa.load(path, sr=target_sr, mono=True)
+    # Convert float32 [-1, 1] to int16 [-32768, 32767]
+    audio_int16 = (audio * 32767).astype(np.int16)
+    audio_bytes = audio_int16.tobytes()
+    return base64.b64encode(audio_bytes).decode("utf-8")
+
+
+def _get_websocket_url(server: RemoteOpenAIServer) -> str:
+    """Convert HTTP URL to WebSocket URL for realtime endpoint."""
+    http_url = server.url_root
+    ws_url = http_url.replace("http://", "ws://")
+    return f"{ws_url}/v1/realtime"
+
+
+async def receive_event(ws, timeout: float = 60.0) -> dict:
+    """Receive and parse JSON event from WebSocket."""
+    message = await asyncio.wait_for(ws.recv(), timeout=timeout)
+    return json.loads(message)
+
+
+async def send_event(ws, event: dict) -> None:
+    """Send JSON event to WebSocket."""
+    await ws.send(json.dumps(event))
+
+
+@pytest.fixture
+def mary_had_lamb_audio_chunks() -> list[str]:
+    """Audio split into ~1 second chunks for streaming."""
+    path = AudioAsset("mary_had_lamb").get_local_path()
+    audio, _ = librosa.load(str(path), sr=16000, mono=True)
+
+    # Split into ~0.1 second chunks (1600 samples at 16kHz)
+    chunk_size = 1600
+    chunks = []
+    for i in range(0, len(audio), chunk_size):
+        chunk = audio[i : i + chunk_size]
+        chunk_int16 = (chunk * 32767).astype(np.int16)
+        chunk_bytes = chunk_int16.tobytes()
+        chunks.append(base64.b64encode(chunk_bytes).decode("utf-8"))
+
+    return chunks
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="Voxtral streaming is not yet public")
+async def test_multi_chunk_streaming(
+    model_name, mary_had_lamb_audio_chunks, rocm_aiter_fa_attention
+):
+    """Test streaming multiple audio chunks before committing."""
+    server_args = ["--enforce-eager"]
+
+    if model_name.startswith("mistralai"):
+        server_args += MISTRAL_FORMAT_ARGS
+
+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        ws_url = _get_websocket_url(remote_server)
+        async with websockets.connect(ws_url) as ws:
+            # Receive session.created
+            event = await receive_event(ws, timeout=30.0)
+            assert event["type"] == "session.created"
+
+            await send_event(ws, {"type": "session.update", "model": model_name})
+
+            # Send commit to start transcription
+            await send_event(ws, {"type": "input_audio_buffer.commit"})
+
+            # Send multiple audio chunks
+            for chunk in mary_had_lamb_audio_chunks:
+                await send_event(
+                    ws, {"type": "input_audio_buffer.append", "audio": chunk}
+                )
+
+            # Send commit to end
+            await send_event(ws, {"type": "input_audio_buffer.commit", "final": True})
+
+            # Collect transcription deltas
+            full_text = ""
+            done_received = False
+
+            while not done_received:
+                event = await receive_event(ws, timeout=60.0)
+
+                if event["type"] == "transcription.delta":
+                    full_text += event["delta"]
+                elif event["type"] == "transcription.done":
+                    done_received = True
+                    assert "text" in event
+                elif event["type"] == "error":
+                    pytest.fail(f"Received error: {event}")
+
+            # Verify transcription contains expected content
+            assert event["type"] == "transcription.done"
+            assert event["text"] == full_text
+            assert full_text == (
+                " He has first words I spoke in the original phonograph."
+                " A little piece of practical poetry. Mary had a little lamb,"
+                " it squeaked with quite a flow, and everywhere that Mary went,"
+                " the lamb was sure to go"
+            )