2026-01-30 11:41:29 +01:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
|
|
|
|
|
|
import asyncio
|
|
|
|
|
import json
|
2026-02-22 04:07:18 -06:00
|
|
|
import warnings
|
2026-01-30 11:41:29 +01:00
|
|
|
|
|
|
|
|
import librosa
|
|
|
|
|
import numpy as np
|
2026-03-17 22:44:19 +08:00
|
|
|
import pybase64 as base64
|
2026-01-30 11:41:29 +01:00
|
|
|
import pytest
|
|
|
|
|
import websockets
|
|
|
|
|
|
2026-03-19 03:19:36 -04:00
|
|
|
from tests.entrypoints.openai.conftest import add_attention_backend
|
|
|
|
|
from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
|
2026-01-30 11:41:29 +01:00
|
|
|
from vllm.assets.audio import AudioAsset
|
|
|
|
|
|
2026-03-25 05:24:33 -05:00
|
|
|
# Increase engine iteration timeout for ROCm where first-use JIT compilation
|
|
|
|
|
# can exceed the default 60s, causing a silent deadlock in feed_tokens.
|
|
|
|
|
REALTIME_ENV_OVERRIDES = {
|
|
|
|
|
**ROCM_ENV_OVERRIDES,
|
|
|
|
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "600",
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-30 11:41:29 +01:00
|
|
|
MISTRAL_FORMAT_ARGS = [
|
|
|
|
|
"--tokenizer_mode",
|
|
|
|
|
"mistral",
|
|
|
|
|
"--config_format",
|
|
|
|
|
"mistral",
|
|
|
|
|
"--load_format",
|
|
|
|
|
"mistral",
|
2026-03-06 21:04:40 -06:00
|
|
|
] + ROCM_EXTRA_ARGS
|
2026-01-30 11:41:29 +01:00
|
|
|
|
2026-02-03 22:03:28 +01:00
|
|
|
MODEL_NAME = "mistralai/Voxtral-Mini-4B-Realtime-2602"
|
2026-01-30 11:41:29 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_websocket_url(server: RemoteOpenAIServer) -> str:
|
|
|
|
|
"""Convert HTTP URL to WebSocket URL for realtime endpoint."""
|
|
|
|
|
http_url = server.url_root
|
|
|
|
|
ws_url = http_url.replace("http://", "ws://")
|
|
|
|
|
return f"{ws_url}/v1/realtime"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def receive_event(ws, timeout: float = 60.0) -> dict:
|
|
|
|
|
"""Receive and parse JSON event from WebSocket."""
|
|
|
|
|
message = await asyncio.wait_for(ws.recv(), timeout=timeout)
|
|
|
|
|
return json.loads(message)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def send_event(ws, event: dict) -> None:
|
|
|
|
|
"""Send JSON event to WebSocket."""
|
|
|
|
|
await ws.send(json.dumps(event))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
|
def mary_had_lamb_audio_chunks() -> list[str]:
|
|
|
|
|
"""Audio split into ~1 second chunks for streaming."""
|
|
|
|
|
path = AudioAsset("mary_had_lamb").get_local_path()
|
|
|
|
|
audio, _ = librosa.load(str(path), sr=16000, mono=True)
|
|
|
|
|
|
|
|
|
|
# Split into ~0.1 second chunks (1600 samples at 16kHz)
|
|
|
|
|
chunk_size = 1600
|
|
|
|
|
chunks = []
|
|
|
|
|
for i in range(0, len(audio), chunk_size):
|
|
|
|
|
chunk = audio[i : i + chunk_size]
|
|
|
|
|
chunk_int16 = (chunk * 32767).astype(np.int16)
|
|
|
|
|
chunk_bytes = chunk_int16.tobytes()
|
|
|
|
|
chunks.append(base64.b64encode(chunk_bytes).decode("utf-8"))
|
|
|
|
|
|
|
|
|
|
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
|
|
|
async def test_multi_chunk_streaming(
|
|
|
|
|
model_name, mary_had_lamb_audio_chunks, rocm_aiter_fa_attention
|
|
|
|
|
):
|
|
|
|
|
"""Test streaming multiple audio chunks before committing."""
|
2026-02-12 18:43:24 +01:00
|
|
|
server_args = ["--enforce-eager", "--max-model-len", "2048"]
|
2026-01-30 11:41:29 +01:00
|
|
|
|
|
|
|
|
if model_name.startswith("mistralai"):
|
|
|
|
|
server_args += MISTRAL_FORMAT_ARGS
|
|
|
|
|
|
|
|
|
|
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
|
|
|
|
|
2026-03-06 21:04:40 -06:00
|
|
|
with RemoteOpenAIServer(
|
2026-03-25 05:24:33 -05:00
|
|
|
model_name, server_args, env_dict=REALTIME_ENV_OVERRIDES
|
2026-03-06 21:04:40 -06:00
|
|
|
) as remote_server:
|
2026-01-30 11:41:29 +01:00
|
|
|
ws_url = _get_websocket_url(remote_server)
|
|
|
|
|
async with websockets.connect(ws_url) as ws:
|
|
|
|
|
# Receive session.created
|
|
|
|
|
event = await receive_event(ws, timeout=30.0)
|
|
|
|
|
assert event["type"] == "session.created"
|
|
|
|
|
|
|
|
|
|
await send_event(ws, {"type": "session.update", "model": model_name})
|
|
|
|
|
|
2026-02-22 04:07:18 -06:00
|
|
|
# Wait for the server to acknowledge the session update.
|
|
|
|
|
try:
|
|
|
|
|
while True:
|
|
|
|
|
event = await receive_event(ws, timeout=5.0)
|
|
|
|
|
if event["type"] == "session.updated":
|
|
|
|
|
break
|
|
|
|
|
except TimeoutError:
|
|
|
|
|
warnings.warn(
|
|
|
|
|
f"session.updated not received within {5.0}s after "
|
|
|
|
|
"session.update. The server may not implement this event.",
|
|
|
|
|
stacklevel=2,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# (ROCm) Warm-up: send a non-final commit (required to start
|
|
|
|
|
# transcription) with a small audio chunk to trigger aiter
|
|
|
|
|
# compilation on first use.
|
|
|
|
|
await send_event(ws, {"type": "input_audio_buffer.commit"})
|
|
|
|
|
await send_event(
|
|
|
|
|
ws,
|
|
|
|
|
{
|
|
|
|
|
"type": "input_audio_buffer.append",
|
|
|
|
|
"audio": mary_had_lamb_audio_chunks[0],
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
await send_event(ws, {"type": "input_audio_buffer.commit", "final": True})
|
|
|
|
|
|
|
|
|
|
# (ROCm) Drain all warm-up responses with generous timeout for
|
|
|
|
|
# JIT compilation
|
|
|
|
|
warmup_done = False
|
|
|
|
|
while not warmup_done:
|
2026-03-10 11:59:19 +09:00
|
|
|
event = await receive_event(ws, timeout=600.0)
|
2026-02-22 04:07:18 -06:00
|
|
|
if event["type"] in ("transcription.done", "error"):
|
|
|
|
|
warmup_done = True
|
|
|
|
|
|
|
|
|
|
# Now send the real test audio
|
2026-01-30 11:41:29 +01:00
|
|
|
await send_event(ws, {"type": "input_audio_buffer.commit"})
|
|
|
|
|
|
|
|
|
|
# Send multiple audio chunks
|
|
|
|
|
for chunk in mary_had_lamb_audio_chunks:
|
|
|
|
|
await send_event(
|
|
|
|
|
ws, {"type": "input_audio_buffer.append", "audio": chunk}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Send commit to end
|
|
|
|
|
await send_event(ws, {"type": "input_audio_buffer.commit", "final": True})
|
|
|
|
|
|
|
|
|
|
# Collect transcription deltas
|
|
|
|
|
full_text = ""
|
|
|
|
|
done_received = False
|
|
|
|
|
|
|
|
|
|
while not done_received:
|
|
|
|
|
event = await receive_event(ws, timeout=60.0)
|
|
|
|
|
|
|
|
|
|
if event["type"] == "transcription.delta":
|
|
|
|
|
full_text += event["delta"]
|
|
|
|
|
elif event["type"] == "transcription.done":
|
|
|
|
|
done_received = True
|
|
|
|
|
assert "text" in event
|
|
|
|
|
elif event["type"] == "error":
|
|
|
|
|
pytest.fail(f"Received error: {event}")
|
|
|
|
|
|
|
|
|
|
# Verify transcription contains expected content
|
|
|
|
|
assert event["type"] == "transcription.done"
|
|
|
|
|
assert event["text"] == full_text
|
|
|
|
|
assert full_text == (
|
2026-02-02 23:01:47 +01:00
|
|
|
" First words I spoke in the original phonograph."
|
2026-01-30 11:41:29 +01:00
|
|
|
" A little piece of practical poetry. Mary had a little lamb,"
|
2026-02-02 23:01:47 +01:00
|
|
|
" it sleeps with quite a flow, and everywhere that Mary went,"
|
2026-02-12 06:01:53 +09:00
|
|
|
" the lamb was sure to go."
|
2026-01-30 11:41:29 +01:00
|
|
|
)
|
2026-02-19 02:21:47 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
|
|
|
async def test_empty_commit_does_not_crash_engine(
|
|
|
|
|
model_name, mary_had_lamb_audio_chunks, rocm_aiter_fa_attention
|
|
|
|
|
):
|
|
|
|
|
"""Test that committing without audio does not crash the engine.
|
|
|
|
|
|
|
|
|
|
Regression test for https://github.com/vllm-project/vllm/issues/34532.
|
|
|
|
|
An empty commit (no prior input_audio_buffer.append) used to trigger
|
|
|
|
|
``AssertionError: For realtime you must provide a multimodal_embedding
|
|
|
|
|
at every step`` which killed the entire engine process, disconnecting
|
|
|
|
|
every connected client.
|
|
|
|
|
"""
|
|
|
|
|
server_args = ["--enforce-eager", "--max-model-len", "2048"]
|
|
|
|
|
|
|
|
|
|
if model_name.startswith("mistralai"):
|
|
|
|
|
server_args += MISTRAL_FORMAT_ARGS
|
|
|
|
|
|
|
|
|
|
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
|
|
|
|
|
2026-03-06 21:04:40 -06:00
|
|
|
with RemoteOpenAIServer(
|
2026-03-25 05:24:33 -05:00
|
|
|
model_name, server_args, env_dict=REALTIME_ENV_OVERRIDES
|
2026-03-06 21:04:40 -06:00
|
|
|
) as remote_server:
|
2026-02-19 02:21:47 -05:00
|
|
|
ws_url = _get_websocket_url(remote_server)
|
|
|
|
|
|
|
|
|
|
# --- First connection: empty commit (no audio appended) ----------
|
|
|
|
|
async with websockets.connect(ws_url) as ws:
|
|
|
|
|
event = await receive_event(ws, timeout=30.0)
|
|
|
|
|
assert event["type"] == "session.created"
|
|
|
|
|
|
|
|
|
|
await send_event(ws, {"type": "session.update", "model": model_name})
|
|
|
|
|
|
2026-02-22 04:07:18 -06:00
|
|
|
try:
|
|
|
|
|
while True:
|
|
|
|
|
event = await receive_event(ws, timeout=5.0)
|
|
|
|
|
if event["type"] == "session.updated":
|
|
|
|
|
break
|
|
|
|
|
except TimeoutError:
|
|
|
|
|
warnings.warn(
|
|
|
|
|
f"session.updated not received within {5.0}s after "
|
|
|
|
|
"session.update. The server may not implement this event.",
|
|
|
|
|
stacklevel=2,
|
|
|
|
|
)
|
|
|
|
|
|
2026-02-19 02:21:47 -05:00
|
|
|
# Start generation without sending any audio
|
|
|
|
|
await send_event(ws, {"type": "input_audio_buffer.commit"})
|
|
|
|
|
|
|
|
|
|
# Immediately signal end-of-audio
|
|
|
|
|
await send_event(ws, {"type": "input_audio_buffer.commit", "final": True})
|
|
|
|
|
|
|
|
|
|
# We should get *some* response (error or empty transcription),
|
|
|
|
|
# but the engine must NOT crash.
|
2026-02-22 04:07:18 -06:00
|
|
|
# (ROCm) Use generous timeout for first request (aiter JIT compilation)
|
|
|
|
|
event = await receive_event(ws, timeout=360.0)
|
2026-02-19 02:21:47 -05:00
|
|
|
assert event["type"] in (
|
|
|
|
|
"error",
|
|
|
|
|
"transcription.done",
|
|
|
|
|
"transcription.delta",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# --- Second connection: normal transcription ---------------------
|
|
|
|
|
# Verifies the engine is still alive after the empty commit above.
|
|
|
|
|
async with websockets.connect(ws_url) as ws:
|
|
|
|
|
event = await receive_event(ws, timeout=30.0)
|
|
|
|
|
assert event["type"] == "session.created"
|
|
|
|
|
|
|
|
|
|
await send_event(ws, {"type": "session.update", "model": model_name})
|
|
|
|
|
|
2026-02-22 04:07:18 -06:00
|
|
|
try:
|
|
|
|
|
while True:
|
|
|
|
|
event = await receive_event(ws, timeout=5.0)
|
|
|
|
|
if event["type"] == "session.updated":
|
|
|
|
|
break
|
|
|
|
|
except TimeoutError:
|
|
|
|
|
warnings.warn(
|
|
|
|
|
f"session.updated not received within {5.0}s after "
|
|
|
|
|
"session.update. The server may not implement this event.",
|
|
|
|
|
stacklevel=2,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Start transcription
|
2026-02-19 02:21:47 -05:00
|
|
|
await send_event(ws, {"type": "input_audio_buffer.commit"})
|
|
|
|
|
|
|
|
|
|
for chunk in mary_had_lamb_audio_chunks:
|
|
|
|
|
await send_event(
|
|
|
|
|
ws, {"type": "input_audio_buffer.append", "audio": chunk}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
await send_event(ws, {"type": "input_audio_buffer.commit", "final": True})
|
|
|
|
|
|
|
|
|
|
done_received = False
|
|
|
|
|
while not done_received:
|
|
|
|
|
event = await receive_event(ws, timeout=60.0)
|
|
|
|
|
if event["type"] == "transcription.done":
|
|
|
|
|
done_received = True
|
|
|
|
|
elif event["type"] == "error":
|
|
|
|
|
pytest.fail(f"Engine error after empty commit: {event}")
|
|
|
|
|
assert done_received
|
2026-03-25 05:24:33 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
|
|
|
async def test_session_update_invalid_model_returns_error(
|
|
|
|
|
model_name, rocm_aiter_fa_attention
|
|
|
|
|
):
|
|
|
|
|
"""Test that session.update with an invalid model returns an error."""
|
|
|
|
|
server_args = ["--enforce-eager", "--max-model-len", "2048"]
|
|
|
|
|
|
|
|
|
|
if model_name.startswith("mistralai"):
|
|
|
|
|
server_args += MISTRAL_FORMAT_ARGS
|
|
|
|
|
|
|
|
|
|
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
|
|
|
|
|
|
|
|
|
with RemoteOpenAIServer(
|
|
|
|
|
model_name, server_args, env_dict=REALTIME_ENV_OVERRIDES
|
|
|
|
|
) as remote_server:
|
|
|
|
|
ws_url = _get_websocket_url(remote_server)
|
|
|
|
|
async with websockets.connect(ws_url) as ws:
|
|
|
|
|
event = await receive_event(ws, timeout=30.0)
|
|
|
|
|
assert event["type"] == "session.created"
|
|
|
|
|
|
|
|
|
|
# Send session.update with a model that doesn't exist
|
|
|
|
|
await send_event(
|
|
|
|
|
ws,
|
|
|
|
|
{"type": "session.update", "model": "nonexistent-model"},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
event = await receive_event(ws, timeout=10.0)
|
|
|
|
|
assert event["type"] == "error"
|
|
|
|
|
assert "nonexistent-model" in event["error"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
|
|
|
|
async def test_commit_without_session_update_returns_error(
|
|
|
|
|
model_name, rocm_aiter_fa_attention
|
|
|
|
|
):
|
|
|
|
|
"""Test that committing before validating the model returns an error
|
|
|
|
|
and does not fall through to processing."""
|
|
|
|
|
server_args = ["--enforce-eager", "--max-model-len", "2048"]
|
|
|
|
|
|
|
|
|
|
if model_name.startswith("mistralai"):
|
|
|
|
|
server_args += MISTRAL_FORMAT_ARGS
|
|
|
|
|
|
|
|
|
|
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
|
|
|
|
|
|
|
|
|
with RemoteOpenAIServer(
|
|
|
|
|
model_name, server_args, env_dict=REALTIME_ENV_OVERRIDES
|
|
|
|
|
) as remote_server:
|
|
|
|
|
ws_url = _get_websocket_url(remote_server)
|
|
|
|
|
async with websockets.connect(ws_url) as ws:
|
|
|
|
|
event = await receive_event(ws, timeout=30.0)
|
|
|
|
|
assert event["type"] == "session.created"
|
|
|
|
|
|
|
|
|
|
# Send commit without sending session.update first
|
|
|
|
|
await send_event(
|
|
|
|
|
ws,
|
|
|
|
|
{"type": "input_audio_buffer.commit", "final": True},
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
event = await receive_event(ws, timeout=10.0)
|
|
|
|
|
assert event["type"] == "error"
|
|
|
|
|
assert "model_not_validated" in event.get("code", "")
|