diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index a7b1c18a6..bdd1784c8 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -610,54 +610,10 @@ Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono chan
 | `transcription.done` | Final transcription with usage stats |
 | `error` | Error notification with message and optional code |
 
-#### Python WebSocket Example
+#### Example Clients
 
-??? code
-
-    ```python
-    import asyncio
-    import base64
-    import json
-    import websockets
-
-    async def realtime_transcribe():
-        uri = "ws://localhost:8000/v1/realtime"
-
-        async with websockets.connect(uri) as ws:
-            # Wait for session.created
-            response = await ws.recv()
-            print(f"Session: {response}")
-
-            # Commit buffer
-            await ws.send(json.dumps({
-                "type": "input_audio_buffer.commit"
-            }))
-
-            # Send audio chunks (example with file)
-            with open("audio.raw", "rb") as f:
-                while chunk := f.read(4096):
-                    await ws.send(json.dumps({
-                        "type": "input_audio_buffer.append",
-                        "audio": base64.b64encode(chunk).decode()
-                    }))
-
-            # Signal all audio is sent
-            await ws.send(json.dumps({
-                "type": "input_audio_buffer.commit",
-                "final": True,
-            }))
-
-            # Receive transcription
-            while True:
-                response = json.loads(await ws.recv())
-                if response["type"] == "transcription.delta":
-                    print(response["delta"], end="", flush=True)
-                elif response["type"] == "transcription.done":
-                    print(f"\nFinal: {response['text']}")
-                    break
-
-    asyncio.run(realtime_transcribe())
-    ```
+- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_client.py) - Upload and transcribe an audio file
+- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription
 
 ### Tokenizer API
 
diff --git a/tests/models/multimodal/generation/test_voxtral_streaming.py b/tests/models/multimodal/generation/test_voxtral_realtime.py
similarity index 98%
rename from tests/models/multimodal/generation/test_voxtral_streaming.py
rename to tests/models/multimodal/generation/test_voxtral_realtime.py
index 41b9a6830..d7906003a 100644
--- a/tests/models/multimodal/generation/test_voxtral_streaming.py
+++ b/tests/models/multimodal/generation/test_voxtral_realtime.py
@@ -74,7 +74,7 @@ def async_engine() -> AsyncLLM:
 
 
 @pytest.mark.skip(reason="Voxtral streaming is not yet public")
-def test_voxtral_streaming_forward(audio_assets, tokenizer, engine):
+def test_voxtral_realtime_forward(audio_assets, tokenizer, engine):
     audio_config = tokenizer.instruct_tokenizer.tokenizer.audio
 
     def from_file(file_path: str):
@@ -219,7 +219,7 @@ class RealTimeAudioInput:
 
 @pytest.mark.asyncio
 @pytest.mark.skip(reason="Voxtral streaming is not yet public")
-async def test_voxtral_streaming_generator(audio_assets, tokenizer, async_engine):
+async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine):
     sampling_params = SamplingParams(temperature=0.0, max_tokens=1)
 
     output_tokens_list = []
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3be300e2c..c2760d37f 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -989,7 +989,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
         # disable this temporarily until we support HF format
         is_available_online=False,
     ),
-    "VoxtralStreamingGeneration": _HfExamplesInfo(
+    "VoxtralRealtimeGeneration": _HfExamplesInfo(
         "<place-holder>",
         # disable this temporarily until we support HF format
         is_available_online=False,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 23d3e0b41..f38914a7c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -462,7 +462,7 @@ _MULTIMODAL_MODELS = {
     ),
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
-    "VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"),  # noqa: E501
+    "VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"),  # noqa: E501
     # [Encoder-decoder]
     "NemotronParseForConditionalGeneration": (
         "nemotron_parse",
diff --git a/vllm/model_executor/models/voxtral_streaming.py b/vllm/model_executor/models/voxtral_realtime.py
similarity index 95%
rename from vllm/model_executor/models/voxtral_streaming.py
rename to vllm/model_executor/models/voxtral_realtime.py
index 5ff561f73..cbd3f73ae 100644
--- a/vllm/model_executor/models/voxtral_streaming.py
+++ b/vllm/model_executor/models/voxtral_realtime.py
@@ -50,7 +50,7 @@ logger = init_logger(__name__)
 _PRE_ALLOCATE_BUFFER_SIZE_IN_S = 30
 
 
-class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
+class VoxtralRealtimeMultiModalProcessor(VoxtralMultiModalProcessor):
     def __init__(
         self,
         info: _I,
@@ -58,7 +58,7 @@ class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
         *,
         cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
-        # streaming can't make use of a cache yet
+        # realtime can't make use of a cache yet
         super().__init__(info, dummy_inputs, cache=None)
 
     def _maybe_apply_prompt_updates(
@@ -72,10 +72,10 @@ class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
         # there are no placeholder audio tokens for streaming
         # so we need to build the place placeholder positions manually
 
-        # in streaming there is always only one audio input
+        # in realtime there is always only one audio input
         audios = mm_kwargs.get("audio", [])
         assert len(audios) == 1, (
-            f"Expected only one audio input for streaming, got {mm_kwargs=}"
+            f"Expected only one audio input for realtime, got {mm_kwargs=}"
         )
         tokenizer = self.info.get_tokenizer()
         audio_config = tokenizer.instruct.audio_encoder.audio_config
@@ -211,12 +211,12 @@ class VoxtralRealtimeBuffer:
 
 
 @MULTIMODAL_REGISTRY.register_processor(
-    VoxtralStreamingMultiModalProcessor,
+    VoxtralRealtimeMultiModalProcessor,
     info=VoxtralProcessingInfo,
     dummy_inputs=VoxtralDummyInputsBuilder,
 )
 @support_torch_compile
-class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealtime):
+class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtime):
     requires_raw_input_tokens = True
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -224,10 +224,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti
 
         assert (
             not vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs()
-        ), (
-            "Voxtral streaming doesn't support full cudagraphs yet. "
-            "Please use PIECEWISE."
-        )
+        ), "Voxtral realtime doesn't support full cudagraphs yet. Please use PIECEWISE."
 
         self.time_embedding: TimeEmbedding = TimeEmbedding(
             dim=self.config.text_config.hidden_size
@@ -302,11 +299,11 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti
         handle_oov_mm_token: bool = True,
     ) -> torch.Tensor:
         """Pass post-conv embeddings directly as input"""
-        # for streaming we simply flatten the multimodal embeddings
+        # for realtime we simply flatten the multimodal embeddings
         # to be in tensor format, we treat the input ids later
         assert multimodal_embeddings is not None
         assert len(multimodal_embeddings) > 0, (
-            "For streaming you must provide a multimodal_embedding at every step."
+            "For realtime you must provide a multimodal_embedding at every step."
         )
         mm_embeds_flat = _flatten_embeddings(multimodal_embeddings)
         return mm_embeds_flat
@@ -370,7 +367,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti
         audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
 
         assert audio_inputs is not None, (
-            "For streaming you must provide an audio input at every step."
+            "For realtime you must provide an audio input at every step."
         )
 
         def _truncate_left(
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index 0bf282c8e..d81042aa9 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -204,7 +204,7 @@ def _remap_mistral_audio_args(config: dict) -> dict:
         raise NotImplementedError(f"Unsupported: {_maybe_sliding_window=}")
 
     architecture = (
-        "VoxtralStreamingGeneration"
+        "VoxtralRealtimeGeneration"
         if encoder_args.get("causal")
         else "VoxtralForConditionalGeneration"
     )