diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index a7b1c18a6..bdd1784c8 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -610,54 +610,10 @@ Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono chan | `transcription.done` | Final transcription with usage stats | | `error` | Error notification with message and optional code | -#### Python WebSocket Example +#### Example Clients -??? code - - ```python - import asyncio - import base64 - import json - import websockets - - async def realtime_transcribe(): - uri = "ws://localhost:8000/v1/realtime" - - async with websockets.connect(uri) as ws: - # Wait for session.created - response = await ws.recv() - print(f"Session: {response}") - - # Commit buffer - await ws.send(json.dumps({ - "type": "input_audio_buffer.commit" - })) - - # Send audio chunks (example with file) - with open("audio.raw", "rb") as f: - while chunk := f.read(4096): - await ws.send(json.dumps({ - "type": "input_audio_buffer.append", - "audio": base64.b64encode(chunk).decode() - })) - - # Signal all audio is sent - await ws.send(json.dumps({ - "type": "input_audio_buffer.commit", - "final": True, - })) - - # Receive transcription - while True: - response = json.loads(await ws.recv()) - if response["type"] == "transcription.delta": - print(response["delta"], end="", flush=True) - elif response["type"] == "transcription.done": - print(f"\nFinal: {response['text']}") - break - - asyncio.run(realtime_transcribe()) - ``` +- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_client.py) - Upload and transcribe an audio file +- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription ### Tokenizer API diff --git a/tests/models/multimodal/generation/test_voxtral_streaming.py b/tests/models/multimodal/generation/test_voxtral_realtime.py similarity index 98% rename from tests/models/multimodal/generation/test_voxtral_streaming.py rename to tests/models/multimodal/generation/test_voxtral_realtime.py index 41b9a6830..d7906003a 100644 --- a/tests/models/multimodal/generation/test_voxtral_streaming.py +++ b/tests/models/multimodal/generation/test_voxtral_realtime.py @@ -74,7 +74,7 @@ def async_engine() -> AsyncLLM: @pytest.mark.skip(reason="Voxtral streaming is not yet public") -def test_voxtral_streaming_forward(audio_assets, tokenizer, engine): +def test_voxtral_realtime_forward(audio_assets, tokenizer, engine): audio_config = tokenizer.instruct_tokenizer.tokenizer.audio def from_file(file_path: str): @@ -219,7 +219,7 @@ class RealTimeAudioInput: @pytest.mark.asyncio @pytest.mark.skip(reason="Voxtral streaming is not yet public") -async def test_voxtral_streaming_generator(audio_assets, tokenizer, async_engine): +async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine): sampling_params = SamplingParams(temperature=0.0, max_tokens=1) output_tokens_list = [] diff --git a/tests/models/registry.py b/tests/models/registry.py index 3be300e2c..c2760d37f 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -989,7 +989,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { # disable this temporarily until we support HF format is_available_online=False, ), - "VoxtralStreamingGeneration": _HfExamplesInfo( + "VoxtralRealtimeGeneration": _HfExamplesInfo( "", # disable this temporarily until we support HF format is_available_online=False, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 23d3e0b41..f38914a7c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -462,7 +462,7 @@ _MULTIMODAL_MODELS = { ), "UltravoxModel": ("ultravox", "UltravoxModel"), "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 - "VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"), # noqa: E501 + "VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"), # noqa: E501 # [Encoder-decoder] "NemotronParseForConditionalGeneration": ( "nemotron_parse", diff --git a/vllm/model_executor/models/voxtral_streaming.py b/vllm/model_executor/models/voxtral_realtime.py similarity index 95% rename from vllm/model_executor/models/voxtral_streaming.py rename to vllm/model_executor/models/voxtral_realtime.py index 5ff561f73..cbd3f73ae 100644 --- a/vllm/model_executor/models/voxtral_streaming.py +++ b/vllm/model_executor/models/voxtral_realtime.py @@ -50,7 +50,7 @@ logger = init_logger(__name__) _PRE_ALLOCATE_BUFFER_SIZE_IN_S = 30 -class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor): +class VoxtralRealtimeMultiModalProcessor(VoxtralMultiModalProcessor): def __init__( self, info: _I, @@ -58,7 +58,7 @@ class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor): *, cache: BaseMultiModalProcessorCache | None = None, ) -> None: - # streaming can't make use of a cache yet + # realtime can't make use of a cache yet super().__init__(info, dummy_inputs, cache=None) def _maybe_apply_prompt_updates( @@ -72,10 +72,10 @@ class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor): # there are no placeholder audio tokens for streaming # so we need to build the place placeholder positions manually - # in streaming there is always only one audio input + # in realtime there is always only one audio input audios = mm_kwargs.get("audio", []) assert len(audios) == 1, ( - f"Expected only one audio input for streaming, got {mm_kwargs=}" + f"Expected only one audio input for realtime, got {mm_kwargs=}" ) tokenizer = self.info.get_tokenizer() audio_config = tokenizer.instruct.audio_encoder.audio_config @@ -211,12 +211,12 @@ class VoxtralRealtimeBuffer: @MULTIMODAL_REGISTRY.register_processor( - VoxtralStreamingMultiModalProcessor, + VoxtralRealtimeMultiModalProcessor, info=VoxtralProcessingInfo, dummy_inputs=VoxtralDummyInputsBuilder, ) @support_torch_compile -class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealtime): +class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtime): requires_raw_input_tokens = True def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -224,10 +224,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti assert ( not vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs() - ), ( - "Voxtral streaming doesn't support full cudagraphs yet. " - "Please use PIECEWISE." - ) + ), "Voxtral realtime doesn't support full cudagraphs yet. Please use PIECEWISE." self.time_embedding: TimeEmbedding = TimeEmbedding( dim=self.config.text_config.hidden_size @@ -302,11 +299,11 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti handle_oov_mm_token: bool = True, ) -> torch.Tensor: """Pass post-conv embeddings directly as input""" - # for streaming we simply flatten the multimodal embeddings + # for realtime we simply flatten the multimodal embeddings # to be in tensor format, we treat the input ids later assert multimodal_embeddings is not None assert len(multimodal_embeddings) > 0, ( - "For streaming you must provide a multimodal_embedding at every step." + "For realtime you must provide a multimodal_embedding at every step." ) mm_embeds_flat = _flatten_embeddings(multimodal_embeddings) return mm_embeds_flat @@ -370,7 +367,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti audio_inputs = self._parse_and_validate_audio_arrays(**kwargs) assert audio_inputs is not None, ( - "For streaming you must provide an audio input at every step." + "For realtime you must provide an audio input at every step." ) def _truncate_left( diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index 0bf282c8e..d81042aa9 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -204,7 +204,7 @@ def _remap_mistral_audio_args(config: dict) -> dict: raise NotImplementedError(f"Unsupported: {_maybe_sliding_window=}") architecture = ( - "VoxtralStreamingGeneration" + "VoxtralRealtimeGeneration" if encoder_args.get("causal") else "VoxtralForConditionalGeneration" )