[Streaming -> Realtime] Rename all voxtral related classes, fn, files (#33415)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
committed by
GitHub
parent
6c64c41b4a
commit
15e0bb9c42
@@ -610,54 +610,10 @@ Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono chan
|
||||
| `transcription.done` | Final transcription with usage stats |
|
||||
| `error` | Error notification with message and optional code |
|
||||
|
||||
#### Python WebSocket Example
|
||||
#### Example Clients
|
||||
|
||||
??? code
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import websockets
|
||||
|
||||
async def realtime_transcribe():
|
||||
uri = "ws://localhost:8000/v1/realtime"
|
||||
|
||||
async with websockets.connect(uri) as ws:
|
||||
# Wait for session.created
|
||||
response = await ws.recv()
|
||||
print(f"Session: {response}")
|
||||
|
||||
# Commit buffer
|
||||
await ws.send(json.dumps({
|
||||
"type": "input_audio_buffer.commit"
|
||||
}))
|
||||
|
||||
# Send audio chunks (example with file)
|
||||
with open("audio.raw", "rb") as f:
|
||||
while chunk := f.read(4096):
|
||||
await ws.send(json.dumps({
|
||||
"type": "input_audio_buffer.append",
|
||||
"audio": base64.b64encode(chunk).decode()
|
||||
}))
|
||||
|
||||
# Signal all audio is sent
|
||||
await ws.send(json.dumps({
|
||||
"type": "input_audio_buffer.commit",
|
||||
"final": True,
|
||||
}))
|
||||
|
||||
# Receive transcription
|
||||
while True:
|
||||
response = json.loads(await ws.recv())
|
||||
if response["type"] == "transcription.delta":
|
||||
print(response["delta"], end="", flush=True)
|
||||
elif response["type"] == "transcription.done":
|
||||
print(f"\nFinal: {response['text']}")
|
||||
break
|
||||
|
||||
asyncio.run(realtime_transcribe())
|
||||
```
|
||||
- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_client.py) - Upload and transcribe an audio file
|
||||
- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription
|
||||
|
||||
### Tokenizer API
|
||||
|
||||
|
||||
@@ -74,7 +74,7 @@ def async_engine() -> AsyncLLM:
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Voxtral streaming is not yet public")
|
||||
def test_voxtral_streaming_forward(audio_assets, tokenizer, engine):
|
||||
def test_voxtral_realtime_forward(audio_assets, tokenizer, engine):
|
||||
audio_config = tokenizer.instruct_tokenizer.tokenizer.audio
|
||||
|
||||
def from_file(file_path: str):
|
||||
@@ -219,7 +219,7 @@ class RealTimeAudioInput:
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip(reason="Voxtral streaming is not yet public")
|
||||
async def test_voxtral_streaming_generator(audio_assets, tokenizer, async_engine):
|
||||
async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine):
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=1)
|
||||
|
||||
output_tokens_list = []
|
||||
@@ -989,7 +989,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
# disable this temporarily until we support HF format
|
||||
is_available_online=False,
|
||||
),
|
||||
"VoxtralStreamingGeneration": _HfExamplesInfo(
|
||||
"VoxtralRealtimeGeneration": _HfExamplesInfo(
|
||||
"<place-holder>",
|
||||
# disable this temporarily until we support HF format
|
||||
is_available_online=False,
|
||||
|
||||
@@ -462,7 +462,7 @@ _MULTIMODAL_MODELS = {
|
||||
),
|
||||
"UltravoxModel": ("ultravox", "UltravoxModel"),
|
||||
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
|
||||
"VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"), # noqa: E501
|
||||
"VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"), # noqa: E501
|
||||
# [Encoder-decoder]
|
||||
"NemotronParseForConditionalGeneration": (
|
||||
"nemotron_parse",
|
||||
|
||||
@@ -50,7 +50,7 @@ logger = init_logger(__name__)
|
||||
_PRE_ALLOCATE_BUFFER_SIZE_IN_S = 30
|
||||
|
||||
|
||||
class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
|
||||
class VoxtralRealtimeMultiModalProcessor(VoxtralMultiModalProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
info: _I,
|
||||
@@ -58,7 +58,7 @@ class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
|
||||
*,
|
||||
cache: BaseMultiModalProcessorCache | None = None,
|
||||
) -> None:
|
||||
# streaming can't make use of a cache yet
|
||||
# realtime can't make use of a cache yet
|
||||
super().__init__(info, dummy_inputs, cache=None)
|
||||
|
||||
def _maybe_apply_prompt_updates(
|
||||
@@ -72,10 +72,10 @@ class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
|
||||
# there are no placeholder audio tokens for streaming
|
||||
# so we need to build the place placeholder positions manually
|
||||
|
||||
# in streaming there is always only one audio input
|
||||
# in realtime there is always only one audio input
|
||||
audios = mm_kwargs.get("audio", [])
|
||||
assert len(audios) == 1, (
|
||||
f"Expected only one audio input for streaming, got {mm_kwargs=}"
|
||||
f"Expected only one audio input for realtime, got {mm_kwargs=}"
|
||||
)
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
audio_config = tokenizer.instruct.audio_encoder.audio_config
|
||||
@@ -211,12 +211,12 @@ class VoxtralRealtimeBuffer:
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(
|
||||
VoxtralStreamingMultiModalProcessor,
|
||||
VoxtralRealtimeMultiModalProcessor,
|
||||
info=VoxtralProcessingInfo,
|
||||
dummy_inputs=VoxtralDummyInputsBuilder,
|
||||
)
|
||||
@support_torch_compile
|
||||
class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealtime):
|
||||
class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtime):
|
||||
requires_raw_input_tokens = True
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
@@ -224,10 +224,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti
|
||||
|
||||
assert (
|
||||
not vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||
), (
|
||||
"Voxtral streaming doesn't support full cudagraphs yet. "
|
||||
"Please use PIECEWISE."
|
||||
)
|
||||
), "Voxtral realtime doesn't support full cudagraphs yet. Please use PIECEWISE."
|
||||
|
||||
self.time_embedding: TimeEmbedding = TimeEmbedding(
|
||||
dim=self.config.text_config.hidden_size
|
||||
@@ -302,11 +299,11 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti
|
||||
handle_oov_mm_token: bool = True,
|
||||
) -> torch.Tensor:
|
||||
"""Pass post-conv embeddings directly as input"""
|
||||
# for streaming we simply flatten the multimodal embeddings
|
||||
# for realtime we simply flatten the multimodal embeddings
|
||||
# to be in tensor format, we treat the input ids later
|
||||
assert multimodal_embeddings is not None
|
||||
assert len(multimodal_embeddings) > 0, (
|
||||
"For streaming you must provide a multimodal_embedding at every step."
|
||||
"For realtime you must provide a multimodal_embedding at every step."
|
||||
)
|
||||
mm_embeds_flat = _flatten_embeddings(multimodal_embeddings)
|
||||
return mm_embeds_flat
|
||||
@@ -370,7 +367,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti
|
||||
audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
|
||||
|
||||
assert audio_inputs is not None, (
|
||||
"For streaming you must provide an audio input at every step."
|
||||
"For realtime you must provide an audio input at every step."
|
||||
)
|
||||
|
||||
def _truncate_left(
|
||||
@@ -204,7 +204,7 @@ def _remap_mistral_audio_args(config: dict) -> dict:
|
||||
raise NotImplementedError(f"Unsupported: {_maybe_sliding_window=}")
|
||||
|
||||
architecture = (
|
||||
"VoxtralStreamingGeneration"
|
||||
"VoxtralRealtimeGeneration"
|
||||
if encoder_args.get("causal")
|
||||
else "VoxtralForConditionalGeneration"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user