[Streaming -> Realtime] Rename all voxtral related classes, fn, files (#33415)

Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
2026-01-31 05:49:00 +01:00
parent 6c64c41b4a
commit 15e0bb9c42
6 changed files with 18 additions and 65 deletions
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -462,7 +462,7 @@ _MULTIMODAL_MODELS = {
    ),
    "UltravoxModel": ("ultravox", "UltravoxModel"),
    "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
-    "VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"),  # noqa: E501
+    "VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"),  # noqa: E501
    # [Encoder-decoder]
    "NemotronParseForConditionalGeneration": (
        "nemotron_parse",
--- a/vllm/model_executor/models/voxtral_streaming.py
+++ b/vllm/model_executor/models/voxtral_streaming.py
@@ -50,7 +50,7 @@ logger = init_logger(__name__)
 _PRE_ALLOCATE_BUFFER_SIZE_IN_S = 30


-class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
+class VoxtralRealtimeMultiModalProcessor(VoxtralMultiModalProcessor):
    def __init__(
        self,
        info: _I,
@@ -58,7 +58,7 @@ class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
        *,
        cache: BaseMultiModalProcessorCache | None = None,
    ) -> None:
-        # streaming can't make use of a cache yet
+        # realtime can't make use of a cache yet
        super().__init__(info, dummy_inputs, cache=None)

    def _maybe_apply_prompt_updates(
@@ -72,10 +72,10 @@ class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
        # there are no placeholder audio tokens for streaming
        # so we need to build the place placeholder positions manually

-        # in streaming there is always only one audio input
+        # in realtime there is always only one audio input
        audios = mm_kwargs.get("audio", [])
        assert len(audios) == 1, (
-            f"Expected only one audio input for streaming, got {mm_kwargs=}"
+            f"Expected only one audio input for realtime, got {mm_kwargs=}"
        )
        tokenizer = self.info.get_tokenizer()
        audio_config = tokenizer.instruct.audio_encoder.audio_config
@@ -211,12 +211,12 @@ class VoxtralRealtimeBuffer:


@MULTIMODAL_REGISTRY.register_processor(
-    VoxtralStreamingMultiModalProcessor,
+    VoxtralRealtimeMultiModalProcessor,
    info=VoxtralProcessingInfo,
    dummy_inputs=VoxtralDummyInputsBuilder,
 )
@support_torch_compile
-class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealtime):
+class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtime):
    requires_raw_input_tokens = True

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -224,10 +224,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti

        assert (
            not vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs()
-        ), (
-            "Voxtral streaming doesn't support full cudagraphs yet. "
-            "Please use PIECEWISE."
-        )
+        ), "Voxtral realtime doesn't support full cudagraphs yet. Please use PIECEWISE."

        self.time_embedding: TimeEmbedding = TimeEmbedding(
            dim=self.config.text_config.hidden_size
@@ -302,11 +299,11 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti
        handle_oov_mm_token: bool = True,
    ) -> torch.Tensor:
        """Pass post-conv embeddings directly as input"""
-        # for streaming we simply flatten the multimodal embeddings
+        # for realtime we simply flatten the multimodal embeddings
        # to be in tensor format, we treat the input ids later
        assert multimodal_embeddings is not None
        assert len(multimodal_embeddings) > 0, (
-            "For streaming you must provide a multimodal_embedding at every step."
+            "For realtime you must provide a multimodal_embedding at every step."
        )
        mm_embeds_flat = _flatten_embeddings(multimodal_embeddings)
        return mm_embeds_flat
@@ -370,7 +367,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration, SupportsRealti
        audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)

        assert audio_inputs is not None, (
-            "For streaming you must provide an audio input at every step."
+            "For realtime you must provide an audio input at every step."
        )

        def _truncate_left(