[Realtime API] Adds minimal realtime API based on websockets (#33187)

Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
Patrick von Platen
2026-01-30 11:41:29 +01:00
committed by GitHub
parent 1a7894dbdf
commit 10152d2194
21 changed files with 1316 additions and 48 deletions

View File

@@ -7,7 +7,6 @@ import time
import warnings
from collections.abc import AsyncGenerator, Iterable, Mapping
from copy import copy
from dataclasses import dataclass
from typing import Any
import torch
@@ -19,6 +18,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.utils import _validate_truncation_size
from vllm.inputs import PromptType
from vllm.inputs.data import StreamingInput
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
@@ -53,18 +53,6 @@ from vllm.v1.metrics.stats import IterationStats
logger = init_logger(__name__)
@dataclass
class StreamingInput:
"""Input data for a streaming generation request.
This is used with generate() to support multi-turn streaming sessions
where inputs are provided via an async generator.
"""
prompt: PromptType
sampling_params: SamplingParams | None = None
class InputStreamError(Exception):
"""Wrapper for errors from the input stream generator.