diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index acaf5f8d4..5ce3ea4de 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,12 +17,12 @@ from argparse import Namespace from collections.abc import AsyncIterator, Awaitable from contextlib import asynccontextmanager from http import HTTPStatus -from typing import Annotated, Any +from typing import Any import model_hosting_container_standards.sagemaker as sagemaker_standards import pydantic import uvloop -from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request +from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, StreamingResponse @@ -49,10 +49,6 @@ from vllm.entrypoints.openai.engine.protocol import ( CompletionResponse, ErrorInfo, ErrorResponse, - TranscriptionRequest, - TranscriptionResponseVariant, - TranslationRequest, - TranslationResponseVariant, ) from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.orca_metrics import metrics_header @@ -62,7 +58,7 @@ from vllm.entrypoints.openai.serving_models import ( BaseModelPath, OpenAIServingModels, ) -from vllm.entrypoints.openai.serving_transcription import ( +from vllm.entrypoints.openai.translations.serving import ( OpenAIServingTranscription, OpenAIServingTranslation, ) @@ -239,10 +235,6 @@ def models(request: Request) -> OpenAIServingModels: return request.app.state.openai_serving_models -def responses(request: Request) -> OpenAIServingResponses | None: - return request.app.state.openai_serving_responses - - def messages(request: Request) -> AnthropicServingMessages: return request.app.state.anthropic_serving_messages @@ -259,22 +251,10 @@ def tokenization(request: Request) -> OpenAIServingTokenization: return request.app.state.openai_serving_tokenization -def transcription(request: Request) -> OpenAIServingTranscription: - return request.app.state.openai_serving_transcription - - -def translation(request: Request) -> OpenAIServingTranslation: - return request.app.state.openai_serving_translation - - def engine_client(request: Request) -> EngineClient: return request.app.state.engine_client -def generate_tokens(request: Request) -> ServingTokens | None: - return request.app.state.serving_tokens - - @router.get("/load") async def get_server_load_metrics(request: Request): # This endpoint returns the current server load metrics. @@ -410,84 +390,6 @@ async def create_completion(request: CompletionRequest, raw_request: Request): return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post( - "/v1/audio/transcriptions", - responses={ - HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.UNPROCESSABLE_ENTITY.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - }, -) -@with_cancellation -@load_aware_call -async def create_transcriptions( - raw_request: Request, request: Annotated[TranscriptionRequest, Form()] -): - handler = transcription(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Transcriptions API" - ) - - audio_data = await request.file.read() - try: - generator = await handler.create_transcription(audio_data, request, raw_request) - except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e - - if isinstance(generator, ErrorResponse): - return JSONResponse( - content=generator.model_dump(), status_code=generator.error.code - ) - - elif isinstance(generator, TranscriptionResponseVariant): - return JSONResponse(content=generator.model_dump()) - - return StreamingResponse(content=generator, media_type="text/event-stream") - - -@router.post( - "/v1/audio/translations", - responses={ - HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.UNPROCESSABLE_ENTITY.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - }, -) -@with_cancellation -@load_aware_call -async def create_translations( - request: Annotated[TranslationRequest, Form()], raw_request: Request -): - handler = translation(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Translations API" - ) - - audio_data = await request.file.read() - try: - generator = await handler.create_translation(audio_data, request, raw_request) - except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e - - if isinstance(generator, ErrorResponse): - return JSONResponse( - content=generator.model_dump(), status_code=generator.error.code - ) - - elif isinstance(generator, TranslationResponseVariant): - return JSONResponse(content=generator.model_dump()) - - return StreamingResponse(content=generator, media_type="text/event-stream") - - def load_log_config(log_config_file: str | None) -> dict | None: if not log_config_file: return None @@ -741,6 +643,11 @@ def build_app(args: Namespace) -> FastAPI: ) register_responses_api_router(app) + from vllm.entrypoints.openai.translations.api_router import ( + attach_router as register_translations_api_router, + ) + + register_translations_api_router(app) from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes register_sagemaker_routes(router) diff --git a/vllm/entrypoints/openai/engine/protocol.py b/vllm/entrypoints/openai/engine/protocol.py index 7166b54cb..2fe76e8db 100644 --- a/vllm/entrypoints/openai/engine/protocol.py +++ b/vllm/entrypoints/openai/engine/protocol.py @@ -5,12 +5,10 @@ # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py import json import time -from http import HTTPStatus from typing import Annotated, Any, ClassVar, Literal, TypeAlias import regex as re import torch -from fastapi import HTTPException, UploadFile from pydantic import ( BaseModel, ConfigDict, @@ -702,522 +700,6 @@ class DeltaMessage(OpenAIBaseModel): return self -class TranscriptionResponseStreamChoice(OpenAIBaseModel): - delta: DeltaMessage - finish_reason: str | None = None - stop_reason: int | str | None = None - - -class TranscriptionStreamResponse(OpenAIBaseModel): - id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}") - object: Literal["transcription.chunk"] = "transcription.chunk" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - choices: list[TranscriptionResponseStreamChoice] - usage: UsageInfo | None = Field(default=None) - - -## Protocols for Audio -AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"] - - -class TranscriptionRequest(OpenAIBaseModel): - # Ordered by official OpenAI API documentation - # https://platform.openai.com/docs/api-reference/audio/createTranscription - - file: UploadFile - """ - The audio file object (not file name) to transcribe, in one of these - formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. - """ - - model: str | None = None - """ID of the model to use. - """ - - language: str | None = None - """The language of the input audio. - - Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format - will improve accuracy and latency. - """ - - prompt: str = Field(default="") - """An optional text to guide the model's style or continue a previous audio - segment. - - The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. - """ - - response_format: AudioResponseFormat = Field(default="json") - """ - The format of the output, in one of these options: `json`, `text`, `srt`, - `verbose_json`, or `vtt`. - """ - - ## TODO (varun) : Support if set to 0, certain thresholds are met !! - - timestamp_granularities: list[Literal["word", "segment"]] = Field( - alias="timestamp_granularities[]", default=[] - ) - """The timestamp granularities to populate for this transcription. - - `response_format` must be set `verbose_json` to use timestamp granularities. - Either or both of these options are supported: `word`, or `segment`. Note: - There is no additional latency for segment timestamps, but generating word - timestamps incurs additional latency. - """ - - stream: bool | None = False - """When set, it will enable output to be streamed in a similar fashion - as the Chat Completion endpoint. - """ - # --8<-- [start:transcription-extra-params] - # Flattened stream option to simplify form data. - stream_include_usage: bool | None = False - stream_continuous_usage_stats: bool | None = False - - vllm_xargs: dict[str, str | int | float] | None = Field( - default=None, - description=( - "Additional request parameters with string or " - "numeric values, used by custom extensions." - ), - ) - # --8<-- [end:transcription-extra-params] - - to_language: str | None = None - """The language of the output audio we transcribe to. - - Please note that this is not currently used by supported models at this - time, but it is a placeholder for future use, matching translation api. - """ - - # --8<-- [start:transcription-sampling-params] - temperature: float = Field(default=0.0) - """The sampling temperature, between 0 and 1. - - Higher values like 0.8 will make the output more random, while lower values - like 0.2 will make it more focused / deterministic. If set to 0, the model - will use [log probability](https://en.wikipedia.org/wiki/Log_probability) - to automatically increase the temperature until certain thresholds are hit. - """ - - top_p: float | None = None - """Enables nucleus (top-p) sampling, where tokens are selected from the - smallest possible set whose cumulative probability exceeds `p`. - """ - - top_k: int | None = None - """Limits sampling to the `k` most probable tokens at each step.""" - - min_p: float | None = None - """Filters out tokens with a probability lower than `min_p`, ensuring a - minimum likelihood threshold during sampling. - """ - - seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - """The seed to use for sampling.""" - - frequency_penalty: float | None = 0.0 - """The frequency penalty to use for sampling.""" - - repetition_penalty: float | None = None - """The repetition penalty to use for sampling.""" - - presence_penalty: float | None = 0.0 - """The presence penalty to use for sampling.""" - - max_completion_tokens: int | None = None - """The maximum number of tokens to generate.""" - # --8<-- [end:transcription-sampling-params] - - # Default sampling parameters for transcription requests. - _DEFAULT_SAMPLING_PARAMS: dict = { - "repetition_penalty": 1.0, - "temperature": 1.0, - "top_p": 1.0, - "top_k": 0, - "min_p": 0.0, - } - - def to_sampling_params( - self, default_max_tokens: int, default_sampling_params: dict | None = None - ) -> SamplingParams: - max_tokens = default_max_tokens - - if default_sampling_params is None: - default_sampling_params = {} - - # Default parameters - if (temperature := self.temperature) is None: - temperature = default_sampling_params.get( - "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] - ) - if (top_p := self.top_p) is None: - top_p = default_sampling_params.get( - "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"] - ) - if (top_k := self.top_k) is None: - top_k = default_sampling_params.get( - "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"] - ) - if (min_p := self.min_p) is None: - min_p = default_sampling_params.get( - "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"] - ) - - if (repetition_penalty := self.repetition_penalty) is None: - repetition_penalty = default_sampling_params.get( - "repetition_penalty", - self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"], - ) - - return SamplingParams.from_optional( - temperature=temperature, - max_tokens=max_tokens, - seed=self.seed, - top_p=top_p, - top_k=top_k, - min_p=min_p, - frequency_penalty=self.frequency_penalty, - repetition_penalty=repetition_penalty, - presence_penalty=self.presence_penalty, - output_kind=RequestOutputKind.DELTA - if self.stream - else RequestOutputKind.FINAL_ONLY, - extra_args=self.vllm_xargs, - skip_clone=True, # Created fresh per request, safe to skip clone - ) - - @model_validator(mode="before") - @classmethod - def validate_transcription_request(cls, data): - if isinstance(data.get("file"), str): - raise HTTPException( - status_code=HTTPStatus.UNPROCESSABLE_ENTITY, - detail="Expected 'file' to be a file-like object, not 'str'.", - ) - - stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] - stream = data.get("stream", False) - if any(bool(data.get(so, False)) for so in stream_opts) and not stream: - # Find which specific stream option was set - invalid_param = next( - (so for so in stream_opts if data.get(so, False)), - "stream_include_usage", - ) - raise VLLMValidationError( - "Stream options can only be defined when `stream=True`.", - parameter=invalid_param, - ) - - return data - - -# Transcription response objects -class TranscriptionUsageAudio(OpenAIBaseModel): - type: Literal["duration"] = "duration" - seconds: int - - -class TranscriptionResponse(OpenAIBaseModel): - text: str - """The transcribed text.""" - usage: TranscriptionUsageAudio - - -class TranscriptionWord(OpenAIBaseModel): - end: float - """End time of the word in seconds.""" - - start: float - """Start time of the word in seconds.""" - - word: str - """The text content of the word.""" - - -class TranscriptionSegment(OpenAIBaseModel): - id: int - """Unique identifier of the segment.""" - - avg_logprob: float | None = None - """Average logprob of the segment. - - If the value is lower than -1, consider the logprobs failed. - """ - - compression_ratio: float | None = None - """Compression ratio of the segment. - - If the value is greater than 2.4, consider the compression failed. - """ - - end: float - """End time of the segment in seconds.""" - - no_speech_prob: float | None = None - """Probability of no speech in the segment. - - If the value is higher than 1.0 and the `avg_logprob` is below -1, consider - this segment silent. - """ - - seek: int - """Seek offset of the segment.""" - - start: float - """Start time of the segment in seconds.""" - - temperature: float - """Temperature parameter used for generating the segment.""" - - text: str - """Text content of the segment.""" - - tokens: list[int] - """Array of token IDs for the text content.""" - - -class TranscriptionResponseVerbose(OpenAIBaseModel): - duration: str - """The duration of the input audio.""" - - language: str - """The language of the input audio.""" - - text: str - """The transcribed text.""" - - segments: list[TranscriptionSegment] | None = None - """Segments of the transcribed text and their corresponding details.""" - - words: list[TranscriptionWord] | None = None - """Extracted words and their corresponding timestamps.""" - - -TranscriptionResponseVariant: TypeAlias = ( - TranscriptionResponse | TranscriptionResponseVerbose -) - - -class TranslationResponseStreamChoice(OpenAIBaseModel): - delta: DeltaMessage - finish_reason: str | None = None - stop_reason: int | str | None = None - - -class TranslationStreamResponse(OpenAIBaseModel): - id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}") - object: Literal["translation.chunk"] = "translation.chunk" - created: int = Field(default_factory=lambda: int(time.time())) - model: str - choices: list[TranslationResponseStreamChoice] - usage: UsageInfo | None = Field(default=None) - - -class TranslationRequest(OpenAIBaseModel): - # Ordered by official OpenAI API documentation - # https://platform.openai.com/docs/api-reference/audio/createTranslation - - file: UploadFile - """ - The audio file object (not file name) to translate, in one of these - formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. - """ - - model: str | None = None - """ID of the model to use. - """ - - prompt: str = Field(default="") - """An optional text to guide the model's style or continue a previous audio - segment. - - The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. - """ - - response_format: AudioResponseFormat = Field(default="json") - """ - The format of the output, in one of these options: `json`, `text`, `srt`, - `verbose_json`, or `vtt`. - """ - - # TODO support additional sampling parameters - # --8<-- [start:translation-sampling-params] - seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - """The seed to use for sampling.""" - - temperature: float = Field(default=0.0) - """The sampling temperature, between 0 and 1. - - Higher values like 0.8 will make the output more random, while lower values - like 0.2 will make it more focused / deterministic. If set to 0, the model - will use [log probability](https://en.wikipedia.org/wiki/Log_probability) - to automatically increase the temperature until certain thresholds are hit. - """ - # --8<-- [end:translation-sampling-params] - - # --8<-- [start:translation-extra-params] - language: str | None = None - """The language of the input audio we translate from. - - Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format - will improve accuracy. - """ - - to_language: str | None = None - """The language of the input audio we translate to. - - Please note that this is not supported by all models, refer to the specific - model documentation for more details. - For instance, Whisper only supports `to_language=en`. - """ - - stream: bool | None = False - """Custom field not present in the original OpenAI definition. When set, - it will enable output to be streamed in a similar fashion as the Chat - Completion endpoint. - """ - # Flattened stream option to simplify form data. - stream_include_usage: bool | None = False - stream_continuous_usage_stats: bool | None = False - - max_completion_tokens: int | None = None - """The maximum number of tokens to generate.""" - # --8<-- [end:translation-extra-params] - - # Default sampling parameters for translation requests. - _DEFAULT_SAMPLING_PARAMS: dict = { - "temperature": 0, - } - - def to_sampling_params( - self, default_max_tokens: int, default_sampling_params: dict | None = None - ) -> SamplingParams: - max_tokens = default_max_tokens - - if default_sampling_params is None: - default_sampling_params = {} - # Default parameters - if (temperature := self.temperature) is None: - temperature = default_sampling_params.get( - "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] - ) - - return SamplingParams.from_optional( - temperature=temperature, - max_tokens=max_tokens, - seed=self.seed, - output_kind=RequestOutputKind.DELTA - if self.stream - else RequestOutputKind.FINAL_ONLY, - skip_clone=True, # Created fresh per request, safe to skip clone - ) - - @model_validator(mode="before") - @classmethod - def validate_stream_options(cls, data): - stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] - stream = data.get("stream", False) - if any(bool(data.get(so, False)) for so in stream_opts) and not stream: - # Find which specific stream option was set - invalid_param = next( - (so for so in stream_opts if data.get(so, False)), - "stream_include_usage", - ) - raise VLLMValidationError( - "Stream options can only be defined when `stream=True`.", - parameter=invalid_param, - ) - - return data - - -# Translation response objects -class TranslationResponse(OpenAIBaseModel): - text: str - """The translated text.""" - - -class TranslationWord(OpenAIBaseModel): - end: float - """End time of the word in seconds.""" - - start: float - """Start time of the word in seconds.""" - - word: str - """The text content of the word.""" - - -class TranslationSegment(OpenAIBaseModel): - id: int - """Unique identifier of the segment.""" - - avg_logprob: float | None = None - """Average logprob of the segment. - - If the value is lower than -1, consider the logprobs failed. - """ - - compression_ratio: float | None = None - """Compression ratio of the segment. - - If the value is greater than 2.4, consider the compression failed. - """ - - end: float - """End time of the segment in seconds.""" - - no_speech_prob: float | None = None - """Probability of no speech in the segment. - - If the value is higher than 1.0 and the `avg_logprob` is below -1, consider - this segment silent. - """ - - seek: int - """Seek offset of the segment.""" - - start: float - """Start time of the segment in seconds.""" - - temperature: float - """Temperature parameter used for generating the segment.""" - - text: str - """Text content of the segment.""" - - tokens: list[int] - """Array of token IDs for the text content.""" - - -class TranslationResponseVerbose(OpenAIBaseModel): - duration: str - """The duration of the input audio.""" - - language: str - """The language of the input audio.""" - - text: str - """The translated text.""" - - segments: list[TranslationSegment] | None = None - """Segments of the translated text and their corresponding details.""" - - words: list[TranslationWord] | None = None - """Extracted words and their corresponding timestamps.""" - - -TranslationResponseVariant: TypeAlias = TranslationResponse | TranslationResponseVerbose - - ####### Tokens IN <> Tokens OUT ####### class GenerateRequest(BaseModel): request_id: str = Field( diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 045e2d068..3d711b18b 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -50,9 +50,6 @@ from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, FunctionCall, FunctionDefinition, - TranscriptionRequest, - TranscriptionResponse, - TranslationRequest, VLLMValidationError, ) from vllm.entrypoints.openai.responses.protocol import ( @@ -60,6 +57,11 @@ from vllm.entrypoints.openai.responses.protocol import ( ResponsesRequest, ) from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.openai.translations.protocol import ( + TranscriptionRequest, + TranscriptionResponse, + TranslationRequest, +) from vllm.entrypoints.pooling.classify.protocol import ( ClassificationChatRequest, ClassificationCompletionRequest, diff --git a/vllm/entrypoints/openai/translations/__init__.py b/vllm/entrypoints/openai/translations/__init__.py new file mode 100644 index 000000000..208f01a7c --- /dev/null +++ b/vllm/entrypoints/openai/translations/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/entrypoints/openai/translations/api_router.py b/vllm/entrypoints/openai/translations/api_router.py new file mode 100644 index 000000000..86dc3dc66 --- /dev/null +++ b/vllm/entrypoints/openai/translations/api_router.py @@ -0,0 +1,122 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from http import HTTPStatus +from typing import Annotated + +from fastapi import APIRouter, FastAPI, Form, HTTPException, Request +from fastapi.responses import JSONResponse, StreamingResponse + +from vllm.entrypoints.openai.engine.protocol import ErrorResponse +from vllm.entrypoints.openai.translations.protocol import ( + TranscriptionRequest, + TranscriptionResponseVariant, + TranslationRequest, + TranslationResponseVariant, +) +from vllm.entrypoints.openai.translations.serving import ( + OpenAIServingTranscription, + OpenAIServingTranslation, +) +from vllm.entrypoints.utils import ( + load_aware_call, + with_cancellation, +) +from vllm.logger import init_logger + +logger = init_logger(__name__) + +router = APIRouter() + + +def transcription(request: Request) -> OpenAIServingTranscription: + return request.app.state.openai_serving_transcription + + +def translation(request: Request) -> OpenAIServingTranslation: + return request.app.state.openai_serving_translation + + +@router.post( + "/v1/audio/transcriptions", + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.UNPROCESSABLE_ENTITY.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_transcriptions( + raw_request: Request, request: Annotated[TranscriptionRequest, Form()] +): + handler = transcription(raw_request) + if handler is None: + base_server = raw_request.app.state.openai_serving_tokenization + return base_server.create_error_response( + message="The model does not support Transcriptions API" + ) + + audio_data = await request.file.read() + try: + generator = await handler.create_transcription(audio_data, request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + + elif isinstance(generator, TranscriptionResponseVariant): + return JSONResponse(content=generator.model_dump()) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + +@router.post( + "/v1/audio/translations", + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.UNPROCESSABLE_ENTITY.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_translations( + request: Annotated[TranslationRequest, Form()], raw_request: Request +): + handler = translation(raw_request) + if handler is None: + base_server = raw_request.app.state.openai_serving_tokenization + return base_server.create_error_response( + message="The model does not support Translations API" + ) + + audio_data = await request.file.read() + try: + generator = await handler.create_translation(audio_data, request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + + elif isinstance(generator, TranslationResponseVariant): + return JSONResponse(content=generator.model_dump()) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + +def attach_router(app: FastAPI): + app.include_router(router) diff --git a/vllm/entrypoints/openai/translations/protocol.py b/vllm/entrypoints/openai/translations/protocol.py new file mode 100644 index 000000000..a3c36c107 --- /dev/null +++ b/vllm/entrypoints/openai/translations/protocol.py @@ -0,0 +1,545 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import time +from http import HTTPStatus +from typing import Literal, TypeAlias + +import torch +from fastapi import HTTPException, UploadFile +from pydantic import ( + Field, + model_validator, +) + +from vllm.entrypoints.openai.engine.protocol import ( + DeltaMessage, + OpenAIBaseModel, + UsageInfo, +) +from vllm.exceptions import VLLMValidationError +from vllm.logger import init_logger +from vllm.sampling_params import ( + RequestOutputKind, + SamplingParams, +) +from vllm.utils import random_uuid + +logger = init_logger(__name__) +_LONG_INFO = torch.iinfo(torch.long) + + +class TranscriptionResponseStreamChoice(OpenAIBaseModel): + delta: DeltaMessage + finish_reason: str | None = None + stop_reason: int | str | None = None + + +class TranscriptionStreamResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}") + object: Literal["transcription.chunk"] = "transcription.chunk" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[TranscriptionResponseStreamChoice] + usage: UsageInfo | None = Field(default=None) + + +## Protocols for Audio +AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"] + + +class TranscriptionRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/audio/createTranscription + + file: UploadFile + """ + The audio file object (not file name) to transcribe, in one of these + formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + """ + + model: str | None = None + """ID of the model to use. + """ + + language: str | None = None + """The language of the input audio. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format + will improve accuracy and latency. + """ + + prompt: str = Field(default="") + """An optional text to guide the model's style or continue a previous audio + segment. + + The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) + should match the audio language. + """ + + response_format: AudioResponseFormat = Field(default="json") + """ + The format of the output, in one of these options: `json`, `text`, `srt`, + `verbose_json`, or `vtt`. + """ + + ## TODO (varun) : Support if set to 0, certain thresholds are met !! + + timestamp_granularities: list[Literal["word", "segment"]] = Field( + alias="timestamp_granularities[]", default=[] + ) + """The timestamp granularities to populate for this transcription. + + `response_format` must be set `verbose_json` to use timestamp granularities. + Either or both of these options are supported: `word`, or `segment`. Note: + There is no additional latency for segment timestamps, but generating word + timestamps incurs additional latency. + """ + + stream: bool | None = False + """When set, it will enable output to be streamed in a similar fashion + as the Chat Completion endpoint. + """ + # --8<-- [start:transcription-extra-params] + # Flattened stream option to simplify form data. + stream_include_usage: bool | None = False + stream_continuous_usage_stats: bool | None = False + + vllm_xargs: dict[str, str | int | float] | None = Field( + default=None, + description=( + "Additional request parameters with string or " + "numeric values, used by custom extensions." + ), + ) + # --8<-- [end:transcription-extra-params] + + to_language: str | None = None + """The language of the output audio we transcribe to. + + Please note that this is not currently used by supported models at this + time, but it is a placeholder for future use, matching translation api. + """ + + # --8<-- [start:transcription-sampling-params] + temperature: float = Field(default=0.0) + """The sampling temperature, between 0 and 1. + + Higher values like 0.8 will make the output more random, while lower values + like 0.2 will make it more focused / deterministic. If set to 0, the model + will use [log probability](https://en.wikipedia.org/wiki/Log_probability) + to automatically increase the temperature until certain thresholds are hit. + """ + + top_p: float | None = None + """Enables nucleus (top-p) sampling, where tokens are selected from the + smallest possible set whose cumulative probability exceeds `p`. + """ + + top_k: int | None = None + """Limits sampling to the `k` most probable tokens at each step.""" + + min_p: float | None = None + """Filters out tokens with a probability lower than `min_p`, ensuring a + minimum likelihood threshold during sampling. + """ + + seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) + """The seed to use for sampling.""" + + frequency_penalty: float | None = 0.0 + """The frequency penalty to use for sampling.""" + + repetition_penalty: float | None = None + """The repetition penalty to use for sampling.""" + + presence_penalty: float | None = 0.0 + """The presence penalty to use for sampling.""" + + max_completion_tokens: int | None = None + """The maximum number of tokens to generate.""" + # --8<-- [end:transcription-sampling-params] + + # Default sampling parameters for transcription requests. + _DEFAULT_SAMPLING_PARAMS: dict = { + "repetition_penalty": 1.0, + "temperature": 1.0, + "top_p": 1.0, + "top_k": 0, + "min_p": 0.0, + } + + def to_sampling_params( + self, default_max_tokens: int, default_sampling_params: dict | None = None + ) -> SamplingParams: + max_tokens = default_max_tokens + + if default_sampling_params is None: + default_sampling_params = {} + + # Default parameters + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get( + "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] + ) + if (top_p := self.top_p) is None: + top_p = default_sampling_params.get( + "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"] + ) + if (top_k := self.top_k) is None: + top_k = default_sampling_params.get( + "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"] + ) + if (min_p := self.min_p) is None: + min_p = default_sampling_params.get( + "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"] + ) + + if (repetition_penalty := self.repetition_penalty) is None: + repetition_penalty = default_sampling_params.get( + "repetition_penalty", + self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"], + ) + + return SamplingParams.from_optional( + temperature=temperature, + max_tokens=max_tokens, + seed=self.seed, + top_p=top_p, + top_k=top_k, + min_p=min_p, + frequency_penalty=self.frequency_penalty, + repetition_penalty=repetition_penalty, + presence_penalty=self.presence_penalty, + output_kind=RequestOutputKind.DELTA + if self.stream + else RequestOutputKind.FINAL_ONLY, + extra_args=self.vllm_xargs, + skip_clone=True, # Created fresh per request, safe to skip clone + ) + + @model_validator(mode="before") + @classmethod + def validate_transcription_request(cls, data): + if isinstance(data.get("file"), str): + raise HTTPException( + status_code=HTTPStatus.UNPROCESSABLE_ENTITY, + detail="Expected 'file' to be a file-like object, not 'str'.", + ) + + stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] + stream = data.get("stream", False) + if any(bool(data.get(so, False)) for so in stream_opts) and not stream: + # Find which specific stream option was set + invalid_param = next( + (so for so in stream_opts if data.get(so, False)), + "stream_include_usage", + ) + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter=invalid_param, + ) + + return data + + +# Transcription response objects +class TranscriptionUsageAudio(OpenAIBaseModel): + type: Literal["duration"] = "duration" + seconds: int + + +class TranscriptionResponse(OpenAIBaseModel): + text: str + """The transcribed text.""" + usage: TranscriptionUsageAudio + + +class TranscriptionWord(OpenAIBaseModel): + end: float + """End time of the word in seconds.""" + + start: float + """Start time of the word in seconds.""" + + word: str + """The text content of the word.""" + + +class TranscriptionSegment(OpenAIBaseModel): + id: int + """Unique identifier of the segment.""" + + avg_logprob: float | None = None + """Average logprob of the segment. + + If the value is lower than -1, consider the logprobs failed. + """ + + compression_ratio: float | None = None + """Compression ratio of the segment. + + If the value is greater than 2.4, consider the compression failed. + """ + + end: float + """End time of the segment in seconds.""" + + no_speech_prob: float | None = None + """Probability of no speech in the segment. + + If the value is higher than 1.0 and the `avg_logprob` is below -1, consider + this segment silent. + """ + + seek: int + """Seek offset of the segment.""" + + start: float + """Start time of the segment in seconds.""" + + temperature: float + """Temperature parameter used for generating the segment.""" + + text: str + """Text content of the segment.""" + + tokens: list[int] + """Array of token IDs for the text content.""" + + +class TranscriptionResponseVerbose(OpenAIBaseModel): + duration: str + """The duration of the input audio.""" + + language: str + """The language of the input audio.""" + + text: str + """The transcribed text.""" + + segments: list[TranscriptionSegment] | None = None + """Segments of the transcribed text and their corresponding details.""" + + words: list[TranscriptionWord] | None = None + """Extracted words and their corresponding timestamps.""" + + +TranscriptionResponseVariant: TypeAlias = ( + TranscriptionResponse | TranscriptionResponseVerbose +) + + +class TranslationResponseStreamChoice(OpenAIBaseModel): + delta: DeltaMessage + finish_reason: str | None = None + stop_reason: int | str | None = None + + +class TranslationStreamResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}") + object: Literal["translation.chunk"] = "translation.chunk" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[TranslationResponseStreamChoice] + usage: UsageInfo | None = Field(default=None) + + +class TranslationRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/audio/createTranslation + + file: UploadFile + """ + The audio file object (not file name) to translate, in one of these + formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + """ + + model: str | None = None + """ID of the model to use. + """ + + prompt: str = Field(default="") + """An optional text to guide the model's style or continue a previous audio + segment. + + The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) + should match the audio language. + """ + + response_format: AudioResponseFormat = Field(default="json") + """ + The format of the output, in one of these options: `json`, `text`, `srt`, + `verbose_json`, or `vtt`. + """ + + # TODO support additional sampling parameters + # --8<-- [start:translation-sampling-params] + seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) + """The seed to use for sampling.""" + + temperature: float = Field(default=0.0) + """The sampling temperature, between 0 and 1. + + Higher values like 0.8 will make the output more random, while lower values + like 0.2 will make it more focused / deterministic. If set to 0, the model + will use [log probability](https://en.wikipedia.org/wiki/Log_probability) + to automatically increase the temperature until certain thresholds are hit. + """ + # --8<-- [end:translation-sampling-params] + + # --8<-- [start:translation-extra-params] + language: str | None = None + """The language of the input audio we translate from. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format + will improve accuracy. + """ + + to_language: str | None = None + """The language of the input audio we translate to. + + Please note that this is not supported by all models, refer to the specific + model documentation for more details. + For instance, Whisper only supports `to_language=en`. + """ + + stream: bool | None = False + """Custom field not present in the original OpenAI definition. When set, + it will enable output to be streamed in a similar fashion as the Chat + Completion endpoint. + """ + # Flattened stream option to simplify form data. + stream_include_usage: bool | None = False + stream_continuous_usage_stats: bool | None = False + + max_completion_tokens: int | None = None + """The maximum number of tokens to generate.""" + # --8<-- [end:translation-extra-params] + + # Default sampling parameters for translation requests. + _DEFAULT_SAMPLING_PARAMS: dict = { + "temperature": 0, + } + + def to_sampling_params( + self, default_max_tokens: int, default_sampling_params: dict | None = None + ) -> SamplingParams: + max_tokens = default_max_tokens + + if default_sampling_params is None: + default_sampling_params = {} + # Default parameters + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get( + "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] + ) + + return SamplingParams.from_optional( + temperature=temperature, + max_tokens=max_tokens, + seed=self.seed, + output_kind=RequestOutputKind.DELTA + if self.stream + else RequestOutputKind.FINAL_ONLY, + skip_clone=True, # Created fresh per request, safe to skip clone + ) + + @model_validator(mode="before") + @classmethod + def validate_stream_options(cls, data): + stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] + stream = data.get("stream", False) + if any(bool(data.get(so, False)) for so in stream_opts) and not stream: + # Find which specific stream option was set + invalid_param = next( + (so for so in stream_opts if data.get(so, False)), + "stream_include_usage", + ) + raise VLLMValidationError( + "Stream options can only be defined when `stream=True`.", + parameter=invalid_param, + ) + + return data + + +# Translation response objects +class TranslationResponse(OpenAIBaseModel): + text: str + """The translated text.""" + + +class TranslationWord(OpenAIBaseModel): + end: float + """End time of the word in seconds.""" + + start: float + """Start time of the word in seconds.""" + + word: str + """The text content of the word.""" + + +class TranslationSegment(OpenAIBaseModel): + id: int + """Unique identifier of the segment.""" + + avg_logprob: float | None = None + """Average logprob of the segment. + + If the value is lower than -1, consider the logprobs failed. + """ + + compression_ratio: float | None = None + """Compression ratio of the segment. + + If the value is greater than 2.4, consider the compression failed. + """ + + end: float + """End time of the segment in seconds.""" + + no_speech_prob: float | None = None + """Probability of no speech in the segment. + + If the value is higher than 1.0 and the `avg_logprob` is below -1, consider + this segment silent. + """ + + seek: int + """Seek offset of the segment.""" + + start: float + """Start time of the segment in seconds.""" + + temperature: float + """Temperature parameter used for generating the segment.""" + + text: str + """Text content of the segment.""" + + tokens: list[int] + """Array of token IDs for the text content.""" + + +class TranslationResponseVerbose(OpenAIBaseModel): + duration: str + """The duration of the input audio.""" + + language: str + """The language of the input audio.""" + + text: str + """The translated text.""" + + segments: list[TranslationSegment] | None = None + """Segments of the translated text and their corresponding details.""" + + words: list[TranslationWord] | None = None + """Extracted words and their corresponding timestamps.""" + + +TranslationResponseVariant: TypeAlias = TranslationResponse | TranslationResponseVerbose diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/translations/serving.py similarity index 97% rename from vllm/entrypoints/openai/serving_transcription.py rename to vllm/entrypoints/openai/translations/serving.py index ef576153d..6cbd4c265 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/translations/serving.py @@ -9,6 +9,9 @@ from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, RequestResponseMetadata, +) +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.openai.translations.protocol import ( TranscriptionRequest, TranscriptionResponse, TranscriptionResponseStreamChoice, @@ -20,8 +23,7 @@ from vllm.entrypoints.openai.engine.protocol import ( TranslationResponseVerbose, TranslationStreamResponse, ) -from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.entrypoints.openai.speech_to_text import OpenAISpeechToText +from vllm.entrypoints.openai.translations.speech_to_text import OpenAISpeechToText from vllm.logger import init_logger from vllm.outputs import RequestOutput diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/translations/speech_to_text.py similarity index 99% rename from vllm/entrypoints/openai/speech_to_text.py rename to vllm/entrypoints/openai/translations/speech_to_text.py index 6fdfea2f4..08cfd8c29 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/translations/speech_to_text.py @@ -19,6 +19,11 @@ from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ErrorResponse, RequestResponseMetadata, + UsageInfo, +) +from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.entrypoints.openai.translations.protocol import ( TranscriptionResponse, TranscriptionResponseStreamChoice, TranscriptionResponseVerbose, @@ -29,11 +34,8 @@ from vllm.entrypoints.openai.engine.protocol import ( TranslationResponseVerbose, TranslationSegment, TranslationStreamResponse, - UsageInfo, - VLLMValidationError, ) -from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest -from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.exceptions import VLLMValidationError from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.models import SupportsTranscription, supports_transcription