[Frontend] Add /v1/audio/transcriptions OpenAI API endpoint (#12909)
This commit is contained in:
@@ -8,9 +8,10 @@ from argparse import Namespace
|
||||
from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union
|
||||
|
||||
import torch
|
||||
from fastapi import UploadFile
|
||||
from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
|
||||
ValidationInfo, field_validator, model_validator)
|
||||
from typing_extensions import Annotated
|
||||
from typing_extensions import Annotated, TypeAlias
|
||||
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
from vllm.logger import init_logger
|
||||
@@ -1426,3 +1427,163 @@ class LoadLoraAdapterRequest(BaseModel):
|
||||
class UnloadLoraAdapterRequest(BaseModel):
|
||||
lora_name: str
|
||||
lora_int_id: Optional[int] = Field(default=None)
|
||||
|
||||
|
||||
## Protocols for Audio
|
||||
AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json",
|
||||
"vtt"]
|
||||
|
||||
|
||||
class TranscriptionRequest(OpenAIBaseModel):
|
||||
# Ordered by official OpenAI API documentation
|
||||
#https://platform.openai.com/docs/api-reference/audio/createTranscription
|
||||
|
||||
file: UploadFile
|
||||
"""
|
||||
The audio file object (not file name) to transcribe, in one of these
|
||||
formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
|
||||
"""
|
||||
|
||||
model: str
|
||||
"""ID of the model to use.
|
||||
"""
|
||||
|
||||
language: Optional[str] = None
|
||||
"""The language of the input audio.
|
||||
|
||||
Supplying the input language in
|
||||
[ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
|
||||
will improve accuracy and latency.
|
||||
"""
|
||||
|
||||
prompt: str = Field(default="")
|
||||
"""An optional text to guide the model's style or continue a previous audio
|
||||
segment.
|
||||
|
||||
The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
|
||||
should match the audio language.
|
||||
"""
|
||||
|
||||
response_format: AudioResponseFormat = Field(default="json")
|
||||
"""
|
||||
The format of the output, in one of these options: `json`, `text`, `srt`,
|
||||
`verbose_json`, or `vtt`.
|
||||
"""
|
||||
|
||||
## TODO (varun) : Support if set to 0, certain thresholds are met !!
|
||||
temperature: float = Field(default=0.0)
|
||||
"""The sampling temperature, between 0 and 1.
|
||||
|
||||
Higher values like 0.8 will make the output more random, while lower values
|
||||
like 0.2 will make it more focused / deterministic. If set to 0, the model
|
||||
will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
|
||||
to automatically increase the temperature until certain thresholds are hit.
|
||||
"""
|
||||
|
||||
timestamp_granularities: List[Literal["word", "segment"]] = Field(
|
||||
alias="timestamp_granularities[]", default=[])
|
||||
"""The timestamp granularities to populate for this transcription.
|
||||
|
||||
`response_format` must be set `verbose_json` to use timestamp granularities.
|
||||
Either or both of these options are supported: `word`, or `segment`. Note:
|
||||
There is no additional latency for segment timestamps, but generating word
|
||||
timestamps incurs additional latency.
|
||||
"""
|
||||
|
||||
# Default sampling parameters for transcription requests.
|
||||
_DEFAULT_SAMPLING_PARAMS: dict = {
|
||||
"temperature": 0,
|
||||
}
|
||||
|
||||
def to_sampling_params(
|
||||
self,
|
||||
default_max_tokens: int,
|
||||
default_sampling_params: Optional[dict] = None) -> SamplingParams:
|
||||
# TODO(#9845): remove max_tokens when field is removed from OpenAI API
|
||||
max_tokens = default_max_tokens
|
||||
|
||||
if default_sampling_params is None:
|
||||
default_sampling_params = {}
|
||||
# Default parameters
|
||||
if (temperature := self.temperature) is None:
|
||||
temperature = default_sampling_params.get(
|
||||
"temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
|
||||
|
||||
return SamplingParams.from_optional(temperature=temperature,
|
||||
max_tokens=max_tokens)
|
||||
|
||||
|
||||
# Transcription response objects
|
||||
class TranscriptionResponse(OpenAIBaseModel):
|
||||
text: str
|
||||
"""The transcribed text."""
|
||||
|
||||
|
||||
class TranscriptionWord(OpenAIBaseModel):
|
||||
end: float
|
||||
"""End time of the word in seconds."""
|
||||
|
||||
start: float
|
||||
"""Start time of the word in seconds."""
|
||||
|
||||
word: str
|
||||
"""The text content of the word."""
|
||||
|
||||
|
||||
class TranscriptionSegment(OpenAIBaseModel):
|
||||
id: int
|
||||
"""Unique identifier of the segment."""
|
||||
|
||||
avg_logprob: float
|
||||
"""Average logprob of the segment.
|
||||
|
||||
If the value is lower than -1, consider the logprobs failed.
|
||||
"""
|
||||
|
||||
compression_ratio: float
|
||||
"""Compression ratio of the segment.
|
||||
|
||||
If the value is greater than 2.4, consider the compression failed.
|
||||
"""
|
||||
|
||||
end: float
|
||||
"""End time of the segment in seconds."""
|
||||
|
||||
no_speech_prob: float
|
||||
"""Probability of no speech in the segment.
|
||||
|
||||
If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
|
||||
this segment silent.
|
||||
"""
|
||||
|
||||
seek: int
|
||||
"""Seek offset of the segment."""
|
||||
|
||||
start: float
|
||||
"""Start time of the segment in seconds."""
|
||||
|
||||
temperature: float
|
||||
"""Temperature parameter used for generating the segment."""
|
||||
|
||||
text: str
|
||||
"""Text content of the segment."""
|
||||
|
||||
tokens: List[int]
|
||||
"""Array of token IDs for the text content."""
|
||||
|
||||
|
||||
class TranscriptionResponseVerbose(OpenAIBaseModel):
|
||||
duration: str
|
||||
"""The duration of the input audio."""
|
||||
|
||||
language: str
|
||||
"""The language of the input audio."""
|
||||
|
||||
text: str
|
||||
"""The transcribed text."""
|
||||
|
||||
segments: Optional[List[TranscriptionSegment]] = None
|
||||
"""Segments of the transcribed text and their corresponding details."""
|
||||
|
||||
words: Optional[List[TranscriptionWord]] = None
|
||||
"""Extracted words and their corresponding timestamps."""
|
||||
|
||||
Reference in New Issue
Block a user